1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<unsigned> SmallLoopCost( 276 "small-loop-cost", cl::init(20), cl::Hidden, 277 cl::desc( 278 "The cost of a loop that is considered 'small' by the interleaver.")); 279 280 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 281 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 282 cl::desc("Enable the use of the block frequency analysis to access PGO " 283 "heuristics minimizing code growth in cold regions and being more " 284 "aggressive in hot regions.")); 285 286 // Runtime interleave loops for load/store throughput. 287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 288 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 289 cl::desc( 290 "Enable runtime interleaving until load/store ports are saturated")); 291 292 /// Interleave small loops with scalar reductions. 293 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 294 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 295 cl::desc("Enable interleaving for loops with small iteration counts that " 296 "contain scalar reductions to expose ILP.")); 297 298 /// The number of stores in a loop that are allowed to need predication. 299 static cl::opt<unsigned> NumberOfStoresToPredicate( 300 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 301 cl::desc("Max number of stores to be predicated behind an if.")); 302 303 static cl::opt<bool> EnableIndVarRegisterHeur( 304 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 305 cl::desc("Count the induction variable only once when interleaving")); 306 307 static cl::opt<bool> EnableCondStoresVectorization( 308 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 309 cl::desc("Enable if predication of stores during vectorization.")); 310 311 static cl::opt<unsigned> MaxNestedScalarReductionIC( 312 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 313 cl::desc("The maximum interleave count to use when interleaving a scalar " 314 "reduction in a nested loop.")); 315 316 static cl::opt<bool> 317 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 318 cl::Hidden, 319 cl::desc("Prefer in-loop vector reductions, " 320 "overriding the targets preference.")); 321 322 static cl::opt<bool> PreferPredicatedReductionSelect( 323 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 324 cl::desc( 325 "Prefer predicating a reduction operation over an after loop select.")); 326 327 cl::opt<bool> EnableVPlanNativePath( 328 "enable-vplan-native-path", cl::init(false), cl::Hidden, 329 cl::desc("Enable VPlan-native vectorization path with " 330 "support for outer loop vectorization.")); 331 332 // FIXME: Remove this switch once we have divergence analysis. Currently we 333 // assume divergent non-backedge branches when this switch is true. 334 cl::opt<bool> EnableVPlanPredication( 335 "enable-vplan-predication", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path predicator with " 337 "support for outer loop vectorization.")); 338 339 // This flag enables the stress testing of the VPlan H-CFG construction in the 340 // VPlan-native vectorization path. It must be used in conjuction with 341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 342 // verification of the H-CFGs built. 343 static cl::opt<bool> VPlanBuildStressTest( 344 "vplan-build-stress-test", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Build VPlan for every supported loop nest in the function and bail " 347 "out right after the build (stress test the VPlan H-CFG construction " 348 "in the VPlan-native vectorization path).")); 349 350 cl::opt<bool> llvm::EnableLoopInterleaving( 351 "interleave-loops", cl::init(true), cl::Hidden, 352 cl::desc("Enable loop interleaving in Loop vectorization passes")); 353 cl::opt<bool> llvm::EnableLoopVectorization( 354 "vectorize-loops", cl::init(true), cl::Hidden, 355 cl::desc("Run the Loop vectorization passes")); 356 357 /// A helper function that returns the type of loaded or stored value. 358 static Type *getMemInstValueType(Value *I) { 359 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 360 "Expected Load or Store instruction"); 361 if (auto *LI = dyn_cast<LoadInst>(I)) 362 return LI->getType(); 363 return cast<StoreInst>(I)->getValueOperand()->getType(); 364 } 365 366 /// A helper function that returns true if the given type is irregular. The 367 /// type is irregular if its allocated size doesn't equal the store size of an 368 /// element of the corresponding vector type at the given vectorization factor. 369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 370 // Determine if an array of VF elements of type Ty is "bitcast compatible" 371 // with a <VF x Ty> vector. 372 if (VF.isVector()) { 373 auto *VectorTy = VectorType::get(Ty, VF); 374 return TypeSize::get(VF.getKnownMinValue() * 375 DL.getTypeAllocSize(Ty).getFixedValue(), 376 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 377 } 378 379 // If the vectorization factor is one, we just check if an array of type Ty 380 // requires padding between elements. 381 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 382 } 383 384 /// A helper function that returns the reciprocal of the block probability of 385 /// predicated blocks. If we return X, we are assuming the predicated block 386 /// will execute once for every X iterations of the loop header. 387 /// 388 /// TODO: We should use actual block probability here, if available. Currently, 389 /// we always assume predicated blocks have a 50% chance of executing. 390 static unsigned getReciprocalPredBlockProb() { return 2; } 391 392 /// A helper function that adds a 'fast' flag to floating-point operations. 393 static Value *addFastMathFlag(Value *V) { 394 if (isa<FPMathOperator>(V)) 395 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 396 return V; 397 } 398 399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 400 if (isa<FPMathOperator>(V)) 401 cast<Instruction>(V)->setFastMathFlags(FMF); 402 return V; 403 } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 namespace llvm { 436 437 /// InnerLoopVectorizer vectorizes loops which contain only one basic 438 /// block to a specified vectorization factor (VF). 439 /// This class performs the widening of scalars into vectors, or multiple 440 /// scalars. This class also implements the following features: 441 /// * It inserts an epilogue loop for handling loops that don't have iteration 442 /// counts that are known to be a multiple of the vectorization factor. 443 /// * It handles the code generation for reduction variables. 444 /// * Scalarization (implementation using scalars) of un-vectorizable 445 /// instructions. 446 /// InnerLoopVectorizer does not perform any vectorization-legality 447 /// checks, and relies on the caller to check for the different legality 448 /// aspects. The InnerLoopVectorizer relies on the 449 /// LoopVectorizationLegality class to provide information about the induction 450 /// and reduction variables that were found to a given vectorization factor. 451 class InnerLoopVectorizer { 452 public: 453 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 454 LoopInfo *LI, DominatorTree *DT, 455 const TargetLibraryInfo *TLI, 456 const TargetTransformInfo *TTI, AssumptionCache *AC, 457 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 458 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 459 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 460 ProfileSummaryInfo *PSI) 461 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 462 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 463 Builder(PSE.getSE()->getContext()), 464 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 465 BFI(BFI), PSI(PSI) { 466 // Query this against the original loop and save it here because the profile 467 // of the original loop header may change as the transformation happens. 468 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 469 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 470 } 471 472 virtual ~InnerLoopVectorizer() = default; 473 474 /// Create a new empty loop that will contain vectorized instructions later 475 /// on, while the old loop will be used as the scalar remainder. Control flow 476 /// is generated around the vectorized (and scalar epilogue) loops consisting 477 /// of various checks and bypasses. Return the pre-header block of the new 478 /// loop. 479 /// In the case of epilogue vectorization, this function is overriden to 480 /// handle the more complex control flow around the loops. 481 virtual BasicBlock *createVectorizedLoopSkeleton(); 482 483 /// Widen a single instruction within the innermost loop. 484 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 485 VPTransformState &State); 486 487 /// Widen a single call instruction within the innermost loop. 488 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 489 VPTransformState &State); 490 491 /// Widen a single select instruction within the innermost loop. 492 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 493 bool InvariantCond, VPTransformState &State); 494 495 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 496 void fixVectorizedLoop(); 497 498 // Return true if any runtime check is added. 499 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 500 501 /// A type for vectorized values in the new loop. Each value from the 502 /// original loop, when vectorized, is represented by UF vector values in the 503 /// new unrolled loop, where UF is the unroll factor. 504 using VectorParts = SmallVector<Value *, 2>; 505 506 /// Vectorize a single GetElementPtrInst based on information gathered and 507 /// decisions taken during planning. 508 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 509 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 510 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 511 512 /// Vectorize a single PHINode in a block. This method handles the induction 513 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 514 /// arbitrary length vectors. 515 void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF); 516 517 /// A helper function to scalarize a single Instruction in the innermost loop. 518 /// Generates a sequence of scalar instances for each lane between \p MinLane 519 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 520 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 521 /// Instr's operands. 522 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 523 const VPIteration &Instance, bool IfPredicateInstr, 524 VPTransformState &State); 525 526 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 527 /// is provided, the integer induction variable will first be truncated to 528 /// the corresponding type. 529 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 530 531 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 532 /// vector or scalar value on-demand if one is not yet available. When 533 /// vectorizing a loop, we visit the definition of an instruction before its 534 /// uses. When visiting the definition, we either vectorize or scalarize the 535 /// instruction, creating an entry for it in the corresponding map. (In some 536 /// cases, such as induction variables, we will create both vector and scalar 537 /// entries.) Then, as we encounter uses of the definition, we derive values 538 /// for each scalar or vector use unless such a value is already available. 539 /// For example, if we scalarize a definition and one of its uses is vector, 540 /// we build the required vector on-demand with an insertelement sequence 541 /// when visiting the use. Otherwise, if the use is scalar, we can use the 542 /// existing scalar definition. 543 /// 544 /// Return a value in the new loop corresponding to \p V from the original 545 /// loop at unroll index \p Part. If the value has already been vectorized, 546 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 547 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 548 /// a new vector value on-demand by inserting the scalar values into a vector 549 /// with an insertelement sequence. If the value has been neither vectorized 550 /// nor scalarized, it must be loop invariant, so we simply broadcast the 551 /// value into a vector. 552 Value *getOrCreateVectorValue(Value *V, unsigned Part); 553 554 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 555 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 556 } 557 558 /// Return a value in the new loop corresponding to \p V from the original 559 /// loop at unroll and vector indices \p Instance. If the value has been 560 /// vectorized but not scalarized, the necessary extractelement instruction 561 /// will be generated. 562 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 563 564 /// Construct the vector value of a scalarized value \p V one lane at a time. 565 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 566 567 /// Try to vectorize interleaved access group \p Group with the base address 568 /// given in \p Addr, optionally masking the vector operations if \p 569 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 570 /// values in the vectorized loop. 571 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 572 VPTransformState &State, VPValue *Addr, 573 ArrayRef<VPValue *> StoredValues, 574 VPValue *BlockInMask = nullptr); 575 576 /// Vectorize Load and Store instructions with the base address given in \p 577 /// Addr, optionally masking the vector operations if \p BlockInMask is 578 /// non-null. Use \p State to translate given VPValues to IR values in the 579 /// vectorized loop. 580 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 581 VPValue *Def, VPValue *Addr, 582 VPValue *StoredValue, VPValue *BlockInMask); 583 584 /// Set the debug location in the builder using the debug location in 585 /// the instruction. 586 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 587 588 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 589 void fixNonInductionPHIs(void); 590 591 protected: 592 friend class LoopVectorizationPlanner; 593 594 /// A small list of PHINodes. 595 using PhiVector = SmallVector<PHINode *, 4>; 596 597 /// A type for scalarized values in the new loop. Each value from the 598 /// original loop, when scalarized, is represented by UF x VF scalar values 599 /// in the new unrolled loop, where UF is the unroll factor and VF is the 600 /// vectorization factor. 601 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 602 603 /// Set up the values of the IVs correctly when exiting the vector loop. 604 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 605 Value *CountRoundDown, Value *EndValue, 606 BasicBlock *MiddleBlock); 607 608 /// Create a new induction variable inside L. 609 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 610 Value *Step, Instruction *DL); 611 612 /// Handle all cross-iteration phis in the header. 613 void fixCrossIterationPHIs(); 614 615 /// Fix a first-order recurrence. This is the second phase of vectorizing 616 /// this phi node. 617 void fixFirstOrderRecurrence(PHINode *Phi); 618 619 /// Fix a reduction cross-iteration phi. This is the second phase of 620 /// vectorizing this phi node. 621 void fixReduction(PHINode *Phi); 622 623 /// Clear NSW/NUW flags from reduction instructions if necessary. 624 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 625 626 /// The Loop exit block may have single value PHI nodes with some 627 /// incoming value. While vectorizing we only handled real values 628 /// that were defined inside the loop and we should have one value for 629 /// each predecessor of its parent basic block. See PR14725. 630 void fixLCSSAPHIs(); 631 632 /// Iteratively sink the scalarized operands of a predicated instruction into 633 /// the block that was created for it. 634 void sinkScalarOperands(Instruction *PredInst); 635 636 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 637 /// represented as. 638 void truncateToMinimalBitwidths(); 639 640 /// Create a broadcast instruction. This method generates a broadcast 641 /// instruction (shuffle) for loop invariant values and for the induction 642 /// value. If this is the induction variable then we extend it to N, N+1, ... 643 /// this is needed because each iteration in the loop corresponds to a SIMD 644 /// element. 645 virtual Value *getBroadcastInstrs(Value *V); 646 647 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 648 /// to each vector element of Val. The sequence starts at StartIndex. 649 /// \p Opcode is relevant for FP induction variable. 650 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 651 Instruction::BinaryOps Opcode = 652 Instruction::BinaryOpsEnd); 653 654 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 655 /// variable on which to base the steps, \p Step is the size of the step, and 656 /// \p EntryVal is the value from the original loop that maps to the steps. 657 /// Note that \p EntryVal doesn't have to be an induction variable - it 658 /// can also be a truncate instruction. 659 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 660 const InductionDescriptor &ID); 661 662 /// Create a vector induction phi node based on an existing scalar one. \p 663 /// EntryVal is the value from the original loop that maps to the vector phi 664 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 665 /// truncate instruction, instead of widening the original IV, we widen a 666 /// version of the IV truncated to \p EntryVal's type. 667 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 668 Value *Step, Instruction *EntryVal); 669 670 /// Returns true if an instruction \p I should be scalarized instead of 671 /// vectorized for the chosen vectorization factor. 672 bool shouldScalarizeInstruction(Instruction *I) const; 673 674 /// Returns true if we should generate a scalar version of \p IV. 675 bool needsScalarInduction(Instruction *IV) const; 676 677 /// If there is a cast involved in the induction variable \p ID, which should 678 /// be ignored in the vectorized loop body, this function records the 679 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 680 /// cast. We had already proved that the casted Phi is equal to the uncasted 681 /// Phi in the vectorized loop (under a runtime guard), and therefore 682 /// there is no need to vectorize the cast - the same value can be used in the 683 /// vector loop for both the Phi and the cast. 684 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 685 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 686 /// 687 /// \p EntryVal is the value from the original loop that maps to the vector 688 /// phi node and is used to distinguish what is the IV currently being 689 /// processed - original one (if \p EntryVal is a phi corresponding to the 690 /// original IV) or the "newly-created" one based on the proof mentioned above 691 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 692 /// latter case \p EntryVal is a TruncInst and we must not record anything for 693 /// that IV, but it's error-prone to expect callers of this routine to care 694 /// about that, hence this explicit parameter. 695 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 696 const Instruction *EntryVal, 697 Value *VectorLoopValue, 698 unsigned Part, 699 unsigned Lane = UINT_MAX); 700 701 /// Generate a shuffle sequence that will reverse the vector Vec. 702 virtual Value *reverseVector(Value *Vec); 703 704 /// Returns (and creates if needed) the original loop trip count. 705 Value *getOrCreateTripCount(Loop *NewLoop); 706 707 /// Returns (and creates if needed) the trip count of the widened loop. 708 Value *getOrCreateVectorTripCount(Loop *NewLoop); 709 710 /// Returns a bitcasted value to the requested vector type. 711 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 712 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 713 const DataLayout &DL); 714 715 /// Emit a bypass check to see if the vector trip count is zero, including if 716 /// it overflows. 717 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 718 719 /// Emit a bypass check to see if all of the SCEV assumptions we've 720 /// had to make are correct. 721 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 722 723 /// Emit bypass checks to check any memory assumptions we may have made. 724 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 725 726 /// Compute the transformed value of Index at offset StartValue using step 727 /// StepValue. 728 /// For integer induction, returns StartValue + Index * StepValue. 729 /// For pointer induction, returns StartValue[Index * StepValue]. 730 /// FIXME: The newly created binary instructions should contain nsw/nuw 731 /// flags, which can be found from the original scalar operations. 732 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 733 const DataLayout &DL, 734 const InductionDescriptor &ID) const; 735 736 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 737 /// vector loop preheader, middle block and scalar preheader. Also 738 /// allocate a loop object for the new vector loop and return it. 739 Loop *createVectorLoopSkeleton(StringRef Prefix); 740 741 /// Create new phi nodes for the induction variables to resume iteration count 742 /// in the scalar epilogue, from where the vectorized loop left off (given by 743 /// \p VectorTripCount). 744 /// In cases where the loop skeleton is more complicated (eg. epilogue 745 /// vectorization) and the resume values can come from an additional bypass 746 /// block, the \p AdditionalBypass pair provides information about the bypass 747 /// block and the end value on the edge from bypass to this loop. 748 void createInductionResumeValues( 749 Loop *L, Value *VectorTripCount, 750 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 751 752 /// Complete the loop skeleton by adding debug MDs, creating appropriate 753 /// conditional branches in the middle block, preparing the builder and 754 /// running the verifier. Take in the vector loop \p L as argument, and return 755 /// the preheader of the completed vector loop. 756 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 757 758 /// Add additional metadata to \p To that was not present on \p Orig. 759 /// 760 /// Currently this is used to add the noalias annotations based on the 761 /// inserted memchecks. Use this for instructions that are *cloned* into the 762 /// vector loop. 763 void addNewMetadata(Instruction *To, const Instruction *Orig); 764 765 /// Add metadata from one instruction to another. 766 /// 767 /// This includes both the original MDs from \p From and additional ones (\see 768 /// addNewMetadata). Use this for *newly created* instructions in the vector 769 /// loop. 770 void addMetadata(Instruction *To, Instruction *From); 771 772 /// Similar to the previous function but it adds the metadata to a 773 /// vector of instructions. 774 void addMetadata(ArrayRef<Value *> To, Instruction *From); 775 776 /// Allow subclasses to override and print debug traces before/after vplan 777 /// execution, when trace information is requested. 778 virtual void printDebugTracesAtStart(){}; 779 virtual void printDebugTracesAtEnd(){}; 780 781 /// The original loop. 782 Loop *OrigLoop; 783 784 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 785 /// dynamic knowledge to simplify SCEV expressions and converts them to a 786 /// more usable form. 787 PredicatedScalarEvolution &PSE; 788 789 /// Loop Info. 790 LoopInfo *LI; 791 792 /// Dominator Tree. 793 DominatorTree *DT; 794 795 /// Alias Analysis. 796 AAResults *AA; 797 798 /// Target Library Info. 799 const TargetLibraryInfo *TLI; 800 801 /// Target Transform Info. 802 const TargetTransformInfo *TTI; 803 804 /// Assumption Cache. 805 AssumptionCache *AC; 806 807 /// Interface to emit optimization remarks. 808 OptimizationRemarkEmitter *ORE; 809 810 /// LoopVersioning. It's only set up (non-null) if memchecks were 811 /// used. 812 /// 813 /// This is currently only used to add no-alias metadata based on the 814 /// memchecks. The actually versioning is performed manually. 815 std::unique_ptr<LoopVersioning> LVer; 816 817 /// The vectorization SIMD factor to use. Each vector will have this many 818 /// vector elements. 819 ElementCount VF; 820 821 /// The vectorization unroll factor to use. Each scalar is vectorized to this 822 /// many different vector instructions. 823 unsigned UF; 824 825 /// The builder that we use 826 IRBuilder<> Builder; 827 828 // --- Vectorization state --- 829 830 /// The vector-loop preheader. 831 BasicBlock *LoopVectorPreHeader; 832 833 /// The scalar-loop preheader. 834 BasicBlock *LoopScalarPreHeader; 835 836 /// Middle Block between the vector and the scalar. 837 BasicBlock *LoopMiddleBlock; 838 839 /// The ExitBlock of the scalar loop. 840 BasicBlock *LoopExitBlock; 841 842 /// The vector loop body. 843 BasicBlock *LoopVectorBody; 844 845 /// The scalar loop body. 846 BasicBlock *LoopScalarBody; 847 848 /// A list of all bypass blocks. The first block is the entry of the loop. 849 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 850 851 /// The new Induction variable which was added to the new block. 852 PHINode *Induction = nullptr; 853 854 /// The induction variable of the old basic block. 855 PHINode *OldInduction = nullptr; 856 857 /// Maps values from the original loop to their corresponding values in the 858 /// vectorized loop. A key value can map to either vector values, scalar 859 /// values or both kinds of values, depending on whether the key was 860 /// vectorized and scalarized. 861 VectorizerValueMap VectorLoopValueMap; 862 863 /// Store instructions that were predicated. 864 SmallVector<Instruction *, 4> PredicatedInstructions; 865 866 /// Trip count of the original loop. 867 Value *TripCount = nullptr; 868 869 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 870 Value *VectorTripCount = nullptr; 871 872 /// The legality analysis. 873 LoopVectorizationLegality *Legal; 874 875 /// The profitablity analysis. 876 LoopVectorizationCostModel *Cost; 877 878 // Record whether runtime checks are added. 879 bool AddedSafetyChecks = false; 880 881 // Holds the end values for each induction variable. We save the end values 882 // so we can later fix-up the external users of the induction variables. 883 DenseMap<PHINode *, Value *> IVEndValues; 884 885 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 886 // fixed up at the end of vector code generation. 887 SmallVector<PHINode *, 8> OrigPHIsToFix; 888 889 /// BFI and PSI are used to check for profile guided size optimizations. 890 BlockFrequencyInfo *BFI; 891 ProfileSummaryInfo *PSI; 892 893 // Whether this loop should be optimized for size based on profile guided size 894 // optimizatios. 895 bool OptForSizeBasedOnProfile; 896 }; 897 898 class InnerLoopUnroller : public InnerLoopVectorizer { 899 public: 900 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 901 LoopInfo *LI, DominatorTree *DT, 902 const TargetLibraryInfo *TLI, 903 const TargetTransformInfo *TTI, AssumptionCache *AC, 904 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 905 LoopVectorizationLegality *LVL, 906 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 907 ProfileSummaryInfo *PSI) 908 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 909 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 910 BFI, PSI) {} 911 912 private: 913 Value *getBroadcastInstrs(Value *V) override; 914 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 915 Instruction::BinaryOps Opcode = 916 Instruction::BinaryOpsEnd) override; 917 Value *reverseVector(Value *Vec) override; 918 }; 919 920 /// Encapsulate information regarding vectorization of a loop and its epilogue. 921 /// This information is meant to be updated and used across two stages of 922 /// epilogue vectorization. 923 struct EpilogueLoopVectorizationInfo { 924 ElementCount MainLoopVF = ElementCount::getFixed(0); 925 unsigned MainLoopUF = 0; 926 ElementCount EpilogueVF = ElementCount::getFixed(0); 927 unsigned EpilogueUF = 0; 928 BasicBlock *MainLoopIterationCountCheck = nullptr; 929 BasicBlock *EpilogueIterationCountCheck = nullptr; 930 BasicBlock *SCEVSafetyCheck = nullptr; 931 BasicBlock *MemSafetyCheck = nullptr; 932 Value *TripCount = nullptr; 933 Value *VectorTripCount = nullptr; 934 935 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 936 unsigned EUF) 937 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 938 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 939 assert(EUF == 1 && 940 "A high UF for the epilogue loop is likely not beneficial."); 941 } 942 }; 943 944 /// An extension of the inner loop vectorizer that creates a skeleton for a 945 /// vectorized loop that has its epilogue (residual) also vectorized. 946 /// The idea is to run the vplan on a given loop twice, firstly to setup the 947 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 948 /// from the first step and vectorize the epilogue. This is achieved by 949 /// deriving two concrete strategy classes from this base class and invoking 950 /// them in succession from the loop vectorizer planner. 951 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 952 public: 953 InnerLoopAndEpilogueVectorizer( 954 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 955 DominatorTree *DT, const TargetLibraryInfo *TLI, 956 const TargetTransformInfo *TTI, AssumptionCache *AC, 957 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 958 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 959 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 960 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 961 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 962 EPI(EPI) {} 963 964 // Override this function to handle the more complex control flow around the 965 // three loops. 966 BasicBlock *createVectorizedLoopSkeleton() final override { 967 return createEpilogueVectorizedLoopSkeleton(); 968 } 969 970 /// The interface for creating a vectorized skeleton using one of two 971 /// different strategies, each corresponding to one execution of the vplan 972 /// as described above. 973 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 974 975 /// Holds and updates state information required to vectorize the main loop 976 /// and its epilogue in two separate passes. This setup helps us avoid 977 /// regenerating and recomputing runtime safety checks. It also helps us to 978 /// shorten the iteration-count-check path length for the cases where the 979 /// iteration count of the loop is so small that the main vector loop is 980 /// completely skipped. 981 EpilogueLoopVectorizationInfo &EPI; 982 }; 983 984 /// A specialized derived class of inner loop vectorizer that performs 985 /// vectorization of *main* loops in the process of vectorizing loops and their 986 /// epilogues. 987 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 988 public: 989 EpilogueVectorizerMainLoop( 990 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 991 DominatorTree *DT, const TargetLibraryInfo *TLI, 992 const TargetTransformInfo *TTI, AssumptionCache *AC, 993 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 994 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 995 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 996 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 997 EPI, LVL, CM, BFI, PSI) {} 998 /// Implements the interface for creating a vectorized skeleton using the 999 /// *main loop* strategy (ie the first pass of vplan execution). 1000 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1001 1002 protected: 1003 /// Emits an iteration count bypass check once for the main loop (when \p 1004 /// ForEpilogue is false) and once for the epilogue loop (when \p 1005 /// ForEpilogue is true). 1006 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1007 bool ForEpilogue); 1008 void printDebugTracesAtStart() override; 1009 void printDebugTracesAtEnd() override; 1010 }; 1011 1012 // A specialized derived class of inner loop vectorizer that performs 1013 // vectorization of *epilogue* loops in the process of vectorizing loops and 1014 // their epilogues. 1015 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1016 public: 1017 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1018 LoopInfo *LI, DominatorTree *DT, 1019 const TargetLibraryInfo *TLI, 1020 const TargetTransformInfo *TTI, AssumptionCache *AC, 1021 OptimizationRemarkEmitter *ORE, 1022 EpilogueLoopVectorizationInfo &EPI, 1023 LoopVectorizationLegality *LVL, 1024 llvm::LoopVectorizationCostModel *CM, 1025 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1026 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1027 EPI, LVL, CM, BFI, PSI) {} 1028 /// Implements the interface for creating a vectorized skeleton using the 1029 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1030 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1031 1032 protected: 1033 /// Emits an iteration count bypass check after the main vector loop has 1034 /// finished to see if there are any iterations left to execute by either 1035 /// the vector epilogue or the scalar epilogue. 1036 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1037 BasicBlock *Bypass, 1038 BasicBlock *Insert); 1039 void printDebugTracesAtStart() override; 1040 void printDebugTracesAtEnd() override; 1041 }; 1042 } // end namespace llvm 1043 1044 /// Look for a meaningful debug location on the instruction or it's 1045 /// operands. 1046 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1047 if (!I) 1048 return I; 1049 1050 DebugLoc Empty; 1051 if (I->getDebugLoc() != Empty) 1052 return I; 1053 1054 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1055 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1056 if (OpInst->getDebugLoc() != Empty) 1057 return OpInst; 1058 } 1059 1060 return I; 1061 } 1062 1063 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1064 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1065 const DILocation *DIL = Inst->getDebugLoc(); 1066 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1067 !isa<DbgInfoIntrinsic>(Inst)) { 1068 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1069 auto NewDIL = 1070 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1071 if (NewDIL) 1072 B.SetCurrentDebugLocation(NewDIL.getValue()); 1073 else 1074 LLVM_DEBUG(dbgs() 1075 << "Failed to create new discriminator: " 1076 << DIL->getFilename() << " Line: " << DIL->getLine()); 1077 } 1078 else 1079 B.SetCurrentDebugLocation(DIL); 1080 } else 1081 B.SetCurrentDebugLocation(DebugLoc()); 1082 } 1083 1084 /// Write a record \p DebugMsg about vectorization failure to the debug 1085 /// output stream. If \p I is passed, it is an instruction that prevents 1086 /// vectorization. 1087 #ifndef NDEBUG 1088 static void debugVectorizationFailure(const StringRef DebugMsg, 1089 Instruction *I) { 1090 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1091 if (I != nullptr) 1092 dbgs() << " " << *I; 1093 else 1094 dbgs() << '.'; 1095 dbgs() << '\n'; 1096 } 1097 #endif 1098 1099 /// Create an analysis remark that explains why vectorization failed 1100 /// 1101 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1102 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1103 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1104 /// the location of the remark. \return the remark object that can be 1105 /// streamed to. 1106 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1107 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1108 Value *CodeRegion = TheLoop->getHeader(); 1109 DebugLoc DL = TheLoop->getStartLoc(); 1110 1111 if (I) { 1112 CodeRegion = I->getParent(); 1113 // If there is no debug location attached to the instruction, revert back to 1114 // using the loop's. 1115 if (I->getDebugLoc()) 1116 DL = I->getDebugLoc(); 1117 } 1118 1119 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1120 R << "loop not vectorized: "; 1121 return R; 1122 } 1123 1124 /// Return a value for Step multiplied by VF. 1125 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1126 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1127 Constant *StepVal = ConstantInt::get( 1128 Step->getType(), 1129 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1130 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1131 } 1132 1133 namespace llvm { 1134 1135 void reportVectorizationFailure(const StringRef DebugMsg, 1136 const StringRef OREMsg, const StringRef ORETag, 1137 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1138 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1139 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1140 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1141 ORETag, TheLoop, I) << OREMsg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// LoopVectorizationCostModel - estimates the expected speedups due to 1211 /// vectorization. 1212 /// In many cases vectorization is not profitable. This can happen because of 1213 /// a number of reasons. In this class we mainly attempt to predict the 1214 /// expected speedup/slowdowns due to the supported instruction set. We use the 1215 /// TargetTransformInfo to query the different backends for the cost of 1216 /// different operations. 1217 class LoopVectorizationCostModel { 1218 public: 1219 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1220 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1221 LoopVectorizationLegality *Legal, 1222 const TargetTransformInfo &TTI, 1223 const TargetLibraryInfo *TLI, DemandedBits *DB, 1224 AssumptionCache *AC, 1225 OptimizationRemarkEmitter *ORE, const Function *F, 1226 const LoopVectorizeHints *Hints, 1227 InterleavedAccessInfo &IAI) 1228 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1229 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1230 Hints(Hints), InterleaveInfo(IAI) {} 1231 1232 /// \return An upper bound for the vectorization factor, or None if 1233 /// vectorization and interleaving should be avoided up front. 1234 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1235 1236 /// \return True if runtime checks are required for vectorization, and false 1237 /// otherwise. 1238 bool runtimeChecksRequired(); 1239 1240 /// \return The most profitable vectorization factor and the cost of that VF. 1241 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1242 /// then this vectorization factor will be selected if vectorization is 1243 /// possible. 1244 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1245 VectorizationFactor 1246 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1247 const LoopVectorizationPlanner &LVP); 1248 1249 /// Setup cost-based decisions for user vectorization factor. 1250 void selectUserVectorizationFactor(ElementCount UserVF) { 1251 collectUniformsAndScalars(UserVF); 1252 collectInstsToScalarize(UserVF); 1253 } 1254 1255 /// \return The size (in bits) of the smallest and widest types in the code 1256 /// that needs to be vectorized. We ignore values that remain scalar such as 1257 /// 64 bit loop indices. 1258 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1259 1260 /// \return The desired interleave count. 1261 /// If interleave count has been specified by metadata it will be returned. 1262 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1263 /// are the selected vectorization factor and the cost of the selected VF. 1264 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1265 1266 /// Memory access instruction may be vectorized in more than one way. 1267 /// Form of instruction after vectorization depends on cost. 1268 /// This function takes cost-based decisions for Load/Store instructions 1269 /// and collects them in a map. This decisions map is used for building 1270 /// the lists of loop-uniform and loop-scalar instructions. 1271 /// The calculated cost is saved with widening decision in order to 1272 /// avoid redundant calculations. 1273 void setCostBasedWideningDecision(ElementCount VF); 1274 1275 /// A struct that represents some properties of the register usage 1276 /// of a loop. 1277 struct RegisterUsage { 1278 /// Holds the number of loop invariant values that are used in the loop. 1279 /// The key is ClassID of target-provided register class. 1280 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1281 /// Holds the maximum number of concurrent live intervals in the loop. 1282 /// The key is ClassID of target-provided register class. 1283 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1284 }; 1285 1286 /// \return Returns information about the register usages of the loop for the 1287 /// given vectorization factors. 1288 SmallVector<RegisterUsage, 8> 1289 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1290 1291 /// Collect values we want to ignore in the cost model. 1292 void collectValuesToIgnore(); 1293 1294 /// Split reductions into those that happen in the loop, and those that happen 1295 /// outside. In loop reductions are collected into InLoopReductionChains. 1296 void collectInLoopReductions(); 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 unsigned Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, unsigned Cost) { 1384 assert(VF.isVector() && "Expected VF >=2"); 1385 /// Broadcast this decicion to all instructions inside the group. 1386 /// But the cost will be assigned to one instruction only. 1387 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1388 if (auto *I = Grp->getMember(i)) { 1389 if (Grp->getInsertPos() == I) 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 else 1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1393 } 1394 } 1395 } 1396 1397 /// Return the cost model decision for the given instruction \p I and vector 1398 /// width \p VF. Return CM_Unknown if this instruction did not pass 1399 /// through the cost modeling. 1400 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1401 assert(VF.isVector() && "Expected VF to be a vector VF"); 1402 // Cost model is not run in the VPlan-native path - return conservative 1403 // result until this changes. 1404 if (EnableVPlanNativePath) 1405 return CM_GatherScatter; 1406 1407 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1408 auto Itr = WideningDecisions.find(InstOnVF); 1409 if (Itr == WideningDecisions.end()) 1410 return CM_Unknown; 1411 return Itr->second.first; 1412 } 1413 1414 /// Return the vectorization cost for the given instruction \p I and vector 1415 /// width \p VF. 1416 unsigned getWideningCost(Instruction *I, ElementCount VF) { 1417 assert(VF.isVector() && "Expected VF >=2"); 1418 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1419 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1420 "The cost is not calculated"); 1421 return WideningDecisions[InstOnVF].second; 1422 } 1423 1424 /// Return True if instruction \p I is an optimizable truncate whose operand 1425 /// is an induction variable. Such a truncate will be removed by adding a new 1426 /// induction variable with the destination type. 1427 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1428 // If the instruction is not a truncate, return false. 1429 auto *Trunc = dyn_cast<TruncInst>(I); 1430 if (!Trunc) 1431 return false; 1432 1433 // Get the source and destination types of the truncate. 1434 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1435 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1436 1437 // If the truncate is free for the given types, return false. Replacing a 1438 // free truncate with an induction variable would add an induction variable 1439 // update instruction to each iteration of the loop. We exclude from this 1440 // check the primary induction variable since it will need an update 1441 // instruction regardless. 1442 Value *Op = Trunc->getOperand(0); 1443 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1444 return false; 1445 1446 // If the truncated value is not an induction variable, return false. 1447 return Legal->isInductionPhi(Op); 1448 } 1449 1450 /// Collects the instructions to scalarize for each predicated instruction in 1451 /// the loop. 1452 void collectInstsToScalarize(ElementCount VF); 1453 1454 /// Collect Uniform and Scalar values for the given \p VF. 1455 /// The sets depend on CM decision for Load/Store instructions 1456 /// that may be vectorized as interleave, gather-scatter or scalarized. 1457 void collectUniformsAndScalars(ElementCount VF) { 1458 // Do the analysis once. 1459 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1460 return; 1461 setCostBasedWideningDecision(VF); 1462 collectLoopUniforms(VF); 1463 collectLoopScalars(VF); 1464 } 1465 1466 /// Returns true if the target machine supports masked store operation 1467 /// for the given \p DataType and kind of access to \p Ptr. 1468 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1469 return Legal->isConsecutivePtr(Ptr) && 1470 TTI.isLegalMaskedStore(DataType, Alignment); 1471 } 1472 1473 /// Returns true if the target machine supports masked load operation 1474 /// for the given \p DataType and kind of access to \p Ptr. 1475 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1476 return Legal->isConsecutivePtr(Ptr) && 1477 TTI.isLegalMaskedLoad(DataType, Alignment); 1478 } 1479 1480 /// Returns true if the target machine supports masked scatter operation 1481 /// for the given \p DataType. 1482 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1483 return TTI.isLegalMaskedScatter(DataType, Alignment); 1484 } 1485 1486 /// Returns true if the target machine supports masked gather operation 1487 /// for the given \p DataType. 1488 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1489 return TTI.isLegalMaskedGather(DataType, Alignment); 1490 } 1491 1492 /// Returns true if the target machine can represent \p V as a masked gather 1493 /// or scatter operation. 1494 bool isLegalGatherOrScatter(Value *V) { 1495 bool LI = isa<LoadInst>(V); 1496 bool SI = isa<StoreInst>(V); 1497 if (!LI && !SI) 1498 return false; 1499 auto *Ty = getMemInstValueType(V); 1500 Align Align = getLoadStoreAlignment(V); 1501 return (LI && isLegalMaskedGather(Ty, Align)) || 1502 (SI && isLegalMaskedScatter(Ty, Align)); 1503 } 1504 1505 /// Returns true if \p I is an instruction that will be scalarized with 1506 /// predication. Such instructions include conditional stores and 1507 /// instructions that may divide by zero. 1508 /// If a non-zero VF has been calculated, we check if I will be scalarized 1509 /// predication for that VF. 1510 bool isScalarWithPredication(Instruction *I, 1511 ElementCount VF = ElementCount::getFixed(1)); 1512 1513 // Returns true if \p I is an instruction that will be predicated either 1514 // through scalar predication or masked load/store or masked gather/scatter. 1515 // Superset of instructions that return true for isScalarWithPredication. 1516 bool isPredicatedInst(Instruction *I) { 1517 if (!blockNeedsPredication(I->getParent())) 1518 return false; 1519 // Loads and stores that need some form of masked operation are predicated 1520 // instructions. 1521 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1522 return Legal->isMaskRequired(I); 1523 return isScalarWithPredication(I); 1524 } 1525 1526 /// Returns true if \p I is a memory instruction with consecutive memory 1527 /// access that can be widened. 1528 bool 1529 memoryInstructionCanBeWidened(Instruction *I, 1530 ElementCount VF = ElementCount::getFixed(1)); 1531 1532 /// Returns true if \p I is a memory instruction in an interleaved-group 1533 /// of memory accesses that can be vectorized with wide vector loads/stores 1534 /// and shuffles. 1535 bool 1536 interleavedAccessCanBeWidened(Instruction *I, 1537 ElementCount VF = ElementCount::getFixed(1)); 1538 1539 /// Check if \p Instr belongs to any interleaved access group. 1540 bool isAccessInterleaved(Instruction *Instr) { 1541 return InterleaveInfo.isInterleaved(Instr); 1542 } 1543 1544 /// Get the interleaved access group that \p Instr belongs to. 1545 const InterleaveGroup<Instruction> * 1546 getInterleavedAccessGroup(Instruction *Instr) { 1547 return InterleaveInfo.getInterleaveGroup(Instr); 1548 } 1549 1550 /// Returns true if an interleaved group requires a scalar iteration 1551 /// to handle accesses with gaps, and there is nothing preventing us from 1552 /// creating a scalar epilogue. 1553 bool requiresScalarEpilogue() const { 1554 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1555 } 1556 1557 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1558 /// loop hint annotation. 1559 bool isScalarEpilogueAllowed() const { 1560 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1561 } 1562 1563 /// Returns true if all loop blocks should be masked to fold tail loop. 1564 bool foldTailByMasking() const { return FoldTailByMasking; } 1565 1566 bool blockNeedsPredication(BasicBlock *BB) { 1567 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1568 } 1569 1570 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1571 /// nodes to the chain of instructions representing the reductions. Uses a 1572 /// MapVector to ensure deterministic iteration order. 1573 using ReductionChainMap = 1574 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1575 1576 /// Return the chain of instructions representing an inloop reduction. 1577 const ReductionChainMap &getInLoopReductionChains() const { 1578 return InLoopReductionChains; 1579 } 1580 1581 /// Returns true if the Phi is part of an inloop reduction. 1582 bool isInLoopReduction(PHINode *Phi) const { 1583 return InLoopReductionChains.count(Phi); 1584 } 1585 1586 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1587 /// with factor VF. Return the cost of the instruction, including 1588 /// scalarization overhead if it's needed. 1589 unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1590 1591 /// Estimate cost of a call instruction CI if it were vectorized with factor 1592 /// VF. Return the cost of the instruction, including scalarization overhead 1593 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1594 /// scalarized - 1595 /// i.e. either vector version isn't available, or is too expensive. 1596 unsigned getVectorCallCost(CallInst *CI, ElementCount VF, 1597 bool &NeedToScalarize); 1598 1599 /// Invalidates decisions already taken by the cost model. 1600 void invalidateCostModelingDecisions() { 1601 WideningDecisions.clear(); 1602 Uniforms.clear(); 1603 Scalars.clear(); 1604 } 1605 1606 private: 1607 unsigned NumPredStores = 0; 1608 1609 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1610 /// than zero. One is returned if vectorization should best be avoided due 1611 /// to cost. 1612 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1613 ElementCount UserVF); 1614 1615 /// The vectorization cost is a combination of the cost itself and a boolean 1616 /// indicating whether any of the contributing operations will actually 1617 /// operate on 1618 /// vector values after type legalization in the backend. If this latter value 1619 /// is 1620 /// false, then all operations will be scalarized (i.e. no vectorization has 1621 /// actually taken place). 1622 using VectorizationCostTy = std::pair<unsigned, bool>; 1623 1624 /// Returns the expected execution cost. The unit of the cost does 1625 /// not matter because we use the 'cost' units to compare different 1626 /// vector widths. The cost that is returned is *not* normalized by 1627 /// the factor width. 1628 VectorizationCostTy expectedCost(ElementCount VF); 1629 1630 /// Returns the execution time cost of an instruction for a given vector 1631 /// width. Vector width of one means scalar. 1632 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1633 1634 /// The cost-computation logic from getInstructionCost which provides 1635 /// the vector type as an output parameter. 1636 unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); 1637 1638 /// Calculate vectorization cost of memory instruction \p I. 1639 unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); 1640 1641 /// The cost computation for scalarized memory instruction. 1642 unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1643 1644 /// The cost computation for interleaving group of memory instructions. 1645 unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF); 1646 1647 /// The cost computation for Gather/Scatter instruction. 1648 unsigned getGatherScatterCost(Instruction *I, ElementCount VF); 1649 1650 /// The cost computation for widening instruction \p I with consecutive 1651 /// memory access. 1652 unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1653 1654 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1655 /// Load: scalar load + broadcast. 1656 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1657 /// element) 1658 unsigned getUniformMemOpCost(Instruction *I, ElementCount VF); 1659 1660 /// Estimate the overhead of scalarizing an instruction. This is a 1661 /// convenience wrapper for the type-based getScalarizationOverhead API. 1662 unsigned getScalarizationOverhead(Instruction *I, ElementCount VF); 1663 1664 /// Returns whether the instruction is a load or store and will be a emitted 1665 /// as a vector operation. 1666 bool isConsecutiveLoadOrStore(Instruction *I); 1667 1668 /// Returns true if an artificially high cost for emulated masked memrefs 1669 /// should be used. 1670 bool useEmulatedMaskMemRefHack(Instruction *I); 1671 1672 /// Map of scalar integer values to the smallest bitwidth they can be legally 1673 /// represented as. The vector equivalents of these values should be truncated 1674 /// to this type. 1675 MapVector<Instruction *, uint64_t> MinBWs; 1676 1677 /// A type representing the costs for instructions if they were to be 1678 /// scalarized rather than vectorized. The entries are Instruction-Cost 1679 /// pairs. 1680 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1681 1682 /// A set containing all BasicBlocks that are known to present after 1683 /// vectorization as a predicated block. 1684 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1685 1686 /// Records whether it is allowed to have the original scalar loop execute at 1687 /// least once. This may be needed as a fallback loop in case runtime 1688 /// aliasing/dependence checks fail, or to handle the tail/remainder 1689 /// iterations when the trip count is unknown or doesn't divide by the VF, 1690 /// or as a peel-loop to handle gaps in interleave-groups. 1691 /// Under optsize and when the trip count is very small we don't allow any 1692 /// iterations to execute in the scalar loop. 1693 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1694 1695 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1696 bool FoldTailByMasking = false; 1697 1698 /// A map holding scalar costs for different vectorization factors. The 1699 /// presence of a cost for an instruction in the mapping indicates that the 1700 /// instruction will be scalarized when vectorizing with the associated 1701 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1702 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1703 1704 /// Holds the instructions known to be uniform after vectorization. 1705 /// The data is collected per VF. 1706 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1707 1708 /// Holds the instructions known to be scalar after vectorization. 1709 /// The data is collected per VF. 1710 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1711 1712 /// Holds the instructions (address computations) that are forced to be 1713 /// scalarized. 1714 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1715 1716 /// PHINodes of the reductions that should be expanded in-loop along with 1717 /// their associated chains of reduction operations, in program order from top 1718 /// (PHI) to bottom 1719 ReductionChainMap InLoopReductionChains; 1720 1721 /// Returns the expected difference in cost from scalarizing the expression 1722 /// feeding a predicated instruction \p PredInst. The instructions to 1723 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1724 /// non-negative return value implies the expression will be scalarized. 1725 /// Currently, only single-use chains are considered for scalarization. 1726 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1727 ElementCount VF); 1728 1729 /// Collect the instructions that are uniform after vectorization. An 1730 /// instruction is uniform if we represent it with a single scalar value in 1731 /// the vectorized loop corresponding to each vector iteration. Examples of 1732 /// uniform instructions include pointer operands of consecutive or 1733 /// interleaved memory accesses. Note that although uniformity implies an 1734 /// instruction will be scalar, the reverse is not true. In general, a 1735 /// scalarized instruction will be represented by VF scalar values in the 1736 /// vectorized loop, each corresponding to an iteration of the original 1737 /// scalar loop. 1738 void collectLoopUniforms(ElementCount VF); 1739 1740 /// Collect the instructions that are scalar after vectorization. An 1741 /// instruction is scalar if it is known to be uniform or will be scalarized 1742 /// during vectorization. Non-uniform scalarized instructions will be 1743 /// represented by VF values in the vectorized loop, each corresponding to an 1744 /// iteration of the original scalar loop. 1745 void collectLoopScalars(ElementCount VF); 1746 1747 /// Keeps cost model vectorization decision and cost for instructions. 1748 /// Right now it is used for memory instructions only. 1749 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1750 std::pair<InstWidening, unsigned>>; 1751 1752 DecisionList WideningDecisions; 1753 1754 /// Returns true if \p V is expected to be vectorized and it needs to be 1755 /// extracted. 1756 bool needsExtract(Value *V, ElementCount VF) const { 1757 Instruction *I = dyn_cast<Instruction>(V); 1758 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1759 TheLoop->isLoopInvariant(I)) 1760 return false; 1761 1762 // Assume we can vectorize V (and hence we need extraction) if the 1763 // scalars are not computed yet. This can happen, because it is called 1764 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1765 // the scalars are collected. That should be a safe assumption in most 1766 // cases, because we check if the operands have vectorizable types 1767 // beforehand in LoopVectorizationLegality. 1768 return Scalars.find(VF) == Scalars.end() || 1769 !isScalarAfterVectorization(I, VF); 1770 }; 1771 1772 /// Returns a range containing only operands needing to be extracted. 1773 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1774 ElementCount VF) { 1775 return SmallVector<Value *, 4>(make_filter_range( 1776 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1777 } 1778 1779 /// Determines if we have the infrastructure to vectorize loop \p L and its 1780 /// epilogue, assuming the main loop is vectorized by \p VF. 1781 bool isCandidateForEpilogueVectorization(const Loop &L, 1782 const ElementCount VF) const; 1783 1784 /// Returns true if epilogue vectorization is considered profitable, and 1785 /// false otherwise. 1786 /// \p VF is the vectorization factor chosen for the original loop. 1787 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1788 1789 public: 1790 /// The loop that we evaluate. 1791 Loop *TheLoop; 1792 1793 /// Predicated scalar evolution analysis. 1794 PredicatedScalarEvolution &PSE; 1795 1796 /// Loop Info analysis. 1797 LoopInfo *LI; 1798 1799 /// Vectorization legality. 1800 LoopVectorizationLegality *Legal; 1801 1802 /// Vector target information. 1803 const TargetTransformInfo &TTI; 1804 1805 /// Target Library Info. 1806 const TargetLibraryInfo *TLI; 1807 1808 /// Demanded bits analysis. 1809 DemandedBits *DB; 1810 1811 /// Assumption cache. 1812 AssumptionCache *AC; 1813 1814 /// Interface to emit optimization remarks. 1815 OptimizationRemarkEmitter *ORE; 1816 1817 const Function *TheFunction; 1818 1819 /// Loop Vectorize Hint. 1820 const LoopVectorizeHints *Hints; 1821 1822 /// The interleave access information contains groups of interleaved accesses 1823 /// with the same stride and close to each other. 1824 InterleavedAccessInfo &InterleaveInfo; 1825 1826 /// Values to ignore in the cost model. 1827 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1828 1829 /// Values to ignore in the cost model when VF > 1. 1830 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1831 1832 /// Profitable vector factors. 1833 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1834 }; 1835 1836 } // end namespace llvm 1837 1838 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1839 // vectorization. The loop needs to be annotated with #pragma omp simd 1840 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1841 // vector length information is not provided, vectorization is not considered 1842 // explicit. Interleave hints are not allowed either. These limitations will be 1843 // relaxed in the future. 1844 // Please, note that we are currently forced to abuse the pragma 'clang 1845 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1846 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1847 // provides *explicit vectorization hints* (LV can bypass legal checks and 1848 // assume that vectorization is legal). However, both hints are implemented 1849 // using the same metadata (llvm.loop.vectorize, processed by 1850 // LoopVectorizeHints). This will be fixed in the future when the native IR 1851 // representation for pragma 'omp simd' is introduced. 1852 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1853 OptimizationRemarkEmitter *ORE) { 1854 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1855 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1856 1857 // Only outer loops with an explicit vectorization hint are supported. 1858 // Unannotated outer loops are ignored. 1859 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1860 return false; 1861 1862 Function *Fn = OuterLp->getHeader()->getParent(); 1863 if (!Hints.allowVectorization(Fn, OuterLp, 1864 true /*VectorizeOnlyWhenForced*/)) { 1865 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1866 return false; 1867 } 1868 1869 if (Hints.getInterleave() > 1) { 1870 // TODO: Interleave support is future work. 1871 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1872 "outer loops.\n"); 1873 Hints.emitRemarkWithHints(); 1874 return false; 1875 } 1876 1877 return true; 1878 } 1879 1880 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1881 OptimizationRemarkEmitter *ORE, 1882 SmallVectorImpl<Loop *> &V) { 1883 // Collect inner loops and outer loops without irreducible control flow. For 1884 // now, only collect outer loops that have explicit vectorization hints. If we 1885 // are stress testing the VPlan H-CFG construction, we collect the outermost 1886 // loop of every loop nest. 1887 if (L.isInnermost() || VPlanBuildStressTest || 1888 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1889 LoopBlocksRPO RPOT(&L); 1890 RPOT.perform(LI); 1891 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1892 V.push_back(&L); 1893 // TODO: Collect inner loops inside marked outer loops in case 1894 // vectorization fails for the outer loop. Do not invoke 1895 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1896 // already known to be reducible. We can use an inherited attribute for 1897 // that. 1898 return; 1899 } 1900 } 1901 for (Loop *InnerL : L) 1902 collectSupportedLoops(*InnerL, LI, ORE, V); 1903 } 1904 1905 namespace { 1906 1907 /// The LoopVectorize Pass. 1908 struct LoopVectorize : public FunctionPass { 1909 /// Pass identification, replacement for typeid 1910 static char ID; 1911 1912 LoopVectorizePass Impl; 1913 1914 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1915 bool VectorizeOnlyWhenForced = false) 1916 : FunctionPass(ID), 1917 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1918 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1919 } 1920 1921 bool runOnFunction(Function &F) override { 1922 if (skipFunction(F)) 1923 return false; 1924 1925 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1926 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1927 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1928 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1929 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1930 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1931 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1932 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1933 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1934 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1935 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1936 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1937 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1938 1939 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1940 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1941 1942 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1943 GetLAA, *ORE, PSI).MadeAnyChange; 1944 } 1945 1946 void getAnalysisUsage(AnalysisUsage &AU) const override { 1947 AU.addRequired<AssumptionCacheTracker>(); 1948 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1949 AU.addRequired<DominatorTreeWrapperPass>(); 1950 AU.addRequired<LoopInfoWrapperPass>(); 1951 AU.addRequired<ScalarEvolutionWrapperPass>(); 1952 AU.addRequired<TargetTransformInfoWrapperPass>(); 1953 AU.addRequired<AAResultsWrapperPass>(); 1954 AU.addRequired<LoopAccessLegacyAnalysis>(); 1955 AU.addRequired<DemandedBitsWrapperPass>(); 1956 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1957 AU.addRequired<InjectTLIMappingsLegacy>(); 1958 1959 // We currently do not preserve loopinfo/dominator analyses with outer loop 1960 // vectorization. Until this is addressed, mark these analyses as preserved 1961 // only for non-VPlan-native path. 1962 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1963 if (!EnableVPlanNativePath) { 1964 AU.addPreserved<LoopInfoWrapperPass>(); 1965 AU.addPreserved<DominatorTreeWrapperPass>(); 1966 } 1967 1968 AU.addPreserved<BasicAAWrapperPass>(); 1969 AU.addPreserved<GlobalsAAWrapperPass>(); 1970 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1971 } 1972 }; 1973 1974 } // end anonymous namespace 1975 1976 //===----------------------------------------------------------------------===// 1977 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1978 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1979 //===----------------------------------------------------------------------===// 1980 1981 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1982 // We need to place the broadcast of invariant variables outside the loop, 1983 // but only if it's proven safe to do so. Else, broadcast will be inside 1984 // vector loop body. 1985 Instruction *Instr = dyn_cast<Instruction>(V); 1986 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1987 (!Instr || 1988 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1989 // Place the code for broadcasting invariant variables in the new preheader. 1990 IRBuilder<>::InsertPointGuard Guard(Builder); 1991 if (SafeToHoist) 1992 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1993 1994 // Broadcast the scalar into all locations in the vector. 1995 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1996 1997 return Shuf; 1998 } 1999 2000 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2001 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 2002 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2003 "Expected either an induction phi-node or a truncate of it!"); 2004 Value *Start = II.getStartValue(); 2005 2006 // Construct the initial value of the vector IV in the vector loop preheader 2007 auto CurrIP = Builder.saveIP(); 2008 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2009 if (isa<TruncInst>(EntryVal)) { 2010 assert(Start->getType()->isIntegerTy() && 2011 "Truncation requires an integer type"); 2012 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2013 Step = Builder.CreateTrunc(Step, TruncType); 2014 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2015 } 2016 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2017 Value *SteppedStart = 2018 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2019 2020 // We create vector phi nodes for both integer and floating-point induction 2021 // variables. Here, we determine the kind of arithmetic we will perform. 2022 Instruction::BinaryOps AddOp; 2023 Instruction::BinaryOps MulOp; 2024 if (Step->getType()->isIntegerTy()) { 2025 AddOp = Instruction::Add; 2026 MulOp = Instruction::Mul; 2027 } else { 2028 AddOp = II.getInductionOpcode(); 2029 MulOp = Instruction::FMul; 2030 } 2031 2032 // Multiply the vectorization factor by the step using integer or 2033 // floating-point arithmetic as appropriate. 2034 Value *ConstVF = 2035 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2036 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2037 2038 // Create a vector splat to use in the induction update. 2039 // 2040 // FIXME: If the step is non-constant, we create the vector splat with 2041 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2042 // handle a constant vector splat. 2043 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2044 Value *SplatVF = isa<Constant>(Mul) 2045 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2046 : Builder.CreateVectorSplat(VF, Mul); 2047 Builder.restoreIP(CurrIP); 2048 2049 // We may need to add the step a number of times, depending on the unroll 2050 // factor. The last of those goes into the PHI. 2051 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2052 &*LoopVectorBody->getFirstInsertionPt()); 2053 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2054 Instruction *LastInduction = VecInd; 2055 for (unsigned Part = 0; Part < UF; ++Part) { 2056 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2057 2058 if (isa<TruncInst>(EntryVal)) 2059 addMetadata(LastInduction, EntryVal); 2060 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2061 2062 LastInduction = cast<Instruction>(addFastMathFlag( 2063 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2064 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2065 } 2066 2067 // Move the last step to the end of the latch block. This ensures consistent 2068 // placement of all induction updates. 2069 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2070 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2071 auto *ICmp = cast<Instruction>(Br->getCondition()); 2072 LastInduction->moveBefore(ICmp); 2073 LastInduction->setName("vec.ind.next"); 2074 2075 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2076 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2077 } 2078 2079 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2080 return Cost->isScalarAfterVectorization(I, VF) || 2081 Cost->isProfitableToScalarize(I, VF); 2082 } 2083 2084 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2085 if (shouldScalarizeInstruction(IV)) 2086 return true; 2087 auto isScalarInst = [&](User *U) -> bool { 2088 auto *I = cast<Instruction>(U); 2089 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2090 }; 2091 return llvm::any_of(IV->users(), isScalarInst); 2092 } 2093 2094 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2095 const InductionDescriptor &ID, const Instruction *EntryVal, 2096 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2097 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2098 "Expected either an induction phi-node or a truncate of it!"); 2099 2100 // This induction variable is not the phi from the original loop but the 2101 // newly-created IV based on the proof that casted Phi is equal to the 2102 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2103 // re-uses the same InductionDescriptor that original IV uses but we don't 2104 // have to do any recording in this case - that is done when original IV is 2105 // processed. 2106 if (isa<TruncInst>(EntryVal)) 2107 return; 2108 2109 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2110 if (Casts.empty()) 2111 return; 2112 // Only the first Cast instruction in the Casts vector is of interest. 2113 // The rest of the Casts (if exist) have no uses outside the 2114 // induction update chain itself. 2115 Instruction *CastInst = *Casts.begin(); 2116 if (Lane < UINT_MAX) 2117 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2118 else 2119 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2120 } 2121 2122 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 2123 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2124 "Primary induction variable must have an integer type"); 2125 2126 auto II = Legal->getInductionVars().find(IV); 2127 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2128 2129 auto ID = II->second; 2130 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2131 2132 // The value from the original loop to which we are mapping the new induction 2133 // variable. 2134 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2135 2136 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2137 2138 // Generate code for the induction step. Note that induction steps are 2139 // required to be loop-invariant 2140 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2141 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2142 "Induction step should be loop invariant"); 2143 if (PSE.getSE()->isSCEVable(IV->getType())) { 2144 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2145 return Exp.expandCodeFor(Step, Step->getType(), 2146 LoopVectorPreHeader->getTerminator()); 2147 } 2148 return cast<SCEVUnknown>(Step)->getValue(); 2149 }; 2150 2151 // The scalar value to broadcast. This is derived from the canonical 2152 // induction variable. If a truncation type is given, truncate the canonical 2153 // induction variable and step. Otherwise, derive these values from the 2154 // induction descriptor. 2155 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2156 Value *ScalarIV = Induction; 2157 if (IV != OldInduction) { 2158 ScalarIV = IV->getType()->isIntegerTy() 2159 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2160 : Builder.CreateCast(Instruction::SIToFP, Induction, 2161 IV->getType()); 2162 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2163 ScalarIV->setName("offset.idx"); 2164 } 2165 if (Trunc) { 2166 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2167 assert(Step->getType()->isIntegerTy() && 2168 "Truncation requires an integer step"); 2169 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2170 Step = Builder.CreateTrunc(Step, TruncType); 2171 } 2172 return ScalarIV; 2173 }; 2174 2175 // Create the vector values from the scalar IV, in the absence of creating a 2176 // vector IV. 2177 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2178 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2179 for (unsigned Part = 0; Part < UF; ++Part) { 2180 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2181 Value *EntryPart = 2182 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2183 ID.getInductionOpcode()); 2184 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2185 if (Trunc) 2186 addMetadata(EntryPart, Trunc); 2187 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2188 } 2189 }; 2190 2191 // Now do the actual transformations, and start with creating the step value. 2192 Value *Step = CreateStepValue(ID.getStep()); 2193 if (VF.isZero() || VF.isScalar()) { 2194 Value *ScalarIV = CreateScalarIV(Step); 2195 CreateSplatIV(ScalarIV, Step); 2196 return; 2197 } 2198 2199 // Determine if we want a scalar version of the induction variable. This is 2200 // true if the induction variable itself is not widened, or if it has at 2201 // least one user in the loop that is not widened. 2202 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2203 if (!NeedsScalarIV) { 2204 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2205 return; 2206 } 2207 2208 // Try to create a new independent vector induction variable. If we can't 2209 // create the phi node, we will splat the scalar induction variable in each 2210 // loop iteration. 2211 if (!shouldScalarizeInstruction(EntryVal)) { 2212 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 2213 Value *ScalarIV = CreateScalarIV(Step); 2214 // Create scalar steps that can be used by instructions we will later 2215 // scalarize. Note that the addition of the scalar steps will not increase 2216 // the number of instructions in the loop in the common case prior to 2217 // InstCombine. We will be trading one vector extract for each scalar step. 2218 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2219 return; 2220 } 2221 2222 // All IV users are scalar instructions, so only emit a scalar IV, not a 2223 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2224 // predicate used by the masked loads/stores. 2225 Value *ScalarIV = CreateScalarIV(Step); 2226 if (!Cost->isScalarEpilogueAllowed()) 2227 CreateSplatIV(ScalarIV, Step); 2228 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2229 } 2230 2231 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2232 Instruction::BinaryOps BinOp) { 2233 // Create and check the types. 2234 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2235 int VLen = ValVTy->getNumElements(); 2236 2237 Type *STy = Val->getType()->getScalarType(); 2238 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2239 "Induction Step must be an integer or FP"); 2240 assert(Step->getType() == STy && "Step has wrong type"); 2241 2242 SmallVector<Constant *, 8> Indices; 2243 2244 if (STy->isIntegerTy()) { 2245 // Create a vector of consecutive numbers from zero to VF. 2246 for (int i = 0; i < VLen; ++i) 2247 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2248 2249 // Add the consecutive indices to the vector value. 2250 Constant *Cv = ConstantVector::get(Indices); 2251 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2252 Step = Builder.CreateVectorSplat(VLen, Step); 2253 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2254 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2255 // which can be found from the original scalar operations. 2256 Step = Builder.CreateMul(Cv, Step); 2257 return Builder.CreateAdd(Val, Step, "induction"); 2258 } 2259 2260 // Floating point induction. 2261 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2262 "Binary Opcode should be specified for FP induction"); 2263 // Create a vector of consecutive numbers from zero to VF. 2264 for (int i = 0; i < VLen; ++i) 2265 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2266 2267 // Add the consecutive indices to the vector value. 2268 Constant *Cv = ConstantVector::get(Indices); 2269 2270 Step = Builder.CreateVectorSplat(VLen, Step); 2271 2272 // Floating point operations had to be 'fast' to enable the induction. 2273 FastMathFlags Flags; 2274 Flags.setFast(); 2275 2276 Value *MulOp = Builder.CreateFMul(Cv, Step); 2277 if (isa<Instruction>(MulOp)) 2278 // Have to check, MulOp may be a constant 2279 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2280 2281 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2282 if (isa<Instruction>(BOp)) 2283 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2284 return BOp; 2285 } 2286 2287 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2288 Instruction *EntryVal, 2289 const InductionDescriptor &ID) { 2290 // We shouldn't have to build scalar steps if we aren't vectorizing. 2291 assert(VF.isVector() && "VF should be greater than one"); 2292 // Get the value type and ensure it and the step have the same integer type. 2293 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2294 assert(ScalarIVTy == Step->getType() && 2295 "Val and Step should have the same type"); 2296 2297 // We build scalar steps for both integer and floating-point induction 2298 // variables. Here, we determine the kind of arithmetic we will perform. 2299 Instruction::BinaryOps AddOp; 2300 Instruction::BinaryOps MulOp; 2301 if (ScalarIVTy->isIntegerTy()) { 2302 AddOp = Instruction::Add; 2303 MulOp = Instruction::Mul; 2304 } else { 2305 AddOp = ID.getInductionOpcode(); 2306 MulOp = Instruction::FMul; 2307 } 2308 2309 // Determine the number of scalars we need to generate for each unroll 2310 // iteration. If EntryVal is uniform, we only need to generate the first 2311 // lane. Otherwise, we generate all VF values. 2312 unsigned Lanes = 2313 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2314 ? 1 2315 : VF.getKnownMinValue(); 2316 assert((!VF.isScalable() || Lanes == 1) && 2317 "Should never scalarize a scalable vector"); 2318 // Compute the scalar steps and save the results in VectorLoopValueMap. 2319 for (unsigned Part = 0; Part < UF; ++Part) { 2320 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2321 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2322 ScalarIVTy->getScalarSizeInBits()); 2323 Value *StartIdx = 2324 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2325 if (ScalarIVTy->isFloatingPointTy()) 2326 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2327 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2328 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2329 // The step returned by `createStepForVF` is a runtime-evaluated value 2330 // when VF is scalable. Otherwise, it should be folded into a Constant. 2331 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2332 "Expected StartIdx to be folded to a constant when VF is not " 2333 "scalable"); 2334 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2335 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2336 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2337 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2338 } 2339 } 2340 } 2341 2342 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2343 assert(V != Induction && "The new induction variable should not be used."); 2344 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2345 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2346 2347 // If we have a stride that is replaced by one, do it here. Defer this for 2348 // the VPlan-native path until we start running Legal checks in that path. 2349 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2350 V = ConstantInt::get(V->getType(), 1); 2351 2352 // If we have a vector mapped to this value, return it. 2353 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2354 return VectorLoopValueMap.getVectorValue(V, Part); 2355 2356 // If the value has not been vectorized, check if it has been scalarized 2357 // instead. If it has been scalarized, and we actually need the value in 2358 // vector form, we will construct the vector values on demand. 2359 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2360 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2361 2362 // If we've scalarized a value, that value should be an instruction. 2363 auto *I = cast<Instruction>(V); 2364 2365 // If we aren't vectorizing, we can just copy the scalar map values over to 2366 // the vector map. 2367 if (VF.isScalar()) { 2368 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2369 return ScalarValue; 2370 } 2371 2372 // Get the last scalar instruction we generated for V and Part. If the value 2373 // is known to be uniform after vectorization, this corresponds to lane zero 2374 // of the Part unroll iteration. Otherwise, the last instruction is the one 2375 // we created for the last vector lane of the Part unroll iteration. 2376 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2377 ? 0 2378 : VF.getKnownMinValue() - 1; 2379 assert((!VF.isScalable() || LastLane == 0) && 2380 "Scalable vectorization can't lead to any scalarized values."); 2381 auto *LastInst = cast<Instruction>( 2382 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2383 2384 // Set the insert point after the last scalarized instruction. This ensures 2385 // the insertelement sequence will directly follow the scalar definitions. 2386 auto OldIP = Builder.saveIP(); 2387 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2388 Builder.SetInsertPoint(&*NewIP); 2389 2390 // However, if we are vectorizing, we need to construct the vector values. 2391 // If the value is known to be uniform after vectorization, we can just 2392 // broadcast the scalar value corresponding to lane zero for each unroll 2393 // iteration. Otherwise, we construct the vector values using insertelement 2394 // instructions. Since the resulting vectors are stored in 2395 // VectorLoopValueMap, we will only generate the insertelements once. 2396 Value *VectorValue = nullptr; 2397 if (Cost->isUniformAfterVectorization(I, VF)) { 2398 VectorValue = getBroadcastInstrs(ScalarValue); 2399 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2400 } else { 2401 // Initialize packing with insertelements to start from undef. 2402 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2403 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2404 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2405 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2406 packScalarIntoVectorValue(V, {Part, Lane}); 2407 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2408 } 2409 Builder.restoreIP(OldIP); 2410 return VectorValue; 2411 } 2412 2413 // If this scalar is unknown, assume that it is a constant or that it is 2414 // loop invariant. Broadcast V and save the value for future uses. 2415 Value *B = getBroadcastInstrs(V); 2416 VectorLoopValueMap.setVectorValue(V, Part, B); 2417 return B; 2418 } 2419 2420 Value * 2421 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2422 const VPIteration &Instance) { 2423 // If the value is not an instruction contained in the loop, it should 2424 // already be scalar. 2425 if (OrigLoop->isLoopInvariant(V)) 2426 return V; 2427 2428 assert(Instance.Lane > 0 2429 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2430 : true && "Uniform values only have lane zero"); 2431 2432 // If the value from the original loop has not been vectorized, it is 2433 // represented by UF x VF scalar values in the new loop. Return the requested 2434 // scalar value. 2435 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2436 return VectorLoopValueMap.getScalarValue(V, Instance); 2437 2438 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2439 // for the given unroll part. If this entry is not a vector type (i.e., the 2440 // vectorization factor is one), there is no need to generate an 2441 // extractelement instruction. 2442 auto *U = getOrCreateVectorValue(V, Instance.Part); 2443 if (!U->getType()->isVectorTy()) { 2444 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2445 return U; 2446 } 2447 2448 // Otherwise, the value from the original loop has been vectorized and is 2449 // represented by UF vector values. Extract and return the requested scalar 2450 // value from the appropriate vector lane. 2451 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2452 } 2453 2454 void InnerLoopVectorizer::packScalarIntoVectorValue( 2455 Value *V, const VPIteration &Instance) { 2456 assert(V != Induction && "The new induction variable should not be used."); 2457 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2458 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2459 2460 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2461 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2462 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2463 Builder.getInt32(Instance.Lane)); 2464 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2465 } 2466 2467 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2468 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2469 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2470 SmallVector<int, 8> ShuffleMask; 2471 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2472 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2473 2474 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2475 } 2476 2477 // Return whether we allow using masked interleave-groups (for dealing with 2478 // strided loads/stores that reside in predicated blocks, or for dealing 2479 // with gaps). 2480 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2481 // If an override option has been passed in for interleaved accesses, use it. 2482 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2483 return EnableMaskedInterleavedMemAccesses; 2484 2485 return TTI.enableMaskedInterleavedAccessVectorization(); 2486 } 2487 2488 // Try to vectorize the interleave group that \p Instr belongs to. 2489 // 2490 // E.g. Translate following interleaved load group (factor = 3): 2491 // for (i = 0; i < N; i+=3) { 2492 // R = Pic[i]; // Member of index 0 2493 // G = Pic[i+1]; // Member of index 1 2494 // B = Pic[i+2]; // Member of index 2 2495 // ... // do something to R, G, B 2496 // } 2497 // To: 2498 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2499 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2500 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2501 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2502 // 2503 // Or translate following interleaved store group (factor = 3): 2504 // for (i = 0; i < N; i+=3) { 2505 // ... do something to R, G, B 2506 // Pic[i] = R; // Member of index 0 2507 // Pic[i+1] = G; // Member of index 1 2508 // Pic[i+2] = B; // Member of index 2 2509 // } 2510 // To: 2511 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2512 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2513 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2514 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2515 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2516 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2517 const InterleaveGroup<Instruction> *Group, VPTransformState &State, 2518 VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) { 2519 Instruction *Instr = Group->getInsertPos(); 2520 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2521 2522 // Prepare for the vector type of the interleaved load/store. 2523 Type *ScalarTy = getMemInstValueType(Instr); 2524 unsigned InterleaveFactor = Group->getFactor(); 2525 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2526 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2527 2528 // Prepare for the new pointers. 2529 SmallVector<Value *, 2> AddrParts; 2530 unsigned Index = Group->getIndex(Instr); 2531 2532 // TODO: extend the masked interleaved-group support to reversed access. 2533 assert((!BlockInMask || !Group->isReverse()) && 2534 "Reversed masked interleave-group not supported."); 2535 2536 // If the group is reverse, adjust the index to refer to the last vector lane 2537 // instead of the first. We adjust the index from the first vector lane, 2538 // rather than directly getting the pointer for lane VF - 1, because the 2539 // pointer operand of the interleaved access is supposed to be uniform. For 2540 // uniform instructions, we're only required to generate a value for the 2541 // first vector lane in each unroll iteration. 2542 assert(!VF.isScalable() && 2543 "scalable vector reverse operation is not implemented"); 2544 if (Group->isReverse()) 2545 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2546 2547 for (unsigned Part = 0; Part < UF; Part++) { 2548 Value *AddrPart = State.get(Addr, {Part, 0}); 2549 setDebugLocFromInst(Builder, AddrPart); 2550 2551 // Notice current instruction could be any index. Need to adjust the address 2552 // to the member of index 0. 2553 // 2554 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2555 // b = A[i]; // Member of index 0 2556 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2557 // 2558 // E.g. A[i+1] = a; // Member of index 1 2559 // A[i] = b; // Member of index 0 2560 // A[i+2] = c; // Member of index 2 (Current instruction) 2561 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2562 2563 bool InBounds = false; 2564 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2565 InBounds = gep->isInBounds(); 2566 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2567 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2568 2569 // Cast to the vector pointer type. 2570 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2571 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2572 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2573 } 2574 2575 setDebugLocFromInst(Builder, Instr); 2576 Value *UndefVec = UndefValue::get(VecTy); 2577 2578 Value *MaskForGaps = nullptr; 2579 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2580 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2581 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2582 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2583 } 2584 2585 // Vectorize the interleaved load group. 2586 if (isa<LoadInst>(Instr)) { 2587 // For each unroll part, create a wide load for the group. 2588 SmallVector<Value *, 2> NewLoads; 2589 for (unsigned Part = 0; Part < UF; Part++) { 2590 Instruction *NewLoad; 2591 if (BlockInMask || MaskForGaps) { 2592 assert(useMaskedInterleavedAccesses(*TTI) && 2593 "masked interleaved groups are not allowed."); 2594 Value *GroupMask = MaskForGaps; 2595 if (BlockInMask) { 2596 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2597 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2598 Value *ShuffledMask = Builder.CreateShuffleVector( 2599 BlockInMaskPart, 2600 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2601 "interleaved.mask"); 2602 GroupMask = MaskForGaps 2603 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2604 MaskForGaps) 2605 : ShuffledMask; 2606 } 2607 NewLoad = 2608 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2609 GroupMask, UndefVec, "wide.masked.vec"); 2610 } 2611 else 2612 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2613 Group->getAlign(), "wide.vec"); 2614 Group->addMetadata(NewLoad); 2615 NewLoads.push_back(NewLoad); 2616 } 2617 2618 // For each member in the group, shuffle out the appropriate data from the 2619 // wide loads. 2620 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2621 Instruction *Member = Group->getMember(I); 2622 2623 // Skip the gaps in the group. 2624 if (!Member) 2625 continue; 2626 2627 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2628 auto StrideMask = 2629 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2630 for (unsigned Part = 0; Part < UF; Part++) { 2631 Value *StridedVec = Builder.CreateShuffleVector( 2632 NewLoads[Part], StrideMask, "strided.vec"); 2633 2634 // If this member has different type, cast the result type. 2635 if (Member->getType() != ScalarTy) { 2636 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2637 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2638 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2639 } 2640 2641 if (Group->isReverse()) 2642 StridedVec = reverseVector(StridedVec); 2643 2644 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2645 } 2646 } 2647 return; 2648 } 2649 2650 // The sub vector type for current instruction. 2651 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2652 auto *SubVT = VectorType::get(ScalarTy, VF); 2653 2654 // Vectorize the interleaved store group. 2655 for (unsigned Part = 0; Part < UF; Part++) { 2656 // Collect the stored vector from each member. 2657 SmallVector<Value *, 4> StoredVecs; 2658 for (unsigned i = 0; i < InterleaveFactor; i++) { 2659 // Interleaved store group doesn't allow a gap, so each index has a member 2660 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2661 2662 Value *StoredVec = State.get(StoredValues[i], Part); 2663 2664 if (Group->isReverse()) 2665 StoredVec = reverseVector(StoredVec); 2666 2667 // If this member has different type, cast it to a unified type. 2668 2669 if (StoredVec->getType() != SubVT) 2670 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2671 2672 StoredVecs.push_back(StoredVec); 2673 } 2674 2675 // Concatenate all vectors into a wide vector. 2676 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2677 2678 // Interleave the elements in the wide vector. 2679 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2680 Value *IVec = Builder.CreateShuffleVector( 2681 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2682 "interleaved.vec"); 2683 2684 Instruction *NewStoreInstr; 2685 if (BlockInMask) { 2686 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2687 Value *ShuffledMask = Builder.CreateShuffleVector( 2688 BlockInMaskPart, 2689 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2690 "interleaved.mask"); 2691 NewStoreInstr = Builder.CreateMaskedStore( 2692 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2693 } 2694 else 2695 NewStoreInstr = 2696 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2697 2698 Group->addMetadata(NewStoreInstr); 2699 } 2700 } 2701 2702 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2703 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2704 VPValue *StoredValue, VPValue *BlockInMask) { 2705 // Attempt to issue a wide load. 2706 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2707 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2708 2709 assert((LI || SI) && "Invalid Load/Store instruction"); 2710 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2711 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2712 2713 LoopVectorizationCostModel::InstWidening Decision = 2714 Cost->getWideningDecision(Instr, VF); 2715 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2716 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2717 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2718 "CM decision is not to widen the memory instruction"); 2719 2720 Type *ScalarDataTy = getMemInstValueType(Instr); 2721 2722 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2723 const Align Alignment = getLoadStoreAlignment(Instr); 2724 2725 // Determine if the pointer operand of the access is either consecutive or 2726 // reverse consecutive. 2727 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2728 bool ConsecutiveStride = 2729 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2730 bool CreateGatherScatter = 2731 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2732 2733 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2734 // gather/scatter. Otherwise Decision should have been to Scalarize. 2735 assert((ConsecutiveStride || CreateGatherScatter) && 2736 "The instruction should be scalarized"); 2737 (void)ConsecutiveStride; 2738 2739 VectorParts BlockInMaskParts(UF); 2740 bool isMaskRequired = BlockInMask; 2741 if (isMaskRequired) 2742 for (unsigned Part = 0; Part < UF; ++Part) 2743 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2744 2745 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2746 // Calculate the pointer for the specific unroll-part. 2747 GetElementPtrInst *PartPtr = nullptr; 2748 2749 bool InBounds = false; 2750 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2751 InBounds = gep->isInBounds(); 2752 2753 if (Reverse) { 2754 assert(!VF.isScalable() && 2755 "Reversing vectors is not yet supported for scalable vectors."); 2756 2757 // If the address is consecutive but reversed, then the 2758 // wide store needs to start at the last vector element. 2759 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2760 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2761 PartPtr->setIsInBounds(InBounds); 2762 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2763 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2764 PartPtr->setIsInBounds(InBounds); 2765 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2766 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2767 } else { 2768 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2769 PartPtr = cast<GetElementPtrInst>( 2770 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2771 PartPtr->setIsInBounds(InBounds); 2772 } 2773 2774 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2775 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2776 }; 2777 2778 // Handle Stores: 2779 if (SI) { 2780 setDebugLocFromInst(Builder, SI); 2781 2782 for (unsigned Part = 0; Part < UF; ++Part) { 2783 Instruction *NewSI = nullptr; 2784 Value *StoredVal = State.get(StoredValue, Part); 2785 if (CreateGatherScatter) { 2786 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2787 Value *VectorGep = State.get(Addr, Part); 2788 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2789 MaskPart); 2790 } else { 2791 if (Reverse) { 2792 // If we store to reverse consecutive memory locations, then we need 2793 // to reverse the order of elements in the stored value. 2794 StoredVal = reverseVector(StoredVal); 2795 // We don't want to update the value in the map as it might be used in 2796 // another expression. So don't call resetVectorValue(StoredVal). 2797 } 2798 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2799 if (isMaskRequired) 2800 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2801 BlockInMaskParts[Part]); 2802 else 2803 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2804 } 2805 addMetadata(NewSI, SI); 2806 } 2807 return; 2808 } 2809 2810 // Handle loads. 2811 assert(LI && "Must have a load instruction"); 2812 setDebugLocFromInst(Builder, LI); 2813 for (unsigned Part = 0; Part < UF; ++Part) { 2814 Value *NewLI; 2815 if (CreateGatherScatter) { 2816 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2817 Value *VectorGep = State.get(Addr, Part); 2818 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2819 nullptr, "wide.masked.gather"); 2820 addMetadata(NewLI, LI); 2821 } else { 2822 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2823 if (isMaskRequired) 2824 NewLI = Builder.CreateMaskedLoad( 2825 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2826 "wide.masked.load"); 2827 else 2828 NewLI = 2829 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2830 2831 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2832 addMetadata(NewLI, LI); 2833 if (Reverse) 2834 NewLI = reverseVector(NewLI); 2835 } 2836 2837 State.set(Def, Instr, NewLI, Part); 2838 } 2839 } 2840 2841 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2842 const VPIteration &Instance, 2843 bool IfPredicateInstr, 2844 VPTransformState &State) { 2845 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2846 2847 setDebugLocFromInst(Builder, Instr); 2848 2849 // Does this instruction return a value ? 2850 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2851 2852 Instruction *Cloned = Instr->clone(); 2853 if (!IsVoidRetTy) 2854 Cloned->setName(Instr->getName() + ".cloned"); 2855 2856 // Replace the operands of the cloned instructions with their scalar 2857 // equivalents in the new loop. 2858 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2859 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2860 auto InputInstance = Instance; 2861 if (!Operand || !OrigLoop->contains(Operand) || 2862 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2863 InputInstance.Lane = 0; 2864 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2865 Cloned->setOperand(op, NewOp); 2866 } 2867 addNewMetadata(Cloned, Instr); 2868 2869 // Place the cloned scalar in the new loop. 2870 Builder.Insert(Cloned); 2871 2872 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2873 // representing scalar values in VPTransformState. Add the cloned scalar to 2874 // the scalar map entry. 2875 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2876 2877 // If we just cloned a new assumption, add it the assumption cache. 2878 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2879 if (II->getIntrinsicID() == Intrinsic::assume) 2880 AC->registerAssumption(II); 2881 2882 // End if-block. 2883 if (IfPredicateInstr) 2884 PredicatedInstructions.push_back(Cloned); 2885 } 2886 2887 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2888 Value *End, Value *Step, 2889 Instruction *DL) { 2890 BasicBlock *Header = L->getHeader(); 2891 BasicBlock *Latch = L->getLoopLatch(); 2892 // As we're just creating this loop, it's possible no latch exists 2893 // yet. If so, use the header as this will be a single block loop. 2894 if (!Latch) 2895 Latch = Header; 2896 2897 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2898 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2899 setDebugLocFromInst(Builder, OldInst); 2900 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2901 2902 Builder.SetInsertPoint(Latch->getTerminator()); 2903 setDebugLocFromInst(Builder, OldInst); 2904 2905 // Create i+1 and fill the PHINode. 2906 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2907 Induction->addIncoming(Start, L->getLoopPreheader()); 2908 Induction->addIncoming(Next, Latch); 2909 // Create the compare. 2910 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2911 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2912 2913 // Now we have two terminators. Remove the old one from the block. 2914 Latch->getTerminator()->eraseFromParent(); 2915 2916 return Induction; 2917 } 2918 2919 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2920 if (TripCount) 2921 return TripCount; 2922 2923 assert(L && "Create Trip Count for null loop."); 2924 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2925 // Find the loop boundaries. 2926 ScalarEvolution *SE = PSE.getSE(); 2927 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2928 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2929 "Invalid loop count"); 2930 2931 Type *IdxTy = Legal->getWidestInductionType(); 2932 assert(IdxTy && "No type for induction"); 2933 2934 // The exit count might have the type of i64 while the phi is i32. This can 2935 // happen if we have an induction variable that is sign extended before the 2936 // compare. The only way that we get a backedge taken count is that the 2937 // induction variable was signed and as such will not overflow. In such a case 2938 // truncation is legal. 2939 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2940 IdxTy->getPrimitiveSizeInBits()) 2941 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2942 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2943 2944 // Get the total trip count from the count by adding 1. 2945 const SCEV *ExitCount = SE->getAddExpr( 2946 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2947 2948 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2949 2950 // Expand the trip count and place the new instructions in the preheader. 2951 // Notice that the pre-header does not change, only the loop body. 2952 SCEVExpander Exp(*SE, DL, "induction"); 2953 2954 // Count holds the overall loop count (N). 2955 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2956 L->getLoopPreheader()->getTerminator()); 2957 2958 if (TripCount->getType()->isPointerTy()) 2959 TripCount = 2960 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2961 L->getLoopPreheader()->getTerminator()); 2962 2963 return TripCount; 2964 } 2965 2966 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2967 if (VectorTripCount) 2968 return VectorTripCount; 2969 2970 Value *TC = getOrCreateTripCount(L); 2971 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2972 2973 Type *Ty = TC->getType(); 2974 // This is where we can make the step a runtime constant. 2975 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 2976 2977 // If the tail is to be folded by masking, round the number of iterations N 2978 // up to a multiple of Step instead of rounding down. This is done by first 2979 // adding Step-1 and then rounding down. Note that it's ok if this addition 2980 // overflows: the vector induction variable will eventually wrap to zero given 2981 // that it starts at zero and its Step is a power of two; the loop will then 2982 // exit, with the last early-exit vector comparison also producing all-true. 2983 if (Cost->foldTailByMasking()) { 2984 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2985 "VF*UF must be a power of 2 when folding tail by masking"); 2986 assert(!VF.isScalable() && 2987 "Tail folding not yet supported for scalable vectors"); 2988 TC = Builder.CreateAdd( 2989 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 2990 } 2991 2992 // Now we need to generate the expression for the part of the loop that the 2993 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2994 // iterations are not required for correctness, or N - Step, otherwise. Step 2995 // is equal to the vectorization factor (number of SIMD elements) times the 2996 // unroll factor (number of SIMD instructions). 2997 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2998 2999 // If there is a non-reversed interleaved group that may speculatively access 3000 // memory out-of-bounds, we need to ensure that there will be at least one 3001 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 3002 // the trip count, we set the remainder to be equal to the step. If the step 3003 // does not evenly divide the trip count, no adjustment is necessary since 3004 // there will already be scalar iterations. Note that the minimum iterations 3005 // check ensures that N >= Step. 3006 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3007 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3008 R = Builder.CreateSelect(IsZero, Step, R); 3009 } 3010 3011 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3012 3013 return VectorTripCount; 3014 } 3015 3016 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3017 const DataLayout &DL) { 3018 // Verify that V is a vector type with same number of elements as DstVTy. 3019 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3020 unsigned VF = DstFVTy->getNumElements(); 3021 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3022 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3023 Type *SrcElemTy = SrcVecTy->getElementType(); 3024 Type *DstElemTy = DstFVTy->getElementType(); 3025 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3026 "Vector elements must have same size"); 3027 3028 // Do a direct cast if element types are castable. 3029 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3030 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3031 } 3032 // V cannot be directly casted to desired vector type. 3033 // May happen when V is a floating point vector but DstVTy is a vector of 3034 // pointers or vice-versa. Handle this using a two-step bitcast using an 3035 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3036 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3037 "Only one type should be a pointer type"); 3038 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3039 "Only one type should be a floating point type"); 3040 Type *IntTy = 3041 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3042 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3043 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3044 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3045 } 3046 3047 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3048 BasicBlock *Bypass) { 3049 Value *Count = getOrCreateTripCount(L); 3050 // Reuse existing vector loop preheader for TC checks. 3051 // Note that new preheader block is generated for vector loop. 3052 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3053 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3054 3055 // Generate code to check if the loop's trip count is less than VF * UF, or 3056 // equal to it in case a scalar epilogue is required; this implies that the 3057 // vector trip count is zero. This check also covers the case where adding one 3058 // to the backedge-taken count overflowed leading to an incorrect trip count 3059 // of zero. In this case we will also jump to the scalar loop. 3060 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3061 : ICmpInst::ICMP_ULT; 3062 3063 // If tail is to be folded, vector loop takes care of all iterations. 3064 Value *CheckMinIters = Builder.getFalse(); 3065 if (!Cost->foldTailByMasking()) { 3066 Value *Step = 3067 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3068 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3069 } 3070 // Create new preheader for vector loop. 3071 LoopVectorPreHeader = 3072 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3073 "vector.ph"); 3074 3075 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3076 DT->getNode(Bypass)->getIDom()) && 3077 "TC check is expected to dominate Bypass"); 3078 3079 // Update dominator for Bypass & LoopExit. 3080 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3081 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3082 3083 ReplaceInstWithInst( 3084 TCCheckBlock->getTerminator(), 3085 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3086 LoopBypassBlocks.push_back(TCCheckBlock); 3087 } 3088 3089 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3090 // Reuse existing vector loop preheader for SCEV checks. 3091 // Note that new preheader block is generated for vector loop. 3092 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3093 3094 // Generate the code to check that the SCEV assumptions that we made. 3095 // We want the new basic block to start at the first instruction in a 3096 // sequence of instructions that form a check. 3097 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3098 "scev.check"); 3099 Value *SCEVCheck = Exp.expandCodeForPredicate( 3100 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3101 3102 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3103 if (C->isZero()) 3104 return; 3105 3106 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3107 (OptForSizeBasedOnProfile && 3108 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3109 "Cannot SCEV check stride or overflow when optimizing for size"); 3110 3111 SCEVCheckBlock->setName("vector.scevcheck"); 3112 // Create new preheader for vector loop. 3113 LoopVectorPreHeader = 3114 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3115 nullptr, "vector.ph"); 3116 3117 // Update dominator only if this is first RT check. 3118 if (LoopBypassBlocks.empty()) { 3119 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3120 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3121 } 3122 3123 ReplaceInstWithInst( 3124 SCEVCheckBlock->getTerminator(), 3125 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3126 LoopBypassBlocks.push_back(SCEVCheckBlock); 3127 AddedSafetyChecks = true; 3128 } 3129 3130 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3131 // VPlan-native path does not do any analysis for runtime checks currently. 3132 if (EnableVPlanNativePath) 3133 return; 3134 3135 // Reuse existing vector loop preheader for runtime memory checks. 3136 // Note that new preheader block is generated for vector loop. 3137 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3138 3139 // Generate the code that checks in runtime if arrays overlap. We put the 3140 // checks into a separate block to make the more common case of few elements 3141 // faster. 3142 auto *LAI = Legal->getLAI(); 3143 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3144 if (!RtPtrChecking.Need) 3145 return; 3146 3147 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3148 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3149 "Cannot emit memory checks when optimizing for size, unless forced " 3150 "to vectorize."); 3151 ORE->emit([&]() { 3152 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3153 L->getStartLoc(), L->getHeader()) 3154 << "Code-size may be reduced by not forcing " 3155 "vectorization, or by source-code modifications " 3156 "eliminating the need for runtime checks " 3157 "(e.g., adding 'restrict')."; 3158 }); 3159 } 3160 3161 MemCheckBlock->setName("vector.memcheck"); 3162 // Create new preheader for vector loop. 3163 LoopVectorPreHeader = 3164 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3165 "vector.ph"); 3166 3167 auto *CondBranch = cast<BranchInst>( 3168 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3169 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3170 LoopBypassBlocks.push_back(MemCheckBlock); 3171 AddedSafetyChecks = true; 3172 3173 // Update dominator only if this is first RT check. 3174 if (LoopBypassBlocks.empty()) { 3175 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3176 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3177 } 3178 3179 Instruction *FirstCheckInst; 3180 Instruction *MemRuntimeCheck; 3181 std::tie(FirstCheckInst, MemRuntimeCheck) = 3182 addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, 3183 RtPtrChecking.getChecks(), RtPtrChecking.getSE()); 3184 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3185 "claimed checks are required"); 3186 CondBranch->setCondition(MemRuntimeCheck); 3187 3188 // We currently don't use LoopVersioning for the actual loop cloning but we 3189 // still use it to add the noalias metadata. 3190 LVer = std::make_unique<LoopVersioning>( 3191 *Legal->getLAI(), 3192 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3193 DT, PSE.getSE()); 3194 LVer->prepareNoAliasMetadata(); 3195 } 3196 3197 Value *InnerLoopVectorizer::emitTransformedIndex( 3198 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3199 const InductionDescriptor &ID) const { 3200 3201 SCEVExpander Exp(*SE, DL, "induction"); 3202 auto Step = ID.getStep(); 3203 auto StartValue = ID.getStartValue(); 3204 assert(Index->getType() == Step->getType() && 3205 "Index type does not match StepValue type"); 3206 3207 // Note: the IR at this point is broken. We cannot use SE to create any new 3208 // SCEV and then expand it, hoping that SCEV's simplification will give us 3209 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3210 // lead to various SCEV crashes. So all we can do is to use builder and rely 3211 // on InstCombine for future simplifications. Here we handle some trivial 3212 // cases only. 3213 auto CreateAdd = [&B](Value *X, Value *Y) { 3214 assert(X->getType() == Y->getType() && "Types don't match!"); 3215 if (auto *CX = dyn_cast<ConstantInt>(X)) 3216 if (CX->isZero()) 3217 return Y; 3218 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3219 if (CY->isZero()) 3220 return X; 3221 return B.CreateAdd(X, Y); 3222 }; 3223 3224 auto CreateMul = [&B](Value *X, Value *Y) { 3225 assert(X->getType() == Y->getType() && "Types don't match!"); 3226 if (auto *CX = dyn_cast<ConstantInt>(X)) 3227 if (CX->isOne()) 3228 return Y; 3229 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3230 if (CY->isOne()) 3231 return X; 3232 return B.CreateMul(X, Y); 3233 }; 3234 3235 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3236 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3237 // the DomTree is not kept up-to-date for additional blocks generated in the 3238 // vector loop. By using the header as insertion point, we guarantee that the 3239 // expanded instructions dominate all their uses. 3240 auto GetInsertPoint = [this, &B]() { 3241 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3242 if (InsertBB != LoopVectorBody && 3243 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3244 return LoopVectorBody->getTerminator(); 3245 return &*B.GetInsertPoint(); 3246 }; 3247 switch (ID.getKind()) { 3248 case InductionDescriptor::IK_IntInduction: { 3249 assert(Index->getType() == StartValue->getType() && 3250 "Index type does not match StartValue type"); 3251 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3252 return B.CreateSub(StartValue, Index); 3253 auto *Offset = CreateMul( 3254 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3255 return CreateAdd(StartValue, Offset); 3256 } 3257 case InductionDescriptor::IK_PtrInduction: { 3258 assert(isa<SCEVConstant>(Step) && 3259 "Expected constant step for pointer induction"); 3260 return B.CreateGEP( 3261 StartValue->getType()->getPointerElementType(), StartValue, 3262 CreateMul(Index, 3263 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3264 } 3265 case InductionDescriptor::IK_FpInduction: { 3266 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3267 auto InductionBinOp = ID.getInductionBinOp(); 3268 assert(InductionBinOp && 3269 (InductionBinOp->getOpcode() == Instruction::FAdd || 3270 InductionBinOp->getOpcode() == Instruction::FSub) && 3271 "Original bin op should be defined for FP induction"); 3272 3273 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3274 3275 // Floating point operations had to be 'fast' to enable the induction. 3276 FastMathFlags Flags; 3277 Flags.setFast(); 3278 3279 Value *MulExp = B.CreateFMul(StepValue, Index); 3280 if (isa<Instruction>(MulExp)) 3281 // We have to check, the MulExp may be a constant. 3282 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3283 3284 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3285 "induction"); 3286 if (isa<Instruction>(BOp)) 3287 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3288 3289 return BOp; 3290 } 3291 case InductionDescriptor::IK_NoInduction: 3292 return nullptr; 3293 } 3294 llvm_unreachable("invalid enum"); 3295 } 3296 3297 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3298 LoopScalarBody = OrigLoop->getHeader(); 3299 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3300 LoopExitBlock = OrigLoop->getExitBlock(); 3301 assert(LoopExitBlock && "Must have an exit block"); 3302 assert(LoopVectorPreHeader && "Invalid loop structure"); 3303 3304 LoopMiddleBlock = 3305 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3306 LI, nullptr, Twine(Prefix) + "middle.block"); 3307 LoopScalarPreHeader = 3308 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3309 nullptr, Twine(Prefix) + "scalar.ph"); 3310 // We intentionally don't let SplitBlock to update LoopInfo since 3311 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3312 // LoopVectorBody is explicitly added to the correct place few lines later. 3313 LoopVectorBody = 3314 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3315 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3316 3317 // Update dominator for loop exit. 3318 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3319 3320 // Create and register the new vector loop. 3321 Loop *Lp = LI->AllocateLoop(); 3322 Loop *ParentLoop = OrigLoop->getParentLoop(); 3323 3324 // Insert the new loop into the loop nest and register the new basic blocks 3325 // before calling any utilities such as SCEV that require valid LoopInfo. 3326 if (ParentLoop) { 3327 ParentLoop->addChildLoop(Lp); 3328 } else { 3329 LI->addTopLevelLoop(Lp); 3330 } 3331 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3332 return Lp; 3333 } 3334 3335 void InnerLoopVectorizer::createInductionResumeValues( 3336 Loop *L, Value *VectorTripCount, 3337 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3338 assert(VectorTripCount && L && "Expected valid arguments"); 3339 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3340 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3341 "Inconsistent information about additional bypass."); 3342 // We are going to resume the execution of the scalar loop. 3343 // Go over all of the induction variables that we found and fix the 3344 // PHIs that are left in the scalar version of the loop. 3345 // The starting values of PHI nodes depend on the counter of the last 3346 // iteration in the vectorized loop. 3347 // If we come from a bypass edge then we need to start from the original 3348 // start value. 3349 for (auto &InductionEntry : Legal->getInductionVars()) { 3350 PHINode *OrigPhi = InductionEntry.first; 3351 InductionDescriptor II = InductionEntry.second; 3352 3353 // Create phi nodes to merge from the backedge-taken check block. 3354 PHINode *BCResumeVal = 3355 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3356 LoopScalarPreHeader->getTerminator()); 3357 // Copy original phi DL over to the new one. 3358 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3359 Value *&EndValue = IVEndValues[OrigPhi]; 3360 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3361 if (OrigPhi == OldInduction) { 3362 // We know what the end value is. 3363 EndValue = VectorTripCount; 3364 } else { 3365 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3366 Type *StepType = II.getStep()->getType(); 3367 Instruction::CastOps CastOp = 3368 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3369 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3370 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3371 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3372 EndValue->setName("ind.end"); 3373 3374 // Compute the end value for the additional bypass (if applicable). 3375 if (AdditionalBypass.first) { 3376 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3377 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3378 StepType, true); 3379 CRD = 3380 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3381 EndValueFromAdditionalBypass = 3382 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3383 EndValueFromAdditionalBypass->setName("ind.end"); 3384 } 3385 } 3386 // The new PHI merges the original incoming value, in case of a bypass, 3387 // or the value at the end of the vectorized loop. 3388 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3389 3390 // Fix the scalar body counter (PHI node). 3391 // The old induction's phi node in the scalar body needs the truncated 3392 // value. 3393 for (BasicBlock *BB : LoopBypassBlocks) 3394 BCResumeVal->addIncoming(II.getStartValue(), BB); 3395 3396 if (AdditionalBypass.first) 3397 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3398 EndValueFromAdditionalBypass); 3399 3400 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3401 } 3402 } 3403 3404 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3405 MDNode *OrigLoopID) { 3406 assert(L && "Expected valid loop."); 3407 3408 // The trip counts should be cached by now. 3409 Value *Count = getOrCreateTripCount(L); 3410 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3411 3412 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3413 3414 // Add a check in the middle block to see if we have completed 3415 // all of the iterations in the first vector loop. 3416 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3417 // If tail is to be folded, we know we don't need to run the remainder. 3418 Value *CmpN = Builder.getTrue(); 3419 if (!Cost->foldTailByMasking()) { 3420 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3421 VectorTripCount, "cmp.n", 3422 LoopMiddleBlock->getTerminator()); 3423 3424 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3425 // of the corresponding compare because they may have ended up with 3426 // different line numbers and we want to avoid awkward line stepping while 3427 // debugging. Eg. if the compare has got a line number inside the loop. 3428 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3429 } 3430 3431 BranchInst *BrInst = 3432 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3433 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3434 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3435 3436 // Get ready to start creating new instructions into the vectorized body. 3437 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3438 "Inconsistent vector loop preheader"); 3439 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3440 3441 Optional<MDNode *> VectorizedLoopID = 3442 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3443 LLVMLoopVectorizeFollowupVectorized}); 3444 if (VectorizedLoopID.hasValue()) { 3445 L->setLoopID(VectorizedLoopID.getValue()); 3446 3447 // Do not setAlreadyVectorized if loop attributes have been defined 3448 // explicitly. 3449 return LoopVectorPreHeader; 3450 } 3451 3452 // Keep all loop hints from the original loop on the vector loop (we'll 3453 // replace the vectorizer-specific hints below). 3454 if (MDNode *LID = OrigLoop->getLoopID()) 3455 L->setLoopID(LID); 3456 3457 LoopVectorizeHints Hints(L, true, *ORE); 3458 Hints.setAlreadyVectorized(); 3459 3460 #ifdef EXPENSIVE_CHECKS 3461 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3462 LI->verify(*DT); 3463 #endif 3464 3465 return LoopVectorPreHeader; 3466 } 3467 3468 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3469 /* 3470 In this function we generate a new loop. The new loop will contain 3471 the vectorized instructions while the old loop will continue to run the 3472 scalar remainder. 3473 3474 [ ] <-- loop iteration number check. 3475 / | 3476 / v 3477 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3478 | / | 3479 | / v 3480 || [ ] <-- vector pre header. 3481 |/ | 3482 | v 3483 | [ ] \ 3484 | [ ]_| <-- vector loop. 3485 | | 3486 | v 3487 | -[ ] <--- middle-block. 3488 | / | 3489 | / v 3490 -|- >[ ] <--- new preheader. 3491 | | 3492 | v 3493 | [ ] \ 3494 | [ ]_| <-- old scalar loop to handle remainder. 3495 \ | 3496 \ v 3497 >[ ] <-- exit block. 3498 ... 3499 */ 3500 3501 // Get the metadata of the original loop before it gets modified. 3502 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3503 3504 // Create an empty vector loop, and prepare basic blocks for the runtime 3505 // checks. 3506 Loop *Lp = createVectorLoopSkeleton(""); 3507 3508 // Now, compare the new count to zero. If it is zero skip the vector loop and 3509 // jump to the scalar loop. This check also covers the case where the 3510 // backedge-taken count is uint##_max: adding one to it will overflow leading 3511 // to an incorrect trip count of zero. In this (rare) case we will also jump 3512 // to the scalar loop. 3513 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3514 3515 // Generate the code to check any assumptions that we've made for SCEV 3516 // expressions. 3517 emitSCEVChecks(Lp, LoopScalarPreHeader); 3518 3519 // Generate the code that checks in runtime if arrays overlap. We put the 3520 // checks into a separate block to make the more common case of few elements 3521 // faster. 3522 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3523 3524 // Some loops have a single integer induction variable, while other loops 3525 // don't. One example is c++ iterators that often have multiple pointer 3526 // induction variables. In the code below we also support a case where we 3527 // don't have a single induction variable. 3528 // 3529 // We try to obtain an induction variable from the original loop as hard 3530 // as possible. However if we don't find one that: 3531 // - is an integer 3532 // - counts from zero, stepping by one 3533 // - is the size of the widest induction variable type 3534 // then we create a new one. 3535 OldInduction = Legal->getPrimaryInduction(); 3536 Type *IdxTy = Legal->getWidestInductionType(); 3537 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3538 // The loop step is equal to the vectorization factor (num of SIMD elements) 3539 // times the unroll factor (num of SIMD instructions). 3540 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3541 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3542 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3543 Induction = 3544 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3545 getDebugLocFromInstOrOperands(OldInduction)); 3546 3547 // Emit phis for the new starting index of the scalar loop. 3548 createInductionResumeValues(Lp, CountRoundDown); 3549 3550 return completeLoopSkeleton(Lp, OrigLoopID); 3551 } 3552 3553 // Fix up external users of the induction variable. At this point, we are 3554 // in LCSSA form, with all external PHIs that use the IV having one input value, 3555 // coming from the remainder loop. We need those PHIs to also have a correct 3556 // value for the IV when arriving directly from the middle block. 3557 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3558 const InductionDescriptor &II, 3559 Value *CountRoundDown, Value *EndValue, 3560 BasicBlock *MiddleBlock) { 3561 // There are two kinds of external IV usages - those that use the value 3562 // computed in the last iteration (the PHI) and those that use the penultimate 3563 // value (the value that feeds into the phi from the loop latch). 3564 // We allow both, but they, obviously, have different values. 3565 3566 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3567 3568 DenseMap<Value *, Value *> MissingVals; 3569 3570 // An external user of the last iteration's value should see the value that 3571 // the remainder loop uses to initialize its own IV. 3572 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3573 for (User *U : PostInc->users()) { 3574 Instruction *UI = cast<Instruction>(U); 3575 if (!OrigLoop->contains(UI)) { 3576 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3577 MissingVals[UI] = EndValue; 3578 } 3579 } 3580 3581 // An external user of the penultimate value need to see EndValue - Step. 3582 // The simplest way to get this is to recompute it from the constituent SCEVs, 3583 // that is Start + (Step * (CRD - 1)). 3584 for (User *U : OrigPhi->users()) { 3585 auto *UI = cast<Instruction>(U); 3586 if (!OrigLoop->contains(UI)) { 3587 const DataLayout &DL = 3588 OrigLoop->getHeader()->getModule()->getDataLayout(); 3589 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3590 3591 IRBuilder<> B(MiddleBlock->getTerminator()); 3592 Value *CountMinusOne = B.CreateSub( 3593 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3594 Value *CMO = 3595 !II.getStep()->getType()->isIntegerTy() 3596 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3597 II.getStep()->getType()) 3598 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3599 CMO->setName("cast.cmo"); 3600 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3601 Escape->setName("ind.escape"); 3602 MissingVals[UI] = Escape; 3603 } 3604 } 3605 3606 for (auto &I : MissingVals) { 3607 PHINode *PHI = cast<PHINode>(I.first); 3608 // One corner case we have to handle is two IVs "chasing" each-other, 3609 // that is %IV2 = phi [...], [ %IV1, %latch ] 3610 // In this case, if IV1 has an external use, we need to avoid adding both 3611 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3612 // don't already have an incoming value for the middle block. 3613 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3614 PHI->addIncoming(I.second, MiddleBlock); 3615 } 3616 } 3617 3618 namespace { 3619 3620 struct CSEDenseMapInfo { 3621 static bool canHandle(const Instruction *I) { 3622 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3623 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3624 } 3625 3626 static inline Instruction *getEmptyKey() { 3627 return DenseMapInfo<Instruction *>::getEmptyKey(); 3628 } 3629 3630 static inline Instruction *getTombstoneKey() { 3631 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3632 } 3633 3634 static unsigned getHashValue(const Instruction *I) { 3635 assert(canHandle(I) && "Unknown instruction!"); 3636 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3637 I->value_op_end())); 3638 } 3639 3640 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3641 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3642 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3643 return LHS == RHS; 3644 return LHS->isIdenticalTo(RHS); 3645 } 3646 }; 3647 3648 } // end anonymous namespace 3649 3650 ///Perform cse of induction variable instructions. 3651 static void cse(BasicBlock *BB) { 3652 // Perform simple cse. 3653 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3654 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3655 Instruction *In = &*I++; 3656 3657 if (!CSEDenseMapInfo::canHandle(In)) 3658 continue; 3659 3660 // Check if we can replace this instruction with any of the 3661 // visited instructions. 3662 if (Instruction *V = CSEMap.lookup(In)) { 3663 In->replaceAllUsesWith(V); 3664 In->eraseFromParent(); 3665 continue; 3666 } 3667 3668 CSEMap[In] = In; 3669 } 3670 } 3671 3672 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3673 ElementCount VF, 3674 bool &NeedToScalarize) { 3675 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3676 Function *F = CI->getCalledFunction(); 3677 Type *ScalarRetTy = CI->getType(); 3678 SmallVector<Type *, 4> Tys, ScalarTys; 3679 for (auto &ArgOp : CI->arg_operands()) 3680 ScalarTys.push_back(ArgOp->getType()); 3681 3682 // Estimate cost of scalarized vector call. The source operands are assumed 3683 // to be vectors, so we need to extract individual elements from there, 3684 // execute VF scalar calls, and then gather the result into the vector return 3685 // value. 3686 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, 3687 TTI::TCK_RecipThroughput); 3688 if (VF.isScalar()) 3689 return ScalarCallCost; 3690 3691 // Compute corresponding vector type for return value and arguments. 3692 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3693 for (Type *ScalarTy : ScalarTys) 3694 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3695 3696 // Compute costs of unpacking argument values for the scalar calls and 3697 // packing the return values to a vector. 3698 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3699 3700 unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3701 3702 // If we can't emit a vector call for this function, then the currently found 3703 // cost is the cost we need to return. 3704 NeedToScalarize = true; 3705 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3706 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3707 3708 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3709 return Cost; 3710 3711 // If the corresponding vector cost is cheaper, return its cost. 3712 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys, 3713 TTI::TCK_RecipThroughput); 3714 if (VectorCallCost < Cost) { 3715 NeedToScalarize = false; 3716 return VectorCallCost; 3717 } 3718 return Cost; 3719 } 3720 3721 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3722 ElementCount VF) { 3723 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3724 assert(ID && "Expected intrinsic call!"); 3725 3726 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3727 return TTI.getIntrinsicInstrCost(CostAttrs, 3728 TargetTransformInfo::TCK_RecipThroughput); 3729 } 3730 3731 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3732 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3733 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3734 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3735 } 3736 3737 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3738 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3739 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3740 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3741 } 3742 3743 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3744 // For every instruction `I` in MinBWs, truncate the operands, create a 3745 // truncated version of `I` and reextend its result. InstCombine runs 3746 // later and will remove any ext/trunc pairs. 3747 SmallPtrSet<Value *, 4> Erased; 3748 for (const auto &KV : Cost->getMinimalBitwidths()) { 3749 // If the value wasn't vectorized, we must maintain the original scalar 3750 // type. The absence of the value from VectorLoopValueMap indicates that it 3751 // wasn't vectorized. 3752 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3753 continue; 3754 for (unsigned Part = 0; Part < UF; ++Part) { 3755 Value *I = getOrCreateVectorValue(KV.first, Part); 3756 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3757 continue; 3758 Type *OriginalTy = I->getType(); 3759 Type *ScalarTruncatedTy = 3760 IntegerType::get(OriginalTy->getContext(), KV.second); 3761 auto *TruncatedTy = FixedVectorType::get( 3762 ScalarTruncatedTy, 3763 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3764 if (TruncatedTy == OriginalTy) 3765 continue; 3766 3767 IRBuilder<> B(cast<Instruction>(I)); 3768 auto ShrinkOperand = [&](Value *V) -> Value * { 3769 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3770 if (ZI->getSrcTy() == TruncatedTy) 3771 return ZI->getOperand(0); 3772 return B.CreateZExtOrTrunc(V, TruncatedTy); 3773 }; 3774 3775 // The actual instruction modification depends on the instruction type, 3776 // unfortunately. 3777 Value *NewI = nullptr; 3778 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3779 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3780 ShrinkOperand(BO->getOperand(1))); 3781 3782 // Any wrapping introduced by shrinking this operation shouldn't be 3783 // considered undefined behavior. So, we can't unconditionally copy 3784 // arithmetic wrapping flags to NewI. 3785 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3786 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3787 NewI = 3788 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3789 ShrinkOperand(CI->getOperand(1))); 3790 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3791 NewI = B.CreateSelect(SI->getCondition(), 3792 ShrinkOperand(SI->getTrueValue()), 3793 ShrinkOperand(SI->getFalseValue())); 3794 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3795 switch (CI->getOpcode()) { 3796 default: 3797 llvm_unreachable("Unhandled cast!"); 3798 case Instruction::Trunc: 3799 NewI = ShrinkOperand(CI->getOperand(0)); 3800 break; 3801 case Instruction::SExt: 3802 NewI = B.CreateSExtOrTrunc( 3803 CI->getOperand(0), 3804 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3805 break; 3806 case Instruction::ZExt: 3807 NewI = B.CreateZExtOrTrunc( 3808 CI->getOperand(0), 3809 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3810 break; 3811 } 3812 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3813 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3814 ->getNumElements(); 3815 auto *O0 = B.CreateZExtOrTrunc( 3816 SI->getOperand(0), 3817 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3818 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3819 ->getNumElements(); 3820 auto *O1 = B.CreateZExtOrTrunc( 3821 SI->getOperand(1), 3822 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3823 3824 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3825 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3826 // Don't do anything with the operands, just extend the result. 3827 continue; 3828 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3829 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3830 ->getNumElements(); 3831 auto *O0 = B.CreateZExtOrTrunc( 3832 IE->getOperand(0), 3833 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3834 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3835 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3836 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3837 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3838 ->getNumElements(); 3839 auto *O0 = B.CreateZExtOrTrunc( 3840 EE->getOperand(0), 3841 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3842 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3843 } else { 3844 // If we don't know what to do, be conservative and don't do anything. 3845 continue; 3846 } 3847 3848 // Lastly, extend the result. 3849 NewI->takeName(cast<Instruction>(I)); 3850 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3851 I->replaceAllUsesWith(Res); 3852 cast<Instruction>(I)->eraseFromParent(); 3853 Erased.insert(I); 3854 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3855 } 3856 } 3857 3858 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3859 for (const auto &KV : Cost->getMinimalBitwidths()) { 3860 // If the value wasn't vectorized, we must maintain the original scalar 3861 // type. The absence of the value from VectorLoopValueMap indicates that it 3862 // wasn't vectorized. 3863 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3864 continue; 3865 for (unsigned Part = 0; Part < UF; ++Part) { 3866 Value *I = getOrCreateVectorValue(KV.first, Part); 3867 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3868 if (Inst && Inst->use_empty()) { 3869 Value *NewI = Inst->getOperand(0); 3870 Inst->eraseFromParent(); 3871 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3872 } 3873 } 3874 } 3875 } 3876 3877 void InnerLoopVectorizer::fixVectorizedLoop() { 3878 // Insert truncates and extends for any truncated instructions as hints to 3879 // InstCombine. 3880 if (VF.isVector()) 3881 truncateToMinimalBitwidths(); 3882 3883 // Fix widened non-induction PHIs by setting up the PHI operands. 3884 if (OrigPHIsToFix.size()) { 3885 assert(EnableVPlanNativePath && 3886 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3887 fixNonInductionPHIs(); 3888 } 3889 3890 // At this point every instruction in the original loop is widened to a 3891 // vector form. Now we need to fix the recurrences in the loop. These PHI 3892 // nodes are currently empty because we did not want to introduce cycles. 3893 // This is the second stage of vectorizing recurrences. 3894 fixCrossIterationPHIs(); 3895 3896 // Forget the original basic block. 3897 PSE.getSE()->forgetLoop(OrigLoop); 3898 3899 // Fix-up external users of the induction variables. 3900 for (auto &Entry : Legal->getInductionVars()) 3901 fixupIVUsers(Entry.first, Entry.second, 3902 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3903 IVEndValues[Entry.first], LoopMiddleBlock); 3904 3905 fixLCSSAPHIs(); 3906 for (Instruction *PI : PredicatedInstructions) 3907 sinkScalarOperands(&*PI); 3908 3909 // Remove redundant induction instructions. 3910 cse(LoopVectorBody); 3911 3912 // Set/update profile weights for the vector and remainder loops as original 3913 // loop iterations are now distributed among them. Note that original loop 3914 // represented by LoopScalarBody becomes remainder loop after vectorization. 3915 // 3916 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3917 // end up getting slightly roughened result but that should be OK since 3918 // profile is not inherently precise anyway. Note also possible bypass of 3919 // vector code caused by legality checks is ignored, assigning all the weight 3920 // to the vector loop, optimistically. 3921 // 3922 // For scalable vectorization we can't know at compile time how many iterations 3923 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3924 // vscale of '1'. 3925 setProfileInfoAfterUnrolling( 3926 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3927 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3928 } 3929 3930 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3931 // In order to support recurrences we need to be able to vectorize Phi nodes. 3932 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3933 // stage #2: We now need to fix the recurrences by adding incoming edges to 3934 // the currently empty PHI nodes. At this point every instruction in the 3935 // original loop is widened to a vector form so we can use them to construct 3936 // the incoming edges. 3937 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3938 // Handle first-order recurrences and reductions that need to be fixed. 3939 if (Legal->isFirstOrderRecurrence(&Phi)) 3940 fixFirstOrderRecurrence(&Phi); 3941 else if (Legal->isReductionVariable(&Phi)) 3942 fixReduction(&Phi); 3943 } 3944 } 3945 3946 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3947 // This is the second phase of vectorizing first-order recurrences. An 3948 // overview of the transformation is described below. Suppose we have the 3949 // following loop. 3950 // 3951 // for (int i = 0; i < n; ++i) 3952 // b[i] = a[i] - a[i - 1]; 3953 // 3954 // There is a first-order recurrence on "a". For this loop, the shorthand 3955 // scalar IR looks like: 3956 // 3957 // scalar.ph: 3958 // s_init = a[-1] 3959 // br scalar.body 3960 // 3961 // scalar.body: 3962 // i = phi [0, scalar.ph], [i+1, scalar.body] 3963 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3964 // s2 = a[i] 3965 // b[i] = s2 - s1 3966 // br cond, scalar.body, ... 3967 // 3968 // In this example, s1 is a recurrence because it's value depends on the 3969 // previous iteration. In the first phase of vectorization, we created a 3970 // temporary value for s1. We now complete the vectorization and produce the 3971 // shorthand vector IR shown below (for VF = 4, UF = 1). 3972 // 3973 // vector.ph: 3974 // v_init = vector(..., ..., ..., a[-1]) 3975 // br vector.body 3976 // 3977 // vector.body 3978 // i = phi [0, vector.ph], [i+4, vector.body] 3979 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3980 // v2 = a[i, i+1, i+2, i+3]; 3981 // v3 = vector(v1(3), v2(0, 1, 2)) 3982 // b[i, i+1, i+2, i+3] = v2 - v3 3983 // br cond, vector.body, middle.block 3984 // 3985 // middle.block: 3986 // x = v2(3) 3987 // br scalar.ph 3988 // 3989 // scalar.ph: 3990 // s_init = phi [x, middle.block], [a[-1], otherwise] 3991 // br scalar.body 3992 // 3993 // After execution completes the vector loop, we extract the next value of 3994 // the recurrence (x) to use as the initial value in the scalar loop. 3995 3996 // Get the original loop preheader and single loop latch. 3997 auto *Preheader = OrigLoop->getLoopPreheader(); 3998 auto *Latch = OrigLoop->getLoopLatch(); 3999 4000 // Get the initial and previous values of the scalar recurrence. 4001 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4002 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4003 4004 // Create a vector from the initial value. 4005 auto *VectorInit = ScalarInit; 4006 if (VF.isVector()) { 4007 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4008 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4009 VectorInit = Builder.CreateInsertElement( 4010 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4011 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4012 } 4013 4014 // We constructed a temporary phi node in the first phase of vectorization. 4015 // This phi node will eventually be deleted. 4016 Builder.SetInsertPoint( 4017 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4018 4019 // Create a phi node for the new recurrence. The current value will either be 4020 // the initial value inserted into a vector or loop-varying vector value. 4021 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4022 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4023 4024 // Get the vectorized previous value of the last part UF - 1. It appears last 4025 // among all unrolled iterations, due to the order of their construction. 4026 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4027 4028 // Find and set the insertion point after the previous value if it is an 4029 // instruction. 4030 BasicBlock::iterator InsertPt; 4031 // Note that the previous value may have been constant-folded so it is not 4032 // guaranteed to be an instruction in the vector loop. 4033 // FIXME: Loop invariant values do not form recurrences. We should deal with 4034 // them earlier. 4035 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4036 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4037 else { 4038 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4039 if (isa<PHINode>(PreviousLastPart)) 4040 // If the previous value is a phi node, we should insert after all the phi 4041 // nodes in the block containing the PHI to avoid breaking basic block 4042 // verification. Note that the basic block may be different to 4043 // LoopVectorBody, in case we predicate the loop. 4044 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4045 else 4046 InsertPt = ++PreviousInst->getIterator(); 4047 } 4048 Builder.SetInsertPoint(&*InsertPt); 4049 4050 // We will construct a vector for the recurrence by combining the values for 4051 // the current and previous iterations. This is the required shuffle mask. 4052 assert(!VF.isScalable()); 4053 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4054 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4055 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4056 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4057 4058 // The vector from which to take the initial value for the current iteration 4059 // (actual or unrolled). Initially, this is the vector phi node. 4060 Value *Incoming = VecPhi; 4061 4062 // Shuffle the current and previous vector and update the vector parts. 4063 for (unsigned Part = 0; Part < UF; ++Part) { 4064 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4065 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4066 auto *Shuffle = 4067 VF.isVector() 4068 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4069 : Incoming; 4070 PhiPart->replaceAllUsesWith(Shuffle); 4071 cast<Instruction>(PhiPart)->eraseFromParent(); 4072 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4073 Incoming = PreviousPart; 4074 } 4075 4076 // Fix the latch value of the new recurrence in the vector loop. 4077 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4078 4079 // Extract the last vector element in the middle block. This will be the 4080 // initial value for the recurrence when jumping to the scalar loop. 4081 auto *ExtractForScalar = Incoming; 4082 if (VF.isVector()) { 4083 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4084 ExtractForScalar = Builder.CreateExtractElement( 4085 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4086 "vector.recur.extract"); 4087 } 4088 // Extract the second last element in the middle block if the 4089 // Phi is used outside the loop. We need to extract the phi itself 4090 // and not the last element (the phi update in the current iteration). This 4091 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4092 // when the scalar loop is not run at all. 4093 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4094 if (VF.isVector()) 4095 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4096 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4097 "vector.recur.extract.for.phi"); 4098 // When loop is unrolled without vectorizing, initialize 4099 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4100 // `Incoming`. This is analogous to the vectorized case above: extracting the 4101 // second last element when VF > 1. 4102 else if (UF > 1) 4103 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4104 4105 // Fix the initial value of the original recurrence in the scalar loop. 4106 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4107 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4108 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4109 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4110 Start->addIncoming(Incoming, BB); 4111 } 4112 4113 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4114 Phi->setName("scalar.recur"); 4115 4116 // Finally, fix users of the recurrence outside the loop. The users will need 4117 // either the last value of the scalar recurrence or the last value of the 4118 // vector recurrence we extracted in the middle block. Since the loop is in 4119 // LCSSA form, we just need to find all the phi nodes for the original scalar 4120 // recurrence in the exit block, and then add an edge for the middle block. 4121 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4122 if (LCSSAPhi.getIncomingValue(0) == Phi) { 4123 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4124 } 4125 } 4126 } 4127 4128 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4129 Constant *Zero = Builder.getInt32(0); 4130 4131 // Get it's reduction variable descriptor. 4132 assert(Legal->isReductionVariable(Phi) && 4133 "Unable to find the reduction variable"); 4134 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4135 4136 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4137 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4138 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4139 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 4140 RdxDesc.getMinMaxRecurrenceKind(); 4141 setDebugLocFromInst(Builder, ReductionStartValue); 4142 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4143 4144 // We need to generate a reduction vector from the incoming scalar. 4145 // To do so, we need to generate the 'identity' vector and override 4146 // one of the elements with the incoming scalar reduction. We need 4147 // to do it in the vector-loop preheader. 4148 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4149 4150 // This is the vector-clone of the value that leaves the loop. 4151 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4152 4153 // Find the reduction identity variable. Zero for addition, or, xor, 4154 // one for multiplication, -1 for And. 4155 Value *Identity; 4156 Value *VectorStart; 4157 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 4158 RK == RecurrenceDescriptor::RK_FloatMinMax) { 4159 // MinMax reduction have the start value as their identify. 4160 if (VF.isScalar() || IsInLoopReductionPhi) { 4161 VectorStart = Identity = ReductionStartValue; 4162 } else { 4163 VectorStart = Identity = 4164 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 4165 } 4166 } else { 4167 // Handle other reduction kinds: 4168 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 4169 RK, MinMaxKind, VecTy->getScalarType()); 4170 if (VF.isScalar() || IsInLoopReductionPhi) { 4171 Identity = Iden; 4172 // This vector is the Identity vector where the first element is the 4173 // incoming scalar reduction. 4174 VectorStart = ReductionStartValue; 4175 } else { 4176 Identity = ConstantVector::getSplat(VF, Iden); 4177 4178 // This vector is the Identity vector where the first element is the 4179 // incoming scalar reduction. 4180 VectorStart = 4181 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 4182 } 4183 } 4184 4185 // Wrap flags are in general invalid after vectorization, clear them. 4186 clearReductionWrapFlags(RdxDesc); 4187 4188 // Fix the vector-loop phi. 4189 4190 // Reductions do not have to start at zero. They can start with 4191 // any loop invariant values. 4192 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4193 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4194 4195 for (unsigned Part = 0; Part < UF; ++Part) { 4196 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4197 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4198 // Make sure to add the reduction start value only to the 4199 // first unroll part. 4200 Value *StartVal = (Part == 0) ? VectorStart : Identity; 4201 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 4202 cast<PHINode>(VecRdxPhi) 4203 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4204 } 4205 4206 // Before each round, move the insertion point right between 4207 // the PHIs and the values we are going to write. 4208 // This allows us to write both PHINodes and the extractelement 4209 // instructions. 4210 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4211 4212 setDebugLocFromInst(Builder, LoopExitInst); 4213 4214 // If tail is folded by masking, the vector value to leave the loop should be 4215 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4216 // instead of the former. For an inloop reduction the reduction will already 4217 // be predicated, and does not need to be handled here. 4218 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4219 for (unsigned Part = 0; Part < UF; ++Part) { 4220 Value *VecLoopExitInst = 4221 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4222 Value *Sel = nullptr; 4223 for (User *U : VecLoopExitInst->users()) { 4224 if (isa<SelectInst>(U)) { 4225 assert(!Sel && "Reduction exit feeding two selects"); 4226 Sel = U; 4227 } else 4228 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4229 } 4230 assert(Sel && "Reduction exit feeds no select"); 4231 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4232 4233 // If the target can create a predicated operator for the reduction at no 4234 // extra cost in the loop (for example a predicated vadd), it can be 4235 // cheaper for the select to remain in the loop than be sunk out of it, 4236 // and so use the select value for the phi instead of the old 4237 // LoopExitValue. 4238 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4239 if (PreferPredicatedReductionSelect || 4240 TTI->preferPredicatedReductionSelect( 4241 RdxDesc.getRecurrenceBinOp(), Phi->getType(), 4242 TargetTransformInfo::ReductionFlags())) { 4243 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4244 VecRdxPhi->setIncomingValueForBlock( 4245 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4246 } 4247 } 4248 } 4249 4250 // If the vector reduction can be performed in a smaller type, we truncate 4251 // then extend the loop exit value to enable InstCombine to evaluate the 4252 // entire expression in the smaller type. 4253 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4254 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4255 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4256 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4257 Builder.SetInsertPoint( 4258 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4259 VectorParts RdxParts(UF); 4260 for (unsigned Part = 0; Part < UF; ++Part) { 4261 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4262 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4263 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4264 : Builder.CreateZExt(Trunc, VecTy); 4265 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4266 UI != RdxParts[Part]->user_end();) 4267 if (*UI != Trunc) { 4268 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4269 RdxParts[Part] = Extnd; 4270 } else { 4271 ++UI; 4272 } 4273 } 4274 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4275 for (unsigned Part = 0; Part < UF; ++Part) { 4276 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4277 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4278 } 4279 } 4280 4281 // Reduce all of the unrolled parts into a single vector. 4282 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4283 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 4284 4285 // The middle block terminator has already been assigned a DebugLoc here (the 4286 // OrigLoop's single latch terminator). We want the whole middle block to 4287 // appear to execute on this line because: (a) it is all compiler generated, 4288 // (b) these instructions are always executed after evaluating the latch 4289 // conditional branch, and (c) other passes may add new predecessors which 4290 // terminate on this line. This is the easiest way to ensure we don't 4291 // accidentally cause an extra step back into the loop while debugging. 4292 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4293 for (unsigned Part = 1; Part < UF; ++Part) { 4294 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4295 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 4296 // Floating point operations had to be 'fast' to enable the reduction. 4297 ReducedPartRdx = addFastMathFlag( 4298 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 4299 ReducedPartRdx, "bin.rdx"), 4300 RdxDesc.getFastMathFlags()); 4301 else 4302 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 4303 RdxPart); 4304 } 4305 4306 // Create the reduction after the loop. Note that inloop reductions create the 4307 // target reduction in the loop using a Reduction recipe. 4308 if (VF.isVector() && !IsInLoopReductionPhi) { 4309 bool NoNaN = Legal->hasFunNoNaNAttr(); 4310 ReducedPartRdx = 4311 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 4312 // If the reduction can be performed in a smaller type, we need to extend 4313 // the reduction to the wider type before we branch to the original loop. 4314 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4315 ReducedPartRdx = 4316 RdxDesc.isSigned() 4317 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4318 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4319 } 4320 4321 // Create a phi node that merges control-flow from the backedge-taken check 4322 // block and the middle block. 4323 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4324 LoopScalarPreHeader->getTerminator()); 4325 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4326 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4327 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4328 4329 // Now, we need to fix the users of the reduction variable 4330 // inside and outside of the scalar remainder loop. 4331 // We know that the loop is in LCSSA form. We need to update the 4332 // PHI nodes in the exit blocks. 4333 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4334 // All PHINodes need to have a single entry edge, or two if 4335 // we already fixed them. 4336 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 4337 4338 // We found a reduction value exit-PHI. Update it with the 4339 // incoming bypass edge. 4340 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 4341 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4342 } // end of the LCSSA phi scan. 4343 4344 // Fix the scalar loop reduction variable with the incoming reduction sum 4345 // from the vector body and from the backedge value. 4346 int IncomingEdgeBlockIdx = 4347 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4348 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4349 // Pick the other block. 4350 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4351 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4352 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4353 } 4354 4355 void InnerLoopVectorizer::clearReductionWrapFlags( 4356 RecurrenceDescriptor &RdxDesc) { 4357 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 4358 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 4359 RK != RecurrenceDescriptor::RK_IntegerMult) 4360 return; 4361 4362 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4363 assert(LoopExitInstr && "null loop exit instruction"); 4364 SmallVector<Instruction *, 8> Worklist; 4365 SmallPtrSet<Instruction *, 8> Visited; 4366 Worklist.push_back(LoopExitInstr); 4367 Visited.insert(LoopExitInstr); 4368 4369 while (!Worklist.empty()) { 4370 Instruction *Cur = Worklist.pop_back_val(); 4371 if (isa<OverflowingBinaryOperator>(Cur)) 4372 for (unsigned Part = 0; Part < UF; ++Part) { 4373 Value *V = getOrCreateVectorValue(Cur, Part); 4374 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4375 } 4376 4377 for (User *U : Cur->users()) { 4378 Instruction *UI = cast<Instruction>(U); 4379 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4380 Visited.insert(UI).second) 4381 Worklist.push_back(UI); 4382 } 4383 } 4384 } 4385 4386 void InnerLoopVectorizer::fixLCSSAPHIs() { 4387 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4388 if (LCSSAPhi.getNumIncomingValues() == 1) { 4389 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4390 // Non-instruction incoming values will have only one value. 4391 unsigned LastLane = 0; 4392 if (isa<Instruction>(IncomingValue)) 4393 LastLane = Cost->isUniformAfterVectorization( 4394 cast<Instruction>(IncomingValue), VF) 4395 ? 0 4396 : VF.getKnownMinValue() - 1; 4397 assert((!VF.isScalable() || LastLane == 0) && 4398 "scalable vectors dont support non-uniform scalars yet"); 4399 // Can be a loop invariant incoming value or the last scalar value to be 4400 // extracted from the vectorized loop. 4401 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4402 Value *lastIncomingValue = 4403 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4404 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4405 } 4406 } 4407 } 4408 4409 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4410 // The basic block and loop containing the predicated instruction. 4411 auto *PredBB = PredInst->getParent(); 4412 auto *VectorLoop = LI->getLoopFor(PredBB); 4413 4414 // Initialize a worklist with the operands of the predicated instruction. 4415 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4416 4417 // Holds instructions that we need to analyze again. An instruction may be 4418 // reanalyzed if we don't yet know if we can sink it or not. 4419 SmallVector<Instruction *, 8> InstsToReanalyze; 4420 4421 // Returns true if a given use occurs in the predicated block. Phi nodes use 4422 // their operands in their corresponding predecessor blocks. 4423 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4424 auto *I = cast<Instruction>(U.getUser()); 4425 BasicBlock *BB = I->getParent(); 4426 if (auto *Phi = dyn_cast<PHINode>(I)) 4427 BB = Phi->getIncomingBlock( 4428 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4429 return BB == PredBB; 4430 }; 4431 4432 // Iteratively sink the scalarized operands of the predicated instruction 4433 // into the block we created for it. When an instruction is sunk, it's 4434 // operands are then added to the worklist. The algorithm ends after one pass 4435 // through the worklist doesn't sink a single instruction. 4436 bool Changed; 4437 do { 4438 // Add the instructions that need to be reanalyzed to the worklist, and 4439 // reset the changed indicator. 4440 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4441 InstsToReanalyze.clear(); 4442 Changed = false; 4443 4444 while (!Worklist.empty()) { 4445 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4446 4447 // We can't sink an instruction if it is a phi node, is already in the 4448 // predicated block, is not in the loop, or may have side effects. 4449 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4450 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4451 continue; 4452 4453 // It's legal to sink the instruction if all its uses occur in the 4454 // predicated block. Otherwise, there's nothing to do yet, and we may 4455 // need to reanalyze the instruction. 4456 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4457 InstsToReanalyze.push_back(I); 4458 continue; 4459 } 4460 4461 // Move the instruction to the beginning of the predicated block, and add 4462 // it's operands to the worklist. 4463 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4464 Worklist.insert(I->op_begin(), I->op_end()); 4465 4466 // The sinking may have enabled other instructions to be sunk, so we will 4467 // need to iterate. 4468 Changed = true; 4469 } 4470 } while (Changed); 4471 } 4472 4473 void InnerLoopVectorizer::fixNonInductionPHIs() { 4474 for (PHINode *OrigPhi : OrigPHIsToFix) { 4475 PHINode *NewPhi = 4476 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4477 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4478 4479 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4480 predecessors(OrigPhi->getParent())); 4481 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4482 predecessors(NewPhi->getParent())); 4483 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4484 "Scalar and Vector BB should have the same number of predecessors"); 4485 4486 // The insertion point in Builder may be invalidated by the time we get 4487 // here. Force the Builder insertion point to something valid so that we do 4488 // not run into issues during insertion point restore in 4489 // getOrCreateVectorValue calls below. 4490 Builder.SetInsertPoint(NewPhi); 4491 4492 // The predecessor order is preserved and we can rely on mapping between 4493 // scalar and vector block predecessors. 4494 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4495 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4496 4497 // When looking up the new scalar/vector values to fix up, use incoming 4498 // values from original phi. 4499 Value *ScIncV = 4500 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4501 4502 // Scalar incoming value may need a broadcast 4503 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4504 NewPhi->addIncoming(NewIncV, NewPredBB); 4505 } 4506 } 4507 } 4508 4509 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4510 VPUser &Operands, unsigned UF, 4511 ElementCount VF, bool IsPtrLoopInvariant, 4512 SmallBitVector &IsIndexLoopInvariant, 4513 VPTransformState &State) { 4514 // Construct a vector GEP by widening the operands of the scalar GEP as 4515 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4516 // results in a vector of pointers when at least one operand of the GEP 4517 // is vector-typed. Thus, to keep the representation compact, we only use 4518 // vector-typed operands for loop-varying values. 4519 4520 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4521 // If we are vectorizing, but the GEP has only loop-invariant operands, 4522 // the GEP we build (by only using vector-typed operands for 4523 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4524 // produce a vector of pointers, we need to either arbitrarily pick an 4525 // operand to broadcast, or broadcast a clone of the original GEP. 4526 // Here, we broadcast a clone of the original. 4527 // 4528 // TODO: If at some point we decide to scalarize instructions having 4529 // loop-invariant operands, this special case will no longer be 4530 // required. We would add the scalarization decision to 4531 // collectLoopScalars() and teach getVectorValue() to broadcast 4532 // the lane-zero scalar value. 4533 auto *Clone = Builder.Insert(GEP->clone()); 4534 for (unsigned Part = 0; Part < UF; ++Part) { 4535 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4536 State.set(VPDef, GEP, EntryPart, Part); 4537 addMetadata(EntryPart, GEP); 4538 } 4539 } else { 4540 // If the GEP has at least one loop-varying operand, we are sure to 4541 // produce a vector of pointers. But if we are only unrolling, we want 4542 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4543 // produce with the code below will be scalar (if VF == 1) or vector 4544 // (otherwise). Note that for the unroll-only case, we still maintain 4545 // values in the vector mapping with initVector, as we do for other 4546 // instructions. 4547 for (unsigned Part = 0; Part < UF; ++Part) { 4548 // The pointer operand of the new GEP. If it's loop-invariant, we 4549 // won't broadcast it. 4550 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4551 : State.get(Operands.getOperand(0), Part); 4552 4553 // Collect all the indices for the new GEP. If any index is 4554 // loop-invariant, we won't broadcast it. 4555 SmallVector<Value *, 4> Indices; 4556 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4557 VPValue *Operand = Operands.getOperand(I); 4558 if (IsIndexLoopInvariant[I - 1]) 4559 Indices.push_back(State.get(Operand, {0, 0})); 4560 else 4561 Indices.push_back(State.get(Operand, Part)); 4562 } 4563 4564 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4565 // but it should be a vector, otherwise. 4566 auto *NewGEP = 4567 GEP->isInBounds() 4568 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4569 Indices) 4570 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4571 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4572 "NewGEP is not a pointer vector"); 4573 State.set(VPDef, GEP, NewGEP, Part); 4574 addMetadata(NewGEP, GEP); 4575 } 4576 } 4577 } 4578 4579 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4580 ElementCount VF) { 4581 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4582 PHINode *P = cast<PHINode>(PN); 4583 if (EnableVPlanNativePath) { 4584 // Currently we enter here in the VPlan-native path for non-induction 4585 // PHIs where all control flow is uniform. We simply widen these PHIs. 4586 // Create a vector phi with no operands - the vector phi operands will be 4587 // set at the end of vector code generation. 4588 Type *VecTy = 4589 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4590 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4591 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4592 OrigPHIsToFix.push_back(P); 4593 4594 return; 4595 } 4596 4597 assert(PN->getParent() == OrigLoop->getHeader() && 4598 "Non-header phis should have been handled elsewhere"); 4599 4600 // In order to support recurrences we need to be able to vectorize Phi nodes. 4601 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4602 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4603 // this value when we vectorize all of the instructions that use the PHI. 4604 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4605 for (unsigned Part = 0; Part < UF; ++Part) { 4606 // This is phase one of vectorizing PHIs. 4607 bool ScalarPHI = 4608 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4609 Type *VecTy = 4610 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4611 Value *EntryPart = PHINode::Create( 4612 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4613 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4614 } 4615 return; 4616 } 4617 4618 setDebugLocFromInst(Builder, P); 4619 4620 // This PHINode must be an induction variable. 4621 // Make sure that we know about it. 4622 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4623 4624 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4625 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4626 4627 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4628 // which can be found from the original scalar operations. 4629 switch (II.getKind()) { 4630 case InductionDescriptor::IK_NoInduction: 4631 llvm_unreachable("Unknown induction"); 4632 case InductionDescriptor::IK_IntInduction: 4633 case InductionDescriptor::IK_FpInduction: 4634 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4635 case InductionDescriptor::IK_PtrInduction: { 4636 // Handle the pointer induction variable case. 4637 assert(P->getType()->isPointerTy() && "Unexpected type."); 4638 4639 if (Cost->isScalarAfterVectorization(P, VF)) { 4640 // This is the normalized GEP that starts counting at zero. 4641 Value *PtrInd = 4642 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4643 // Determine the number of scalars we need to generate for each unroll 4644 // iteration. If the instruction is uniform, we only need to generate the 4645 // first lane. Otherwise, we generate all VF values. 4646 unsigned Lanes = 4647 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4648 for (unsigned Part = 0; Part < UF; ++Part) { 4649 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4650 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4651 Lane + Part * VF.getKnownMinValue()); 4652 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4653 Value *SclrGep = 4654 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4655 SclrGep->setName("next.gep"); 4656 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4657 } 4658 } 4659 return; 4660 } 4661 assert(isa<SCEVConstant>(II.getStep()) && 4662 "Induction step not a SCEV constant!"); 4663 Type *PhiType = II.getStep()->getType(); 4664 4665 // Build a pointer phi 4666 Value *ScalarStartValue = II.getStartValue(); 4667 Type *ScStValueType = ScalarStartValue->getType(); 4668 PHINode *NewPointerPhi = 4669 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4670 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4671 4672 // A pointer induction, performed by using a gep 4673 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4674 Instruction *InductionLoc = LoopLatch->getTerminator(); 4675 const SCEV *ScalarStep = II.getStep(); 4676 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4677 Value *ScalarStepValue = 4678 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4679 Value *InductionGEP = GetElementPtrInst::Create( 4680 ScStValueType->getPointerElementType(), NewPointerPhi, 4681 Builder.CreateMul( 4682 ScalarStepValue, 4683 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4684 "ptr.ind", InductionLoc); 4685 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4686 4687 // Create UF many actual address geps that use the pointer 4688 // phi as base and a vectorized version of the step value 4689 // (<step*0, ..., step*N>) as offset. 4690 for (unsigned Part = 0; Part < UF; ++Part) { 4691 SmallVector<Constant *, 8> Indices; 4692 // Create a vector of consecutive numbers from zero to VF. 4693 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4694 Indices.push_back( 4695 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4696 Constant *StartOffset = ConstantVector::get(Indices); 4697 4698 Value *GEP = Builder.CreateGEP( 4699 ScStValueType->getPointerElementType(), NewPointerPhi, 4700 Builder.CreateMul( 4701 StartOffset, 4702 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4703 "vector.gep")); 4704 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4705 } 4706 } 4707 } 4708 } 4709 4710 /// A helper function for checking whether an integer division-related 4711 /// instruction may divide by zero (in which case it must be predicated if 4712 /// executed conditionally in the scalar code). 4713 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4714 /// Non-zero divisors that are non compile-time constants will not be 4715 /// converted into multiplication, so we will still end up scalarizing 4716 /// the division, but can do so w/o predication. 4717 static bool mayDivideByZero(Instruction &I) { 4718 assert((I.getOpcode() == Instruction::UDiv || 4719 I.getOpcode() == Instruction::SDiv || 4720 I.getOpcode() == Instruction::URem || 4721 I.getOpcode() == Instruction::SRem) && 4722 "Unexpected instruction"); 4723 Value *Divisor = I.getOperand(1); 4724 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4725 return !CInt || CInt->isZero(); 4726 } 4727 4728 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4729 VPUser &User, 4730 VPTransformState &State) { 4731 switch (I.getOpcode()) { 4732 case Instruction::Call: 4733 case Instruction::Br: 4734 case Instruction::PHI: 4735 case Instruction::GetElementPtr: 4736 case Instruction::Select: 4737 llvm_unreachable("This instruction is handled by a different recipe."); 4738 case Instruction::UDiv: 4739 case Instruction::SDiv: 4740 case Instruction::SRem: 4741 case Instruction::URem: 4742 case Instruction::Add: 4743 case Instruction::FAdd: 4744 case Instruction::Sub: 4745 case Instruction::FSub: 4746 case Instruction::FNeg: 4747 case Instruction::Mul: 4748 case Instruction::FMul: 4749 case Instruction::FDiv: 4750 case Instruction::FRem: 4751 case Instruction::Shl: 4752 case Instruction::LShr: 4753 case Instruction::AShr: 4754 case Instruction::And: 4755 case Instruction::Or: 4756 case Instruction::Xor: { 4757 // Just widen unops and binops. 4758 setDebugLocFromInst(Builder, &I); 4759 4760 for (unsigned Part = 0; Part < UF; ++Part) { 4761 SmallVector<Value *, 2> Ops; 4762 for (VPValue *VPOp : User.operands()) 4763 Ops.push_back(State.get(VPOp, Part)); 4764 4765 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4766 4767 if (auto *VecOp = dyn_cast<Instruction>(V)) 4768 VecOp->copyIRFlags(&I); 4769 4770 // Use this vector value for all users of the original instruction. 4771 State.set(Def, &I, V, Part); 4772 addMetadata(V, &I); 4773 } 4774 4775 break; 4776 } 4777 case Instruction::ICmp: 4778 case Instruction::FCmp: { 4779 // Widen compares. Generate vector compares. 4780 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4781 auto *Cmp = cast<CmpInst>(&I); 4782 setDebugLocFromInst(Builder, Cmp); 4783 for (unsigned Part = 0; Part < UF; ++Part) { 4784 Value *A = State.get(User.getOperand(0), Part); 4785 Value *B = State.get(User.getOperand(1), Part); 4786 Value *C = nullptr; 4787 if (FCmp) { 4788 // Propagate fast math flags. 4789 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4790 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4791 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4792 } else { 4793 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4794 } 4795 State.set(Def, &I, C, Part); 4796 addMetadata(C, &I); 4797 } 4798 4799 break; 4800 } 4801 4802 case Instruction::ZExt: 4803 case Instruction::SExt: 4804 case Instruction::FPToUI: 4805 case Instruction::FPToSI: 4806 case Instruction::FPExt: 4807 case Instruction::PtrToInt: 4808 case Instruction::IntToPtr: 4809 case Instruction::SIToFP: 4810 case Instruction::UIToFP: 4811 case Instruction::Trunc: 4812 case Instruction::FPTrunc: 4813 case Instruction::BitCast: { 4814 auto *CI = cast<CastInst>(&I); 4815 setDebugLocFromInst(Builder, CI); 4816 4817 /// Vectorize casts. 4818 Type *DestTy = 4819 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4820 4821 for (unsigned Part = 0; Part < UF; ++Part) { 4822 Value *A = State.get(User.getOperand(0), Part); 4823 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4824 State.set(Def, &I, Cast, Part); 4825 addMetadata(Cast, &I); 4826 } 4827 break; 4828 } 4829 default: 4830 // This instruction is not vectorized by simple widening. 4831 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4832 llvm_unreachable("Unhandled instruction!"); 4833 } // end of switch. 4834 } 4835 4836 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4837 VPUser &ArgOperands, 4838 VPTransformState &State) { 4839 assert(!isa<DbgInfoIntrinsic>(I) && 4840 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4841 setDebugLocFromInst(Builder, &I); 4842 4843 Module *M = I.getParent()->getParent()->getParent(); 4844 auto *CI = cast<CallInst>(&I); 4845 4846 SmallVector<Type *, 4> Tys; 4847 for (Value *ArgOperand : CI->arg_operands()) 4848 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4849 4850 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4851 4852 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4853 // version of the instruction. 4854 // Is it beneficial to perform intrinsic call compared to lib call? 4855 bool NeedToScalarize = false; 4856 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4857 bool UseVectorIntrinsic = 4858 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4859 assert((UseVectorIntrinsic || !NeedToScalarize) && 4860 "Instruction should be scalarized elsewhere."); 4861 4862 for (unsigned Part = 0; Part < UF; ++Part) { 4863 SmallVector<Value *, 4> Args; 4864 for (auto &I : enumerate(ArgOperands.operands())) { 4865 // Some intrinsics have a scalar argument - don't replace it with a 4866 // vector. 4867 Value *Arg; 4868 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4869 Arg = State.get(I.value(), Part); 4870 else 4871 Arg = State.get(I.value(), {0, 0}); 4872 Args.push_back(Arg); 4873 } 4874 4875 Function *VectorF; 4876 if (UseVectorIntrinsic) { 4877 // Use vector version of the intrinsic. 4878 Type *TysForDecl[] = {CI->getType()}; 4879 if (VF.isVector()) { 4880 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4881 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4882 } 4883 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4884 assert(VectorF && "Can't retrieve vector intrinsic."); 4885 } else { 4886 // Use vector version of the function call. 4887 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4888 #ifndef NDEBUG 4889 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4890 "Can't create vector function."); 4891 #endif 4892 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4893 } 4894 SmallVector<OperandBundleDef, 1> OpBundles; 4895 CI->getOperandBundlesAsDefs(OpBundles); 4896 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4897 4898 if (isa<FPMathOperator>(V)) 4899 V->copyFastMathFlags(CI); 4900 4901 State.set(Def, &I, V, Part); 4902 addMetadata(V, &I); 4903 } 4904 } 4905 4906 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4907 VPUser &Operands, 4908 bool InvariantCond, 4909 VPTransformState &State) { 4910 setDebugLocFromInst(Builder, &I); 4911 4912 // The condition can be loop invariant but still defined inside the 4913 // loop. This means that we can't just use the original 'cond' value. 4914 // We have to take the 'vectorized' value and pick the first lane. 4915 // Instcombine will make this a no-op. 4916 auto *InvarCond = 4917 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4918 4919 for (unsigned Part = 0; Part < UF; ++Part) { 4920 Value *Cond = 4921 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4922 Value *Op0 = State.get(Operands.getOperand(1), Part); 4923 Value *Op1 = State.get(Operands.getOperand(2), Part); 4924 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4925 State.set(VPDef, &I, Sel, Part); 4926 addMetadata(Sel, &I); 4927 } 4928 } 4929 4930 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4931 // We should not collect Scalars more than once per VF. Right now, this 4932 // function is called from collectUniformsAndScalars(), which already does 4933 // this check. Collecting Scalars for VF=1 does not make any sense. 4934 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4935 "This function should not be visited twice for the same VF"); 4936 4937 SmallSetVector<Instruction *, 8> Worklist; 4938 4939 // These sets are used to seed the analysis with pointers used by memory 4940 // accesses that will remain scalar. 4941 SmallSetVector<Instruction *, 8> ScalarPtrs; 4942 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4943 auto *Latch = TheLoop->getLoopLatch(); 4944 4945 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4946 // The pointer operands of loads and stores will be scalar as long as the 4947 // memory access is not a gather or scatter operation. The value operand of a 4948 // store will remain scalar if the store is scalarized. 4949 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4950 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4951 assert(WideningDecision != CM_Unknown && 4952 "Widening decision should be ready at this moment"); 4953 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4954 if (Ptr == Store->getValueOperand()) 4955 return WideningDecision == CM_Scalarize; 4956 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4957 "Ptr is neither a value or pointer operand"); 4958 return WideningDecision != CM_GatherScatter; 4959 }; 4960 4961 // A helper that returns true if the given value is a bitcast or 4962 // getelementptr instruction contained in the loop. 4963 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4964 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4965 isa<GetElementPtrInst>(V)) && 4966 !TheLoop->isLoopInvariant(V); 4967 }; 4968 4969 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 4970 if (!isa<PHINode>(Ptr) || 4971 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 4972 return false; 4973 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 4974 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 4975 return false; 4976 return isScalarUse(MemAccess, Ptr); 4977 }; 4978 4979 // A helper that evaluates a memory access's use of a pointer. If the 4980 // pointer is actually the pointer induction of a loop, it is being 4981 // inserted into Worklist. If the use will be a scalar use, and the 4982 // pointer is only used by memory accesses, we place the pointer in 4983 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 4984 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4985 if (isScalarPtrInduction(MemAccess, Ptr)) { 4986 Worklist.insert(cast<Instruction>(Ptr)); 4987 Instruction *Update = cast<Instruction>( 4988 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 4989 Worklist.insert(Update); 4990 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 4991 << "\n"); 4992 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 4993 << "\n"); 4994 return; 4995 } 4996 // We only care about bitcast and getelementptr instructions contained in 4997 // the loop. 4998 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4999 return; 5000 5001 // If the pointer has already been identified as scalar (e.g., if it was 5002 // also identified as uniform), there's nothing to do. 5003 auto *I = cast<Instruction>(Ptr); 5004 if (Worklist.count(I)) 5005 return; 5006 5007 // If the use of the pointer will be a scalar use, and all users of the 5008 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5009 // place the pointer in PossibleNonScalarPtrs. 5010 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5011 return isa<LoadInst>(U) || isa<StoreInst>(U); 5012 })) 5013 ScalarPtrs.insert(I); 5014 else 5015 PossibleNonScalarPtrs.insert(I); 5016 }; 5017 5018 // We seed the scalars analysis with three classes of instructions: (1) 5019 // instructions marked uniform-after-vectorization and (2) bitcast, 5020 // getelementptr and (pointer) phi instructions used by memory accesses 5021 // requiring a scalar use. 5022 // 5023 // (1) Add to the worklist all instructions that have been identified as 5024 // uniform-after-vectorization. 5025 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5026 5027 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5028 // memory accesses requiring a scalar use. The pointer operands of loads and 5029 // stores will be scalar as long as the memory accesses is not a gather or 5030 // scatter operation. The value operand of a store will remain scalar if the 5031 // store is scalarized. 5032 for (auto *BB : TheLoop->blocks()) 5033 for (auto &I : *BB) { 5034 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5035 evaluatePtrUse(Load, Load->getPointerOperand()); 5036 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5037 evaluatePtrUse(Store, Store->getPointerOperand()); 5038 evaluatePtrUse(Store, Store->getValueOperand()); 5039 } 5040 } 5041 for (auto *I : ScalarPtrs) 5042 if (!PossibleNonScalarPtrs.count(I)) { 5043 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5044 Worklist.insert(I); 5045 } 5046 5047 // Insert the forced scalars. 5048 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5049 // induction variable when the PHI user is scalarized. 5050 auto ForcedScalar = ForcedScalars.find(VF); 5051 if (ForcedScalar != ForcedScalars.end()) 5052 for (auto *I : ForcedScalar->second) 5053 Worklist.insert(I); 5054 5055 // Expand the worklist by looking through any bitcasts and getelementptr 5056 // instructions we've already identified as scalar. This is similar to the 5057 // expansion step in collectLoopUniforms(); however, here we're only 5058 // expanding to include additional bitcasts and getelementptr instructions. 5059 unsigned Idx = 0; 5060 while (Idx != Worklist.size()) { 5061 Instruction *Dst = Worklist[Idx++]; 5062 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5063 continue; 5064 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5065 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5066 auto *J = cast<Instruction>(U); 5067 return !TheLoop->contains(J) || Worklist.count(J) || 5068 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5069 isScalarUse(J, Src)); 5070 })) { 5071 Worklist.insert(Src); 5072 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5073 } 5074 } 5075 5076 // An induction variable will remain scalar if all users of the induction 5077 // variable and induction variable update remain scalar. 5078 for (auto &Induction : Legal->getInductionVars()) { 5079 auto *Ind = Induction.first; 5080 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5081 5082 // If tail-folding is applied, the primary induction variable will be used 5083 // to feed a vector compare. 5084 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5085 continue; 5086 5087 // Determine if all users of the induction variable are scalar after 5088 // vectorization. 5089 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5090 auto *I = cast<Instruction>(U); 5091 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5092 }); 5093 if (!ScalarInd) 5094 continue; 5095 5096 // Determine if all users of the induction variable update instruction are 5097 // scalar after vectorization. 5098 auto ScalarIndUpdate = 5099 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5100 auto *I = cast<Instruction>(U); 5101 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5102 }); 5103 if (!ScalarIndUpdate) 5104 continue; 5105 5106 // The induction variable and its update instruction will remain scalar. 5107 Worklist.insert(Ind); 5108 Worklist.insert(IndUpdate); 5109 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5110 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5111 << "\n"); 5112 } 5113 5114 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5115 } 5116 5117 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5118 ElementCount VF) { 5119 if (!blockNeedsPredication(I->getParent())) 5120 return false; 5121 switch(I->getOpcode()) { 5122 default: 5123 break; 5124 case Instruction::Load: 5125 case Instruction::Store: { 5126 if (!Legal->isMaskRequired(I)) 5127 return false; 5128 auto *Ptr = getLoadStorePointerOperand(I); 5129 auto *Ty = getMemInstValueType(I); 5130 // We have already decided how to vectorize this instruction, get that 5131 // result. 5132 if (VF.isVector()) { 5133 InstWidening WideningDecision = getWideningDecision(I, VF); 5134 assert(WideningDecision != CM_Unknown && 5135 "Widening decision should be ready at this moment"); 5136 return WideningDecision == CM_Scalarize; 5137 } 5138 const Align Alignment = getLoadStoreAlignment(I); 5139 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5140 isLegalMaskedGather(Ty, Alignment)) 5141 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5142 isLegalMaskedScatter(Ty, Alignment)); 5143 } 5144 case Instruction::UDiv: 5145 case Instruction::SDiv: 5146 case Instruction::SRem: 5147 case Instruction::URem: 5148 return mayDivideByZero(*I); 5149 } 5150 return false; 5151 } 5152 5153 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5154 Instruction *I, ElementCount VF) { 5155 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5156 assert(getWideningDecision(I, VF) == CM_Unknown && 5157 "Decision should not be set yet."); 5158 auto *Group = getInterleavedAccessGroup(I); 5159 assert(Group && "Must have a group."); 5160 5161 // If the instruction's allocated size doesn't equal it's type size, it 5162 // requires padding and will be scalarized. 5163 auto &DL = I->getModule()->getDataLayout(); 5164 auto *ScalarTy = getMemInstValueType(I); 5165 if (hasIrregularType(ScalarTy, DL, VF)) 5166 return false; 5167 5168 // Check if masking is required. 5169 // A Group may need masking for one of two reasons: it resides in a block that 5170 // needs predication, or it was decided to use masking to deal with gaps. 5171 bool PredicatedAccessRequiresMasking = 5172 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5173 bool AccessWithGapsRequiresMasking = 5174 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5175 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5176 return true; 5177 5178 // If masked interleaving is required, we expect that the user/target had 5179 // enabled it, because otherwise it either wouldn't have been created or 5180 // it should have been invalidated by the CostModel. 5181 assert(useMaskedInterleavedAccesses(TTI) && 5182 "Masked interleave-groups for predicated accesses are not enabled."); 5183 5184 auto *Ty = getMemInstValueType(I); 5185 const Align Alignment = getLoadStoreAlignment(I); 5186 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5187 : TTI.isLegalMaskedStore(Ty, Alignment); 5188 } 5189 5190 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5191 Instruction *I, ElementCount VF) { 5192 // Get and ensure we have a valid memory instruction. 5193 LoadInst *LI = dyn_cast<LoadInst>(I); 5194 StoreInst *SI = dyn_cast<StoreInst>(I); 5195 assert((LI || SI) && "Invalid memory instruction"); 5196 5197 auto *Ptr = getLoadStorePointerOperand(I); 5198 5199 // In order to be widened, the pointer should be consecutive, first of all. 5200 if (!Legal->isConsecutivePtr(Ptr)) 5201 return false; 5202 5203 // If the instruction is a store located in a predicated block, it will be 5204 // scalarized. 5205 if (isScalarWithPredication(I)) 5206 return false; 5207 5208 // If the instruction's allocated size doesn't equal it's type size, it 5209 // requires padding and will be scalarized. 5210 auto &DL = I->getModule()->getDataLayout(); 5211 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5212 if (hasIrregularType(ScalarTy, DL, VF)) 5213 return false; 5214 5215 return true; 5216 } 5217 5218 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5219 // We should not collect Uniforms more than once per VF. Right now, 5220 // this function is called from collectUniformsAndScalars(), which 5221 // already does this check. Collecting Uniforms for VF=1 does not make any 5222 // sense. 5223 5224 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5225 "This function should not be visited twice for the same VF"); 5226 5227 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5228 // not analyze again. Uniforms.count(VF) will return 1. 5229 Uniforms[VF].clear(); 5230 5231 // We now know that the loop is vectorizable! 5232 // Collect instructions inside the loop that will remain uniform after 5233 // vectorization. 5234 5235 // Global values, params and instructions outside of current loop are out of 5236 // scope. 5237 auto isOutOfScope = [&](Value *V) -> bool { 5238 Instruction *I = dyn_cast<Instruction>(V); 5239 return (!I || !TheLoop->contains(I)); 5240 }; 5241 5242 SetVector<Instruction *> Worklist; 5243 BasicBlock *Latch = TheLoop->getLoopLatch(); 5244 5245 // Instructions that are scalar with predication must not be considered 5246 // uniform after vectorization, because that would create an erroneous 5247 // replicating region where only a single instance out of VF should be formed. 5248 // TODO: optimize such seldom cases if found important, see PR40816. 5249 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5250 if (isOutOfScope(I)) { 5251 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5252 << *I << "\n"); 5253 return; 5254 } 5255 if (isScalarWithPredication(I, VF)) { 5256 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5257 << *I << "\n"); 5258 return; 5259 } 5260 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5261 Worklist.insert(I); 5262 }; 5263 5264 // Start with the conditional branch. If the branch condition is an 5265 // instruction contained in the loop that is only used by the branch, it is 5266 // uniform. 5267 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5268 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5269 addToWorklistIfAllowed(Cmp); 5270 5271 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5272 InstWidening WideningDecision = getWideningDecision(I, VF); 5273 assert(WideningDecision != CM_Unknown && 5274 "Widening decision should be ready at this moment"); 5275 5276 // A uniform memory op is itself uniform. We exclude uniform stores 5277 // here as they demand the last lane, not the first one. 5278 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5279 assert(WideningDecision == CM_Scalarize); 5280 return true; 5281 } 5282 5283 return (WideningDecision == CM_Widen || 5284 WideningDecision == CM_Widen_Reverse || 5285 WideningDecision == CM_Interleave); 5286 }; 5287 5288 5289 // Returns true if Ptr is the pointer operand of a memory access instruction 5290 // I, and I is known to not require scalarization. 5291 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5292 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5293 }; 5294 5295 // Holds a list of values which are known to have at least one uniform use. 5296 // Note that there may be other uses which aren't uniform. A "uniform use" 5297 // here is something which only demands lane 0 of the unrolled iterations; 5298 // it does not imply that all lanes produce the same value (e.g. this is not 5299 // the usual meaning of uniform) 5300 SmallPtrSet<Value *, 8> HasUniformUse; 5301 5302 // Scan the loop for instructions which are either a) known to have only 5303 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5304 for (auto *BB : TheLoop->blocks()) 5305 for (auto &I : *BB) { 5306 // If there's no pointer operand, there's nothing to do. 5307 auto *Ptr = getLoadStorePointerOperand(&I); 5308 if (!Ptr) 5309 continue; 5310 5311 // A uniform memory op is itself uniform. We exclude uniform stores 5312 // here as they demand the last lane, not the first one. 5313 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5314 addToWorklistIfAllowed(&I); 5315 5316 if (isUniformDecision(&I, VF)) { 5317 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5318 HasUniformUse.insert(Ptr); 5319 } 5320 } 5321 5322 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5323 // demanding) users. Since loops are assumed to be in LCSSA form, this 5324 // disallows uses outside the loop as well. 5325 for (auto *V : HasUniformUse) { 5326 if (isOutOfScope(V)) 5327 continue; 5328 auto *I = cast<Instruction>(V); 5329 auto UsersAreMemAccesses = 5330 llvm::all_of(I->users(), [&](User *U) -> bool { 5331 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5332 }); 5333 if (UsersAreMemAccesses) 5334 addToWorklistIfAllowed(I); 5335 } 5336 5337 // Expand Worklist in topological order: whenever a new instruction 5338 // is added , its users should be already inside Worklist. It ensures 5339 // a uniform instruction will only be used by uniform instructions. 5340 unsigned idx = 0; 5341 while (idx != Worklist.size()) { 5342 Instruction *I = Worklist[idx++]; 5343 5344 for (auto OV : I->operand_values()) { 5345 // isOutOfScope operands cannot be uniform instructions. 5346 if (isOutOfScope(OV)) 5347 continue; 5348 // First order recurrence Phi's should typically be considered 5349 // non-uniform. 5350 auto *OP = dyn_cast<PHINode>(OV); 5351 if (OP && Legal->isFirstOrderRecurrence(OP)) 5352 continue; 5353 // If all the users of the operand are uniform, then add the 5354 // operand into the uniform worklist. 5355 auto *OI = cast<Instruction>(OV); 5356 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5357 auto *J = cast<Instruction>(U); 5358 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5359 })) 5360 addToWorklistIfAllowed(OI); 5361 } 5362 } 5363 5364 // For an instruction to be added into Worklist above, all its users inside 5365 // the loop should also be in Worklist. However, this condition cannot be 5366 // true for phi nodes that form a cyclic dependence. We must process phi 5367 // nodes separately. An induction variable will remain uniform if all users 5368 // of the induction variable and induction variable update remain uniform. 5369 // The code below handles both pointer and non-pointer induction variables. 5370 for (auto &Induction : Legal->getInductionVars()) { 5371 auto *Ind = Induction.first; 5372 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5373 5374 // Determine if all users of the induction variable are uniform after 5375 // vectorization. 5376 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5377 auto *I = cast<Instruction>(U); 5378 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5379 isVectorizedMemAccessUse(I, Ind); 5380 }); 5381 if (!UniformInd) 5382 continue; 5383 5384 // Determine if all users of the induction variable update instruction are 5385 // uniform after vectorization. 5386 auto UniformIndUpdate = 5387 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5388 auto *I = cast<Instruction>(U); 5389 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5390 isVectorizedMemAccessUse(I, IndUpdate); 5391 }); 5392 if (!UniformIndUpdate) 5393 continue; 5394 5395 // The induction variable and its update instruction will remain uniform. 5396 addToWorklistIfAllowed(Ind); 5397 addToWorklistIfAllowed(IndUpdate); 5398 } 5399 5400 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5401 } 5402 5403 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5404 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5405 5406 if (Legal->getRuntimePointerChecking()->Need) { 5407 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5408 "runtime pointer checks needed. Enable vectorization of this " 5409 "loop with '#pragma clang loop vectorize(enable)' when " 5410 "compiling with -Os/-Oz", 5411 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5412 return true; 5413 } 5414 5415 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5416 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5417 "runtime SCEV checks needed. Enable vectorization of this " 5418 "loop with '#pragma clang loop vectorize(enable)' when " 5419 "compiling with -Os/-Oz", 5420 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5421 return true; 5422 } 5423 5424 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5425 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5426 reportVectorizationFailure("Runtime stride check for small trip count", 5427 "runtime stride == 1 checks needed. Enable vectorization of " 5428 "this loop without such check by compiling with -Os/-Oz", 5429 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5430 return true; 5431 } 5432 5433 return false; 5434 } 5435 5436 Optional<ElementCount> 5437 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5438 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5439 // TODO: It may by useful to do since it's still likely to be dynamically 5440 // uniform if the target can skip. 5441 reportVectorizationFailure( 5442 "Not inserting runtime ptr check for divergent target", 5443 "runtime pointer checks needed. Not enabled for divergent target", 5444 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5445 return None; 5446 } 5447 5448 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5449 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5450 if (TC == 1) { 5451 reportVectorizationFailure("Single iteration (non) loop", 5452 "loop trip count is one, irrelevant for vectorization", 5453 "SingleIterationLoop", ORE, TheLoop); 5454 return None; 5455 } 5456 5457 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5458 5459 switch (ScalarEpilogueStatus) { 5460 case CM_ScalarEpilogueAllowed: 5461 return MaxVF; 5462 case CM_ScalarEpilogueNotAllowedUsePredicate: 5463 LLVM_FALLTHROUGH; 5464 case CM_ScalarEpilogueNotNeededUsePredicate: 5465 LLVM_DEBUG( 5466 dbgs() << "LV: vector predicate hint/switch found.\n" 5467 << "LV: Not allowing scalar epilogue, creating predicated " 5468 << "vector loop.\n"); 5469 break; 5470 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5471 // fallthrough as a special case of OptForSize 5472 case CM_ScalarEpilogueNotAllowedOptSize: 5473 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5474 LLVM_DEBUG( 5475 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5476 else 5477 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5478 << "count.\n"); 5479 5480 // Bail if runtime checks are required, which are not good when optimising 5481 // for size. 5482 if (runtimeChecksRequired()) 5483 return None; 5484 break; 5485 } 5486 5487 // Now try the tail folding 5488 5489 // Invalidate interleave groups that require an epilogue if we can't mask 5490 // the interleave-group. 5491 if (!useMaskedInterleavedAccesses(TTI)) { 5492 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5493 "No decisions should have been taken at this point"); 5494 // Note: There is no need to invalidate any cost modeling decisions here, as 5495 // non where taken so far. 5496 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5497 } 5498 5499 assert(!MaxVF.isScalable() && 5500 "Scalable vectors do not yet support tail folding"); 5501 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5502 "MaxVF must be a power of 2"); 5503 unsigned MaxVFtimesIC = 5504 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5505 if (TC > 0 && TC % MaxVFtimesIC == 0) { 5506 // Accept MaxVF if we do not have a tail. 5507 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5508 return MaxVF; 5509 } 5510 5511 // If we don't know the precise trip count, or if the trip count that we 5512 // found modulo the vectorization factor is not zero, try to fold the tail 5513 // by masking. 5514 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5515 if (Legal->prepareToFoldTailByMasking()) { 5516 FoldTailByMasking = true; 5517 return MaxVF; 5518 } 5519 5520 // If there was a tail-folding hint/switch, but we can't fold the tail by 5521 // masking, fallback to a vectorization with a scalar epilogue. 5522 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5523 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5524 "scalar epilogue instead.\n"); 5525 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5526 return MaxVF; 5527 } 5528 5529 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5530 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5531 return None; 5532 } 5533 5534 if (TC == 0) { 5535 reportVectorizationFailure( 5536 "Unable to calculate the loop count due to complex control flow", 5537 "unable to calculate the loop count due to complex control flow", 5538 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5539 return None; 5540 } 5541 5542 reportVectorizationFailure( 5543 "Cannot optimize for size and vectorize at the same time.", 5544 "cannot optimize for size and vectorize at the same time. " 5545 "Enable vectorization of this loop with '#pragma clang loop " 5546 "vectorize(enable)' when compiling with -Os/-Oz", 5547 "NoTailLoopWithOptForSize", ORE, TheLoop); 5548 return None; 5549 } 5550 5551 ElementCount 5552 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5553 ElementCount UserVF) { 5554 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5555 unsigned SmallestType, WidestType; 5556 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5557 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5558 5559 // Get the maximum safe dependence distance in bits computed by LAA. 5560 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5561 // the memory accesses that is most restrictive (involved in the smallest 5562 // dependence distance). 5563 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5564 5565 if (UserVF.isNonZero()) { 5566 // For now, don't verify legality of scalable vectors. 5567 // This will be addressed properly in https://reviews.llvm.org/D91718. 5568 if (UserVF.isScalable()) 5569 return UserVF; 5570 5571 // If legally unsafe, clamp the user vectorization factor to a safe value. 5572 unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5573 if (UserVF.getFixedValue() <= MaxSafeVF) 5574 return UserVF; 5575 5576 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5577 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5578 << ".\n"); 5579 ORE->emit([&]() { 5580 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5581 TheLoop->getStartLoc(), 5582 TheLoop->getHeader()) 5583 << "User-specified vectorization factor " 5584 << ore::NV("UserVectorizationFactor", UserVF) 5585 << " is unsafe, clamping to maximum safe vectorization factor " 5586 << ore::NV("VectorizationFactor", MaxSafeVF); 5587 }); 5588 return ElementCount::getFixed(MaxSafeVF); 5589 } 5590 5591 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5592 5593 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5594 // Note that both WidestRegister and WidestType may not be a powers of 2. 5595 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5596 5597 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5598 << " / " << WidestType << " bits.\n"); 5599 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5600 << WidestRegister << " bits.\n"); 5601 5602 assert(MaxVectorSize <= WidestRegister && 5603 "Did not expect to pack so many elements" 5604 " into one vector!"); 5605 if (MaxVectorSize == 0) { 5606 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5607 MaxVectorSize = 1; 5608 return ElementCount::getFixed(MaxVectorSize); 5609 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5610 isPowerOf2_32(ConstTripCount)) { 5611 // We need to clamp the VF to be the ConstTripCount. There is no point in 5612 // choosing a higher viable VF as done in the loop below. 5613 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5614 << ConstTripCount << "\n"); 5615 MaxVectorSize = ConstTripCount; 5616 return ElementCount::getFixed(MaxVectorSize); 5617 } 5618 5619 unsigned MaxVF = MaxVectorSize; 5620 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5621 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5622 // Collect all viable vectorization factors larger than the default MaxVF 5623 // (i.e. MaxVectorSize). 5624 SmallVector<ElementCount, 8> VFs; 5625 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5626 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5627 VFs.push_back(ElementCount::getFixed(VS)); 5628 5629 // For each VF calculate its register usage. 5630 auto RUs = calculateRegisterUsage(VFs); 5631 5632 // Select the largest VF which doesn't require more registers than existing 5633 // ones. 5634 for (int i = RUs.size() - 1; i >= 0; --i) { 5635 bool Selected = true; 5636 for (auto& pair : RUs[i].MaxLocalUsers) { 5637 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5638 if (pair.second > TargetNumRegisters) 5639 Selected = false; 5640 } 5641 if (Selected) { 5642 MaxVF = VFs[i].getKnownMinValue(); 5643 break; 5644 } 5645 } 5646 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5647 if (MaxVF < MinVF) { 5648 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5649 << ") with target's minimum: " << MinVF << '\n'); 5650 MaxVF = MinVF; 5651 } 5652 } 5653 } 5654 return ElementCount::getFixed(MaxVF); 5655 } 5656 5657 VectorizationFactor 5658 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5659 // FIXME: This can be fixed for scalable vectors later, because at this stage 5660 // the LoopVectorizer will only consider vectorizing a loop with scalable 5661 // vectors when the loop has a hint to enable vectorization for a given VF. 5662 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5663 5664 float Cost = expectedCost(ElementCount::getFixed(1)).first; 5665 const float ScalarCost = Cost; 5666 unsigned Width = 1; 5667 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5668 5669 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5670 if (ForceVectorization && MaxVF.isVector()) { 5671 // Ignore scalar width, because the user explicitly wants vectorization. 5672 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5673 // evaluation. 5674 Cost = std::numeric_limits<float>::max(); 5675 } 5676 5677 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5678 // Notice that the vector loop needs to be executed less times, so 5679 // we need to divide the cost of the vector loops by the width of 5680 // the vector elements. 5681 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5682 float VectorCost = C.first / (float)i; 5683 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5684 << " costs: " << (int)VectorCost << ".\n"); 5685 if (!C.second && !ForceVectorization) { 5686 LLVM_DEBUG( 5687 dbgs() << "LV: Not considering vector loop of width " << i 5688 << " because it will not generate any vector instructions.\n"); 5689 continue; 5690 } 5691 5692 // If profitable add it to ProfitableVF list. 5693 if (VectorCost < ScalarCost) { 5694 ProfitableVFs.push_back(VectorizationFactor( 5695 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5696 } 5697 5698 if (VectorCost < Cost) { 5699 Cost = VectorCost; 5700 Width = i; 5701 } 5702 } 5703 5704 if (!EnableCondStoresVectorization && NumPredStores) { 5705 reportVectorizationFailure("There are conditional stores.", 5706 "store that is conditionally executed prevents vectorization", 5707 "ConditionalStore", ORE, TheLoop); 5708 Width = 1; 5709 Cost = ScalarCost; 5710 } 5711 5712 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5713 << "LV: Vectorization seems to be not beneficial, " 5714 << "but was forced by a user.\n"); 5715 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5716 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5717 (unsigned)(Width * Cost)}; 5718 return Factor; 5719 } 5720 5721 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5722 const Loop &L, ElementCount VF) const { 5723 // Cross iteration phis such as reductions need special handling and are 5724 // currently unsupported. 5725 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5726 return Legal->isFirstOrderRecurrence(&Phi) || 5727 Legal->isReductionVariable(&Phi); 5728 })) 5729 return false; 5730 5731 // Phis with uses outside of the loop require special handling and are 5732 // currently unsupported. 5733 for (auto &Entry : Legal->getInductionVars()) { 5734 // Look for uses of the value of the induction at the last iteration. 5735 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5736 for (User *U : PostInc->users()) 5737 if (!L.contains(cast<Instruction>(U))) 5738 return false; 5739 // Look for uses of penultimate value of the induction. 5740 for (User *U : Entry.first->users()) 5741 if (!L.contains(cast<Instruction>(U))) 5742 return false; 5743 } 5744 5745 // Induction variables that are widened require special handling that is 5746 // currently not supported. 5747 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5748 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5749 this->isProfitableToScalarize(Entry.first, VF)); 5750 })) 5751 return false; 5752 5753 return true; 5754 } 5755 5756 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5757 const ElementCount VF) const { 5758 // FIXME: We need a much better cost-model to take different parameters such 5759 // as register pressure, code size increase and cost of extra branches into 5760 // account. For now we apply a very crude heuristic and only consider loops 5761 // with vectorization factors larger than a certain value. 5762 // We also consider epilogue vectorization unprofitable for targets that don't 5763 // consider interleaving beneficial (eg. MVE). 5764 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5765 return false; 5766 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5767 return true; 5768 return false; 5769 } 5770 5771 VectorizationFactor 5772 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5773 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5774 VectorizationFactor Result = VectorizationFactor::Disabled(); 5775 if (!EnableEpilogueVectorization) { 5776 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5777 return Result; 5778 } 5779 5780 if (!isScalarEpilogueAllowed()) { 5781 LLVM_DEBUG( 5782 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5783 "allowed.\n";); 5784 return Result; 5785 } 5786 5787 // FIXME: This can be fixed for scalable vectors later, because at this stage 5788 // the LoopVectorizer will only consider vectorizing a loop with scalable 5789 // vectors when the loop has a hint to enable vectorization for a given VF. 5790 if (MainLoopVF.isScalable()) { 5791 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5792 "yet supported.\n"); 5793 return Result; 5794 } 5795 5796 // Not really a cost consideration, but check for unsupported cases here to 5797 // simplify the logic. 5798 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5799 LLVM_DEBUG( 5800 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5801 "not a supported candidate.\n";); 5802 return Result; 5803 } 5804 5805 if (EpilogueVectorizationForceVF > 1) { 5806 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5807 if (LVP.hasPlanWithVFs( 5808 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5809 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5810 else { 5811 LLVM_DEBUG( 5812 dbgs() 5813 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5814 return Result; 5815 } 5816 } 5817 5818 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5819 TheLoop->getHeader()->getParent()->hasMinSize()) { 5820 LLVM_DEBUG( 5821 dbgs() 5822 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5823 return Result; 5824 } 5825 5826 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5827 return Result; 5828 5829 for (auto &NextVF : ProfitableVFs) 5830 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5831 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5832 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5833 Result = NextVF; 5834 5835 if (Result != VectorizationFactor::Disabled()) 5836 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5837 << Result.Width.getFixedValue() << "\n";); 5838 return Result; 5839 } 5840 5841 std::pair<unsigned, unsigned> 5842 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5843 unsigned MinWidth = -1U; 5844 unsigned MaxWidth = 8; 5845 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5846 5847 // For each block. 5848 for (BasicBlock *BB : TheLoop->blocks()) { 5849 // For each instruction in the loop. 5850 for (Instruction &I : BB->instructionsWithoutDebug()) { 5851 Type *T = I.getType(); 5852 5853 // Skip ignored values. 5854 if (ValuesToIgnore.count(&I)) 5855 continue; 5856 5857 // Only examine Loads, Stores and PHINodes. 5858 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5859 continue; 5860 5861 // Examine PHI nodes that are reduction variables. Update the type to 5862 // account for the recurrence type. 5863 if (auto *PN = dyn_cast<PHINode>(&I)) { 5864 if (!Legal->isReductionVariable(PN)) 5865 continue; 5866 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5867 T = RdxDesc.getRecurrenceType(); 5868 } 5869 5870 // Examine the stored values. 5871 if (auto *ST = dyn_cast<StoreInst>(&I)) 5872 T = ST->getValueOperand()->getType(); 5873 5874 // Ignore loaded pointer types and stored pointer types that are not 5875 // vectorizable. 5876 // 5877 // FIXME: The check here attempts to predict whether a load or store will 5878 // be vectorized. We only know this for certain after a VF has 5879 // been selected. Here, we assume that if an access can be 5880 // vectorized, it will be. We should also look at extending this 5881 // optimization to non-pointer types. 5882 // 5883 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5884 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5885 continue; 5886 5887 MinWidth = std::min(MinWidth, 5888 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5889 MaxWidth = std::max(MaxWidth, 5890 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5891 } 5892 } 5893 5894 return {MinWidth, MaxWidth}; 5895 } 5896 5897 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5898 unsigned LoopCost) { 5899 // -- The interleave heuristics -- 5900 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5901 // There are many micro-architectural considerations that we can't predict 5902 // at this level. For example, frontend pressure (on decode or fetch) due to 5903 // code size, or the number and capabilities of the execution ports. 5904 // 5905 // We use the following heuristics to select the interleave count: 5906 // 1. If the code has reductions, then we interleave to break the cross 5907 // iteration dependency. 5908 // 2. If the loop is really small, then we interleave to reduce the loop 5909 // overhead. 5910 // 3. We don't interleave if we think that we will spill registers to memory 5911 // due to the increased register pressure. 5912 5913 if (!isScalarEpilogueAllowed()) 5914 return 1; 5915 5916 // We used the distance for the interleave count. 5917 if (Legal->getMaxSafeDepDistBytes() != -1U) 5918 return 1; 5919 5920 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5921 const bool HasReductions = !Legal->getReductionVars().empty(); 5922 // Do not interleave loops with a relatively small known or estimated trip 5923 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5924 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5925 // because with the above conditions interleaving can expose ILP and break 5926 // cross iteration dependences for reductions. 5927 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5928 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5929 return 1; 5930 5931 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5932 // We divide by these constants so assume that we have at least one 5933 // instruction that uses at least one register. 5934 for (auto& pair : R.MaxLocalUsers) { 5935 pair.second = std::max(pair.second, 1U); 5936 } 5937 5938 // We calculate the interleave count using the following formula. 5939 // Subtract the number of loop invariants from the number of available 5940 // registers. These registers are used by all of the interleaved instances. 5941 // Next, divide the remaining registers by the number of registers that is 5942 // required by the loop, in order to estimate how many parallel instances 5943 // fit without causing spills. All of this is rounded down if necessary to be 5944 // a power of two. We want power of two interleave count to simplify any 5945 // addressing operations or alignment considerations. 5946 // We also want power of two interleave counts to ensure that the induction 5947 // variable of the vector loop wraps to zero, when tail is folded by masking; 5948 // this currently happens when OptForSize, in which case IC is set to 1 above. 5949 unsigned IC = UINT_MAX; 5950 5951 for (auto& pair : R.MaxLocalUsers) { 5952 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5953 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5954 << " registers of " 5955 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5956 if (VF.isScalar()) { 5957 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5958 TargetNumRegisters = ForceTargetNumScalarRegs; 5959 } else { 5960 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5961 TargetNumRegisters = ForceTargetNumVectorRegs; 5962 } 5963 unsigned MaxLocalUsers = pair.second; 5964 unsigned LoopInvariantRegs = 0; 5965 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5966 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5967 5968 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5969 // Don't count the induction variable as interleaved. 5970 if (EnableIndVarRegisterHeur) { 5971 TmpIC = 5972 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5973 std::max(1U, (MaxLocalUsers - 1))); 5974 } 5975 5976 IC = std::min(IC, TmpIC); 5977 } 5978 5979 // Clamp the interleave ranges to reasonable counts. 5980 unsigned MaxInterleaveCount = 5981 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5982 5983 // Check if the user has overridden the max. 5984 if (VF.isScalar()) { 5985 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5986 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5987 } else { 5988 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5989 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5990 } 5991 5992 // If trip count is known or estimated compile time constant, limit the 5993 // interleave count to be less than the trip count divided by VF, provided it 5994 // is at least 1. 5995 // 5996 // For scalable vectors we can't know if interleaving is beneficial. It may 5997 // not be beneficial for small loops if none of the lanes in the second vector 5998 // iterations is enabled. However, for larger loops, there is likely to be a 5999 // similar benefit as for fixed-width vectors. For now, we choose to leave 6000 // the InterleaveCount as if vscale is '1', although if some information about 6001 // the vector is known (e.g. min vector size), we can make a better decision. 6002 if (BestKnownTC) { 6003 MaxInterleaveCount = 6004 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6005 // Make sure MaxInterleaveCount is greater than 0. 6006 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6007 } 6008 6009 assert(MaxInterleaveCount > 0 && 6010 "Maximum interleave count must be greater than 0"); 6011 6012 // Clamp the calculated IC to be between the 1 and the max interleave count 6013 // that the target and trip count allows. 6014 if (IC > MaxInterleaveCount) 6015 IC = MaxInterleaveCount; 6016 else 6017 // Make sure IC is greater than 0. 6018 IC = std::max(1u, IC); 6019 6020 assert(IC > 0 && "Interleave count must be greater than 0."); 6021 6022 // If we did not calculate the cost for VF (because the user selected the VF) 6023 // then we calculate the cost of VF here. 6024 if (LoopCost == 0) 6025 LoopCost = expectedCost(VF).first; 6026 6027 assert(LoopCost && "Non-zero loop cost expected"); 6028 6029 // Interleave if we vectorized this loop and there is a reduction that could 6030 // benefit from interleaving. 6031 if (VF.isVector() && HasReductions) { 6032 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6033 return IC; 6034 } 6035 6036 // Note that if we've already vectorized the loop we will have done the 6037 // runtime check and so interleaving won't require further checks. 6038 bool InterleavingRequiresRuntimePointerCheck = 6039 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6040 6041 // We want to interleave small loops in order to reduce the loop overhead and 6042 // potentially expose ILP opportunities. 6043 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6044 << "LV: IC is " << IC << '\n' 6045 << "LV: VF is " << VF << '\n'); 6046 const bool AggressivelyInterleaveReductions = 6047 TTI.enableAggressiveInterleaving(HasReductions); 6048 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6049 // We assume that the cost overhead is 1 and we use the cost model 6050 // to estimate the cost of the loop and interleave until the cost of the 6051 // loop overhead is about 5% of the cost of the loop. 6052 unsigned SmallIC = 6053 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6054 6055 // Interleave until store/load ports (estimated by max interleave count) are 6056 // saturated. 6057 unsigned NumStores = Legal->getNumStores(); 6058 unsigned NumLoads = Legal->getNumLoads(); 6059 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6060 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6061 6062 // If we have a scalar reduction (vector reductions are already dealt with 6063 // by this point), we can increase the critical path length if the loop 6064 // we're interleaving is inside another loop. Limit, by default to 2, so the 6065 // critical path only gets increased by one reduction operation. 6066 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6067 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6068 SmallIC = std::min(SmallIC, F); 6069 StoresIC = std::min(StoresIC, F); 6070 LoadsIC = std::min(LoadsIC, F); 6071 } 6072 6073 if (EnableLoadStoreRuntimeInterleave && 6074 std::max(StoresIC, LoadsIC) > SmallIC) { 6075 LLVM_DEBUG( 6076 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6077 return std::max(StoresIC, LoadsIC); 6078 } 6079 6080 // If there are scalar reductions and TTI has enabled aggressive 6081 // interleaving for reductions, we will interleave to expose ILP. 6082 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6083 AggressivelyInterleaveReductions) { 6084 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6085 // Interleave no less than SmallIC but not as aggressive as the normal IC 6086 // to satisfy the rare situation when resources are too limited. 6087 return std::max(IC / 2, SmallIC); 6088 } else { 6089 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6090 return SmallIC; 6091 } 6092 } 6093 6094 // Interleave if this is a large loop (small loops are already dealt with by 6095 // this point) that could benefit from interleaving. 6096 if (AggressivelyInterleaveReductions) { 6097 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6098 return IC; 6099 } 6100 6101 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6102 return 1; 6103 } 6104 6105 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6106 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6107 // This function calculates the register usage by measuring the highest number 6108 // of values that are alive at a single location. Obviously, this is a very 6109 // rough estimation. We scan the loop in a topological order in order and 6110 // assign a number to each instruction. We use RPO to ensure that defs are 6111 // met before their users. We assume that each instruction that has in-loop 6112 // users starts an interval. We record every time that an in-loop value is 6113 // used, so we have a list of the first and last occurrences of each 6114 // instruction. Next, we transpose this data structure into a multi map that 6115 // holds the list of intervals that *end* at a specific location. This multi 6116 // map allows us to perform a linear search. We scan the instructions linearly 6117 // and record each time that a new interval starts, by placing it in a set. 6118 // If we find this value in the multi-map then we remove it from the set. 6119 // The max register usage is the maximum size of the set. 6120 // We also search for instructions that are defined outside the loop, but are 6121 // used inside the loop. We need this number separately from the max-interval 6122 // usage number because when we unroll, loop-invariant values do not take 6123 // more register. 6124 LoopBlocksDFS DFS(TheLoop); 6125 DFS.perform(LI); 6126 6127 RegisterUsage RU; 6128 6129 // Each 'key' in the map opens a new interval. The values 6130 // of the map are the index of the 'last seen' usage of the 6131 // instruction that is the key. 6132 using IntervalMap = DenseMap<Instruction *, unsigned>; 6133 6134 // Maps instruction to its index. 6135 SmallVector<Instruction *, 64> IdxToInstr; 6136 // Marks the end of each interval. 6137 IntervalMap EndPoint; 6138 // Saves the list of instruction indices that are used in the loop. 6139 SmallPtrSet<Instruction *, 8> Ends; 6140 // Saves the list of values that are used in the loop but are 6141 // defined outside the loop, such as arguments and constants. 6142 SmallPtrSet<Value *, 8> LoopInvariants; 6143 6144 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6145 for (Instruction &I : BB->instructionsWithoutDebug()) { 6146 IdxToInstr.push_back(&I); 6147 6148 // Save the end location of each USE. 6149 for (Value *U : I.operands()) { 6150 auto *Instr = dyn_cast<Instruction>(U); 6151 6152 // Ignore non-instruction values such as arguments, constants, etc. 6153 if (!Instr) 6154 continue; 6155 6156 // If this instruction is outside the loop then record it and continue. 6157 if (!TheLoop->contains(Instr)) { 6158 LoopInvariants.insert(Instr); 6159 continue; 6160 } 6161 6162 // Overwrite previous end points. 6163 EndPoint[Instr] = IdxToInstr.size(); 6164 Ends.insert(Instr); 6165 } 6166 } 6167 } 6168 6169 // Saves the list of intervals that end with the index in 'key'. 6170 using InstrList = SmallVector<Instruction *, 2>; 6171 DenseMap<unsigned, InstrList> TransposeEnds; 6172 6173 // Transpose the EndPoints to a list of values that end at each index. 6174 for (auto &Interval : EndPoint) 6175 TransposeEnds[Interval.second].push_back(Interval.first); 6176 6177 SmallPtrSet<Instruction *, 8> OpenIntervals; 6178 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6179 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6180 6181 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6182 6183 // A lambda that gets the register usage for the given type and VF. 6184 const auto &TTICapture = TTI; 6185 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6186 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6187 return 0U; 6188 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6189 }; 6190 6191 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6192 Instruction *I = IdxToInstr[i]; 6193 6194 // Remove all of the instructions that end at this location. 6195 InstrList &List = TransposeEnds[i]; 6196 for (Instruction *ToRemove : List) 6197 OpenIntervals.erase(ToRemove); 6198 6199 // Ignore instructions that are never used within the loop. 6200 if (!Ends.count(I)) 6201 continue; 6202 6203 // Skip ignored values. 6204 if (ValuesToIgnore.count(I)) 6205 continue; 6206 6207 // For each VF find the maximum usage of registers. 6208 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6209 // Count the number of live intervals. 6210 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6211 6212 if (VFs[j].isScalar()) { 6213 for (auto Inst : OpenIntervals) { 6214 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6215 if (RegUsage.find(ClassID) == RegUsage.end()) 6216 RegUsage[ClassID] = 1; 6217 else 6218 RegUsage[ClassID] += 1; 6219 } 6220 } else { 6221 collectUniformsAndScalars(VFs[j]); 6222 for (auto Inst : OpenIntervals) { 6223 // Skip ignored values for VF > 1. 6224 if (VecValuesToIgnore.count(Inst)) 6225 continue; 6226 if (isScalarAfterVectorization(Inst, VFs[j])) { 6227 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6228 if (RegUsage.find(ClassID) == RegUsage.end()) 6229 RegUsage[ClassID] = 1; 6230 else 6231 RegUsage[ClassID] += 1; 6232 } else { 6233 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6234 if (RegUsage.find(ClassID) == RegUsage.end()) 6235 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6236 else 6237 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6238 } 6239 } 6240 } 6241 6242 for (auto& pair : RegUsage) { 6243 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6244 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6245 else 6246 MaxUsages[j][pair.first] = pair.second; 6247 } 6248 } 6249 6250 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6251 << OpenIntervals.size() << '\n'); 6252 6253 // Add the current instruction to the list of open intervals. 6254 OpenIntervals.insert(I); 6255 } 6256 6257 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6258 SmallMapVector<unsigned, unsigned, 4> Invariant; 6259 6260 for (auto Inst : LoopInvariants) { 6261 unsigned Usage = 6262 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6263 unsigned ClassID = 6264 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6265 if (Invariant.find(ClassID) == Invariant.end()) 6266 Invariant[ClassID] = Usage; 6267 else 6268 Invariant[ClassID] += Usage; 6269 } 6270 6271 LLVM_DEBUG({ 6272 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6273 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6274 << " item\n"; 6275 for (const auto &pair : MaxUsages[i]) { 6276 dbgs() << "LV(REG): RegisterClass: " 6277 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6278 << " registers\n"; 6279 } 6280 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6281 << " item\n"; 6282 for (const auto &pair : Invariant) { 6283 dbgs() << "LV(REG): RegisterClass: " 6284 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6285 << " registers\n"; 6286 } 6287 }); 6288 6289 RU.LoopInvariantRegs = Invariant; 6290 RU.MaxLocalUsers = MaxUsages[i]; 6291 RUs[i] = RU; 6292 } 6293 6294 return RUs; 6295 } 6296 6297 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6298 // TODO: Cost model for emulated masked load/store is completely 6299 // broken. This hack guides the cost model to use an artificially 6300 // high enough value to practically disable vectorization with such 6301 // operations, except where previously deployed legality hack allowed 6302 // using very low cost values. This is to avoid regressions coming simply 6303 // from moving "masked load/store" check from legality to cost model. 6304 // Masked Load/Gather emulation was previously never allowed. 6305 // Limited number of Masked Store/Scatter emulation was allowed. 6306 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6307 return isa<LoadInst>(I) || 6308 (isa<StoreInst>(I) && 6309 NumPredStores > NumberOfStoresToPredicate); 6310 } 6311 6312 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6313 // If we aren't vectorizing the loop, or if we've already collected the 6314 // instructions to scalarize, there's nothing to do. Collection may already 6315 // have occurred if we have a user-selected VF and are now computing the 6316 // expected cost for interleaving. 6317 if (VF.isScalar() || VF.isZero() || 6318 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6319 return; 6320 6321 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6322 // not profitable to scalarize any instructions, the presence of VF in the 6323 // map will indicate that we've analyzed it already. 6324 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6325 6326 // Find all the instructions that are scalar with predication in the loop and 6327 // determine if it would be better to not if-convert the blocks they are in. 6328 // If so, we also record the instructions to scalarize. 6329 for (BasicBlock *BB : TheLoop->blocks()) { 6330 if (!blockNeedsPredication(BB)) 6331 continue; 6332 for (Instruction &I : *BB) 6333 if (isScalarWithPredication(&I)) { 6334 ScalarCostsTy ScalarCosts; 6335 // Do not apply discount logic if hacked cost is needed 6336 // for emulated masked memrefs. 6337 if (!useEmulatedMaskMemRefHack(&I) && 6338 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6339 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6340 // Remember that BB will remain after vectorization. 6341 PredicatedBBsAfterVectorization.insert(BB); 6342 } 6343 } 6344 } 6345 6346 int LoopVectorizationCostModel::computePredInstDiscount( 6347 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 6348 ElementCount VF) { 6349 assert(!isUniformAfterVectorization(PredInst, VF) && 6350 "Instruction marked uniform-after-vectorization will be predicated"); 6351 6352 // Initialize the discount to zero, meaning that the scalar version and the 6353 // vector version cost the same. 6354 int Discount = 0; 6355 6356 // Holds instructions to analyze. The instructions we visit are mapped in 6357 // ScalarCosts. Those instructions are the ones that would be scalarized if 6358 // we find that the scalar version costs less. 6359 SmallVector<Instruction *, 8> Worklist; 6360 6361 // Returns true if the given instruction can be scalarized. 6362 auto canBeScalarized = [&](Instruction *I) -> bool { 6363 // We only attempt to scalarize instructions forming a single-use chain 6364 // from the original predicated block that would otherwise be vectorized. 6365 // Although not strictly necessary, we give up on instructions we know will 6366 // already be scalar to avoid traversing chains that are unlikely to be 6367 // beneficial. 6368 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6369 isScalarAfterVectorization(I, VF)) 6370 return false; 6371 6372 // If the instruction is scalar with predication, it will be analyzed 6373 // separately. We ignore it within the context of PredInst. 6374 if (isScalarWithPredication(I)) 6375 return false; 6376 6377 // If any of the instruction's operands are uniform after vectorization, 6378 // the instruction cannot be scalarized. This prevents, for example, a 6379 // masked load from being scalarized. 6380 // 6381 // We assume we will only emit a value for lane zero of an instruction 6382 // marked uniform after vectorization, rather than VF identical values. 6383 // Thus, if we scalarize an instruction that uses a uniform, we would 6384 // create uses of values corresponding to the lanes we aren't emitting code 6385 // for. This behavior can be changed by allowing getScalarValue to clone 6386 // the lane zero values for uniforms rather than asserting. 6387 for (Use &U : I->operands()) 6388 if (auto *J = dyn_cast<Instruction>(U.get())) 6389 if (isUniformAfterVectorization(J, VF)) 6390 return false; 6391 6392 // Otherwise, we can scalarize the instruction. 6393 return true; 6394 }; 6395 6396 // Compute the expected cost discount from scalarizing the entire expression 6397 // feeding the predicated instruction. We currently only consider expressions 6398 // that are single-use instruction chains. 6399 Worklist.push_back(PredInst); 6400 while (!Worklist.empty()) { 6401 Instruction *I = Worklist.pop_back_val(); 6402 6403 // If we've already analyzed the instruction, there's nothing to do. 6404 if (ScalarCosts.find(I) != ScalarCosts.end()) 6405 continue; 6406 6407 // Compute the cost of the vector instruction. Note that this cost already 6408 // includes the scalarization overhead of the predicated instruction. 6409 unsigned VectorCost = getInstructionCost(I, VF).first; 6410 6411 // Compute the cost of the scalarized instruction. This cost is the cost of 6412 // the instruction as if it wasn't if-converted and instead remained in the 6413 // predicated block. We will scale this cost by block probability after 6414 // computing the scalarization overhead. 6415 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6416 unsigned ScalarCost = 6417 VF.getKnownMinValue() * 6418 getInstructionCost(I, ElementCount::getFixed(1)).first; 6419 6420 // Compute the scalarization overhead of needed insertelement instructions 6421 // and phi nodes. 6422 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6423 ScalarCost += TTI.getScalarizationOverhead( 6424 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6425 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6426 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6427 ScalarCost += 6428 VF.getKnownMinValue() * 6429 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6430 } 6431 6432 // Compute the scalarization overhead of needed extractelement 6433 // instructions. For each of the instruction's operands, if the operand can 6434 // be scalarized, add it to the worklist; otherwise, account for the 6435 // overhead. 6436 for (Use &U : I->operands()) 6437 if (auto *J = dyn_cast<Instruction>(U.get())) { 6438 assert(VectorType::isValidElementType(J->getType()) && 6439 "Instruction has non-scalar type"); 6440 if (canBeScalarized(J)) 6441 Worklist.push_back(J); 6442 else if (needsExtract(J, VF)) { 6443 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6444 ScalarCost += TTI.getScalarizationOverhead( 6445 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6446 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6447 } 6448 } 6449 6450 // Scale the total scalar cost by block probability. 6451 ScalarCost /= getReciprocalPredBlockProb(); 6452 6453 // Compute the discount. A non-negative discount means the vector version 6454 // of the instruction costs more, and scalarizing would be beneficial. 6455 Discount += VectorCost - ScalarCost; 6456 ScalarCosts[I] = ScalarCost; 6457 } 6458 6459 return Discount; 6460 } 6461 6462 LoopVectorizationCostModel::VectorizationCostTy 6463 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6464 VectorizationCostTy Cost; 6465 6466 // For each block. 6467 for (BasicBlock *BB : TheLoop->blocks()) { 6468 VectorizationCostTy BlockCost; 6469 6470 // For each instruction in the old loop. 6471 for (Instruction &I : BB->instructionsWithoutDebug()) { 6472 // Skip ignored values. 6473 if (ValuesToIgnore.count(&I) || 6474 (VF.isVector() && VecValuesToIgnore.count(&I))) 6475 continue; 6476 6477 VectorizationCostTy C = getInstructionCost(&I, VF); 6478 6479 // Check if we should override the cost. 6480 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6481 C.first = ForceTargetInstructionCost; 6482 6483 BlockCost.first += C.first; 6484 BlockCost.second |= C.second; 6485 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6486 << " for VF " << VF << " For instruction: " << I 6487 << '\n'); 6488 } 6489 6490 // If we are vectorizing a predicated block, it will have been 6491 // if-converted. This means that the block's instructions (aside from 6492 // stores and instructions that may divide by zero) will now be 6493 // unconditionally executed. For the scalar case, we may not always execute 6494 // the predicated block, if it is an if-else block. Thus, scale the block's 6495 // cost by the probability of executing it. blockNeedsPredication from 6496 // Legal is used so as to not include all blocks in tail folded loops. 6497 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6498 BlockCost.first /= getReciprocalPredBlockProb(); 6499 6500 Cost.first += BlockCost.first; 6501 Cost.second |= BlockCost.second; 6502 } 6503 6504 return Cost; 6505 } 6506 6507 /// Gets Address Access SCEV after verifying that the access pattern 6508 /// is loop invariant except the induction variable dependence. 6509 /// 6510 /// This SCEV can be sent to the Target in order to estimate the address 6511 /// calculation cost. 6512 static const SCEV *getAddressAccessSCEV( 6513 Value *Ptr, 6514 LoopVectorizationLegality *Legal, 6515 PredicatedScalarEvolution &PSE, 6516 const Loop *TheLoop) { 6517 6518 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6519 if (!Gep) 6520 return nullptr; 6521 6522 // We are looking for a gep with all loop invariant indices except for one 6523 // which should be an induction variable. 6524 auto SE = PSE.getSE(); 6525 unsigned NumOperands = Gep->getNumOperands(); 6526 for (unsigned i = 1; i < NumOperands; ++i) { 6527 Value *Opd = Gep->getOperand(i); 6528 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6529 !Legal->isInductionVariable(Opd)) 6530 return nullptr; 6531 } 6532 6533 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6534 return PSE.getSCEV(Ptr); 6535 } 6536 6537 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6538 return Legal->hasStride(I->getOperand(0)) || 6539 Legal->hasStride(I->getOperand(1)); 6540 } 6541 6542 unsigned 6543 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6544 ElementCount VF) { 6545 assert(VF.isVector() && 6546 "Scalarization cost of instruction implies vectorization."); 6547 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6548 Type *ValTy = getMemInstValueType(I); 6549 auto SE = PSE.getSE(); 6550 6551 unsigned AS = getLoadStoreAddressSpace(I); 6552 Value *Ptr = getLoadStorePointerOperand(I); 6553 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6554 6555 // Figure out whether the access is strided and get the stride value 6556 // if it's known in compile time 6557 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6558 6559 // Get the cost of the scalar memory instruction and address computation. 6560 unsigned Cost = 6561 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6562 6563 // Don't pass *I here, since it is scalar but will actually be part of a 6564 // vectorized loop where the user of it is a vectorized instruction. 6565 const Align Alignment = getLoadStoreAlignment(I); 6566 Cost += VF.getKnownMinValue() * 6567 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6568 AS, TTI::TCK_RecipThroughput); 6569 6570 // Get the overhead of the extractelement and insertelement instructions 6571 // we might create due to scalarization. 6572 Cost += getScalarizationOverhead(I, VF); 6573 6574 // If we have a predicated store, it may not be executed for each vector 6575 // lane. Scale the cost by the probability of executing the predicated 6576 // block. 6577 if (isPredicatedInst(I)) { 6578 Cost /= getReciprocalPredBlockProb(); 6579 6580 if (useEmulatedMaskMemRefHack(I)) 6581 // Artificially setting to a high enough value to practically disable 6582 // vectorization with such operations. 6583 Cost = 3000000; 6584 } 6585 6586 return Cost; 6587 } 6588 6589 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6590 ElementCount VF) { 6591 Type *ValTy = getMemInstValueType(I); 6592 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6593 Value *Ptr = getLoadStorePointerOperand(I); 6594 unsigned AS = getLoadStoreAddressSpace(I); 6595 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6596 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6597 6598 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6599 "Stride should be 1 or -1 for consecutive memory access"); 6600 const Align Alignment = getLoadStoreAlignment(I); 6601 unsigned Cost = 0; 6602 if (Legal->isMaskRequired(I)) 6603 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6604 CostKind); 6605 else 6606 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6607 CostKind, I); 6608 6609 bool Reverse = ConsecutiveStride < 0; 6610 if (Reverse) 6611 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6612 return Cost; 6613 } 6614 6615 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6616 ElementCount VF) { 6617 assert(Legal->isUniformMemOp(*I)); 6618 6619 Type *ValTy = getMemInstValueType(I); 6620 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6621 const Align Alignment = getLoadStoreAlignment(I); 6622 unsigned AS = getLoadStoreAddressSpace(I); 6623 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6624 if (isa<LoadInst>(I)) { 6625 return TTI.getAddressComputationCost(ValTy) + 6626 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6627 CostKind) + 6628 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6629 } 6630 StoreInst *SI = cast<StoreInst>(I); 6631 6632 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6633 return TTI.getAddressComputationCost(ValTy) + 6634 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6635 CostKind) + 6636 (isLoopInvariantStoreValue 6637 ? 0 6638 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6639 VF.getKnownMinValue() - 1)); 6640 } 6641 6642 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6643 ElementCount VF) { 6644 Type *ValTy = getMemInstValueType(I); 6645 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6646 const Align Alignment = getLoadStoreAlignment(I); 6647 const Value *Ptr = getLoadStorePointerOperand(I); 6648 6649 return TTI.getAddressComputationCost(VectorTy) + 6650 TTI.getGatherScatterOpCost( 6651 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6652 TargetTransformInfo::TCK_RecipThroughput, I); 6653 } 6654 6655 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6656 ElementCount VF) { 6657 Type *ValTy = getMemInstValueType(I); 6658 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6659 unsigned AS = getLoadStoreAddressSpace(I); 6660 6661 auto Group = getInterleavedAccessGroup(I); 6662 assert(Group && "Fail to get an interleaved access group."); 6663 6664 unsigned InterleaveFactor = Group->getFactor(); 6665 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6666 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6667 6668 // Holds the indices of existing members in an interleaved load group. 6669 // An interleaved store group doesn't need this as it doesn't allow gaps. 6670 SmallVector<unsigned, 4> Indices; 6671 if (isa<LoadInst>(I)) { 6672 for (unsigned i = 0; i < InterleaveFactor; i++) 6673 if (Group->getMember(i)) 6674 Indices.push_back(i); 6675 } 6676 6677 // Calculate the cost of the whole interleaved group. 6678 bool UseMaskForGaps = 6679 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6680 unsigned Cost = TTI.getInterleavedMemoryOpCost( 6681 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6682 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6683 6684 if (Group->isReverse()) { 6685 // TODO: Add support for reversed masked interleaved access. 6686 assert(!Legal->isMaskRequired(I) && 6687 "Reverse masked interleaved access not supported."); 6688 Cost += Group->getNumMembers() * 6689 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6690 } 6691 return Cost; 6692 } 6693 6694 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6695 ElementCount VF) { 6696 // Calculate scalar cost only. Vectorization cost should be ready at this 6697 // moment. 6698 if (VF.isScalar()) { 6699 Type *ValTy = getMemInstValueType(I); 6700 const Align Alignment = getLoadStoreAlignment(I); 6701 unsigned AS = getLoadStoreAddressSpace(I); 6702 6703 return TTI.getAddressComputationCost(ValTy) + 6704 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6705 TTI::TCK_RecipThroughput, I); 6706 } 6707 return getWideningCost(I, VF); 6708 } 6709 6710 LoopVectorizationCostModel::VectorizationCostTy 6711 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6712 ElementCount VF) { 6713 // If we know that this instruction will remain uniform, check the cost of 6714 // the scalar version. 6715 if (isUniformAfterVectorization(I, VF)) 6716 VF = ElementCount::getFixed(1); 6717 6718 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6719 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6720 6721 // Forced scalars do not have any scalarization overhead. 6722 auto ForcedScalar = ForcedScalars.find(VF); 6723 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6724 auto InstSet = ForcedScalar->second; 6725 if (InstSet.count(I)) 6726 return VectorizationCostTy( 6727 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6728 VF.getKnownMinValue()), 6729 false); 6730 } 6731 6732 Type *VectorTy; 6733 unsigned C = getInstructionCost(I, VF, VectorTy); 6734 6735 bool TypeNotScalarized = 6736 VF.isVector() && VectorTy->isVectorTy() && 6737 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6738 return VectorizationCostTy(C, TypeNotScalarized); 6739 } 6740 6741 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6742 ElementCount VF) { 6743 6744 assert(!VF.isScalable() && 6745 "cannot compute scalarization overhead for scalable vectorization"); 6746 if (VF.isScalar()) 6747 return 0; 6748 6749 unsigned Cost = 0; 6750 Type *RetTy = ToVectorTy(I->getType(), VF); 6751 if (!RetTy->isVoidTy() && 6752 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6753 Cost += TTI.getScalarizationOverhead( 6754 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 6755 true, false); 6756 6757 // Some targets keep addresses scalar. 6758 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6759 return Cost; 6760 6761 // Some targets support efficient element stores. 6762 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6763 return Cost; 6764 6765 // Collect operands to consider. 6766 CallInst *CI = dyn_cast<CallInst>(I); 6767 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6768 6769 // Skip operands that do not require extraction/scalarization and do not incur 6770 // any overhead. 6771 return Cost + TTI.getOperandsScalarizationOverhead( 6772 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 6773 } 6774 6775 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6776 if (VF.isScalar()) 6777 return; 6778 NumPredStores = 0; 6779 for (BasicBlock *BB : TheLoop->blocks()) { 6780 // For each instruction in the old loop. 6781 for (Instruction &I : *BB) { 6782 Value *Ptr = getLoadStorePointerOperand(&I); 6783 if (!Ptr) 6784 continue; 6785 6786 // TODO: We should generate better code and update the cost model for 6787 // predicated uniform stores. Today they are treated as any other 6788 // predicated store (see added test cases in 6789 // invariant-store-vectorization.ll). 6790 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6791 NumPredStores++; 6792 6793 if (Legal->isUniformMemOp(I)) { 6794 // TODO: Avoid replicating loads and stores instead of 6795 // relying on instcombine to remove them. 6796 // Load: Scalar load + broadcast 6797 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6798 unsigned Cost = getUniformMemOpCost(&I, VF); 6799 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6800 continue; 6801 } 6802 6803 // We assume that widening is the best solution when possible. 6804 if (memoryInstructionCanBeWidened(&I, VF)) { 6805 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6806 int ConsecutiveStride = 6807 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6808 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6809 "Expected consecutive stride."); 6810 InstWidening Decision = 6811 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6812 setWideningDecision(&I, VF, Decision, Cost); 6813 continue; 6814 } 6815 6816 // Choose between Interleaving, Gather/Scatter or Scalarization. 6817 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6818 unsigned NumAccesses = 1; 6819 if (isAccessInterleaved(&I)) { 6820 auto Group = getInterleavedAccessGroup(&I); 6821 assert(Group && "Fail to get an interleaved access group."); 6822 6823 // Make one decision for the whole group. 6824 if (getWideningDecision(&I, VF) != CM_Unknown) 6825 continue; 6826 6827 NumAccesses = Group->getNumMembers(); 6828 if (interleavedAccessCanBeWidened(&I, VF)) 6829 InterleaveCost = getInterleaveGroupCost(&I, VF); 6830 } 6831 6832 unsigned GatherScatterCost = 6833 isLegalGatherOrScatter(&I) 6834 ? getGatherScatterCost(&I, VF) * NumAccesses 6835 : std::numeric_limits<unsigned>::max(); 6836 6837 unsigned ScalarizationCost = 6838 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6839 6840 // Choose better solution for the current VF, 6841 // write down this decision and use it during vectorization. 6842 unsigned Cost; 6843 InstWidening Decision; 6844 if (InterleaveCost <= GatherScatterCost && 6845 InterleaveCost < ScalarizationCost) { 6846 Decision = CM_Interleave; 6847 Cost = InterleaveCost; 6848 } else if (GatherScatterCost < ScalarizationCost) { 6849 Decision = CM_GatherScatter; 6850 Cost = GatherScatterCost; 6851 } else { 6852 Decision = CM_Scalarize; 6853 Cost = ScalarizationCost; 6854 } 6855 // If the instructions belongs to an interleave group, the whole group 6856 // receives the same decision. The whole group receives the cost, but 6857 // the cost will actually be assigned to one instruction. 6858 if (auto Group = getInterleavedAccessGroup(&I)) 6859 setWideningDecision(Group, VF, Decision, Cost); 6860 else 6861 setWideningDecision(&I, VF, Decision, Cost); 6862 } 6863 } 6864 6865 // Make sure that any load of address and any other address computation 6866 // remains scalar unless there is gather/scatter support. This avoids 6867 // inevitable extracts into address registers, and also has the benefit of 6868 // activating LSR more, since that pass can't optimize vectorized 6869 // addresses. 6870 if (TTI.prefersVectorizedAddressing()) 6871 return; 6872 6873 // Start with all scalar pointer uses. 6874 SmallPtrSet<Instruction *, 8> AddrDefs; 6875 for (BasicBlock *BB : TheLoop->blocks()) 6876 for (Instruction &I : *BB) { 6877 Instruction *PtrDef = 6878 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6879 if (PtrDef && TheLoop->contains(PtrDef) && 6880 getWideningDecision(&I, VF) != CM_GatherScatter) 6881 AddrDefs.insert(PtrDef); 6882 } 6883 6884 // Add all instructions used to generate the addresses. 6885 SmallVector<Instruction *, 4> Worklist; 6886 for (auto *I : AddrDefs) 6887 Worklist.push_back(I); 6888 while (!Worklist.empty()) { 6889 Instruction *I = Worklist.pop_back_val(); 6890 for (auto &Op : I->operands()) 6891 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6892 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6893 AddrDefs.insert(InstOp).second) 6894 Worklist.push_back(InstOp); 6895 } 6896 6897 for (auto *I : AddrDefs) { 6898 if (isa<LoadInst>(I)) { 6899 // Setting the desired widening decision should ideally be handled in 6900 // by cost functions, but since this involves the task of finding out 6901 // if the loaded register is involved in an address computation, it is 6902 // instead changed here when we know this is the case. 6903 InstWidening Decision = getWideningDecision(I, VF); 6904 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6905 // Scalarize a widened load of address. 6906 setWideningDecision( 6907 I, VF, CM_Scalarize, 6908 (VF.getKnownMinValue() * 6909 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6910 else if (auto Group = getInterleavedAccessGroup(I)) { 6911 // Scalarize an interleave group of address loads. 6912 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6913 if (Instruction *Member = Group->getMember(I)) 6914 setWideningDecision( 6915 Member, VF, CM_Scalarize, 6916 (VF.getKnownMinValue() * 6917 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6918 } 6919 } 6920 } else 6921 // Make sure I gets scalarized and a cost estimate without 6922 // scalarization overhead. 6923 ForcedScalars[VF].insert(I); 6924 } 6925 } 6926 6927 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6928 ElementCount VF, 6929 Type *&VectorTy) { 6930 Type *RetTy = I->getType(); 6931 if (canTruncateToMinimalBitwidth(I, VF)) 6932 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6933 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6934 auto SE = PSE.getSE(); 6935 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6936 6937 // TODO: We need to estimate the cost of intrinsic calls. 6938 switch (I->getOpcode()) { 6939 case Instruction::GetElementPtr: 6940 // We mark this instruction as zero-cost because the cost of GEPs in 6941 // vectorized code depends on whether the corresponding memory instruction 6942 // is scalarized or not. Therefore, we handle GEPs with the memory 6943 // instruction cost. 6944 return 0; 6945 case Instruction::Br: { 6946 // In cases of scalarized and predicated instructions, there will be VF 6947 // predicated blocks in the vectorized loop. Each branch around these 6948 // blocks requires also an extract of its vector compare i1 element. 6949 bool ScalarPredicatedBB = false; 6950 BranchInst *BI = cast<BranchInst>(I); 6951 if (VF.isVector() && BI->isConditional() && 6952 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6953 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6954 ScalarPredicatedBB = true; 6955 6956 if (ScalarPredicatedBB) { 6957 // Return cost for branches around scalarized and predicated blocks. 6958 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6959 auto *Vec_i1Ty = 6960 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6961 return (TTI.getScalarizationOverhead( 6962 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6963 false, true) + 6964 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 6965 VF.getKnownMinValue())); 6966 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6967 // The back-edge branch will remain, as will all scalar branches. 6968 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6969 else 6970 // This branch will be eliminated by if-conversion. 6971 return 0; 6972 // Note: We currently assume zero cost for an unconditional branch inside 6973 // a predicated block since it will become a fall-through, although we 6974 // may decide in the future to call TTI for all branches. 6975 } 6976 case Instruction::PHI: { 6977 auto *Phi = cast<PHINode>(I); 6978 6979 // First-order recurrences are replaced by vector shuffles inside the loop. 6980 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6981 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6982 return TTI.getShuffleCost( 6983 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6984 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6985 6986 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6987 // converted into select instructions. We require N - 1 selects per phi 6988 // node, where N is the number of incoming values. 6989 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6990 return (Phi->getNumIncomingValues() - 1) * 6991 TTI.getCmpSelInstrCost( 6992 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6993 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6994 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6995 6996 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6997 } 6998 case Instruction::UDiv: 6999 case Instruction::SDiv: 7000 case Instruction::URem: 7001 case Instruction::SRem: 7002 // If we have a predicated instruction, it may not be executed for each 7003 // vector lane. Get the scalarization cost and scale this amount by the 7004 // probability of executing the predicated block. If the instruction is not 7005 // predicated, we fall through to the next case. 7006 if (VF.isVector() && isScalarWithPredication(I)) { 7007 unsigned Cost = 0; 7008 7009 // These instructions have a non-void type, so account for the phi nodes 7010 // that we will create. This cost is likely to be zero. The phi node 7011 // cost, if any, should be scaled by the block probability because it 7012 // models a copy at the end of each predicated block. 7013 Cost += VF.getKnownMinValue() * 7014 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7015 7016 // The cost of the non-predicated instruction. 7017 Cost += VF.getKnownMinValue() * 7018 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7019 7020 // The cost of insertelement and extractelement instructions needed for 7021 // scalarization. 7022 Cost += getScalarizationOverhead(I, VF); 7023 7024 // Scale the cost by the probability of executing the predicated blocks. 7025 // This assumes the predicated block for each vector lane is equally 7026 // likely. 7027 return Cost / getReciprocalPredBlockProb(); 7028 } 7029 LLVM_FALLTHROUGH; 7030 case Instruction::Add: 7031 case Instruction::FAdd: 7032 case Instruction::Sub: 7033 case Instruction::FSub: 7034 case Instruction::Mul: 7035 case Instruction::FMul: 7036 case Instruction::FDiv: 7037 case Instruction::FRem: 7038 case Instruction::Shl: 7039 case Instruction::LShr: 7040 case Instruction::AShr: 7041 case Instruction::And: 7042 case Instruction::Or: 7043 case Instruction::Xor: { 7044 // Since we will replace the stride by 1 the multiplication should go away. 7045 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7046 return 0; 7047 // Certain instructions can be cheaper to vectorize if they have a constant 7048 // second vector operand. One example of this are shifts on x86. 7049 Value *Op2 = I->getOperand(1); 7050 TargetTransformInfo::OperandValueProperties Op2VP; 7051 TargetTransformInfo::OperandValueKind Op2VK = 7052 TTI.getOperandInfo(Op2, Op2VP); 7053 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7054 Op2VK = TargetTransformInfo::OK_UniformValue; 7055 7056 SmallVector<const Value *, 4> Operands(I->operand_values()); 7057 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7058 return N * TTI.getArithmeticInstrCost( 7059 I->getOpcode(), VectorTy, CostKind, 7060 TargetTransformInfo::OK_AnyValue, 7061 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7062 } 7063 case Instruction::FNeg: { 7064 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7065 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7066 return N * TTI.getArithmeticInstrCost( 7067 I->getOpcode(), VectorTy, CostKind, 7068 TargetTransformInfo::OK_AnyValue, 7069 TargetTransformInfo::OK_AnyValue, 7070 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7071 I->getOperand(0), I); 7072 } 7073 case Instruction::Select: { 7074 SelectInst *SI = cast<SelectInst>(I); 7075 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7076 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7077 Type *CondTy = SI->getCondition()->getType(); 7078 if (!ScalarCond) { 7079 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7080 CondTy = VectorType::get(CondTy, VF); 7081 } 7082 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7083 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7084 } 7085 case Instruction::ICmp: 7086 case Instruction::FCmp: { 7087 Type *ValTy = I->getOperand(0)->getType(); 7088 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7089 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7090 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7091 VectorTy = ToVectorTy(ValTy, VF); 7092 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7093 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7094 } 7095 case Instruction::Store: 7096 case Instruction::Load: { 7097 ElementCount Width = VF; 7098 if (Width.isVector()) { 7099 InstWidening Decision = getWideningDecision(I, Width); 7100 assert(Decision != CM_Unknown && 7101 "CM decision should be taken at this point"); 7102 if (Decision == CM_Scalarize) 7103 Width = ElementCount::getFixed(1); 7104 } 7105 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7106 return getMemoryInstructionCost(I, VF); 7107 } 7108 case Instruction::ZExt: 7109 case Instruction::SExt: 7110 case Instruction::FPToUI: 7111 case Instruction::FPToSI: 7112 case Instruction::FPExt: 7113 case Instruction::PtrToInt: 7114 case Instruction::IntToPtr: 7115 case Instruction::SIToFP: 7116 case Instruction::UIToFP: 7117 case Instruction::Trunc: 7118 case Instruction::FPTrunc: 7119 case Instruction::BitCast: { 7120 // Computes the CastContextHint from a Load/Store instruction. 7121 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7122 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7123 "Expected a load or a store!"); 7124 7125 if (VF.isScalar() || !TheLoop->contains(I)) 7126 return TTI::CastContextHint::Normal; 7127 7128 switch (getWideningDecision(I, VF)) { 7129 case LoopVectorizationCostModel::CM_GatherScatter: 7130 return TTI::CastContextHint::GatherScatter; 7131 case LoopVectorizationCostModel::CM_Interleave: 7132 return TTI::CastContextHint::Interleave; 7133 case LoopVectorizationCostModel::CM_Scalarize: 7134 case LoopVectorizationCostModel::CM_Widen: 7135 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7136 : TTI::CastContextHint::Normal; 7137 case LoopVectorizationCostModel::CM_Widen_Reverse: 7138 return TTI::CastContextHint::Reversed; 7139 case LoopVectorizationCostModel::CM_Unknown: 7140 llvm_unreachable("Instr did not go through cost modelling?"); 7141 } 7142 7143 llvm_unreachable("Unhandled case!"); 7144 }; 7145 7146 unsigned Opcode = I->getOpcode(); 7147 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7148 // For Trunc, the context is the only user, which must be a StoreInst. 7149 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7150 if (I->hasOneUse()) 7151 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7152 CCH = ComputeCCH(Store); 7153 } 7154 // For Z/Sext, the context is the operand, which must be a LoadInst. 7155 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7156 Opcode == Instruction::FPExt) { 7157 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7158 CCH = ComputeCCH(Load); 7159 } 7160 7161 // We optimize the truncation of induction variables having constant 7162 // integer steps. The cost of these truncations is the same as the scalar 7163 // operation. 7164 if (isOptimizableIVTruncate(I, VF)) { 7165 auto *Trunc = cast<TruncInst>(I); 7166 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7167 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7168 } 7169 7170 Type *SrcScalarTy = I->getOperand(0)->getType(); 7171 Type *SrcVecTy = 7172 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7173 if (canTruncateToMinimalBitwidth(I, VF)) { 7174 // This cast is going to be shrunk. This may remove the cast or it might 7175 // turn it into slightly different cast. For example, if MinBW == 16, 7176 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7177 // 7178 // Calculate the modified src and dest types. 7179 Type *MinVecTy = VectorTy; 7180 if (Opcode == Instruction::Trunc) { 7181 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7182 VectorTy = 7183 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7184 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7185 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7186 VectorTy = 7187 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7188 } 7189 } 7190 7191 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7192 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7193 return N * 7194 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7195 } 7196 case Instruction::Call: { 7197 bool NeedToScalarize; 7198 CallInst *CI = cast<CallInst>(I); 7199 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7200 if (getVectorIntrinsicIDForCall(CI, TLI)) 7201 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 7202 return CallCost; 7203 } 7204 case Instruction::ExtractValue: { 7205 InstructionCost ExtractCost = 7206 TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7207 assert(ExtractCost.isValid() && "Invalid cost for ExtractValue"); 7208 return *(ExtractCost.getValue()); 7209 } 7210 default: 7211 // The cost of executing VF copies of the scalar instruction. This opcode 7212 // is unknown. Assume that it is the same as 'mul'. 7213 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7214 Instruction::Mul, VectorTy, CostKind) + 7215 getScalarizationOverhead(I, VF); 7216 } // end of switch. 7217 } 7218 7219 char LoopVectorize::ID = 0; 7220 7221 static const char lv_name[] = "Loop Vectorization"; 7222 7223 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7224 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7225 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7226 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7227 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7228 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7229 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7230 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7231 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7232 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7233 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7234 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7235 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7236 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7237 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7238 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7239 7240 namespace llvm { 7241 7242 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7243 7244 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7245 bool VectorizeOnlyWhenForced) { 7246 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7247 } 7248 7249 } // end namespace llvm 7250 7251 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7252 // Check if the pointer operand of a load or store instruction is 7253 // consecutive. 7254 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7255 return Legal->isConsecutivePtr(Ptr); 7256 return false; 7257 } 7258 7259 void LoopVectorizationCostModel::collectValuesToIgnore() { 7260 // Ignore ephemeral values. 7261 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7262 7263 // Ignore type-promoting instructions we identified during reduction 7264 // detection. 7265 for (auto &Reduction : Legal->getReductionVars()) { 7266 RecurrenceDescriptor &RedDes = Reduction.second; 7267 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7268 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7269 } 7270 // Ignore type-casting instructions we identified during induction 7271 // detection. 7272 for (auto &Induction : Legal->getInductionVars()) { 7273 InductionDescriptor &IndDes = Induction.second; 7274 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7275 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7276 } 7277 } 7278 7279 void LoopVectorizationCostModel::collectInLoopReductions() { 7280 for (auto &Reduction : Legal->getReductionVars()) { 7281 PHINode *Phi = Reduction.first; 7282 RecurrenceDescriptor &RdxDesc = Reduction.second; 7283 7284 // We don't collect reductions that are type promoted (yet). 7285 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7286 continue; 7287 7288 // If the target would prefer this reduction to happen "in-loop", then we 7289 // want to record it as such. 7290 unsigned Opcode = RdxDesc.getRecurrenceBinOp(); 7291 if (!PreferInLoopReductions && 7292 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7293 TargetTransformInfo::ReductionFlags())) 7294 continue; 7295 7296 // Check that we can correctly put the reductions into the loop, by 7297 // finding the chain of operations that leads from the phi to the loop 7298 // exit value. 7299 SmallVector<Instruction *, 4> ReductionOperations = 7300 RdxDesc.getReductionOpChain(Phi, TheLoop); 7301 bool InLoop = !ReductionOperations.empty(); 7302 if (InLoop) 7303 InLoopReductionChains[Phi] = ReductionOperations; 7304 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7305 << " reduction for phi: " << *Phi << "\n"); 7306 } 7307 } 7308 7309 // TODO: we could return a pair of values that specify the max VF and 7310 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7311 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7312 // doesn't have a cost model that can choose which plan to execute if 7313 // more than one is generated. 7314 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7315 LoopVectorizationCostModel &CM) { 7316 unsigned WidestType; 7317 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7318 return WidestVectorRegBits / WidestType; 7319 } 7320 7321 VectorizationFactor 7322 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7323 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7324 ElementCount VF = UserVF; 7325 // Outer loop handling: They may require CFG and instruction level 7326 // transformations before even evaluating whether vectorization is profitable. 7327 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7328 // the vectorization pipeline. 7329 if (!OrigLoop->isInnermost()) { 7330 // If the user doesn't provide a vectorization factor, determine a 7331 // reasonable one. 7332 if (UserVF.isZero()) { 7333 VF = ElementCount::getFixed( 7334 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7335 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7336 7337 // Make sure we have a VF > 1 for stress testing. 7338 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7339 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7340 << "overriding computed VF.\n"); 7341 VF = ElementCount::getFixed(4); 7342 } 7343 } 7344 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7345 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7346 "VF needs to be a power of two"); 7347 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7348 << "VF " << VF << " to build VPlans.\n"); 7349 buildVPlans(VF, VF); 7350 7351 // For VPlan build stress testing, we bail out after VPlan construction. 7352 if (VPlanBuildStressTest) 7353 return VectorizationFactor::Disabled(); 7354 7355 return {VF, 0 /*Cost*/}; 7356 } 7357 7358 LLVM_DEBUG( 7359 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7360 "VPlan-native path.\n"); 7361 return VectorizationFactor::Disabled(); 7362 } 7363 7364 Optional<VectorizationFactor> 7365 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7366 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7367 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7368 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7369 return None; 7370 7371 // Invalidate interleave groups if all blocks of loop will be predicated. 7372 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7373 !useMaskedInterleavedAccesses(*TTI)) { 7374 LLVM_DEBUG( 7375 dbgs() 7376 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7377 "which requires masked-interleaved support.\n"); 7378 if (CM.InterleaveInfo.invalidateGroups()) 7379 // Invalidating interleave groups also requires invalidating all decisions 7380 // based on them, which includes widening decisions and uniform and scalar 7381 // values. 7382 CM.invalidateCostModelingDecisions(); 7383 } 7384 7385 ElementCount MaxVF = MaybeMaxVF.getValue(); 7386 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7387 7388 if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) { 7389 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7390 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7391 "VF needs to be a power of two"); 7392 // Collect the instructions (and their associated costs) that will be more 7393 // profitable to scalarize. 7394 CM.selectUserVectorizationFactor(UserVF); 7395 CM.collectInLoopReductions(); 7396 buildVPlansWithVPRecipes(UserVF, UserVF); 7397 LLVM_DEBUG(printPlans(dbgs())); 7398 return {{UserVF, 0}}; 7399 } 7400 7401 assert(!MaxVF.isScalable() && 7402 "Scalable vectors not yet supported beyond this point"); 7403 7404 for (ElementCount VF = ElementCount::getFixed(1); 7405 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7406 // Collect Uniform and Scalar instructions after vectorization with VF. 7407 CM.collectUniformsAndScalars(VF); 7408 7409 // Collect the instructions (and their associated costs) that will be more 7410 // profitable to scalarize. 7411 if (VF.isVector()) 7412 CM.collectInstsToScalarize(VF); 7413 } 7414 7415 CM.collectInLoopReductions(); 7416 7417 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7418 LLVM_DEBUG(printPlans(dbgs())); 7419 if (MaxVF.isScalar()) 7420 return VectorizationFactor::Disabled(); 7421 7422 // Select the optimal vectorization factor. 7423 return CM.selectVectorizationFactor(MaxVF); 7424 } 7425 7426 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7427 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7428 << '\n'); 7429 BestVF = VF; 7430 BestUF = UF; 7431 7432 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7433 return !Plan->hasVF(VF); 7434 }); 7435 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7436 } 7437 7438 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7439 DominatorTree *DT) { 7440 // Perform the actual loop transformation. 7441 7442 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7443 VPCallbackILV CallbackILV(ILV); 7444 7445 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7446 7447 VPTransformState State{*BestVF, BestUF, LI, 7448 DT, ILV.Builder, ILV.VectorLoopValueMap, 7449 &ILV, CallbackILV}; 7450 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7451 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7452 State.CanonicalIV = ILV.Induction; 7453 7454 ILV.printDebugTracesAtStart(); 7455 7456 //===------------------------------------------------===// 7457 // 7458 // Notice: any optimization or new instruction that go 7459 // into the code below should also be implemented in 7460 // the cost-model. 7461 // 7462 //===------------------------------------------------===// 7463 7464 // 2. Copy and widen instructions from the old loop into the new loop. 7465 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7466 VPlans.front()->execute(&State); 7467 7468 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7469 // predication, updating analyses. 7470 ILV.fixVectorizedLoop(); 7471 7472 ILV.printDebugTracesAtEnd(); 7473 } 7474 7475 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7476 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7477 7478 // We create new control-flow for the vectorized loop, so the original exit 7479 // conditions will be dead after vectorization if it's only used by the 7480 // terminator 7481 SmallVector<BasicBlock*> ExitingBlocks; 7482 OrigLoop->getExitingBlocks(ExitingBlocks); 7483 for (auto *BB : ExitingBlocks) { 7484 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7485 if (!Cmp || !Cmp->hasOneUse()) 7486 continue; 7487 7488 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7489 if (!DeadInstructions.insert(Cmp).second) 7490 continue; 7491 7492 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7493 // TODO: can recurse through operands in general 7494 for (Value *Op : Cmp->operands()) { 7495 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7496 DeadInstructions.insert(cast<Instruction>(Op)); 7497 } 7498 } 7499 7500 // We create new "steps" for induction variable updates to which the original 7501 // induction variables map. An original update instruction will be dead if 7502 // all its users except the induction variable are dead. 7503 auto *Latch = OrigLoop->getLoopLatch(); 7504 for (auto &Induction : Legal->getInductionVars()) { 7505 PHINode *Ind = Induction.first; 7506 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7507 7508 // If the tail is to be folded by masking, the primary induction variable, 7509 // if exists, isn't dead: it will be used for masking. Don't kill it. 7510 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7511 continue; 7512 7513 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7514 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7515 })) 7516 DeadInstructions.insert(IndUpdate); 7517 7518 // We record as "Dead" also the type-casting instructions we had identified 7519 // during induction analysis. We don't need any handling for them in the 7520 // vectorized loop because we have proven that, under a proper runtime 7521 // test guarding the vectorized loop, the value of the phi, and the casted 7522 // value of the phi, are the same. The last instruction in this casting chain 7523 // will get its scalar/vector/widened def from the scalar/vector/widened def 7524 // of the respective phi node. Any other casts in the induction def-use chain 7525 // have no other uses outside the phi update chain, and will be ignored. 7526 InductionDescriptor &IndDes = Induction.second; 7527 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7528 DeadInstructions.insert(Casts.begin(), Casts.end()); 7529 } 7530 } 7531 7532 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7533 7534 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7535 7536 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7537 Instruction::BinaryOps BinOp) { 7538 // When unrolling and the VF is 1, we only need to add a simple scalar. 7539 Type *Ty = Val->getType(); 7540 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7541 7542 if (Ty->isFloatingPointTy()) { 7543 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7544 7545 // Floating point operations had to be 'fast' to enable the unrolling. 7546 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7547 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7548 } 7549 Constant *C = ConstantInt::get(Ty, StartIdx); 7550 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7551 } 7552 7553 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7554 SmallVector<Metadata *, 4> MDs; 7555 // Reserve first location for self reference to the LoopID metadata node. 7556 MDs.push_back(nullptr); 7557 bool IsUnrollMetadata = false; 7558 MDNode *LoopID = L->getLoopID(); 7559 if (LoopID) { 7560 // First find existing loop unrolling disable metadata. 7561 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7562 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7563 if (MD) { 7564 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7565 IsUnrollMetadata = 7566 S && S->getString().startswith("llvm.loop.unroll.disable"); 7567 } 7568 MDs.push_back(LoopID->getOperand(i)); 7569 } 7570 } 7571 7572 if (!IsUnrollMetadata) { 7573 // Add runtime unroll disable metadata. 7574 LLVMContext &Context = L->getHeader()->getContext(); 7575 SmallVector<Metadata *, 1> DisableOperands; 7576 DisableOperands.push_back( 7577 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7578 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7579 MDs.push_back(DisableNode); 7580 MDNode *NewLoopID = MDNode::get(Context, MDs); 7581 // Set operand 0 to refer to the loop id itself. 7582 NewLoopID->replaceOperandWith(0, NewLoopID); 7583 L->setLoopID(NewLoopID); 7584 } 7585 } 7586 7587 //===--------------------------------------------------------------------===// 7588 // EpilogueVectorizerMainLoop 7589 //===--------------------------------------------------------------------===// 7590 7591 /// This function is partially responsible for generating the control flow 7592 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7593 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7594 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7595 Loop *Lp = createVectorLoopSkeleton(""); 7596 7597 // Generate the code to check the minimum iteration count of the vector 7598 // epilogue (see below). 7599 EPI.EpilogueIterationCountCheck = 7600 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7601 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7602 7603 // Generate the code to check any assumptions that we've made for SCEV 7604 // expressions. 7605 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7606 emitSCEVChecks(Lp, LoopScalarPreHeader); 7607 7608 // If a safety check was generated save it. 7609 if (SavedPreHeader != LoopVectorPreHeader) 7610 EPI.SCEVSafetyCheck = SavedPreHeader; 7611 7612 // Generate the code that checks at runtime if arrays overlap. We put the 7613 // checks into a separate block to make the more common case of few elements 7614 // faster. 7615 SavedPreHeader = LoopVectorPreHeader; 7616 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7617 7618 // If a safety check was generated save/overwite it. 7619 if (SavedPreHeader != LoopVectorPreHeader) 7620 EPI.MemSafetyCheck = SavedPreHeader; 7621 7622 // Generate the iteration count check for the main loop, *after* the check 7623 // for the epilogue loop, so that the path-length is shorter for the case 7624 // that goes directly through the vector epilogue. The longer-path length for 7625 // the main loop is compensated for, by the gain from vectorizing the larger 7626 // trip count. Note: the branch will get updated later on when we vectorize 7627 // the epilogue. 7628 EPI.MainLoopIterationCountCheck = 7629 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7630 7631 // Generate the induction variable. 7632 OldInduction = Legal->getPrimaryInduction(); 7633 Type *IdxTy = Legal->getWidestInductionType(); 7634 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7635 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7636 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7637 EPI.VectorTripCount = CountRoundDown; 7638 Induction = 7639 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7640 getDebugLocFromInstOrOperands(OldInduction)); 7641 7642 // Skip induction resume value creation here because they will be created in 7643 // the second pass. If we created them here, they wouldn't be used anyway, 7644 // because the vplan in the second pass still contains the inductions from the 7645 // original loop. 7646 7647 return completeLoopSkeleton(Lp, OrigLoopID); 7648 } 7649 7650 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7651 LLVM_DEBUG({ 7652 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7653 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7654 << ", Main Loop UF:" << EPI.MainLoopUF 7655 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7656 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7657 }); 7658 } 7659 7660 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7661 DEBUG_WITH_TYPE(VerboseDebug, { 7662 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7663 }); 7664 } 7665 7666 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7667 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7668 assert(L && "Expected valid Loop."); 7669 assert(Bypass && "Expected valid bypass basic block."); 7670 unsigned VFactor = 7671 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7672 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7673 Value *Count = getOrCreateTripCount(L); 7674 // Reuse existing vector loop preheader for TC checks. 7675 // Note that new preheader block is generated for vector loop. 7676 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7677 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7678 7679 // Generate code to check if the loop's trip count is less than VF * UF of the 7680 // main vector loop. 7681 auto P = 7682 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7683 7684 Value *CheckMinIters = Builder.CreateICmp( 7685 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7686 "min.iters.check"); 7687 7688 if (!ForEpilogue) 7689 TCCheckBlock->setName("vector.main.loop.iter.check"); 7690 7691 // Create new preheader for vector loop. 7692 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7693 DT, LI, nullptr, "vector.ph"); 7694 7695 if (ForEpilogue) { 7696 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7697 DT->getNode(Bypass)->getIDom()) && 7698 "TC check is expected to dominate Bypass"); 7699 7700 // Update dominator for Bypass & LoopExit. 7701 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7702 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7703 7704 LoopBypassBlocks.push_back(TCCheckBlock); 7705 7706 // Save the trip count so we don't have to regenerate it in the 7707 // vec.epilog.iter.check. This is safe to do because the trip count 7708 // generated here dominates the vector epilog iter check. 7709 EPI.TripCount = Count; 7710 } 7711 7712 ReplaceInstWithInst( 7713 TCCheckBlock->getTerminator(), 7714 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7715 7716 return TCCheckBlock; 7717 } 7718 7719 //===--------------------------------------------------------------------===// 7720 // EpilogueVectorizerEpilogueLoop 7721 //===--------------------------------------------------------------------===// 7722 7723 /// This function is partially responsible for generating the control flow 7724 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7725 BasicBlock * 7726 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7727 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7728 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 7729 7730 // Now, compare the remaining count and if there aren't enough iterations to 7731 // execute the vectorized epilogue skip to the scalar part. 7732 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7733 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7734 LoopVectorPreHeader = 7735 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7736 LI, nullptr, "vec.epilog.ph"); 7737 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 7738 VecEpilogueIterationCountCheck); 7739 7740 // Adjust the control flow taking the state info from the main loop 7741 // vectorization into account. 7742 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7743 "expected this to be saved from the previous pass."); 7744 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7745 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7746 7747 DT->changeImmediateDominator(LoopVectorPreHeader, 7748 EPI.MainLoopIterationCountCheck); 7749 7750 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7751 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7752 7753 if (EPI.SCEVSafetyCheck) 7754 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7755 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7756 if (EPI.MemSafetyCheck) 7757 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7758 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7759 7760 DT->changeImmediateDominator( 7761 VecEpilogueIterationCountCheck, 7762 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7763 7764 DT->changeImmediateDominator(LoopScalarPreHeader, 7765 EPI.EpilogueIterationCountCheck); 7766 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 7767 7768 // Keep track of bypass blocks, as they feed start values to the induction 7769 // phis in the scalar loop preheader. 7770 if (EPI.SCEVSafetyCheck) 7771 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7772 if (EPI.MemSafetyCheck) 7773 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7774 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7775 7776 // Generate a resume induction for the vector epilogue and put it in the 7777 // vector epilogue preheader 7778 Type *IdxTy = Legal->getWidestInductionType(); 7779 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7780 LoopVectorPreHeader->getFirstNonPHI()); 7781 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7782 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7783 EPI.MainLoopIterationCountCheck); 7784 7785 // Generate the induction variable. 7786 OldInduction = Legal->getPrimaryInduction(); 7787 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7788 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7789 Value *StartIdx = EPResumeVal; 7790 Induction = 7791 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7792 getDebugLocFromInstOrOperands(OldInduction)); 7793 7794 // Generate induction resume values. These variables save the new starting 7795 // indexes for the scalar loop. They are used to test if there are any tail 7796 // iterations left once the vector loop has completed. 7797 // Note that when the vectorized epilogue is skipped due to iteration count 7798 // check, then the resume value for the induction variable comes from 7799 // the trip count of the main vector loop, hence passing the AdditionalBypass 7800 // argument. 7801 createInductionResumeValues(Lp, CountRoundDown, 7802 {VecEpilogueIterationCountCheck, 7803 EPI.VectorTripCount} /* AdditionalBypass */); 7804 7805 AddRuntimeUnrollDisableMetaData(Lp); 7806 return completeLoopSkeleton(Lp, OrigLoopID); 7807 } 7808 7809 BasicBlock * 7810 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7811 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 7812 7813 assert(EPI.TripCount && 7814 "Expected trip count to have been safed in the first pass."); 7815 assert( 7816 (!isa<Instruction>(EPI.TripCount) || 7817 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7818 "saved trip count does not dominate insertion point."); 7819 Value *TC = EPI.TripCount; 7820 IRBuilder<> Builder(Insert->getTerminator()); 7821 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7822 7823 // Generate code to check if the loop's trip count is less than VF * UF of the 7824 // vector epilogue loop. 7825 auto P = 7826 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7827 7828 Value *CheckMinIters = Builder.CreateICmp( 7829 P, Count, 7830 ConstantInt::get(Count->getType(), 7831 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 7832 "min.epilog.iters.check"); 7833 7834 ReplaceInstWithInst( 7835 Insert->getTerminator(), 7836 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7837 7838 LoopBypassBlocks.push_back(Insert); 7839 return Insert; 7840 } 7841 7842 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7843 LLVM_DEBUG({ 7844 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7845 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7846 << ", Main Loop UF:" << EPI.MainLoopUF 7847 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7848 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7849 }); 7850 } 7851 7852 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7853 DEBUG_WITH_TYPE(VerboseDebug, { 7854 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 7855 }); 7856 } 7857 7858 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7859 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7860 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7861 bool PredicateAtRangeStart = Predicate(Range.Start); 7862 7863 for (ElementCount TmpVF = Range.Start * 2; 7864 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7865 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7866 Range.End = TmpVF; 7867 break; 7868 } 7869 7870 return PredicateAtRangeStart; 7871 } 7872 7873 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7874 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7875 /// of VF's starting at a given VF and extending it as much as possible. Each 7876 /// vectorization decision can potentially shorten this sub-range during 7877 /// buildVPlan(). 7878 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7879 ElementCount MaxVF) { 7880 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7881 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7882 VFRange SubRange = {VF, MaxVFPlusOne}; 7883 VPlans.push_back(buildVPlan(SubRange)); 7884 VF = SubRange.End; 7885 } 7886 } 7887 7888 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7889 VPlanPtr &Plan) { 7890 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7891 7892 // Look for cached value. 7893 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7894 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7895 if (ECEntryIt != EdgeMaskCache.end()) 7896 return ECEntryIt->second; 7897 7898 VPValue *SrcMask = createBlockInMask(Src, Plan); 7899 7900 // The terminator has to be a branch inst! 7901 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7902 assert(BI && "Unexpected terminator found"); 7903 7904 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7905 return EdgeMaskCache[Edge] = SrcMask; 7906 7907 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7908 assert(EdgeMask && "No Edge Mask found for condition"); 7909 7910 if (BI->getSuccessor(0) != Dst) 7911 EdgeMask = Builder.createNot(EdgeMask); 7912 7913 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 7914 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 7915 7916 return EdgeMaskCache[Edge] = EdgeMask; 7917 } 7918 7919 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 7920 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 7921 7922 // Look for cached value. 7923 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 7924 if (BCEntryIt != BlockMaskCache.end()) 7925 return BCEntryIt->second; 7926 7927 // All-one mask is modelled as no-mask following the convention for masked 7928 // load/store/gather/scatter. Initialize BlockMask to no-mask. 7929 VPValue *BlockMask = nullptr; 7930 7931 if (OrigLoop->getHeader() == BB) { 7932 if (!CM.blockNeedsPredication(BB)) 7933 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 7934 7935 // Create the block in mask as the first non-phi instruction in the block. 7936 VPBuilder::InsertPointGuard Guard(Builder); 7937 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 7938 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 7939 7940 // Introduce the early-exit compare IV <= BTC to form header block mask. 7941 // This is used instead of IV < TC because TC may wrap, unlike BTC. 7942 // Start by constructing the desired canonical IV. 7943 VPValue *IV = nullptr; 7944 if (Legal->getPrimaryInduction()) 7945 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 7946 else { 7947 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 7948 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 7949 IV = IVRecipe->getVPValue(); 7950 } 7951 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 7952 bool TailFolded = !CM.isScalarEpilogueAllowed(); 7953 7954 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 7955 // While ActiveLaneMask is a binary op that consumes the loop tripcount 7956 // as a second argument, we only pass the IV here and extract the 7957 // tripcount from the transform state where codegen of the VP instructions 7958 // happen. 7959 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 7960 } else { 7961 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 7962 } 7963 return BlockMaskCache[BB] = BlockMask; 7964 } 7965 7966 // This is the block mask. We OR all incoming edges. 7967 for (auto *Predecessor : predecessors(BB)) { 7968 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 7969 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 7970 return BlockMaskCache[BB] = EdgeMask; 7971 7972 if (!BlockMask) { // BlockMask has its initialized nullptr value. 7973 BlockMask = EdgeMask; 7974 continue; 7975 } 7976 7977 BlockMask = Builder.createOr(BlockMask, EdgeMask); 7978 } 7979 7980 return BlockMaskCache[BB] = BlockMask; 7981 } 7982 7983 VPWidenMemoryInstructionRecipe * 7984 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 7985 VPlanPtr &Plan) { 7986 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7987 "Must be called with either a load or store"); 7988 7989 auto willWiden = [&](ElementCount VF) -> bool { 7990 if (VF.isScalar()) 7991 return false; 7992 LoopVectorizationCostModel::InstWidening Decision = 7993 CM.getWideningDecision(I, VF); 7994 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 7995 "CM decision should be taken at this point."); 7996 if (Decision == LoopVectorizationCostModel::CM_Interleave) 7997 return true; 7998 if (CM.isScalarAfterVectorization(I, VF) || 7999 CM.isProfitableToScalarize(I, VF)) 8000 return false; 8001 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8002 }; 8003 8004 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8005 return nullptr; 8006 8007 VPValue *Mask = nullptr; 8008 if (Legal->isMaskRequired(I)) 8009 Mask = createBlockInMask(I->getParent(), Plan); 8010 8011 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8012 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8013 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8014 8015 StoreInst *Store = cast<StoreInst>(I); 8016 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8017 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8018 } 8019 8020 VPWidenIntOrFpInductionRecipe * 8021 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const { 8022 // Check if this is an integer or fp induction. If so, build the recipe that 8023 // produces its scalar and vector values. 8024 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8025 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8026 II.getKind() == InductionDescriptor::IK_FpInduction) 8027 return new VPWidenIntOrFpInductionRecipe(Phi); 8028 8029 return nullptr; 8030 } 8031 8032 VPWidenIntOrFpInductionRecipe * 8033 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, 8034 VFRange &Range) const { 8035 // Optimize the special case where the source is a constant integer 8036 // induction variable. Notice that we can only optimize the 'trunc' case 8037 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8038 // (c) other casts depend on pointer size. 8039 8040 // Determine whether \p K is a truncation based on an induction variable that 8041 // can be optimized. 8042 auto isOptimizableIVTruncate = 8043 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8044 return [=](ElementCount VF) -> bool { 8045 return CM.isOptimizableIVTruncate(K, VF); 8046 }; 8047 }; 8048 8049 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8050 isOptimizableIVTruncate(I), Range)) 8051 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8052 I); 8053 return nullptr; 8054 } 8055 8056 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8057 // We know that all PHIs in non-header blocks are converted into selects, so 8058 // we don't have to worry about the insertion order and we can just use the 8059 // builder. At this point we generate the predication tree. There may be 8060 // duplications since this is a simple recursive scan, but future 8061 // optimizations will clean it up. 8062 8063 SmallVector<VPValue *, 2> Operands; 8064 unsigned NumIncoming = Phi->getNumIncomingValues(); 8065 for (unsigned In = 0; In < NumIncoming; In++) { 8066 VPValue *EdgeMask = 8067 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8068 assert((EdgeMask || NumIncoming == 1) && 8069 "Multiple predecessors with one having a full mask"); 8070 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8071 if (EdgeMask) 8072 Operands.push_back(EdgeMask); 8073 } 8074 return new VPBlendRecipe(Phi, Operands); 8075 } 8076 8077 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8078 VPlan &Plan) const { 8079 8080 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8081 [this, CI](ElementCount VF) { 8082 return CM.isScalarWithPredication(CI, VF); 8083 }, 8084 Range); 8085 8086 if (IsPredicated) 8087 return nullptr; 8088 8089 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8090 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8091 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8092 ID == Intrinsic::pseudoprobe)) 8093 return nullptr; 8094 8095 auto willWiden = [&](ElementCount VF) -> bool { 8096 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8097 // The following case may be scalarized depending on the VF. 8098 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8099 // version of the instruction. 8100 // Is it beneficial to perform intrinsic call compared to lib call? 8101 bool NeedToScalarize = false; 8102 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8103 bool UseVectorIntrinsic = 8104 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 8105 return UseVectorIntrinsic || !NeedToScalarize; 8106 }; 8107 8108 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8109 return nullptr; 8110 8111 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8112 } 8113 8114 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8115 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8116 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8117 // Instruction should be widened, unless it is scalar after vectorization, 8118 // scalarization is profitable or it is predicated. 8119 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8120 return CM.isScalarAfterVectorization(I, VF) || 8121 CM.isProfitableToScalarize(I, VF) || 8122 CM.isScalarWithPredication(I, VF); 8123 }; 8124 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8125 Range); 8126 } 8127 8128 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8129 auto IsVectorizableOpcode = [](unsigned Opcode) { 8130 switch (Opcode) { 8131 case Instruction::Add: 8132 case Instruction::And: 8133 case Instruction::AShr: 8134 case Instruction::BitCast: 8135 case Instruction::FAdd: 8136 case Instruction::FCmp: 8137 case Instruction::FDiv: 8138 case Instruction::FMul: 8139 case Instruction::FNeg: 8140 case Instruction::FPExt: 8141 case Instruction::FPToSI: 8142 case Instruction::FPToUI: 8143 case Instruction::FPTrunc: 8144 case Instruction::FRem: 8145 case Instruction::FSub: 8146 case Instruction::ICmp: 8147 case Instruction::IntToPtr: 8148 case Instruction::LShr: 8149 case Instruction::Mul: 8150 case Instruction::Or: 8151 case Instruction::PtrToInt: 8152 case Instruction::SDiv: 8153 case Instruction::Select: 8154 case Instruction::SExt: 8155 case Instruction::Shl: 8156 case Instruction::SIToFP: 8157 case Instruction::SRem: 8158 case Instruction::Sub: 8159 case Instruction::Trunc: 8160 case Instruction::UDiv: 8161 case Instruction::UIToFP: 8162 case Instruction::URem: 8163 case Instruction::Xor: 8164 case Instruction::ZExt: 8165 return true; 8166 } 8167 return false; 8168 }; 8169 8170 if (!IsVectorizableOpcode(I->getOpcode())) 8171 return nullptr; 8172 8173 // Success: widen this instruction. 8174 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8175 } 8176 8177 VPBasicBlock *VPRecipeBuilder::handleReplication( 8178 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8179 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8180 VPlanPtr &Plan) { 8181 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8182 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8183 Range); 8184 8185 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8186 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8187 Range); 8188 8189 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8190 IsUniform, IsPredicated); 8191 setRecipe(I, Recipe); 8192 Plan->addVPValue(I, Recipe); 8193 8194 // Find if I uses a predicated instruction. If so, it will use its scalar 8195 // value. Avoid hoisting the insert-element which packs the scalar value into 8196 // a vector value, as that happens iff all users use the vector value. 8197 for (auto &Op : I->operands()) 8198 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8199 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8200 PredInst2Recipe[PredInst]->setAlsoPack(false); 8201 8202 // Finalize the recipe for Instr, first if it is not predicated. 8203 if (!IsPredicated) { 8204 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8205 VPBB->appendRecipe(Recipe); 8206 return VPBB; 8207 } 8208 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8209 assert(VPBB->getSuccessors().empty() && 8210 "VPBB has successors when handling predicated replication."); 8211 // Record predicated instructions for above packing optimizations. 8212 PredInst2Recipe[I] = Recipe; 8213 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8214 VPBlockUtils::insertBlockAfter(Region, VPBB); 8215 auto *RegSucc = new VPBasicBlock(); 8216 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8217 return RegSucc; 8218 } 8219 8220 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8221 VPRecipeBase *PredRecipe, 8222 VPlanPtr &Plan) { 8223 // Instructions marked for predication are replicated and placed under an 8224 // if-then construct to prevent side-effects. 8225 8226 // Generate recipes to compute the block mask for this region. 8227 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8228 8229 // Build the triangular if-then region. 8230 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8231 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8232 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8233 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8234 auto *PHIRecipe = Instr->getType()->isVoidTy() 8235 ? nullptr 8236 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8237 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8238 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8239 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8240 8241 // Note: first set Entry as region entry and then connect successors starting 8242 // from it in order, to propagate the "parent" of each VPBasicBlock. 8243 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8244 VPBlockUtils::connectBlocks(Pred, Exit); 8245 8246 return Region; 8247 } 8248 8249 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8250 VFRange &Range, 8251 VPlanPtr &Plan) { 8252 // First, check for specific widening recipes that deal with calls, memory 8253 // operations, inductions and Phi nodes. 8254 if (auto *CI = dyn_cast<CallInst>(Instr)) 8255 return tryToWidenCall(CI, Range, *Plan); 8256 8257 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8258 return tryToWidenMemory(Instr, Range, Plan); 8259 8260 VPRecipeBase *Recipe; 8261 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8262 if (Phi->getParent() != OrigLoop->getHeader()) 8263 return tryToBlend(Phi, Plan); 8264 if ((Recipe = tryToOptimizeInductionPHI(Phi))) 8265 return Recipe; 8266 return new VPWidenPHIRecipe(Phi); 8267 } 8268 8269 if (isa<TruncInst>(Instr) && 8270 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range))) 8271 return Recipe; 8272 8273 if (!shouldWiden(Instr, Range)) 8274 return nullptr; 8275 8276 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8277 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8278 OrigLoop); 8279 8280 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8281 bool InvariantCond = 8282 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8283 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8284 InvariantCond); 8285 } 8286 8287 return tryToWiden(Instr, *Plan); 8288 } 8289 8290 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8291 ElementCount MaxVF) { 8292 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8293 8294 // Collect instructions from the original loop that will become trivially dead 8295 // in the vectorized loop. We don't need to vectorize these instructions. For 8296 // example, original induction update instructions can become dead because we 8297 // separately emit induction "steps" when generating code for the new loop. 8298 // Similarly, we create a new latch condition when setting up the structure 8299 // of the new loop, so the old one can become dead. 8300 SmallPtrSet<Instruction *, 4> DeadInstructions; 8301 collectTriviallyDeadInstructions(DeadInstructions); 8302 8303 // Add assume instructions we need to drop to DeadInstructions, to prevent 8304 // them from being added to the VPlan. 8305 // TODO: We only need to drop assumes in blocks that get flattend. If the 8306 // control flow is preserved, we should keep them. 8307 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8308 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8309 8310 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8311 // Dead instructions do not need sinking. Remove them from SinkAfter. 8312 for (Instruction *I : DeadInstructions) 8313 SinkAfter.erase(I); 8314 8315 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8316 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8317 VFRange SubRange = {VF, MaxVFPlusOne}; 8318 VPlans.push_back( 8319 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8320 VF = SubRange.End; 8321 } 8322 } 8323 8324 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8325 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8326 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8327 8328 // Hold a mapping from predicated instructions to their recipes, in order to 8329 // fix their AlsoPack behavior if a user is determined to replicate and use a 8330 // scalar instead of vector value. 8331 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8332 8333 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8334 8335 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8336 8337 // --------------------------------------------------------------------------- 8338 // Pre-construction: record ingredients whose recipes we'll need to further 8339 // process after constructing the initial VPlan. 8340 // --------------------------------------------------------------------------- 8341 8342 // Mark instructions we'll need to sink later and their targets as 8343 // ingredients whose recipe we'll need to record. 8344 for (auto &Entry : SinkAfter) { 8345 RecipeBuilder.recordRecipeOf(Entry.first); 8346 RecipeBuilder.recordRecipeOf(Entry.second); 8347 } 8348 for (auto &Reduction : CM.getInLoopReductionChains()) { 8349 PHINode *Phi = Reduction.first; 8350 RecurrenceDescriptor::RecurrenceKind Kind = 8351 Legal->getReductionVars()[Phi].getRecurrenceKind(); 8352 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8353 8354 RecipeBuilder.recordRecipeOf(Phi); 8355 for (auto &R : ReductionOperations) { 8356 RecipeBuilder.recordRecipeOf(R); 8357 // For min/max reducitons, where we have a pair of icmp/select, we also 8358 // need to record the ICmp recipe, so it can be removed later. 8359 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8360 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8361 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8362 } 8363 } 8364 } 8365 8366 // For each interleave group which is relevant for this (possibly trimmed) 8367 // Range, add it to the set of groups to be later applied to the VPlan and add 8368 // placeholders for its members' Recipes which we'll be replacing with a 8369 // single VPInterleaveRecipe. 8370 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8371 auto applyIG = [IG, this](ElementCount VF) -> bool { 8372 return (VF.isVector() && // Query is illegal for VF == 1 8373 CM.getWideningDecision(IG->getInsertPos(), VF) == 8374 LoopVectorizationCostModel::CM_Interleave); 8375 }; 8376 if (!getDecisionAndClampRange(applyIG, Range)) 8377 continue; 8378 InterleaveGroups.insert(IG); 8379 for (unsigned i = 0; i < IG->getFactor(); i++) 8380 if (Instruction *Member = IG->getMember(i)) 8381 RecipeBuilder.recordRecipeOf(Member); 8382 }; 8383 8384 // --------------------------------------------------------------------------- 8385 // Build initial VPlan: Scan the body of the loop in a topological order to 8386 // visit each basic block after having visited its predecessor basic blocks. 8387 // --------------------------------------------------------------------------- 8388 8389 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8390 auto Plan = std::make_unique<VPlan>(); 8391 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8392 Plan->setEntry(VPBB); 8393 8394 // Scan the body of the loop in a topological order to visit each basic block 8395 // after having visited its predecessor basic blocks. 8396 LoopBlocksDFS DFS(OrigLoop); 8397 DFS.perform(LI); 8398 8399 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8400 // Relevant instructions from basic block BB will be grouped into VPRecipe 8401 // ingredients and fill a new VPBasicBlock. 8402 unsigned VPBBsForBB = 0; 8403 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8404 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8405 VPBB = FirstVPBBForBB; 8406 Builder.setInsertPoint(VPBB); 8407 8408 // Introduce each ingredient into VPlan. 8409 // TODO: Model and preserve debug instrinsics in VPlan. 8410 for (Instruction &I : BB->instructionsWithoutDebug()) { 8411 Instruction *Instr = &I; 8412 8413 // First filter out irrelevant instructions, to ensure no recipes are 8414 // built for them. 8415 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8416 continue; 8417 8418 if (auto Recipe = 8419 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8420 // Check if the recipe can be converted to a VPValue. We need the extra 8421 // down-casting step until VPRecipeBase inherits from VPValue. 8422 VPValue *MaybeVPValue = Recipe->toVPValue(); 8423 if (!Instr->getType()->isVoidTy() && MaybeVPValue) 8424 Plan->addVPValue(Instr, MaybeVPValue); 8425 8426 RecipeBuilder.setRecipe(Instr, Recipe); 8427 VPBB->appendRecipe(Recipe); 8428 continue; 8429 } 8430 8431 // Otherwise, if all widening options failed, Instruction is to be 8432 // replicated. This may create a successor for VPBB. 8433 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8434 Instr, Range, VPBB, PredInst2Recipe, Plan); 8435 if (NextVPBB != VPBB) { 8436 VPBB = NextVPBB; 8437 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8438 : ""); 8439 } 8440 } 8441 } 8442 8443 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8444 // may also be empty, such as the last one VPBB, reflecting original 8445 // basic-blocks with no recipes. 8446 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8447 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8448 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8449 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8450 delete PreEntry; 8451 8452 // --------------------------------------------------------------------------- 8453 // Transform initial VPlan: Apply previously taken decisions, in order, to 8454 // bring the VPlan to its final state. 8455 // --------------------------------------------------------------------------- 8456 8457 // Apply Sink-After legal constraints. 8458 for (auto &Entry : SinkAfter) { 8459 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8460 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8461 Sink->moveAfter(Target); 8462 } 8463 8464 // Interleave memory: for each Interleave Group we marked earlier as relevant 8465 // for this VPlan, replace the Recipes widening its memory instructions with a 8466 // single VPInterleaveRecipe at its insertion point. 8467 for (auto IG : InterleaveGroups) { 8468 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8469 RecipeBuilder.getRecipe(IG->getInsertPos())); 8470 SmallVector<VPValue *, 4> StoredValues; 8471 for (unsigned i = 0; i < IG->getFactor(); ++i) 8472 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8473 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8474 8475 (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8476 Recipe->getMask())) 8477 ->insertBefore(Recipe); 8478 8479 for (unsigned i = 0; i < IG->getFactor(); ++i) 8480 if (Instruction *Member = IG->getMember(i)) { 8481 if (!Member->getType()->isVoidTy()) { 8482 VPValue *OriginalV = Plan->getVPValue(Member); 8483 Plan->removeVPValueFor(Member); 8484 OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member)); 8485 } 8486 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8487 } 8488 } 8489 8490 // Adjust the recipes for any inloop reductions. 8491 if (Range.Start.isVector()) 8492 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8493 8494 // Finally, if tail is folded by masking, introduce selects between the phi 8495 // and the live-out instruction of each reduction, at the end of the latch. 8496 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8497 Builder.setInsertPoint(VPBB); 8498 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8499 for (auto &Reduction : Legal->getReductionVars()) { 8500 if (CM.isInLoopReduction(Reduction.first)) 8501 continue; 8502 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8503 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8504 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8505 } 8506 } 8507 8508 std::string PlanName; 8509 raw_string_ostream RSO(PlanName); 8510 ElementCount VF = Range.Start; 8511 Plan->addVF(VF); 8512 RSO << "Initial VPlan for VF={" << VF; 8513 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8514 Plan->addVF(VF); 8515 RSO << "," << VF; 8516 } 8517 RSO << "},UF>=1"; 8518 RSO.flush(); 8519 Plan->setName(PlanName); 8520 8521 return Plan; 8522 } 8523 8524 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8525 // Outer loop handling: They may require CFG and instruction level 8526 // transformations before even evaluating whether vectorization is profitable. 8527 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8528 // the vectorization pipeline. 8529 assert(!OrigLoop->isInnermost()); 8530 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8531 8532 // Create new empty VPlan 8533 auto Plan = std::make_unique<VPlan>(); 8534 8535 // Build hierarchical CFG 8536 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8537 HCFGBuilder.buildHierarchicalCFG(); 8538 8539 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8540 VF *= 2) 8541 Plan->addVF(VF); 8542 8543 if (EnableVPlanPredication) { 8544 VPlanPredicator VPP(*Plan); 8545 VPP.predicate(); 8546 8547 // Avoid running transformation to recipes until masked code generation in 8548 // VPlan-native path is in place. 8549 return Plan; 8550 } 8551 8552 SmallPtrSet<Instruction *, 1> DeadInstructions; 8553 VPlanTransforms::VPInstructionsToVPRecipes( 8554 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8555 return Plan; 8556 } 8557 8558 // Adjust the recipes for any inloop reductions. The chain of instructions 8559 // leading from the loop exit instr to the phi need to be converted to 8560 // reductions, with one operand being vector and the other being the scalar 8561 // reduction chain. 8562 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8563 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8564 for (auto &Reduction : CM.getInLoopReductionChains()) { 8565 PHINode *Phi = Reduction.first; 8566 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8567 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8568 8569 // ReductionOperations are orders top-down from the phi's use to the 8570 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8571 // which of the two operands will remain scalar and which will be reduced. 8572 // For minmax the chain will be the select instructions. 8573 Instruction *Chain = Phi; 8574 for (Instruction *R : ReductionOperations) { 8575 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8576 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind(); 8577 8578 VPValue *ChainOp = Plan->getVPValue(Chain); 8579 unsigned FirstOpId; 8580 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8581 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8582 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8583 "Expected to replace a VPWidenSelectSC"); 8584 FirstOpId = 1; 8585 } else { 8586 assert(isa<VPWidenRecipe>(WidenRecipe) && 8587 "Expected to replace a VPWidenSC"); 8588 FirstOpId = 0; 8589 } 8590 unsigned VecOpId = 8591 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8592 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8593 8594 auto *CondOp = CM.foldTailByMasking() 8595 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8596 : nullptr; 8597 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8598 &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI); 8599 WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe); 8600 Plan->removeVPValueFor(R); 8601 Plan->addVPValue(R, RedRecipe); 8602 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8603 WidenRecipe->eraseFromParent(); 8604 8605 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8606 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8607 VPRecipeBase *CompareRecipe = 8608 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8609 assert(isa<VPWidenRecipe>(CompareRecipe) && 8610 "Expected to replace a VPWidenSC"); 8611 assert(CompareRecipe->toVPValue()->getNumUsers() == 0 && 8612 "Expected no remaining users"); 8613 CompareRecipe->eraseFromParent(); 8614 } 8615 Chain = R; 8616 } 8617 } 8618 } 8619 8620 Value* LoopVectorizationPlanner::VPCallbackILV:: 8621 getOrCreateVectorValues(Value *V, unsigned Part) { 8622 return ILV.getOrCreateVectorValue(V, Part); 8623 } 8624 8625 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8626 Value *V, const VPIteration &Instance) { 8627 return ILV.getOrCreateScalarValue(V, Instance); 8628 } 8629 8630 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8631 VPSlotTracker &SlotTracker) const { 8632 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8633 IG->getInsertPos()->printAsOperand(O, false); 8634 O << ", "; 8635 getAddr()->printAsOperand(O, SlotTracker); 8636 VPValue *Mask = getMask(); 8637 if (Mask) { 8638 O << ", "; 8639 Mask->printAsOperand(O, SlotTracker); 8640 } 8641 for (unsigned i = 0; i < IG->getFactor(); ++i) 8642 if (Instruction *I = IG->getMember(i)) 8643 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8644 } 8645 8646 void VPWidenCallRecipe::execute(VPTransformState &State) { 8647 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8648 *this, State); 8649 } 8650 8651 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8652 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8653 this, *this, InvariantCond, State); 8654 } 8655 8656 void VPWidenRecipe::execute(VPTransformState &State) { 8657 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8658 } 8659 8660 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8661 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8662 *this, State.UF, State.VF, IsPtrLoopInvariant, 8663 IsIndexLoopInvariant, State); 8664 } 8665 8666 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8667 assert(!State.Instance && "Int or FP induction being replicated."); 8668 State.ILV->widenIntOrFpInduction(IV, Trunc); 8669 } 8670 8671 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8672 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 8673 } 8674 8675 void VPBlendRecipe::execute(VPTransformState &State) { 8676 State.ILV->setDebugLocFromInst(State.Builder, Phi); 8677 // We know that all PHIs in non-header blocks are converted into 8678 // selects, so we don't have to worry about the insertion order and we 8679 // can just use the builder. 8680 // At this point we generate the predication tree. There may be 8681 // duplications since this is a simple recursive scan, but future 8682 // optimizations will clean it up. 8683 8684 unsigned NumIncoming = getNumIncomingValues(); 8685 8686 // Generate a sequence of selects of the form: 8687 // SELECT(Mask3, In3, 8688 // SELECT(Mask2, In2, 8689 // SELECT(Mask1, In1, 8690 // In0))) 8691 // Note that Mask0 is never used: lanes for which no path reaches this phi and 8692 // are essentially undef are taken from In0. 8693 InnerLoopVectorizer::VectorParts Entry(State.UF); 8694 for (unsigned In = 0; In < NumIncoming; ++In) { 8695 for (unsigned Part = 0; Part < State.UF; ++Part) { 8696 // We might have single edge PHIs (blocks) - use an identity 8697 // 'select' for the first PHI operand. 8698 Value *In0 = State.get(getIncomingValue(In), Part); 8699 if (In == 0) 8700 Entry[Part] = In0; // Initialize with the first incoming value. 8701 else { 8702 // Select between the current value and the previous incoming edge 8703 // based on the incoming mask. 8704 Value *Cond = State.get(getMask(In), Part); 8705 Entry[Part] = 8706 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 8707 } 8708 } 8709 } 8710 for (unsigned Part = 0; Part < State.UF; ++Part) 8711 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 8712 } 8713 8714 void VPInterleaveRecipe::execute(VPTransformState &State) { 8715 assert(!State.Instance && "Interleave group being replicated."); 8716 State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(), 8717 getMask()); 8718 } 8719 8720 void VPReductionRecipe::execute(VPTransformState &State) { 8721 assert(!State.Instance && "Reduction being replicated."); 8722 for (unsigned Part = 0; Part < State.UF; ++Part) { 8723 RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind(); 8724 Value *NewVecOp = State.get(getVecOp(), Part); 8725 if (VPValue *Cond = getCondOp()) { 8726 Value *NewCond = State.get(Cond, Part); 8727 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 8728 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 8729 Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType()); 8730 Constant *IdenVec = 8731 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 8732 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 8733 NewVecOp = Select; 8734 } 8735 Value *NewRed = 8736 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN); 8737 Value *PrevInChain = State.get(getChainOp(), Part); 8738 Value *NextInChain; 8739 if (Kind == RecurrenceDescriptor::RK_IntegerMinMax || 8740 Kind == RecurrenceDescriptor::RK_FloatMinMax) { 8741 NextInChain = 8742 createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(), 8743 NewRed, PrevInChain); 8744 } else { 8745 NextInChain = State.Builder.CreateBinOp( 8746 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 8747 PrevInChain); 8748 } 8749 State.set(this, getUnderlyingInstr(), NextInChain, Part); 8750 } 8751 } 8752 8753 void VPReplicateRecipe::execute(VPTransformState &State) { 8754 if (State.Instance) { // Generate a single instance. 8755 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 8756 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 8757 *State.Instance, IsPredicated, State); 8758 // Insert scalar instance packing it into a vector. 8759 if (AlsoPack && State.VF.isVector()) { 8760 // If we're constructing lane 0, initialize to start from undef. 8761 if (State.Instance->Lane == 0) { 8762 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 8763 Value *Undef = UndefValue::get( 8764 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 8765 State.ValueMap.setVectorValue(getUnderlyingInstr(), 8766 State.Instance->Part, Undef); 8767 } 8768 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 8769 *State.Instance); 8770 } 8771 return; 8772 } 8773 8774 // Generate scalar instances for all VF lanes of all UF parts, unless the 8775 // instruction is uniform inwhich case generate only the first lane for each 8776 // of the UF parts. 8777 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 8778 assert((!State.VF.isScalable() || IsUniform) && 8779 "Can't scalarize a scalable vector"); 8780 for (unsigned Part = 0; Part < State.UF; ++Part) 8781 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 8782 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 8783 IsPredicated, State); 8784 } 8785 8786 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 8787 assert(State.Instance && "Branch on Mask works only on single instance."); 8788 8789 unsigned Part = State.Instance->Part; 8790 unsigned Lane = State.Instance->Lane; 8791 8792 Value *ConditionBit = nullptr; 8793 VPValue *BlockInMask = getMask(); 8794 if (BlockInMask) { 8795 ConditionBit = State.get(BlockInMask, Part); 8796 if (ConditionBit->getType()->isVectorTy()) 8797 ConditionBit = State.Builder.CreateExtractElement( 8798 ConditionBit, State.Builder.getInt32(Lane)); 8799 } else // Block in mask is all-one. 8800 ConditionBit = State.Builder.getTrue(); 8801 8802 // Replace the temporary unreachable terminator with a new conditional branch, 8803 // whose two destinations will be set later when they are created. 8804 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 8805 assert(isa<UnreachableInst>(CurrentTerminator) && 8806 "Expected to replace unreachable terminator with conditional branch."); 8807 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 8808 CondBr->setSuccessor(0, nullptr); 8809 ReplaceInstWithInst(CurrentTerminator, CondBr); 8810 } 8811 8812 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 8813 assert(State.Instance && "Predicated instruction PHI works per instance."); 8814 Instruction *ScalarPredInst = 8815 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 8816 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 8817 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 8818 assert(PredicatingBB && "Predicated block has no single predecessor."); 8819 8820 // By current pack/unpack logic we need to generate only a single phi node: if 8821 // a vector value for the predicated instruction exists at this point it means 8822 // the instruction has vector users only, and a phi for the vector value is 8823 // needed. In this case the recipe of the predicated instruction is marked to 8824 // also do that packing, thereby "hoisting" the insert-element sequence. 8825 // Otherwise, a phi node for the scalar value is needed. 8826 unsigned Part = State.Instance->Part; 8827 Instruction *PredInst = 8828 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 8829 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 8830 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 8831 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 8832 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 8833 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 8834 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 8835 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 8836 } else { 8837 Type *PredInstType = PredInst->getType(); 8838 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 8839 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 8840 Phi->addIncoming(ScalarPredInst, PredicatedBB); 8841 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 8842 } 8843 } 8844 8845 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 8846 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 8847 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 8848 StoredValue ? nullptr : toVPValue(), 8849 getAddr(), StoredValue, getMask()); 8850 } 8851 8852 // Determine how to lower the scalar epilogue, which depends on 1) optimising 8853 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 8854 // predication, and 4) a TTI hook that analyses whether the loop is suitable 8855 // for predication. 8856 static ScalarEpilogueLowering getScalarEpilogueLowering( 8857 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 8858 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 8859 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 8860 LoopVectorizationLegality &LVL) { 8861 // 1) OptSize takes precedence over all other options, i.e. if this is set, 8862 // don't look at hints or options, and don't request a scalar epilogue. 8863 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 8864 // LoopAccessInfo (due to code dependency and not being able to reliably get 8865 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 8866 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 8867 // versioning when the vectorization is forced, unlike hasOptSize. So revert 8868 // back to the old way and vectorize with versioning when forced. See D81345.) 8869 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 8870 PGSOQueryType::IRPass) && 8871 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 8872 return CM_ScalarEpilogueNotAllowedOptSize; 8873 8874 // 2) If set, obey the directives 8875 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 8876 switch (PreferPredicateOverEpilogue) { 8877 case PreferPredicateTy::ScalarEpilogue: 8878 return CM_ScalarEpilogueAllowed; 8879 case PreferPredicateTy::PredicateElseScalarEpilogue: 8880 return CM_ScalarEpilogueNotNeededUsePredicate; 8881 case PreferPredicateTy::PredicateOrDontVectorize: 8882 return CM_ScalarEpilogueNotAllowedUsePredicate; 8883 }; 8884 } 8885 8886 // 3) If set, obey the hints 8887 switch (Hints.getPredicate()) { 8888 case LoopVectorizeHints::FK_Enabled: 8889 return CM_ScalarEpilogueNotNeededUsePredicate; 8890 case LoopVectorizeHints::FK_Disabled: 8891 return CM_ScalarEpilogueAllowed; 8892 }; 8893 8894 // 4) if the TTI hook indicates this is profitable, request predication. 8895 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 8896 LVL.getLAI())) 8897 return CM_ScalarEpilogueNotNeededUsePredicate; 8898 8899 return CM_ScalarEpilogueAllowed; 8900 } 8901 8902 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 8903 unsigned Part) { 8904 set(Def, V, Part); 8905 ILV->setVectorValue(IRDef, Part, V); 8906 } 8907 8908 // Process the loop in the VPlan-native vectorization path. This path builds 8909 // VPlan upfront in the vectorization pipeline, which allows to apply 8910 // VPlan-to-VPlan transformations from the very beginning without modifying the 8911 // input LLVM IR. 8912 static bool processLoopInVPlanNativePath( 8913 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 8914 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 8915 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 8916 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 8917 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 8918 8919 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 8920 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 8921 return false; 8922 } 8923 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 8924 Function *F = L->getHeader()->getParent(); 8925 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 8926 8927 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 8928 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 8929 8930 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 8931 &Hints, IAI); 8932 // Use the planner for outer loop vectorization. 8933 // TODO: CM is not used at this point inside the planner. Turn CM into an 8934 // optional argument if we don't need it in the future. 8935 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 8936 8937 // Get user vectorization factor. 8938 ElementCount UserVF = Hints.getWidth(); 8939 8940 // Plan how to best vectorize, return the best VF and its cost. 8941 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 8942 8943 // If we are stress testing VPlan builds, do not attempt to generate vector 8944 // code. Masked vector code generation support will follow soon. 8945 // Also, do not attempt to vectorize if no vector code will be produced. 8946 if (VPlanBuildStressTest || EnableVPlanPredication || 8947 VectorizationFactor::Disabled() == VF) 8948 return false; 8949 8950 LVP.setBestPlan(VF.Width, 1); 8951 8952 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 8953 &CM, BFI, PSI); 8954 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 8955 << L->getHeader()->getParent()->getName() << "\"\n"); 8956 LVP.executePlan(LB, DT); 8957 8958 // Mark the loop as already vectorized to avoid vectorizing again. 8959 Hints.setAlreadyVectorized(); 8960 8961 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 8962 return true; 8963 } 8964 8965 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 8966 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 8967 !EnableLoopInterleaving), 8968 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 8969 !EnableLoopVectorization) {} 8970 8971 bool LoopVectorizePass::processLoop(Loop *L) { 8972 assert((EnableVPlanNativePath || L->isInnermost()) && 8973 "VPlan-native path is not enabled. Only process inner loops."); 8974 8975 #ifndef NDEBUG 8976 const std::string DebugLocStr = getDebugLocString(L); 8977 #endif /* NDEBUG */ 8978 8979 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 8980 << L->getHeader()->getParent()->getName() << "\" from " 8981 << DebugLocStr << "\n"); 8982 8983 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 8984 8985 LLVM_DEBUG( 8986 dbgs() << "LV: Loop hints:" 8987 << " force=" 8988 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 8989 ? "disabled" 8990 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 8991 ? "enabled" 8992 : "?")) 8993 << " width=" << Hints.getWidth() 8994 << " unroll=" << Hints.getInterleave() << "\n"); 8995 8996 // Function containing loop 8997 Function *F = L->getHeader()->getParent(); 8998 8999 // Looking at the diagnostic output is the only way to determine if a loop 9000 // was vectorized (other than looking at the IR or machine code), so it 9001 // is important to generate an optimization remark for each loop. Most of 9002 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9003 // generated as OptimizationRemark and OptimizationRemarkMissed are 9004 // less verbose reporting vectorized loops and unvectorized loops that may 9005 // benefit from vectorization, respectively. 9006 9007 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9008 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9009 return false; 9010 } 9011 9012 PredicatedScalarEvolution PSE(*SE, *L); 9013 9014 // Check if it is legal to vectorize the loop. 9015 LoopVectorizationRequirements Requirements(*ORE); 9016 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9017 &Requirements, &Hints, DB, AC, BFI, PSI); 9018 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9019 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9020 Hints.emitRemarkWithHints(); 9021 return false; 9022 } 9023 9024 // Check the function attributes and profiles to find out if this function 9025 // should be optimized for size. 9026 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9027 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9028 9029 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9030 // here. They may require CFG and instruction level transformations before 9031 // even evaluating whether vectorization is profitable. Since we cannot modify 9032 // the incoming IR, we need to build VPlan upfront in the vectorization 9033 // pipeline. 9034 if (!L->isInnermost()) 9035 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9036 ORE, BFI, PSI, Hints); 9037 9038 assert(L->isInnermost() && "Inner loop expected."); 9039 9040 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9041 // count by optimizing for size, to minimize overheads. 9042 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9043 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9044 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9045 << "This loop is worth vectorizing only if no scalar " 9046 << "iteration overheads are incurred."); 9047 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9048 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9049 else { 9050 LLVM_DEBUG(dbgs() << "\n"); 9051 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9052 } 9053 } 9054 9055 // Check the function attributes to see if implicit floats are allowed. 9056 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9057 // an integer loop and the vector instructions selected are purely integer 9058 // vector instructions? 9059 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9060 reportVectorizationFailure( 9061 "Can't vectorize when the NoImplicitFloat attribute is used", 9062 "loop not vectorized due to NoImplicitFloat attribute", 9063 "NoImplicitFloat", ORE, L); 9064 Hints.emitRemarkWithHints(); 9065 return false; 9066 } 9067 9068 // Check if the target supports potentially unsafe FP vectorization. 9069 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9070 // for the target we're vectorizing for, to make sure none of the 9071 // additional fp-math flags can help. 9072 if (Hints.isPotentiallyUnsafe() && 9073 TTI->isFPVectorizationPotentiallyUnsafe()) { 9074 reportVectorizationFailure( 9075 "Potentially unsafe FP op prevents vectorization", 9076 "loop not vectorized due to unsafe FP support.", 9077 "UnsafeFP", ORE, L); 9078 Hints.emitRemarkWithHints(); 9079 return false; 9080 } 9081 9082 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9083 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9084 9085 // If an override option has been passed in for interleaved accesses, use it. 9086 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9087 UseInterleaved = EnableInterleavedMemAccesses; 9088 9089 // Analyze interleaved memory accesses. 9090 if (UseInterleaved) { 9091 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9092 } 9093 9094 // Use the cost model. 9095 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9096 F, &Hints, IAI); 9097 CM.collectValuesToIgnore(); 9098 9099 // Use the planner for vectorization. 9100 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9101 9102 // Get user vectorization factor and interleave count. 9103 ElementCount UserVF = Hints.getWidth(); 9104 unsigned UserIC = Hints.getInterleave(); 9105 9106 // Plan how to best vectorize, return the best VF and its cost. 9107 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9108 9109 VectorizationFactor VF = VectorizationFactor::Disabled(); 9110 unsigned IC = 1; 9111 9112 if (MaybeVF) { 9113 VF = *MaybeVF; 9114 // Select the interleave count. 9115 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9116 } 9117 9118 // Identify the diagnostic messages that should be produced. 9119 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9120 bool VectorizeLoop = true, InterleaveLoop = true; 9121 if (Requirements.doesNotMeet(F, L, Hints)) { 9122 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9123 "requirements.\n"); 9124 Hints.emitRemarkWithHints(); 9125 return false; 9126 } 9127 9128 if (VF.Width.isScalar()) { 9129 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9130 VecDiagMsg = std::make_pair( 9131 "VectorizationNotBeneficial", 9132 "the cost-model indicates that vectorization is not beneficial"); 9133 VectorizeLoop = false; 9134 } 9135 9136 if (!MaybeVF && UserIC > 1) { 9137 // Tell the user interleaving was avoided up-front, despite being explicitly 9138 // requested. 9139 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9140 "interleaving should be avoided up front\n"); 9141 IntDiagMsg = std::make_pair( 9142 "InterleavingAvoided", 9143 "Ignoring UserIC, because interleaving was avoided up front"); 9144 InterleaveLoop = false; 9145 } else if (IC == 1 && UserIC <= 1) { 9146 // Tell the user interleaving is not beneficial. 9147 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9148 IntDiagMsg = std::make_pair( 9149 "InterleavingNotBeneficial", 9150 "the cost-model indicates that interleaving is not beneficial"); 9151 InterleaveLoop = false; 9152 if (UserIC == 1) { 9153 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9154 IntDiagMsg.second += 9155 " and is explicitly disabled or interleave count is set to 1"; 9156 } 9157 } else if (IC > 1 && UserIC == 1) { 9158 // Tell the user interleaving is beneficial, but it explicitly disabled. 9159 LLVM_DEBUG( 9160 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9161 IntDiagMsg = std::make_pair( 9162 "InterleavingBeneficialButDisabled", 9163 "the cost-model indicates that interleaving is beneficial " 9164 "but is explicitly disabled or interleave count is set to 1"); 9165 InterleaveLoop = false; 9166 } 9167 9168 // Override IC if user provided an interleave count. 9169 IC = UserIC > 0 ? UserIC : IC; 9170 9171 // Emit diagnostic messages, if any. 9172 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9173 if (!VectorizeLoop && !InterleaveLoop) { 9174 // Do not vectorize or interleaving the loop. 9175 ORE->emit([&]() { 9176 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9177 L->getStartLoc(), L->getHeader()) 9178 << VecDiagMsg.second; 9179 }); 9180 ORE->emit([&]() { 9181 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9182 L->getStartLoc(), L->getHeader()) 9183 << IntDiagMsg.second; 9184 }); 9185 return false; 9186 } else if (!VectorizeLoop && InterleaveLoop) { 9187 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9188 ORE->emit([&]() { 9189 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9190 L->getStartLoc(), L->getHeader()) 9191 << VecDiagMsg.second; 9192 }); 9193 } else if (VectorizeLoop && !InterleaveLoop) { 9194 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9195 << ") in " << DebugLocStr << '\n'); 9196 ORE->emit([&]() { 9197 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9198 L->getStartLoc(), L->getHeader()) 9199 << IntDiagMsg.second; 9200 }); 9201 } else if (VectorizeLoop && InterleaveLoop) { 9202 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9203 << ") in " << DebugLocStr << '\n'); 9204 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9205 } 9206 9207 LVP.setBestPlan(VF.Width, IC); 9208 9209 using namespace ore; 9210 bool DisableRuntimeUnroll = false; 9211 MDNode *OrigLoopID = L->getLoopID(); 9212 9213 if (!VectorizeLoop) { 9214 assert(IC > 1 && "interleave count should not be 1 or 0"); 9215 // If we decided that it is not legal to vectorize the loop, then 9216 // interleave it. 9217 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9218 BFI, PSI); 9219 LVP.executePlan(Unroller, DT); 9220 9221 ORE->emit([&]() { 9222 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9223 L->getHeader()) 9224 << "interleaved loop (interleaved count: " 9225 << NV("InterleaveCount", IC) << ")"; 9226 }); 9227 } else { 9228 // If we decided that it is *legal* to vectorize the loop, then do it. 9229 9230 // Consider vectorizing the epilogue too if it's profitable. 9231 VectorizationFactor EpilogueVF = 9232 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9233 if (EpilogueVF.Width.isVector()) { 9234 9235 // The first pass vectorizes the main loop and creates a scalar epilogue 9236 // to be vectorized by executing the plan (potentially with a different 9237 // factor) again shortly afterwards. 9238 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9239 EpilogueVF.Width.getKnownMinValue(), 1); 9240 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9241 &LVL, &CM, BFI, PSI); 9242 9243 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9244 LVP.executePlan(MainILV, DT); 9245 ++LoopsVectorized; 9246 9247 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9248 formLCSSARecursively(*L, *DT, LI, SE); 9249 9250 // Second pass vectorizes the epilogue and adjusts the control flow 9251 // edges from the first pass. 9252 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9253 EPI.MainLoopVF = EPI.EpilogueVF; 9254 EPI.MainLoopUF = EPI.EpilogueUF; 9255 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9256 ORE, EPI, &LVL, &CM, BFI, PSI); 9257 LVP.executePlan(EpilogILV, DT); 9258 ++LoopsEpilogueVectorized; 9259 9260 if (!MainILV.areSafetyChecksAdded()) 9261 DisableRuntimeUnroll = true; 9262 } else { 9263 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9264 &LVL, &CM, BFI, PSI); 9265 LVP.executePlan(LB, DT); 9266 ++LoopsVectorized; 9267 9268 // Add metadata to disable runtime unrolling a scalar loop when there are 9269 // no runtime checks about strides and memory. A scalar loop that is 9270 // rarely used is not worth unrolling. 9271 if (!LB.areSafetyChecksAdded()) 9272 DisableRuntimeUnroll = true; 9273 } 9274 9275 // Report the vectorization decision. 9276 ORE->emit([&]() { 9277 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9278 L->getHeader()) 9279 << "vectorized loop (vectorization width: " 9280 << NV("VectorizationFactor", VF.Width) 9281 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9282 }); 9283 } 9284 9285 Optional<MDNode *> RemainderLoopID = 9286 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9287 LLVMLoopVectorizeFollowupEpilogue}); 9288 if (RemainderLoopID.hasValue()) { 9289 L->setLoopID(RemainderLoopID.getValue()); 9290 } else { 9291 if (DisableRuntimeUnroll) 9292 AddRuntimeUnrollDisableMetaData(L); 9293 9294 // Mark the loop as already vectorized to avoid vectorizing again. 9295 Hints.setAlreadyVectorized(); 9296 } 9297 9298 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9299 return true; 9300 } 9301 9302 LoopVectorizeResult LoopVectorizePass::runImpl( 9303 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9304 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9305 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9306 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9307 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9308 SE = &SE_; 9309 LI = &LI_; 9310 TTI = &TTI_; 9311 DT = &DT_; 9312 BFI = &BFI_; 9313 TLI = TLI_; 9314 AA = &AA_; 9315 AC = &AC_; 9316 GetLAA = &GetLAA_; 9317 DB = &DB_; 9318 ORE = &ORE_; 9319 PSI = PSI_; 9320 9321 // Don't attempt if 9322 // 1. the target claims to have no vector registers, and 9323 // 2. interleaving won't help ILP. 9324 // 9325 // The second condition is necessary because, even if the target has no 9326 // vector registers, loop vectorization may still enable scalar 9327 // interleaving. 9328 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9329 TTI->getMaxInterleaveFactor(1) < 2) 9330 return LoopVectorizeResult(false, false); 9331 9332 bool Changed = false, CFGChanged = false; 9333 9334 // The vectorizer requires loops to be in simplified form. 9335 // Since simplification may add new inner loops, it has to run before the 9336 // legality and profitability checks. This means running the loop vectorizer 9337 // will simplify all loops, regardless of whether anything end up being 9338 // vectorized. 9339 for (auto &L : *LI) 9340 Changed |= CFGChanged |= 9341 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9342 9343 // Build up a worklist of inner-loops to vectorize. This is necessary as 9344 // the act of vectorizing or partially unrolling a loop creates new loops 9345 // and can invalidate iterators across the loops. 9346 SmallVector<Loop *, 8> Worklist; 9347 9348 for (Loop *L : *LI) 9349 collectSupportedLoops(*L, LI, ORE, Worklist); 9350 9351 LoopsAnalyzed += Worklist.size(); 9352 9353 // Now walk the identified inner loops. 9354 while (!Worklist.empty()) { 9355 Loop *L = Worklist.pop_back_val(); 9356 9357 // For the inner loops we actually process, form LCSSA to simplify the 9358 // transform. 9359 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9360 9361 Changed |= CFGChanged |= processLoop(L); 9362 } 9363 9364 // Process each loop nest in the function. 9365 return LoopVectorizeResult(Changed, CFGChanged); 9366 } 9367 9368 PreservedAnalyses LoopVectorizePass::run(Function &F, 9369 FunctionAnalysisManager &AM) { 9370 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9371 auto &LI = AM.getResult<LoopAnalysis>(F); 9372 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9373 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9374 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9375 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9376 auto &AA = AM.getResult<AAManager>(F); 9377 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9378 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9379 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9380 MemorySSA *MSSA = EnableMSSALoopDependency 9381 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9382 : nullptr; 9383 9384 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9385 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9386 [&](Loop &L) -> const LoopAccessInfo & { 9387 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9388 TLI, TTI, nullptr, MSSA}; 9389 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9390 }; 9391 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9392 ProfileSummaryInfo *PSI = 9393 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9394 LoopVectorizeResult Result = 9395 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9396 if (!Result.MadeAnyChange) 9397 return PreservedAnalyses::all(); 9398 PreservedAnalyses PA; 9399 9400 // We currently do not preserve loopinfo/dominator analyses with outer loop 9401 // vectorization. Until this is addressed, mark these analyses as preserved 9402 // only for non-VPlan-native path. 9403 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9404 if (!EnableVPlanNativePath) { 9405 PA.preserve<LoopAnalysis>(); 9406 PA.preserve<DominatorTreeAnalysis>(); 9407 } 9408 PA.preserve<BasicAA>(); 9409 PA.preserve<GlobalsAA>(); 9410 if (!Result.MadeCFGChange) 9411 PA.preserveSet<CFGAnalyses>(); 9412 return PA; 9413 } 9414