1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<bool> ForceTargetSupportsScalableVectors( 276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 277 cl::desc( 278 "Pretend that scalable vectors are supported, even if the target does " 279 "not support them. This flag should only be used for testing.")); 280 281 static cl::opt<unsigned> SmallLoopCost( 282 "small-loop-cost", cl::init(20), cl::Hidden, 283 cl::desc( 284 "The cost of a loop that is considered 'small' by the interleaver.")); 285 286 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 288 cl::desc("Enable the use of the block frequency analysis to access PGO " 289 "heuristics minimizing code growth in cold regions and being more " 290 "aggressive in hot regions.")); 291 292 // Runtime interleave loops for load/store throughput. 293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 295 cl::desc( 296 "Enable runtime interleaving until load/store ports are saturated")); 297 298 /// Interleave small loops with scalar reductions. 299 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 301 cl::desc("Enable interleaving for loops with small iteration counts that " 302 "contain scalar reductions to expose ILP.")); 303 304 /// The number of stores in a loop that are allowed to need predication. 305 static cl::opt<unsigned> NumberOfStoresToPredicate( 306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 307 cl::desc("Max number of stores to be predicated behind an if.")); 308 309 static cl::opt<bool> EnableIndVarRegisterHeur( 310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 311 cl::desc("Count the induction variable only once when interleaving")); 312 313 static cl::opt<bool> EnableCondStoresVectorization( 314 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 315 cl::desc("Enable if predication of stores during vectorization.")); 316 317 static cl::opt<unsigned> MaxNestedScalarReductionIC( 318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 319 cl::desc("The maximum interleave count to use when interleaving a scalar " 320 "reduction in a nested loop.")); 321 322 static cl::opt<bool> 323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 324 cl::Hidden, 325 cl::desc("Prefer in-loop vector reductions, " 326 "overriding the targets preference.")); 327 328 static cl::opt<bool> PreferPredicatedReductionSelect( 329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 330 cl::desc( 331 "Prefer predicating a reduction operation over an after loop select.")); 332 333 cl::opt<bool> EnableVPlanNativePath( 334 "enable-vplan-native-path", cl::init(false), cl::Hidden, 335 cl::desc("Enable VPlan-native vectorization path with " 336 "support for outer loop vectorization.")); 337 338 // FIXME: Remove this switch once we have divergence analysis. Currently we 339 // assume divergent non-backedge branches when this switch is true. 340 cl::opt<bool> EnableVPlanPredication( 341 "enable-vplan-predication", cl::init(false), cl::Hidden, 342 cl::desc("Enable VPlan-native vectorization path predicator with " 343 "support for outer loop vectorization.")); 344 345 // This flag enables the stress testing of the VPlan H-CFG construction in the 346 // VPlan-native vectorization path. It must be used in conjuction with 347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 348 // verification of the H-CFGs built. 349 static cl::opt<bool> VPlanBuildStressTest( 350 "vplan-build-stress-test", cl::init(false), cl::Hidden, 351 cl::desc( 352 "Build VPlan for every supported loop nest in the function and bail " 353 "out right after the build (stress test the VPlan H-CFG construction " 354 "in the VPlan-native vectorization path).")); 355 356 cl::opt<bool> llvm::EnableLoopInterleaving( 357 "interleave-loops", cl::init(true), cl::Hidden, 358 cl::desc("Enable loop interleaving in Loop vectorization passes")); 359 cl::opt<bool> llvm::EnableLoopVectorization( 360 "vectorize-loops", cl::init(true), cl::Hidden, 361 cl::desc("Run the Loop vectorization passes")); 362 363 /// A helper function that returns the type of loaded or stored value. 364 static Type *getMemInstValueType(Value *I) { 365 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 366 "Expected Load or Store instruction"); 367 if (auto *LI = dyn_cast<LoadInst>(I)) 368 return LI->getType(); 369 return cast<StoreInst>(I)->getValueOperand()->getType(); 370 } 371 372 /// A helper function that returns true if the given type is irregular. The 373 /// type is irregular if its allocated size doesn't equal the store size of an 374 /// element of the corresponding vector type at the given vectorization factor. 375 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 376 // Determine if an array of VF elements of type Ty is "bitcast compatible" 377 // with a <VF x Ty> vector. 378 if (VF.isVector()) { 379 auto *VectorTy = VectorType::get(Ty, VF); 380 return TypeSize::get(VF.getKnownMinValue() * 381 DL.getTypeAllocSize(Ty).getFixedValue(), 382 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 383 } 384 385 // If the vectorization factor is one, we just check if an array of type Ty 386 // requires padding between elements. 387 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 388 } 389 390 /// A helper function that returns the reciprocal of the block probability of 391 /// predicated blocks. If we return X, we are assuming the predicated block 392 /// will execute once for every X iterations of the loop header. 393 /// 394 /// TODO: We should use actual block probability here, if available. Currently, 395 /// we always assume predicated blocks have a 50% chance of executing. 396 static unsigned getReciprocalPredBlockProb() { return 2; } 397 398 /// A helper function that adds a 'fast' flag to floating-point operations. 399 static Value *addFastMathFlag(Value *V) { 400 if (isa<FPMathOperator>(V)) 401 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 402 return V; 403 } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPValue *StartV, VPValue *Def, 519 VPTransformState &State); 520 521 /// A helper function to scalarize a single Instruction in the innermost loop. 522 /// Generates a sequence of scalar instances for each lane between \p MinLane 523 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 524 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 525 /// Instr's operands. 526 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 527 const VPIteration &Instance, bool IfPredicateInstr, 528 VPTransformState &State); 529 530 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 531 /// is provided, the integer induction variable will first be truncated to 532 /// the corresponding type. 533 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 534 VPValue *Def, VPValue *CastDef, 535 VPTransformState &State); 536 537 /// Construct the vector value of a scalarized value \p V one lane at a time. 538 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 539 VPTransformState &State); 540 541 /// Try to vectorize interleaved access group \p Group with the base address 542 /// given in \p Addr, optionally masking the vector operations if \p 543 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 544 /// values in the vectorized loop. 545 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 546 ArrayRef<VPValue *> VPDefs, 547 VPTransformState &State, VPValue *Addr, 548 ArrayRef<VPValue *> StoredValues, 549 VPValue *BlockInMask = nullptr); 550 551 /// Vectorize Load and Store instructions with the base address given in \p 552 /// Addr, optionally masking the vector operations if \p BlockInMask is 553 /// non-null. Use \p State to translate given VPValues to IR values in the 554 /// vectorized loop. 555 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 556 VPValue *Def, VPValue *Addr, 557 VPValue *StoredValue, VPValue *BlockInMask); 558 559 /// Set the debug location in the builder using the debug location in 560 /// the instruction. 561 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 562 563 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 564 void fixNonInductionPHIs(VPTransformState &State); 565 566 /// Create a broadcast instruction. This method generates a broadcast 567 /// instruction (shuffle) for loop invariant values and for the induction 568 /// value. If this is the induction variable then we extend it to N, N+1, ... 569 /// this is needed because each iteration in the loop corresponds to a SIMD 570 /// element. 571 virtual Value *getBroadcastInstrs(Value *V); 572 573 protected: 574 friend class LoopVectorizationPlanner; 575 576 /// A small list of PHINodes. 577 using PhiVector = SmallVector<PHINode *, 4>; 578 579 /// A type for scalarized values in the new loop. Each value from the 580 /// original loop, when scalarized, is represented by UF x VF scalar values 581 /// in the new unrolled loop, where UF is the unroll factor and VF is the 582 /// vectorization factor. 583 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 584 585 /// Set up the values of the IVs correctly when exiting the vector loop. 586 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 587 Value *CountRoundDown, Value *EndValue, 588 BasicBlock *MiddleBlock); 589 590 /// Create a new induction variable inside L. 591 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 592 Value *Step, Instruction *DL); 593 594 /// Handle all cross-iteration phis in the header. 595 void fixCrossIterationPHIs(VPTransformState &State); 596 597 /// Fix a first-order recurrence. This is the second phase of vectorizing 598 /// this phi node. 599 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 600 601 /// Fix a reduction cross-iteration phi. This is the second phase of 602 /// vectorizing this phi node. 603 void fixReduction(PHINode *Phi, VPTransformState &State); 604 605 /// Clear NSW/NUW flags from reduction instructions if necessary. 606 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 607 VPTransformState &State); 608 609 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 610 /// means we need to add the appropriate incoming value from the middle 611 /// block as exiting edges from the scalar epilogue loop (if present) are 612 /// already in place, and we exit the vector loop exclusively to the middle 613 /// block. 614 void fixLCSSAPHIs(VPTransformState &State); 615 616 /// Iteratively sink the scalarized operands of a predicated instruction into 617 /// the block that was created for it. 618 void sinkScalarOperands(Instruction *PredInst); 619 620 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 621 /// represented as. 622 void truncateToMinimalBitwidths(VPTransformState &State); 623 624 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The (unique) ExitBlock of the scalar loop. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a record \p DebugMsg about vectorization failure to the debug 1069 /// output stream. If \p I is passed, it is an instruction that prevents 1070 /// vectorization. 1071 #ifndef NDEBUG 1072 static void debugVectorizationFailure(const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1104 R << "loop not vectorized: "; 1105 return R; 1106 } 1107 1108 /// Return a value for Step multiplied by VF. 1109 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1110 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1111 Constant *StepVal = ConstantInt::get( 1112 Step->getType(), 1113 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1114 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1115 } 1116 1117 namespace llvm { 1118 1119 void reportVectorizationFailure(const StringRef DebugMsg, 1120 const StringRef OREMsg, const StringRef ORETag, 1121 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1122 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1123 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1124 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1125 ORETag, TheLoop, I) << OREMsg); 1126 } 1127 1128 } // end namespace llvm 1129 1130 #ifndef NDEBUG 1131 /// \return string containing a file name and a line # for the given loop. 1132 static std::string getDebugLocString(const Loop *L) { 1133 std::string Result; 1134 if (L) { 1135 raw_string_ostream OS(Result); 1136 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1137 LoopDbgLoc.print(OS); 1138 else 1139 // Just print the module name. 1140 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1141 OS.flush(); 1142 } 1143 return Result; 1144 } 1145 #endif 1146 1147 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1148 const Instruction *Orig) { 1149 // If the loop was versioned with memchecks, add the corresponding no-alias 1150 // metadata. 1151 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1152 LVer->annotateInstWithNoAlias(To, Orig); 1153 } 1154 1155 void InnerLoopVectorizer::addMetadata(Instruction *To, 1156 Instruction *From) { 1157 propagateMetadata(To, From); 1158 addNewMetadata(To, From); 1159 } 1160 1161 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1162 Instruction *From) { 1163 for (Value *V : To) { 1164 if (Instruction *I = dyn_cast<Instruction>(V)) 1165 addMetadata(I, From); 1166 } 1167 } 1168 1169 namespace llvm { 1170 1171 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1172 // lowered. 1173 enum ScalarEpilogueLowering { 1174 1175 // The default: allowing scalar epilogues. 1176 CM_ScalarEpilogueAllowed, 1177 1178 // Vectorization with OptForSize: don't allow epilogues. 1179 CM_ScalarEpilogueNotAllowedOptSize, 1180 1181 // A special case of vectorisation with OptForSize: loops with a very small 1182 // trip count are considered for vectorization under OptForSize, thereby 1183 // making sure the cost of their loop body is dominant, free of runtime 1184 // guards and scalar iteration overheads. 1185 CM_ScalarEpilogueNotAllowedLowTripLoop, 1186 1187 // Loop hint predicate indicating an epilogue is undesired. 1188 CM_ScalarEpilogueNotNeededUsePredicate, 1189 1190 // Directive indicating we must either tail fold or not vectorize 1191 CM_ScalarEpilogueNotAllowedUsePredicate 1192 }; 1193 1194 /// LoopVectorizationCostModel - estimates the expected speedups due to 1195 /// vectorization. 1196 /// In many cases vectorization is not profitable. This can happen because of 1197 /// a number of reasons. In this class we mainly attempt to predict the 1198 /// expected speedup/slowdowns due to the supported instruction set. We use the 1199 /// TargetTransformInfo to query the different backends for the cost of 1200 /// different operations. 1201 class LoopVectorizationCostModel { 1202 public: 1203 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1204 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1205 LoopVectorizationLegality *Legal, 1206 const TargetTransformInfo &TTI, 1207 const TargetLibraryInfo *TLI, DemandedBits *DB, 1208 AssumptionCache *AC, 1209 OptimizationRemarkEmitter *ORE, const Function *F, 1210 const LoopVectorizeHints *Hints, 1211 InterleavedAccessInfo &IAI) 1212 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1213 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1214 Hints(Hints), InterleaveInfo(IAI) {} 1215 1216 /// \return An upper bound for the vectorization factor, or None if 1217 /// vectorization and interleaving should be avoided up front. 1218 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1219 1220 /// \return True if runtime checks are required for vectorization, and false 1221 /// otherwise. 1222 bool runtimeChecksRequired(); 1223 1224 /// \return The most profitable vectorization factor and the cost of that VF. 1225 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1226 /// then this vectorization factor will be selected if vectorization is 1227 /// possible. 1228 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1229 VectorizationFactor 1230 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1231 const LoopVectorizationPlanner &LVP); 1232 1233 /// Setup cost-based decisions for user vectorization factor. 1234 void selectUserVectorizationFactor(ElementCount UserVF) { 1235 collectUniformsAndScalars(UserVF); 1236 collectInstsToScalarize(UserVF); 1237 } 1238 1239 /// \return The size (in bits) of the smallest and widest types in the code 1240 /// that needs to be vectorized. We ignore values that remain scalar such as 1241 /// 64 bit loop indices. 1242 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1243 1244 /// \return The desired interleave count. 1245 /// If interleave count has been specified by metadata it will be returned. 1246 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1247 /// are the selected vectorization factor and the cost of the selected VF. 1248 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1249 1250 /// Memory access instruction may be vectorized in more than one way. 1251 /// Form of instruction after vectorization depends on cost. 1252 /// This function takes cost-based decisions for Load/Store instructions 1253 /// and collects them in a map. This decisions map is used for building 1254 /// the lists of loop-uniform and loop-scalar instructions. 1255 /// The calculated cost is saved with widening decision in order to 1256 /// avoid redundant calculations. 1257 void setCostBasedWideningDecision(ElementCount VF); 1258 1259 /// A struct that represents some properties of the register usage 1260 /// of a loop. 1261 struct RegisterUsage { 1262 /// Holds the number of loop invariant values that are used in the loop. 1263 /// The key is ClassID of target-provided register class. 1264 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1265 /// Holds the maximum number of concurrent live intervals in the loop. 1266 /// The key is ClassID of target-provided register class. 1267 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1268 }; 1269 1270 /// \return Returns information about the register usages of the loop for the 1271 /// given vectorization factors. 1272 SmallVector<RegisterUsage, 8> 1273 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1274 1275 /// Collect values we want to ignore in the cost model. 1276 void collectValuesToIgnore(); 1277 1278 /// Split reductions into those that happen in the loop, and those that happen 1279 /// outside. In loop reductions are collected into InLoopReductionChains. 1280 void collectInLoopReductions(); 1281 1282 /// \returns The smallest bitwidth each instruction can be represented with. 1283 /// The vector equivalents of these instructions should be truncated to this 1284 /// type. 1285 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1286 return MinBWs; 1287 } 1288 1289 /// \returns True if it is more profitable to scalarize instruction \p I for 1290 /// vectorization factor \p VF. 1291 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1292 assert(VF.isVector() && 1293 "Profitable to scalarize relevant only for VF > 1."); 1294 1295 // Cost model is not run in the VPlan-native path - return conservative 1296 // result until this changes. 1297 if (EnableVPlanNativePath) 1298 return false; 1299 1300 auto Scalars = InstsToScalarize.find(VF); 1301 assert(Scalars != InstsToScalarize.end() && 1302 "VF not yet analyzed for scalarization profitability"); 1303 return Scalars->second.find(I) != Scalars->second.end(); 1304 } 1305 1306 /// Returns true if \p I is known to be uniform after vectorization. 1307 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1308 if (VF.isScalar()) 1309 return true; 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto UniformsPerVF = Uniforms.find(VF); 1317 assert(UniformsPerVF != Uniforms.end() && 1318 "VF not yet analyzed for uniformity"); 1319 return UniformsPerVF->second.count(I); 1320 } 1321 1322 /// Returns true if \p I is known to be scalar after vectorization. 1323 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto ScalarsPerVF = Scalars.find(VF); 1333 assert(ScalarsPerVF != Scalars.end() && 1334 "Scalar values are not calculated for VF"); 1335 return ScalarsPerVF->second.count(I); 1336 } 1337 1338 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1339 /// for vectorization factor \p VF. 1340 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1341 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1342 !isProfitableToScalarize(I, VF) && 1343 !isScalarAfterVectorization(I, VF); 1344 } 1345 1346 /// Decision that was taken during cost calculation for memory instruction. 1347 enum InstWidening { 1348 CM_Unknown, 1349 CM_Widen, // For consecutive accesses with stride +1. 1350 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1351 CM_Interleave, 1352 CM_GatherScatter, 1353 CM_Scalarize 1354 }; 1355 1356 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1357 /// instruction \p I and vector width \p VF. 1358 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1359 InstructionCost Cost) { 1360 assert(VF.isVector() && "Expected VF >=2"); 1361 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1362 } 1363 1364 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1365 /// interleaving group \p Grp and vector width \p VF. 1366 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1367 ElementCount VF, InstWidening W, 1368 InstructionCost Cost) { 1369 assert(VF.isVector() && "Expected VF >=2"); 1370 /// Broadcast this decicion to all instructions inside the group. 1371 /// But the cost will be assigned to one instruction only. 1372 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1373 if (auto *I = Grp->getMember(i)) { 1374 if (Grp->getInsertPos() == I) 1375 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1376 else 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1378 } 1379 } 1380 } 1381 1382 /// Return the cost model decision for the given instruction \p I and vector 1383 /// width \p VF. Return CM_Unknown if this instruction did not pass 1384 /// through the cost modeling. 1385 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1386 assert(VF.isVector() && "Expected VF to be a vector VF"); 1387 // Cost model is not run in the VPlan-native path - return conservative 1388 // result until this changes. 1389 if (EnableVPlanNativePath) 1390 return CM_GatherScatter; 1391 1392 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1393 auto Itr = WideningDecisions.find(InstOnVF); 1394 if (Itr == WideningDecisions.end()) 1395 return CM_Unknown; 1396 return Itr->second.first; 1397 } 1398 1399 /// Return the vectorization cost for the given instruction \p I and vector 1400 /// width \p VF. 1401 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1402 assert(VF.isVector() && "Expected VF >=2"); 1403 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1404 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1405 "The cost is not calculated"); 1406 return WideningDecisions[InstOnVF].second; 1407 } 1408 1409 /// Return True if instruction \p I is an optimizable truncate whose operand 1410 /// is an induction variable. Such a truncate will be removed by adding a new 1411 /// induction variable with the destination type. 1412 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1413 // If the instruction is not a truncate, return false. 1414 auto *Trunc = dyn_cast<TruncInst>(I); 1415 if (!Trunc) 1416 return false; 1417 1418 // Get the source and destination types of the truncate. 1419 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1420 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1421 1422 // If the truncate is free for the given types, return false. Replacing a 1423 // free truncate with an induction variable would add an induction variable 1424 // update instruction to each iteration of the loop. We exclude from this 1425 // check the primary induction variable since it will need an update 1426 // instruction regardless. 1427 Value *Op = Trunc->getOperand(0); 1428 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1429 return false; 1430 1431 // If the truncated value is not an induction variable, return false. 1432 return Legal->isInductionPhi(Op); 1433 } 1434 1435 /// Collects the instructions to scalarize for each predicated instruction in 1436 /// the loop. 1437 void collectInstsToScalarize(ElementCount VF); 1438 1439 /// Collect Uniform and Scalar values for the given \p VF. 1440 /// The sets depend on CM decision for Load/Store instructions 1441 /// that may be vectorized as interleave, gather-scatter or scalarized. 1442 void collectUniformsAndScalars(ElementCount VF) { 1443 // Do the analysis once. 1444 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1445 return; 1446 setCostBasedWideningDecision(VF); 1447 collectLoopUniforms(VF); 1448 collectLoopScalars(VF); 1449 } 1450 1451 /// Returns true if the target machine supports masked store operation 1452 /// for the given \p DataType and kind of access to \p Ptr. 1453 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1454 return Legal->isConsecutivePtr(Ptr) && 1455 TTI.isLegalMaskedStore(DataType, Alignment); 1456 } 1457 1458 /// Returns true if the target machine supports masked load operation 1459 /// for the given \p DataType and kind of access to \p Ptr. 1460 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1461 return Legal->isConsecutivePtr(Ptr) && 1462 TTI.isLegalMaskedLoad(DataType, Alignment); 1463 } 1464 1465 /// Returns true if the target machine supports masked scatter operation 1466 /// for the given \p DataType. 1467 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1468 return TTI.isLegalMaskedScatter(DataType, Alignment); 1469 } 1470 1471 /// Returns true if the target machine supports masked gather operation 1472 /// for the given \p DataType. 1473 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1474 return TTI.isLegalMaskedGather(DataType, Alignment); 1475 } 1476 1477 /// Returns true if the target machine can represent \p V as a masked gather 1478 /// or scatter operation. 1479 bool isLegalGatherOrScatter(Value *V) { 1480 bool LI = isa<LoadInst>(V); 1481 bool SI = isa<StoreInst>(V); 1482 if (!LI && !SI) 1483 return false; 1484 auto *Ty = getMemInstValueType(V); 1485 Align Align = getLoadStoreAlignment(V); 1486 return (LI && isLegalMaskedGather(Ty, Align)) || 1487 (SI && isLegalMaskedScatter(Ty, Align)); 1488 } 1489 1490 /// Returns true if the target machine supports all of the reduction 1491 /// variables found for the given VF. 1492 bool canVectorizeReductions(ElementCount VF) { 1493 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1494 RecurrenceDescriptor RdxDesc = Reduction.second; 1495 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1496 })); 1497 } 1498 1499 /// Returns true if \p I is an instruction that will be scalarized with 1500 /// predication. Such instructions include conditional stores and 1501 /// instructions that may divide by zero. 1502 /// If a non-zero VF has been calculated, we check if I will be scalarized 1503 /// predication for that VF. 1504 bool isScalarWithPredication(Instruction *I, 1505 ElementCount VF = ElementCount::getFixed(1)); 1506 1507 // Returns true if \p I is an instruction that will be predicated either 1508 // through scalar predication or masked load/store or masked gather/scatter. 1509 // Superset of instructions that return true for isScalarWithPredication. 1510 bool isPredicatedInst(Instruction *I) { 1511 if (!blockNeedsPredication(I->getParent())) 1512 return false; 1513 // Loads and stores that need some form of masked operation are predicated 1514 // instructions. 1515 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1516 return Legal->isMaskRequired(I); 1517 return isScalarWithPredication(I); 1518 } 1519 1520 /// Returns true if \p I is a memory instruction with consecutive memory 1521 /// access that can be widened. 1522 bool 1523 memoryInstructionCanBeWidened(Instruction *I, 1524 ElementCount VF = ElementCount::getFixed(1)); 1525 1526 /// Returns true if \p I is a memory instruction in an interleaved-group 1527 /// of memory accesses that can be vectorized with wide vector loads/stores 1528 /// and shuffles. 1529 bool 1530 interleavedAccessCanBeWidened(Instruction *I, 1531 ElementCount VF = ElementCount::getFixed(1)); 1532 1533 /// Check if \p Instr belongs to any interleaved access group. 1534 bool isAccessInterleaved(Instruction *Instr) { 1535 return InterleaveInfo.isInterleaved(Instr); 1536 } 1537 1538 /// Get the interleaved access group that \p Instr belongs to. 1539 const InterleaveGroup<Instruction> * 1540 getInterleavedAccessGroup(Instruction *Instr) { 1541 return InterleaveInfo.getInterleaveGroup(Instr); 1542 } 1543 1544 /// Returns true if we're required to use a scalar epilogue for at least 1545 /// the final iteration of the original loop. 1546 bool requiresScalarEpilogue() const { 1547 if (!isScalarEpilogueAllowed()) 1548 return false; 1549 // If we might exit from anywhere but the latch, must run the exiting 1550 // iteration in scalar form. 1551 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1552 return true; 1553 return InterleaveInfo.requiresScalarEpilogue(); 1554 } 1555 1556 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1557 /// loop hint annotation. 1558 bool isScalarEpilogueAllowed() const { 1559 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1560 } 1561 1562 /// Returns true if all loop blocks should be masked to fold tail loop. 1563 bool foldTailByMasking() const { return FoldTailByMasking; } 1564 1565 bool blockNeedsPredication(BasicBlock *BB) { 1566 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1567 } 1568 1569 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1570 /// nodes to the chain of instructions representing the reductions. Uses a 1571 /// MapVector to ensure deterministic iteration order. 1572 using ReductionChainMap = 1573 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1574 1575 /// Return the chain of instructions representing an inloop reduction. 1576 const ReductionChainMap &getInLoopReductionChains() const { 1577 return InLoopReductionChains; 1578 } 1579 1580 /// Returns true if the Phi is part of an inloop reduction. 1581 bool isInLoopReduction(PHINode *Phi) const { 1582 return InLoopReductionChains.count(Phi); 1583 } 1584 1585 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1586 /// with factor VF. Return the cost of the instruction, including 1587 /// scalarization overhead if it's needed. 1588 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1589 1590 /// Estimate cost of a call instruction CI if it were vectorized with factor 1591 /// VF. Return the cost of the instruction, including scalarization overhead 1592 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1593 /// scalarized - 1594 /// i.e. either vector version isn't available, or is too expensive. 1595 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1596 bool &NeedToScalarize); 1597 1598 /// Invalidates decisions already taken by the cost model. 1599 void invalidateCostModelingDecisions() { 1600 WideningDecisions.clear(); 1601 Uniforms.clear(); 1602 Scalars.clear(); 1603 } 1604 1605 private: 1606 unsigned NumPredStores = 0; 1607 1608 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1609 /// than zero. One is returned if vectorization should best be avoided due 1610 /// to cost. 1611 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1612 ElementCount UserVF); 1613 1614 /// The vectorization cost is a combination of the cost itself and a boolean 1615 /// indicating whether any of the contributing operations will actually 1616 /// operate on 1617 /// vector values after type legalization in the backend. If this latter value 1618 /// is 1619 /// false, then all operations will be scalarized (i.e. no vectorization has 1620 /// actually taken place). 1621 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1622 1623 /// Returns the expected execution cost. The unit of the cost does 1624 /// not matter because we use the 'cost' units to compare different 1625 /// vector widths. The cost that is returned is *not* normalized by 1626 /// the factor width. 1627 VectorizationCostTy expectedCost(ElementCount VF); 1628 1629 /// Returns the execution time cost of an instruction for a given vector 1630 /// width. Vector width of one means scalar. 1631 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1632 1633 /// The cost-computation logic from getInstructionCost which provides 1634 /// the vector type as an output parameter. 1635 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1636 Type *&VectorTy); 1637 1638 /// Return the cost of instructions in an inloop reduction pattern, if I is 1639 /// part of that pattern. 1640 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1641 Type *VectorTy, 1642 TTI::TargetCostKind CostKind); 1643 1644 /// Calculate vectorization cost of memory instruction \p I. 1645 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1646 1647 /// The cost computation for scalarized memory instruction. 1648 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1649 1650 /// The cost computation for interleaving group of memory instructions. 1651 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1652 1653 /// The cost computation for Gather/Scatter instruction. 1654 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1655 1656 /// The cost computation for widening instruction \p I with consecutive 1657 /// memory access. 1658 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1659 1660 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1661 /// Load: scalar load + broadcast. 1662 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1663 /// element) 1664 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1665 1666 /// Estimate the overhead of scalarizing an instruction. This is a 1667 /// convenience wrapper for the type-based getScalarizationOverhead API. 1668 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1669 1670 /// Returns whether the instruction is a load or store and will be a emitted 1671 /// as a vector operation. 1672 bool isConsecutiveLoadOrStore(Instruction *I); 1673 1674 /// Returns true if an artificially high cost for emulated masked memrefs 1675 /// should be used. 1676 bool useEmulatedMaskMemRefHack(Instruction *I); 1677 1678 /// Map of scalar integer values to the smallest bitwidth they can be legally 1679 /// represented as. The vector equivalents of these values should be truncated 1680 /// to this type. 1681 MapVector<Instruction *, uint64_t> MinBWs; 1682 1683 /// A type representing the costs for instructions if they were to be 1684 /// scalarized rather than vectorized. The entries are Instruction-Cost 1685 /// pairs. 1686 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1687 1688 /// A set containing all BasicBlocks that are known to present after 1689 /// vectorization as a predicated block. 1690 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1691 1692 /// Records whether it is allowed to have the original scalar loop execute at 1693 /// least once. This may be needed as a fallback loop in case runtime 1694 /// aliasing/dependence checks fail, or to handle the tail/remainder 1695 /// iterations when the trip count is unknown or doesn't divide by the VF, 1696 /// or as a peel-loop to handle gaps in interleave-groups. 1697 /// Under optsize and when the trip count is very small we don't allow any 1698 /// iterations to execute in the scalar loop. 1699 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1700 1701 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1702 bool FoldTailByMasking = false; 1703 1704 /// A map holding scalar costs for different vectorization factors. The 1705 /// presence of a cost for an instruction in the mapping indicates that the 1706 /// instruction will be scalarized when vectorizing with the associated 1707 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1708 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1709 1710 /// Holds the instructions known to be uniform after vectorization. 1711 /// The data is collected per VF. 1712 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1713 1714 /// Holds the instructions known to be scalar after vectorization. 1715 /// The data is collected per VF. 1716 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1717 1718 /// Holds the instructions (address computations) that are forced to be 1719 /// scalarized. 1720 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1721 1722 /// PHINodes of the reductions that should be expanded in-loop along with 1723 /// their associated chains of reduction operations, in program order from top 1724 /// (PHI) to bottom 1725 ReductionChainMap InLoopReductionChains; 1726 1727 /// A Map of inloop reduction operations and their immediate chain operand. 1728 /// FIXME: This can be removed once reductions can be costed correctly in 1729 /// vplan. This was added to allow quick lookup to the inloop operations, 1730 /// without having to loop through InLoopReductionChains. 1731 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1732 1733 /// Returns the expected difference in cost from scalarizing the expression 1734 /// feeding a predicated instruction \p PredInst. The instructions to 1735 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1736 /// non-negative return value implies the expression will be scalarized. 1737 /// Currently, only single-use chains are considered for scalarization. 1738 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1739 ElementCount VF); 1740 1741 /// Collect the instructions that are uniform after vectorization. An 1742 /// instruction is uniform if we represent it with a single scalar value in 1743 /// the vectorized loop corresponding to each vector iteration. Examples of 1744 /// uniform instructions include pointer operands of consecutive or 1745 /// interleaved memory accesses. Note that although uniformity implies an 1746 /// instruction will be scalar, the reverse is not true. In general, a 1747 /// scalarized instruction will be represented by VF scalar values in the 1748 /// vectorized loop, each corresponding to an iteration of the original 1749 /// scalar loop. 1750 void collectLoopUniforms(ElementCount VF); 1751 1752 /// Collect the instructions that are scalar after vectorization. An 1753 /// instruction is scalar if it is known to be uniform or will be scalarized 1754 /// during vectorization. Non-uniform scalarized instructions will be 1755 /// represented by VF values in the vectorized loop, each corresponding to an 1756 /// iteration of the original scalar loop. 1757 void collectLoopScalars(ElementCount VF); 1758 1759 /// Keeps cost model vectorization decision and cost for instructions. 1760 /// Right now it is used for memory instructions only. 1761 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1762 std::pair<InstWidening, InstructionCost>>; 1763 1764 DecisionList WideningDecisions; 1765 1766 /// Returns true if \p V is expected to be vectorized and it needs to be 1767 /// extracted. 1768 bool needsExtract(Value *V, ElementCount VF) const { 1769 Instruction *I = dyn_cast<Instruction>(V); 1770 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1771 TheLoop->isLoopInvariant(I)) 1772 return false; 1773 1774 // Assume we can vectorize V (and hence we need extraction) if the 1775 // scalars are not computed yet. This can happen, because it is called 1776 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1777 // the scalars are collected. That should be a safe assumption in most 1778 // cases, because we check if the operands have vectorizable types 1779 // beforehand in LoopVectorizationLegality. 1780 return Scalars.find(VF) == Scalars.end() || 1781 !isScalarAfterVectorization(I, VF); 1782 }; 1783 1784 /// Returns a range containing only operands needing to be extracted. 1785 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1786 ElementCount VF) { 1787 return SmallVector<Value *, 4>(make_filter_range( 1788 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1789 } 1790 1791 /// Determines if we have the infrastructure to vectorize loop \p L and its 1792 /// epilogue, assuming the main loop is vectorized by \p VF. 1793 bool isCandidateForEpilogueVectorization(const Loop &L, 1794 const ElementCount VF) const; 1795 1796 /// Returns true if epilogue vectorization is considered profitable, and 1797 /// false otherwise. 1798 /// \p VF is the vectorization factor chosen for the original loop. 1799 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1800 1801 public: 1802 /// The loop that we evaluate. 1803 Loop *TheLoop; 1804 1805 /// Predicated scalar evolution analysis. 1806 PredicatedScalarEvolution &PSE; 1807 1808 /// Loop Info analysis. 1809 LoopInfo *LI; 1810 1811 /// Vectorization legality. 1812 LoopVectorizationLegality *Legal; 1813 1814 /// Vector target information. 1815 const TargetTransformInfo &TTI; 1816 1817 /// Target Library Info. 1818 const TargetLibraryInfo *TLI; 1819 1820 /// Demanded bits analysis. 1821 DemandedBits *DB; 1822 1823 /// Assumption cache. 1824 AssumptionCache *AC; 1825 1826 /// Interface to emit optimization remarks. 1827 OptimizationRemarkEmitter *ORE; 1828 1829 const Function *TheFunction; 1830 1831 /// Loop Vectorize Hint. 1832 const LoopVectorizeHints *Hints; 1833 1834 /// The interleave access information contains groups of interleaved accesses 1835 /// with the same stride and close to each other. 1836 InterleavedAccessInfo &InterleaveInfo; 1837 1838 /// Values to ignore in the cost model. 1839 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1840 1841 /// Values to ignore in the cost model when VF > 1. 1842 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1843 1844 /// Profitable vector factors. 1845 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1846 }; 1847 } // end namespace llvm 1848 1849 /// Helper struct to manage generating runtime checks for vectorization. 1850 /// 1851 /// The runtime checks are created up-front in temporary blocks to allow better 1852 /// estimating the cost and un-linked from the existing IR. After deciding to 1853 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1854 /// temporary blocks are completely removed. 1855 class GeneratedRTChecks { 1856 /// Basic block which contains the generated SCEV checks, if any. 1857 BasicBlock *SCEVCheckBlock = nullptr; 1858 1859 /// The value representing the result of the generated SCEV checks. If it is 1860 /// nullptr, either no SCEV checks have been generated or they have been used. 1861 Value *SCEVCheckCond = nullptr; 1862 1863 /// Basic block which contains the generated memory runtime checks, if any. 1864 BasicBlock *MemCheckBlock = nullptr; 1865 1866 /// The value representing the result of the generated memory runtime checks. 1867 /// If it is nullptr, either no memory runtime checks have been generated or 1868 /// they have been used. 1869 Instruction *MemRuntimeCheckCond = nullptr; 1870 1871 DominatorTree *DT; 1872 LoopInfo *LI; 1873 1874 SCEVExpander SCEVExp; 1875 SCEVExpander MemCheckExp; 1876 1877 public: 1878 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1879 const DataLayout &DL) 1880 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1881 MemCheckExp(SE, DL, "scev.check") {} 1882 1883 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1884 /// accurately estimate the cost of the runtime checks. The blocks are 1885 /// un-linked from the IR and is added back during vector code generation. If 1886 /// there is no vector code generation, the check blocks are removed 1887 /// completely. 1888 void Create(Loop *L, const LoopAccessInfo &LAI, 1889 const SCEVUnionPredicate &UnionPred) { 1890 1891 BasicBlock *LoopHeader = L->getHeader(); 1892 BasicBlock *Preheader = L->getLoopPreheader(); 1893 1894 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1895 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1896 // may be used by SCEVExpander. The blocks will be un-linked from their 1897 // predecessors and removed from LI & DT at the end of the function. 1898 if (!UnionPred.isAlwaysTrue()) { 1899 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1900 nullptr, "vector.scevcheck"); 1901 1902 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1903 &UnionPred, SCEVCheckBlock->getTerminator()); 1904 } 1905 1906 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1907 if (RtPtrChecking.Need) { 1908 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1909 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1910 "vector.memcheck"); 1911 1912 std::tie(std::ignore, MemRuntimeCheckCond) = 1913 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1914 RtPtrChecking.getChecks(), MemCheckExp); 1915 assert(MemRuntimeCheckCond && 1916 "no RT checks generated although RtPtrChecking " 1917 "claimed checks are required"); 1918 } 1919 1920 if (!MemCheckBlock && !SCEVCheckBlock) 1921 return; 1922 1923 // Unhook the temporary block with the checks, update various places 1924 // accordingly. 1925 if (SCEVCheckBlock) 1926 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1927 if (MemCheckBlock) 1928 MemCheckBlock->replaceAllUsesWith(Preheader); 1929 1930 if (SCEVCheckBlock) { 1931 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1932 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1933 Preheader->getTerminator()->eraseFromParent(); 1934 } 1935 if (MemCheckBlock) { 1936 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1937 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1938 Preheader->getTerminator()->eraseFromParent(); 1939 } 1940 1941 DT->changeImmediateDominator(LoopHeader, Preheader); 1942 if (MemCheckBlock) { 1943 DT->eraseNode(MemCheckBlock); 1944 LI->removeBlock(MemCheckBlock); 1945 } 1946 if (SCEVCheckBlock) { 1947 DT->eraseNode(SCEVCheckBlock); 1948 LI->removeBlock(SCEVCheckBlock); 1949 } 1950 } 1951 1952 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1953 /// unused. 1954 ~GeneratedRTChecks() { 1955 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1956 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1957 if (!SCEVCheckCond) 1958 SCEVCleaner.markResultUsed(); 1959 1960 if (!MemRuntimeCheckCond) 1961 MemCheckCleaner.markResultUsed(); 1962 1963 if (MemRuntimeCheckCond) { 1964 auto &SE = *MemCheckExp.getSE(); 1965 // Memory runtime check generation creates compares that use expanded 1966 // values. Remove them before running the SCEVExpanderCleaners. 1967 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1968 if (MemCheckExp.isInsertedInstruction(&I)) 1969 continue; 1970 SE.forgetValue(&I); 1971 SE.eraseValueFromMap(&I); 1972 I.eraseFromParent(); 1973 } 1974 } 1975 MemCheckCleaner.cleanup(); 1976 SCEVCleaner.cleanup(); 1977 1978 if (SCEVCheckCond) 1979 SCEVCheckBlock->eraseFromParent(); 1980 if (MemRuntimeCheckCond) 1981 MemCheckBlock->eraseFromParent(); 1982 } 1983 1984 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1985 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1986 /// depending on the generated condition. 1987 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1988 BasicBlock *LoopVectorPreHeader, 1989 BasicBlock *LoopExitBlock) { 1990 if (!SCEVCheckCond) 1991 return nullptr; 1992 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1993 if (C->isZero()) 1994 return nullptr; 1995 1996 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1997 1998 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1999 // Create new preheader for vector loop. 2000 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2001 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2002 2003 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2004 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2005 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2006 SCEVCheckBlock); 2007 2008 DT->addNewBlock(SCEVCheckBlock, Pred); 2009 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2010 2011 ReplaceInstWithInst( 2012 SCEVCheckBlock->getTerminator(), 2013 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2014 // Mark the check as used, to prevent it from being removed during cleanup. 2015 SCEVCheckCond = nullptr; 2016 return SCEVCheckBlock; 2017 } 2018 2019 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2020 /// the branches to branch to the vector preheader or \p Bypass, depending on 2021 /// the generated condition. 2022 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2023 BasicBlock *LoopVectorPreHeader) { 2024 // Check if we generated code that checks in runtime if arrays overlap. 2025 if (!MemRuntimeCheckCond) 2026 return nullptr; 2027 2028 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2029 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2030 MemCheckBlock); 2031 2032 DT->addNewBlock(MemCheckBlock, Pred); 2033 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2034 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2035 2036 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2037 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2038 2039 ReplaceInstWithInst( 2040 MemCheckBlock->getTerminator(), 2041 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2042 MemCheckBlock->getTerminator()->setDebugLoc( 2043 Pred->getTerminator()->getDebugLoc()); 2044 2045 // Mark the check as used, to prevent it from being removed during cleanup. 2046 MemRuntimeCheckCond = nullptr; 2047 return MemCheckBlock; 2048 } 2049 }; 2050 2051 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2052 // vectorization. The loop needs to be annotated with #pragma omp simd 2053 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2054 // vector length information is not provided, vectorization is not considered 2055 // explicit. Interleave hints are not allowed either. These limitations will be 2056 // relaxed in the future. 2057 // Please, note that we are currently forced to abuse the pragma 'clang 2058 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2059 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2060 // provides *explicit vectorization hints* (LV can bypass legal checks and 2061 // assume that vectorization is legal). However, both hints are implemented 2062 // using the same metadata (llvm.loop.vectorize, processed by 2063 // LoopVectorizeHints). This will be fixed in the future when the native IR 2064 // representation for pragma 'omp simd' is introduced. 2065 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2066 OptimizationRemarkEmitter *ORE) { 2067 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2068 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2069 2070 // Only outer loops with an explicit vectorization hint are supported. 2071 // Unannotated outer loops are ignored. 2072 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2073 return false; 2074 2075 Function *Fn = OuterLp->getHeader()->getParent(); 2076 if (!Hints.allowVectorization(Fn, OuterLp, 2077 true /*VectorizeOnlyWhenForced*/)) { 2078 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2079 return false; 2080 } 2081 2082 if (Hints.getInterleave() > 1) { 2083 // TODO: Interleave support is future work. 2084 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2085 "outer loops.\n"); 2086 Hints.emitRemarkWithHints(); 2087 return false; 2088 } 2089 2090 return true; 2091 } 2092 2093 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2094 OptimizationRemarkEmitter *ORE, 2095 SmallVectorImpl<Loop *> &V) { 2096 // Collect inner loops and outer loops without irreducible control flow. For 2097 // now, only collect outer loops that have explicit vectorization hints. If we 2098 // are stress testing the VPlan H-CFG construction, we collect the outermost 2099 // loop of every loop nest. 2100 if (L.isInnermost() || VPlanBuildStressTest || 2101 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2102 LoopBlocksRPO RPOT(&L); 2103 RPOT.perform(LI); 2104 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2105 V.push_back(&L); 2106 // TODO: Collect inner loops inside marked outer loops in case 2107 // vectorization fails for the outer loop. Do not invoke 2108 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2109 // already known to be reducible. We can use an inherited attribute for 2110 // that. 2111 return; 2112 } 2113 } 2114 for (Loop *InnerL : L) 2115 collectSupportedLoops(*InnerL, LI, ORE, V); 2116 } 2117 2118 namespace { 2119 2120 /// The LoopVectorize Pass. 2121 struct LoopVectorize : public FunctionPass { 2122 /// Pass identification, replacement for typeid 2123 static char ID; 2124 2125 LoopVectorizePass Impl; 2126 2127 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2128 bool VectorizeOnlyWhenForced = false) 2129 : FunctionPass(ID), 2130 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2131 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2132 } 2133 2134 bool runOnFunction(Function &F) override { 2135 if (skipFunction(F)) 2136 return false; 2137 2138 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2139 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2140 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2141 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2142 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2143 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2144 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2145 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2146 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2147 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2148 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2149 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2150 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2151 2152 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2153 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2154 2155 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2156 GetLAA, *ORE, PSI).MadeAnyChange; 2157 } 2158 2159 void getAnalysisUsage(AnalysisUsage &AU) const override { 2160 AU.addRequired<AssumptionCacheTracker>(); 2161 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2162 AU.addRequired<DominatorTreeWrapperPass>(); 2163 AU.addRequired<LoopInfoWrapperPass>(); 2164 AU.addRequired<ScalarEvolutionWrapperPass>(); 2165 AU.addRequired<TargetTransformInfoWrapperPass>(); 2166 AU.addRequired<AAResultsWrapperPass>(); 2167 AU.addRequired<LoopAccessLegacyAnalysis>(); 2168 AU.addRequired<DemandedBitsWrapperPass>(); 2169 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2170 AU.addRequired<InjectTLIMappingsLegacy>(); 2171 2172 // We currently do not preserve loopinfo/dominator analyses with outer loop 2173 // vectorization. Until this is addressed, mark these analyses as preserved 2174 // only for non-VPlan-native path. 2175 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2176 if (!EnableVPlanNativePath) { 2177 AU.addPreserved<LoopInfoWrapperPass>(); 2178 AU.addPreserved<DominatorTreeWrapperPass>(); 2179 } 2180 2181 AU.addPreserved<BasicAAWrapperPass>(); 2182 AU.addPreserved<GlobalsAAWrapperPass>(); 2183 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2184 } 2185 }; 2186 2187 } // end anonymous namespace 2188 2189 //===----------------------------------------------------------------------===// 2190 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2191 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2192 //===----------------------------------------------------------------------===// 2193 2194 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2195 // We need to place the broadcast of invariant variables outside the loop, 2196 // but only if it's proven safe to do so. Else, broadcast will be inside 2197 // vector loop body. 2198 Instruction *Instr = dyn_cast<Instruction>(V); 2199 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2200 (!Instr || 2201 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2202 // Place the code for broadcasting invariant variables in the new preheader. 2203 IRBuilder<>::InsertPointGuard Guard(Builder); 2204 if (SafeToHoist) 2205 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2206 2207 // Broadcast the scalar into all locations in the vector. 2208 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2209 2210 return Shuf; 2211 } 2212 2213 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2214 const InductionDescriptor &II, Value *Step, Value *Start, 2215 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2216 VPTransformState &State) { 2217 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2218 "Expected either an induction phi-node or a truncate of it!"); 2219 2220 // Construct the initial value of the vector IV in the vector loop preheader 2221 auto CurrIP = Builder.saveIP(); 2222 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2223 if (isa<TruncInst>(EntryVal)) { 2224 assert(Start->getType()->isIntegerTy() && 2225 "Truncation requires an integer type"); 2226 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2227 Step = Builder.CreateTrunc(Step, TruncType); 2228 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2229 } 2230 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2231 Value *SteppedStart = 2232 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2233 2234 // We create vector phi nodes for both integer and floating-point induction 2235 // variables. Here, we determine the kind of arithmetic we will perform. 2236 Instruction::BinaryOps AddOp; 2237 Instruction::BinaryOps MulOp; 2238 if (Step->getType()->isIntegerTy()) { 2239 AddOp = Instruction::Add; 2240 MulOp = Instruction::Mul; 2241 } else { 2242 AddOp = II.getInductionOpcode(); 2243 MulOp = Instruction::FMul; 2244 } 2245 2246 // Multiply the vectorization factor by the step using integer or 2247 // floating-point arithmetic as appropriate. 2248 Value *ConstVF = 2249 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2250 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2251 2252 // Create a vector splat to use in the induction update. 2253 // 2254 // FIXME: If the step is non-constant, we create the vector splat with 2255 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2256 // handle a constant vector splat. 2257 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2258 Value *SplatVF = isa<Constant>(Mul) 2259 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2260 : Builder.CreateVectorSplat(VF, Mul); 2261 Builder.restoreIP(CurrIP); 2262 2263 // We may need to add the step a number of times, depending on the unroll 2264 // factor. The last of those goes into the PHI. 2265 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2266 &*LoopVectorBody->getFirstInsertionPt()); 2267 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2268 Instruction *LastInduction = VecInd; 2269 for (unsigned Part = 0; Part < UF; ++Part) { 2270 State.set(Def, LastInduction, Part); 2271 2272 if (isa<TruncInst>(EntryVal)) 2273 addMetadata(LastInduction, EntryVal); 2274 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2275 State, Part); 2276 2277 LastInduction = cast<Instruction>(addFastMathFlag( 2278 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2279 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2280 } 2281 2282 // Move the last step to the end of the latch block. This ensures consistent 2283 // placement of all induction updates. 2284 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2285 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2286 auto *ICmp = cast<Instruction>(Br->getCondition()); 2287 LastInduction->moveBefore(ICmp); 2288 LastInduction->setName("vec.ind.next"); 2289 2290 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2291 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2292 } 2293 2294 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2295 return Cost->isScalarAfterVectorization(I, VF) || 2296 Cost->isProfitableToScalarize(I, VF); 2297 } 2298 2299 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2300 if (shouldScalarizeInstruction(IV)) 2301 return true; 2302 auto isScalarInst = [&](User *U) -> bool { 2303 auto *I = cast<Instruction>(U); 2304 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2305 }; 2306 return llvm::any_of(IV->users(), isScalarInst); 2307 } 2308 2309 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2310 const InductionDescriptor &ID, const Instruction *EntryVal, 2311 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2312 unsigned Part, unsigned Lane) { 2313 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2314 "Expected either an induction phi-node or a truncate of it!"); 2315 2316 // This induction variable is not the phi from the original loop but the 2317 // newly-created IV based on the proof that casted Phi is equal to the 2318 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2319 // re-uses the same InductionDescriptor that original IV uses but we don't 2320 // have to do any recording in this case - that is done when original IV is 2321 // processed. 2322 if (isa<TruncInst>(EntryVal)) 2323 return; 2324 2325 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2326 if (Casts.empty()) 2327 return; 2328 // Only the first Cast instruction in the Casts vector is of interest. 2329 // The rest of the Casts (if exist) have no uses outside the 2330 // induction update chain itself. 2331 if (Lane < UINT_MAX) 2332 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2333 else 2334 State.set(CastDef, VectorLoopVal, Part); 2335 } 2336 2337 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2338 TruncInst *Trunc, VPValue *Def, 2339 VPValue *CastDef, 2340 VPTransformState &State) { 2341 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2342 "Primary induction variable must have an integer type"); 2343 2344 auto II = Legal->getInductionVars().find(IV); 2345 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2346 2347 auto ID = II->second; 2348 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2349 2350 // The value from the original loop to which we are mapping the new induction 2351 // variable. 2352 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2353 2354 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2355 2356 // Generate code for the induction step. Note that induction steps are 2357 // required to be loop-invariant 2358 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2359 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2360 "Induction step should be loop invariant"); 2361 if (PSE.getSE()->isSCEVable(IV->getType())) { 2362 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2363 return Exp.expandCodeFor(Step, Step->getType(), 2364 LoopVectorPreHeader->getTerminator()); 2365 } 2366 return cast<SCEVUnknown>(Step)->getValue(); 2367 }; 2368 2369 // The scalar value to broadcast. This is derived from the canonical 2370 // induction variable. If a truncation type is given, truncate the canonical 2371 // induction variable and step. Otherwise, derive these values from the 2372 // induction descriptor. 2373 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2374 Value *ScalarIV = Induction; 2375 if (IV != OldInduction) { 2376 ScalarIV = IV->getType()->isIntegerTy() 2377 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2378 : Builder.CreateCast(Instruction::SIToFP, Induction, 2379 IV->getType()); 2380 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2381 ScalarIV->setName("offset.idx"); 2382 } 2383 if (Trunc) { 2384 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2385 assert(Step->getType()->isIntegerTy() && 2386 "Truncation requires an integer step"); 2387 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2388 Step = Builder.CreateTrunc(Step, TruncType); 2389 } 2390 return ScalarIV; 2391 }; 2392 2393 // Create the vector values from the scalar IV, in the absence of creating a 2394 // vector IV. 2395 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2396 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2397 for (unsigned Part = 0; Part < UF; ++Part) { 2398 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2399 Value *EntryPart = 2400 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2401 ID.getInductionOpcode()); 2402 State.set(Def, EntryPart, Part); 2403 if (Trunc) 2404 addMetadata(EntryPart, Trunc); 2405 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2406 State, Part); 2407 } 2408 }; 2409 2410 // Now do the actual transformations, and start with creating the step value. 2411 Value *Step = CreateStepValue(ID.getStep()); 2412 if (VF.isZero() || VF.isScalar()) { 2413 Value *ScalarIV = CreateScalarIV(Step); 2414 CreateSplatIV(ScalarIV, Step); 2415 return; 2416 } 2417 2418 // Determine if we want a scalar version of the induction variable. This is 2419 // true if the induction variable itself is not widened, or if it has at 2420 // least one user in the loop that is not widened. 2421 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2422 if (!NeedsScalarIV) { 2423 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2424 State); 2425 return; 2426 } 2427 2428 // Try to create a new independent vector induction variable. If we can't 2429 // create the phi node, we will splat the scalar induction variable in each 2430 // loop iteration. 2431 if (!shouldScalarizeInstruction(EntryVal)) { 2432 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2433 State); 2434 Value *ScalarIV = CreateScalarIV(Step); 2435 // Create scalar steps that can be used by instructions we will later 2436 // scalarize. Note that the addition of the scalar steps will not increase 2437 // the number of instructions in the loop in the common case prior to 2438 // InstCombine. We will be trading one vector extract for each scalar step. 2439 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2440 return; 2441 } 2442 2443 // All IV users are scalar instructions, so only emit a scalar IV, not a 2444 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2445 // predicate used by the masked loads/stores. 2446 Value *ScalarIV = CreateScalarIV(Step); 2447 if (!Cost->isScalarEpilogueAllowed()) 2448 CreateSplatIV(ScalarIV, Step); 2449 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2450 } 2451 2452 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2453 Instruction::BinaryOps BinOp) { 2454 // Create and check the types. 2455 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2456 int VLen = ValVTy->getNumElements(); 2457 2458 Type *STy = Val->getType()->getScalarType(); 2459 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2460 "Induction Step must be an integer or FP"); 2461 assert(Step->getType() == STy && "Step has wrong type"); 2462 2463 SmallVector<Constant *, 8> Indices; 2464 2465 if (STy->isIntegerTy()) { 2466 // Create a vector of consecutive numbers from zero to VF. 2467 for (int i = 0; i < VLen; ++i) 2468 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2469 2470 // Add the consecutive indices to the vector value. 2471 Constant *Cv = ConstantVector::get(Indices); 2472 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2473 Step = Builder.CreateVectorSplat(VLen, Step); 2474 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2475 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2476 // which can be found from the original scalar operations. 2477 Step = Builder.CreateMul(Cv, Step); 2478 return Builder.CreateAdd(Val, Step, "induction"); 2479 } 2480 2481 // Floating point induction. 2482 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2483 "Binary Opcode should be specified for FP induction"); 2484 // Create a vector of consecutive numbers from zero to VF. 2485 for (int i = 0; i < VLen; ++i) 2486 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2487 2488 // Add the consecutive indices to the vector value. 2489 Constant *Cv = ConstantVector::get(Indices); 2490 2491 Step = Builder.CreateVectorSplat(VLen, Step); 2492 2493 // Floating point operations had to be 'fast' to enable the induction. 2494 FastMathFlags Flags; 2495 Flags.setFast(); 2496 2497 Value *MulOp = Builder.CreateFMul(Cv, Step); 2498 if (isa<Instruction>(MulOp)) 2499 // Have to check, MulOp may be a constant 2500 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2501 2502 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2503 if (isa<Instruction>(BOp)) 2504 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2505 return BOp; 2506 } 2507 2508 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2509 Instruction *EntryVal, 2510 const InductionDescriptor &ID, 2511 VPValue *Def, VPValue *CastDef, 2512 VPTransformState &State) { 2513 // We shouldn't have to build scalar steps if we aren't vectorizing. 2514 assert(VF.isVector() && "VF should be greater than one"); 2515 // Get the value type and ensure it and the step have the same integer type. 2516 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2517 assert(ScalarIVTy == Step->getType() && 2518 "Val and Step should have the same type"); 2519 2520 // We build scalar steps for both integer and floating-point induction 2521 // variables. Here, we determine the kind of arithmetic we will perform. 2522 Instruction::BinaryOps AddOp; 2523 Instruction::BinaryOps MulOp; 2524 if (ScalarIVTy->isIntegerTy()) { 2525 AddOp = Instruction::Add; 2526 MulOp = Instruction::Mul; 2527 } else { 2528 AddOp = ID.getInductionOpcode(); 2529 MulOp = Instruction::FMul; 2530 } 2531 2532 // Determine the number of scalars we need to generate for each unroll 2533 // iteration. If EntryVal is uniform, we only need to generate the first 2534 // lane. Otherwise, we generate all VF values. 2535 unsigned Lanes = 2536 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2537 ? 1 2538 : VF.getKnownMinValue(); 2539 assert((!VF.isScalable() || Lanes == 1) && 2540 "Should never scalarize a scalable vector"); 2541 // Compute the scalar steps and save the results in State. 2542 for (unsigned Part = 0; Part < UF; ++Part) { 2543 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2544 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2545 ScalarIVTy->getScalarSizeInBits()); 2546 Value *StartIdx = 2547 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2548 if (ScalarIVTy->isFloatingPointTy()) 2549 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2550 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2551 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2552 // The step returned by `createStepForVF` is a runtime-evaluated value 2553 // when VF is scalable. Otherwise, it should be folded into a Constant. 2554 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2555 "Expected StartIdx to be folded to a constant when VF is not " 2556 "scalable"); 2557 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2558 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2559 State.set(Def, Add, VPIteration(Part, Lane)); 2560 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2561 Part, Lane); 2562 } 2563 } 2564 } 2565 2566 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2567 const VPIteration &Instance, 2568 VPTransformState &State) { 2569 Value *ScalarInst = State.get(Def, Instance); 2570 Value *VectorValue = State.get(Def, Instance.Part); 2571 VectorValue = Builder.CreateInsertElement( 2572 VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane)); 2573 State.set(Def, VectorValue, Instance.Part); 2574 } 2575 2576 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2577 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2578 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2579 SmallVector<int, 8> ShuffleMask; 2580 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2581 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2582 2583 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2584 } 2585 2586 // Return whether we allow using masked interleave-groups (for dealing with 2587 // strided loads/stores that reside in predicated blocks, or for dealing 2588 // with gaps). 2589 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2590 // If an override option has been passed in for interleaved accesses, use it. 2591 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2592 return EnableMaskedInterleavedMemAccesses; 2593 2594 return TTI.enableMaskedInterleavedAccessVectorization(); 2595 } 2596 2597 // Try to vectorize the interleave group that \p Instr belongs to. 2598 // 2599 // E.g. Translate following interleaved load group (factor = 3): 2600 // for (i = 0; i < N; i+=3) { 2601 // R = Pic[i]; // Member of index 0 2602 // G = Pic[i+1]; // Member of index 1 2603 // B = Pic[i+2]; // Member of index 2 2604 // ... // do something to R, G, B 2605 // } 2606 // To: 2607 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2608 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2609 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2610 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2611 // 2612 // Or translate following interleaved store group (factor = 3): 2613 // for (i = 0; i < N; i+=3) { 2614 // ... do something to R, G, B 2615 // Pic[i] = R; // Member of index 0 2616 // Pic[i+1] = G; // Member of index 1 2617 // Pic[i+2] = B; // Member of index 2 2618 // } 2619 // To: 2620 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2621 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2622 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2623 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2624 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2625 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2626 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2627 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2628 VPValue *BlockInMask) { 2629 Instruction *Instr = Group->getInsertPos(); 2630 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2631 2632 // Prepare for the vector type of the interleaved load/store. 2633 Type *ScalarTy = getMemInstValueType(Instr); 2634 unsigned InterleaveFactor = Group->getFactor(); 2635 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2636 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2637 2638 // Prepare for the new pointers. 2639 SmallVector<Value *, 2> AddrParts; 2640 unsigned Index = Group->getIndex(Instr); 2641 2642 // TODO: extend the masked interleaved-group support to reversed access. 2643 assert((!BlockInMask || !Group->isReverse()) && 2644 "Reversed masked interleave-group not supported."); 2645 2646 // If the group is reverse, adjust the index to refer to the last vector lane 2647 // instead of the first. We adjust the index from the first vector lane, 2648 // rather than directly getting the pointer for lane VF - 1, because the 2649 // pointer operand of the interleaved access is supposed to be uniform. For 2650 // uniform instructions, we're only required to generate a value for the 2651 // first vector lane in each unroll iteration. 2652 assert(!VF.isScalable() && 2653 "scalable vector reverse operation is not implemented"); 2654 if (Group->isReverse()) 2655 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2656 2657 for (unsigned Part = 0; Part < UF; Part++) { 2658 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2659 setDebugLocFromInst(Builder, AddrPart); 2660 2661 // Notice current instruction could be any index. Need to adjust the address 2662 // to the member of index 0. 2663 // 2664 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2665 // b = A[i]; // Member of index 0 2666 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2667 // 2668 // E.g. A[i+1] = a; // Member of index 1 2669 // A[i] = b; // Member of index 0 2670 // A[i+2] = c; // Member of index 2 (Current instruction) 2671 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2672 2673 bool InBounds = false; 2674 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2675 InBounds = gep->isInBounds(); 2676 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2677 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2678 2679 // Cast to the vector pointer type. 2680 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2681 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2682 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2683 } 2684 2685 setDebugLocFromInst(Builder, Instr); 2686 Value *PoisonVec = PoisonValue::get(VecTy); 2687 2688 Value *MaskForGaps = nullptr; 2689 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2690 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2691 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2692 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2693 } 2694 2695 // Vectorize the interleaved load group. 2696 if (isa<LoadInst>(Instr)) { 2697 // For each unroll part, create a wide load for the group. 2698 SmallVector<Value *, 2> NewLoads; 2699 for (unsigned Part = 0; Part < UF; Part++) { 2700 Instruction *NewLoad; 2701 if (BlockInMask || MaskForGaps) { 2702 assert(useMaskedInterleavedAccesses(*TTI) && 2703 "masked interleaved groups are not allowed."); 2704 Value *GroupMask = MaskForGaps; 2705 if (BlockInMask) { 2706 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2707 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2708 Value *ShuffledMask = Builder.CreateShuffleVector( 2709 BlockInMaskPart, 2710 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2711 "interleaved.mask"); 2712 GroupMask = MaskForGaps 2713 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2714 MaskForGaps) 2715 : ShuffledMask; 2716 } 2717 NewLoad = 2718 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2719 GroupMask, PoisonVec, "wide.masked.vec"); 2720 } 2721 else 2722 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2723 Group->getAlign(), "wide.vec"); 2724 Group->addMetadata(NewLoad); 2725 NewLoads.push_back(NewLoad); 2726 } 2727 2728 // For each member in the group, shuffle out the appropriate data from the 2729 // wide loads. 2730 unsigned J = 0; 2731 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2732 Instruction *Member = Group->getMember(I); 2733 2734 // Skip the gaps in the group. 2735 if (!Member) 2736 continue; 2737 2738 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2739 auto StrideMask = 2740 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2741 for (unsigned Part = 0; Part < UF; Part++) { 2742 Value *StridedVec = Builder.CreateShuffleVector( 2743 NewLoads[Part], StrideMask, "strided.vec"); 2744 2745 // If this member has different type, cast the result type. 2746 if (Member->getType() != ScalarTy) { 2747 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2748 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2749 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2750 } 2751 2752 if (Group->isReverse()) 2753 StridedVec = reverseVector(StridedVec); 2754 2755 State.set(VPDefs[J], StridedVec, Part); 2756 } 2757 ++J; 2758 } 2759 return; 2760 } 2761 2762 // The sub vector type for current instruction. 2763 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2764 auto *SubVT = VectorType::get(ScalarTy, VF); 2765 2766 // Vectorize the interleaved store group. 2767 for (unsigned Part = 0; Part < UF; Part++) { 2768 // Collect the stored vector from each member. 2769 SmallVector<Value *, 4> StoredVecs; 2770 for (unsigned i = 0; i < InterleaveFactor; i++) { 2771 // Interleaved store group doesn't allow a gap, so each index has a member 2772 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2773 2774 Value *StoredVec = State.get(StoredValues[i], Part); 2775 2776 if (Group->isReverse()) 2777 StoredVec = reverseVector(StoredVec); 2778 2779 // If this member has different type, cast it to a unified type. 2780 2781 if (StoredVec->getType() != SubVT) 2782 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2783 2784 StoredVecs.push_back(StoredVec); 2785 } 2786 2787 // Concatenate all vectors into a wide vector. 2788 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2789 2790 // Interleave the elements in the wide vector. 2791 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2792 Value *IVec = Builder.CreateShuffleVector( 2793 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2794 "interleaved.vec"); 2795 2796 Instruction *NewStoreInstr; 2797 if (BlockInMask) { 2798 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2799 Value *ShuffledMask = Builder.CreateShuffleVector( 2800 BlockInMaskPart, 2801 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2802 "interleaved.mask"); 2803 NewStoreInstr = Builder.CreateMaskedStore( 2804 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2805 } 2806 else 2807 NewStoreInstr = 2808 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2809 2810 Group->addMetadata(NewStoreInstr); 2811 } 2812 } 2813 2814 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2815 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2816 VPValue *StoredValue, VPValue *BlockInMask) { 2817 // Attempt to issue a wide load. 2818 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2819 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2820 2821 assert((LI || SI) && "Invalid Load/Store instruction"); 2822 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2823 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2824 2825 LoopVectorizationCostModel::InstWidening Decision = 2826 Cost->getWideningDecision(Instr, VF); 2827 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2828 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2829 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2830 "CM decision is not to widen the memory instruction"); 2831 2832 Type *ScalarDataTy = getMemInstValueType(Instr); 2833 2834 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2835 const Align Alignment = getLoadStoreAlignment(Instr); 2836 2837 // Determine if the pointer operand of the access is either consecutive or 2838 // reverse consecutive. 2839 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2840 bool ConsecutiveStride = 2841 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2842 bool CreateGatherScatter = 2843 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2844 2845 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2846 // gather/scatter. Otherwise Decision should have been to Scalarize. 2847 assert((ConsecutiveStride || CreateGatherScatter) && 2848 "The instruction should be scalarized"); 2849 (void)ConsecutiveStride; 2850 2851 VectorParts BlockInMaskParts(UF); 2852 bool isMaskRequired = BlockInMask; 2853 if (isMaskRequired) 2854 for (unsigned Part = 0; Part < UF; ++Part) 2855 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2856 2857 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2858 // Calculate the pointer for the specific unroll-part. 2859 GetElementPtrInst *PartPtr = nullptr; 2860 2861 bool InBounds = false; 2862 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2863 InBounds = gep->isInBounds(); 2864 2865 if (Reverse) { 2866 assert(!VF.isScalable() && 2867 "Reversing vectors is not yet supported for scalable vectors."); 2868 2869 // If the address is consecutive but reversed, then the 2870 // wide store needs to start at the last vector element. 2871 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2872 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2873 PartPtr->setIsInBounds(InBounds); 2874 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2875 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2876 PartPtr->setIsInBounds(InBounds); 2877 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2878 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2879 } else { 2880 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2881 PartPtr = cast<GetElementPtrInst>( 2882 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2883 PartPtr->setIsInBounds(InBounds); 2884 } 2885 2886 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2887 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2888 }; 2889 2890 // Handle Stores: 2891 if (SI) { 2892 setDebugLocFromInst(Builder, SI); 2893 2894 for (unsigned Part = 0; Part < UF; ++Part) { 2895 Instruction *NewSI = nullptr; 2896 Value *StoredVal = State.get(StoredValue, Part); 2897 if (CreateGatherScatter) { 2898 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2899 Value *VectorGep = State.get(Addr, Part); 2900 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2901 MaskPart); 2902 } else { 2903 if (Reverse) { 2904 // If we store to reverse consecutive memory locations, then we need 2905 // to reverse the order of elements in the stored value. 2906 StoredVal = reverseVector(StoredVal); 2907 // We don't want to update the value in the map as it might be used in 2908 // another expression. So don't call resetVectorValue(StoredVal). 2909 } 2910 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2911 if (isMaskRequired) 2912 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2913 BlockInMaskParts[Part]); 2914 else 2915 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2916 } 2917 addMetadata(NewSI, SI); 2918 } 2919 return; 2920 } 2921 2922 // Handle loads. 2923 assert(LI && "Must have a load instruction"); 2924 setDebugLocFromInst(Builder, LI); 2925 for (unsigned Part = 0; Part < UF; ++Part) { 2926 Value *NewLI; 2927 if (CreateGatherScatter) { 2928 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2929 Value *VectorGep = State.get(Addr, Part); 2930 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2931 nullptr, "wide.masked.gather"); 2932 addMetadata(NewLI, LI); 2933 } else { 2934 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2935 if (isMaskRequired) 2936 NewLI = Builder.CreateMaskedLoad( 2937 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2938 "wide.masked.load"); 2939 else 2940 NewLI = 2941 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2942 2943 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2944 addMetadata(NewLI, LI); 2945 if (Reverse) 2946 NewLI = reverseVector(NewLI); 2947 } 2948 2949 State.set(Def, NewLI, Part); 2950 } 2951 } 2952 2953 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2954 VPUser &User, 2955 const VPIteration &Instance, 2956 bool IfPredicateInstr, 2957 VPTransformState &State) { 2958 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2959 2960 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2961 // the first lane and part. 2962 if (isa<NoAliasScopeDeclInst>(Instr)) 2963 if (!Instance.isFirstIteration()) 2964 return; 2965 2966 setDebugLocFromInst(Builder, Instr); 2967 2968 // Does this instruction return a value ? 2969 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2970 2971 Instruction *Cloned = Instr->clone(); 2972 if (!IsVoidRetTy) 2973 Cloned->setName(Instr->getName() + ".cloned"); 2974 2975 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2976 Builder.GetInsertPoint()); 2977 // Replace the operands of the cloned instructions with their scalar 2978 // equivalents in the new loop. 2979 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2980 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2981 auto InputInstance = Instance; 2982 if (!Operand || !OrigLoop->contains(Operand) || 2983 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2984 InputInstance.Lane = 0; 2985 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2986 Cloned->setOperand(op, NewOp); 2987 } 2988 addNewMetadata(Cloned, Instr); 2989 2990 // Place the cloned scalar in the new loop. 2991 Builder.Insert(Cloned); 2992 2993 State.set(Def, Cloned, Instance); 2994 2995 // If we just cloned a new assumption, add it the assumption cache. 2996 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2997 if (II->getIntrinsicID() == Intrinsic::assume) 2998 AC->registerAssumption(II); 2999 3000 // End if-block. 3001 if (IfPredicateInstr) 3002 PredicatedInstructions.push_back(Cloned); 3003 } 3004 3005 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3006 Value *End, Value *Step, 3007 Instruction *DL) { 3008 BasicBlock *Header = L->getHeader(); 3009 BasicBlock *Latch = L->getLoopLatch(); 3010 // As we're just creating this loop, it's possible no latch exists 3011 // yet. If so, use the header as this will be a single block loop. 3012 if (!Latch) 3013 Latch = Header; 3014 3015 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3016 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3017 setDebugLocFromInst(Builder, OldInst); 3018 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3019 3020 Builder.SetInsertPoint(Latch->getTerminator()); 3021 setDebugLocFromInst(Builder, OldInst); 3022 3023 // Create i+1 and fill the PHINode. 3024 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3025 Induction->addIncoming(Start, L->getLoopPreheader()); 3026 Induction->addIncoming(Next, Latch); 3027 // Create the compare. 3028 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3029 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3030 3031 // Now we have two terminators. Remove the old one from the block. 3032 Latch->getTerminator()->eraseFromParent(); 3033 3034 return Induction; 3035 } 3036 3037 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3038 if (TripCount) 3039 return TripCount; 3040 3041 assert(L && "Create Trip Count for null loop."); 3042 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3043 // Find the loop boundaries. 3044 ScalarEvolution *SE = PSE.getSE(); 3045 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3046 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3047 "Invalid loop count"); 3048 3049 Type *IdxTy = Legal->getWidestInductionType(); 3050 assert(IdxTy && "No type for induction"); 3051 3052 // The exit count might have the type of i64 while the phi is i32. This can 3053 // happen if we have an induction variable that is sign extended before the 3054 // compare. The only way that we get a backedge taken count is that the 3055 // induction variable was signed and as such will not overflow. In such a case 3056 // truncation is legal. 3057 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3058 IdxTy->getPrimitiveSizeInBits()) 3059 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3060 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3061 3062 // Get the total trip count from the count by adding 1. 3063 const SCEV *ExitCount = SE->getAddExpr( 3064 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3065 3066 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3067 3068 // Expand the trip count and place the new instructions in the preheader. 3069 // Notice that the pre-header does not change, only the loop body. 3070 SCEVExpander Exp(*SE, DL, "induction"); 3071 3072 // Count holds the overall loop count (N). 3073 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3074 L->getLoopPreheader()->getTerminator()); 3075 3076 if (TripCount->getType()->isPointerTy()) 3077 TripCount = 3078 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3079 L->getLoopPreheader()->getTerminator()); 3080 3081 return TripCount; 3082 } 3083 3084 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3085 if (VectorTripCount) 3086 return VectorTripCount; 3087 3088 Value *TC = getOrCreateTripCount(L); 3089 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3090 3091 Type *Ty = TC->getType(); 3092 // This is where we can make the step a runtime constant. 3093 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3094 3095 // If the tail is to be folded by masking, round the number of iterations N 3096 // up to a multiple of Step instead of rounding down. This is done by first 3097 // adding Step-1 and then rounding down. Note that it's ok if this addition 3098 // overflows: the vector induction variable will eventually wrap to zero given 3099 // that it starts at zero and its Step is a power of two; the loop will then 3100 // exit, with the last early-exit vector comparison also producing all-true. 3101 if (Cost->foldTailByMasking()) { 3102 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3103 "VF*UF must be a power of 2 when folding tail by masking"); 3104 assert(!VF.isScalable() && 3105 "Tail folding not yet supported for scalable vectors"); 3106 TC = Builder.CreateAdd( 3107 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3108 } 3109 3110 // Now we need to generate the expression for the part of the loop that the 3111 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3112 // iterations are not required for correctness, or N - Step, otherwise. Step 3113 // is equal to the vectorization factor (number of SIMD elements) times the 3114 // unroll factor (number of SIMD instructions). 3115 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3116 3117 // There are two cases where we need to ensure (at least) the last iteration 3118 // runs in the scalar remainder loop. Thus, if the step evenly divides 3119 // the trip count, we set the remainder to be equal to the step. If the step 3120 // does not evenly divide the trip count, no adjustment is necessary since 3121 // there will already be scalar iterations. Note that the minimum iterations 3122 // check ensures that N >= Step. The cases are: 3123 // 1) If there is a non-reversed interleaved group that may speculatively 3124 // access memory out-of-bounds. 3125 // 2) If any instruction may follow a conditionally taken exit. That is, if 3126 // the loop contains multiple exiting blocks, or a single exiting block 3127 // which is not the latch. 3128 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3129 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3130 R = Builder.CreateSelect(IsZero, Step, R); 3131 } 3132 3133 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3134 3135 return VectorTripCount; 3136 } 3137 3138 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3139 const DataLayout &DL) { 3140 // Verify that V is a vector type with same number of elements as DstVTy. 3141 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3142 unsigned VF = DstFVTy->getNumElements(); 3143 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3144 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3145 Type *SrcElemTy = SrcVecTy->getElementType(); 3146 Type *DstElemTy = DstFVTy->getElementType(); 3147 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3148 "Vector elements must have same size"); 3149 3150 // Do a direct cast if element types are castable. 3151 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3152 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3153 } 3154 // V cannot be directly casted to desired vector type. 3155 // May happen when V is a floating point vector but DstVTy is a vector of 3156 // pointers or vice-versa. Handle this using a two-step bitcast using an 3157 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3158 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3159 "Only one type should be a pointer type"); 3160 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3161 "Only one type should be a floating point type"); 3162 Type *IntTy = 3163 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3164 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3165 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3166 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3167 } 3168 3169 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3170 BasicBlock *Bypass) { 3171 Value *Count = getOrCreateTripCount(L); 3172 // Reuse existing vector loop preheader for TC checks. 3173 // Note that new preheader block is generated for vector loop. 3174 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3175 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3176 3177 // Generate code to check if the loop's trip count is less than VF * UF, or 3178 // equal to it in case a scalar epilogue is required; this implies that the 3179 // vector trip count is zero. This check also covers the case where adding one 3180 // to the backedge-taken count overflowed leading to an incorrect trip count 3181 // of zero. In this case we will also jump to the scalar loop. 3182 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3183 : ICmpInst::ICMP_ULT; 3184 3185 // If tail is to be folded, vector loop takes care of all iterations. 3186 Value *CheckMinIters = Builder.getFalse(); 3187 if (!Cost->foldTailByMasking()) { 3188 Value *Step = 3189 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3190 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3191 } 3192 // Create new preheader for vector loop. 3193 LoopVectorPreHeader = 3194 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3195 "vector.ph"); 3196 3197 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3198 DT->getNode(Bypass)->getIDom()) && 3199 "TC check is expected to dominate Bypass"); 3200 3201 // Update dominator for Bypass & LoopExit. 3202 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3203 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3204 3205 ReplaceInstWithInst( 3206 TCCheckBlock->getTerminator(), 3207 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3208 LoopBypassBlocks.push_back(TCCheckBlock); 3209 } 3210 3211 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3212 3213 BasicBlock *const SCEVCheckBlock = 3214 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3215 if (!SCEVCheckBlock) 3216 return nullptr; 3217 3218 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3219 (OptForSizeBasedOnProfile && 3220 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3221 "Cannot SCEV check stride or overflow when optimizing for size"); 3222 3223 3224 // Update dominator only if this is first RT check. 3225 if (LoopBypassBlocks.empty()) { 3226 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3227 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3228 } 3229 3230 LoopBypassBlocks.push_back(SCEVCheckBlock); 3231 AddedSafetyChecks = true; 3232 return SCEVCheckBlock; 3233 } 3234 3235 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3236 BasicBlock *Bypass) { 3237 // VPlan-native path does not do any analysis for runtime checks currently. 3238 if (EnableVPlanNativePath) 3239 return nullptr; 3240 3241 BasicBlock *const MemCheckBlock = 3242 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3243 3244 // Check if we generated code that checks in runtime if arrays overlap. We put 3245 // the checks into a separate block to make the more common case of few 3246 // elements faster. 3247 if (!MemCheckBlock) 3248 return nullptr; 3249 3250 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3251 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3252 "Cannot emit memory checks when optimizing for size, unless forced " 3253 "to vectorize."); 3254 ORE->emit([&]() { 3255 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3256 L->getStartLoc(), L->getHeader()) 3257 << "Code-size may be reduced by not forcing " 3258 "vectorization, or by source-code modifications " 3259 "eliminating the need for runtime checks " 3260 "(e.g., adding 'restrict')."; 3261 }); 3262 } 3263 3264 LoopBypassBlocks.push_back(MemCheckBlock); 3265 3266 AddedSafetyChecks = true; 3267 3268 // We currently don't use LoopVersioning for the actual loop cloning but we 3269 // still use it to add the noalias metadata. 3270 LVer = std::make_unique<LoopVersioning>( 3271 *Legal->getLAI(), 3272 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3273 DT, PSE.getSE()); 3274 LVer->prepareNoAliasMetadata(); 3275 return MemCheckBlock; 3276 } 3277 3278 Value *InnerLoopVectorizer::emitTransformedIndex( 3279 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3280 const InductionDescriptor &ID) const { 3281 3282 SCEVExpander Exp(*SE, DL, "induction"); 3283 auto Step = ID.getStep(); 3284 auto StartValue = ID.getStartValue(); 3285 assert(Index->getType() == Step->getType() && 3286 "Index type does not match StepValue type"); 3287 3288 // Note: the IR at this point is broken. We cannot use SE to create any new 3289 // SCEV and then expand it, hoping that SCEV's simplification will give us 3290 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3291 // lead to various SCEV crashes. So all we can do is to use builder and rely 3292 // on InstCombine for future simplifications. Here we handle some trivial 3293 // cases only. 3294 auto CreateAdd = [&B](Value *X, Value *Y) { 3295 assert(X->getType() == Y->getType() && "Types don't match!"); 3296 if (auto *CX = dyn_cast<ConstantInt>(X)) 3297 if (CX->isZero()) 3298 return Y; 3299 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3300 if (CY->isZero()) 3301 return X; 3302 return B.CreateAdd(X, Y); 3303 }; 3304 3305 auto CreateMul = [&B](Value *X, Value *Y) { 3306 assert(X->getType() == Y->getType() && "Types don't match!"); 3307 if (auto *CX = dyn_cast<ConstantInt>(X)) 3308 if (CX->isOne()) 3309 return Y; 3310 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3311 if (CY->isOne()) 3312 return X; 3313 return B.CreateMul(X, Y); 3314 }; 3315 3316 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3317 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3318 // the DomTree is not kept up-to-date for additional blocks generated in the 3319 // vector loop. By using the header as insertion point, we guarantee that the 3320 // expanded instructions dominate all their uses. 3321 auto GetInsertPoint = [this, &B]() { 3322 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3323 if (InsertBB != LoopVectorBody && 3324 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3325 return LoopVectorBody->getTerminator(); 3326 return &*B.GetInsertPoint(); 3327 }; 3328 switch (ID.getKind()) { 3329 case InductionDescriptor::IK_IntInduction: { 3330 assert(Index->getType() == StartValue->getType() && 3331 "Index type does not match StartValue type"); 3332 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3333 return B.CreateSub(StartValue, Index); 3334 auto *Offset = CreateMul( 3335 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3336 return CreateAdd(StartValue, Offset); 3337 } 3338 case InductionDescriptor::IK_PtrInduction: { 3339 assert(isa<SCEVConstant>(Step) && 3340 "Expected constant step for pointer induction"); 3341 return B.CreateGEP( 3342 StartValue->getType()->getPointerElementType(), StartValue, 3343 CreateMul(Index, 3344 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3345 } 3346 case InductionDescriptor::IK_FpInduction: { 3347 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3348 auto InductionBinOp = ID.getInductionBinOp(); 3349 assert(InductionBinOp && 3350 (InductionBinOp->getOpcode() == Instruction::FAdd || 3351 InductionBinOp->getOpcode() == Instruction::FSub) && 3352 "Original bin op should be defined for FP induction"); 3353 3354 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3355 3356 // Floating point operations had to be 'fast' to enable the induction. 3357 FastMathFlags Flags; 3358 Flags.setFast(); 3359 3360 Value *MulExp = B.CreateFMul(StepValue, Index); 3361 if (isa<Instruction>(MulExp)) 3362 // We have to check, the MulExp may be a constant. 3363 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3364 3365 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3366 "induction"); 3367 if (isa<Instruction>(BOp)) 3368 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3369 3370 return BOp; 3371 } 3372 case InductionDescriptor::IK_NoInduction: 3373 return nullptr; 3374 } 3375 llvm_unreachable("invalid enum"); 3376 } 3377 3378 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3379 LoopScalarBody = OrigLoop->getHeader(); 3380 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3381 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3382 assert(LoopExitBlock && "Must have an exit block"); 3383 assert(LoopVectorPreHeader && "Invalid loop structure"); 3384 3385 LoopMiddleBlock = 3386 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3387 LI, nullptr, Twine(Prefix) + "middle.block"); 3388 LoopScalarPreHeader = 3389 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3390 nullptr, Twine(Prefix) + "scalar.ph"); 3391 3392 // Set up branch from middle block to the exit and scalar preheader blocks. 3393 // completeLoopSkeleton will update the condition to use an iteration check, 3394 // if required to decide whether to execute the remainder. 3395 BranchInst *BrInst = 3396 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3397 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3398 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3399 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3400 3401 // We intentionally don't let SplitBlock to update LoopInfo since 3402 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3403 // LoopVectorBody is explicitly added to the correct place few lines later. 3404 LoopVectorBody = 3405 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3406 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3407 3408 // Update dominator for loop exit. 3409 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3410 3411 // Create and register the new vector loop. 3412 Loop *Lp = LI->AllocateLoop(); 3413 Loop *ParentLoop = OrigLoop->getParentLoop(); 3414 3415 // Insert the new loop into the loop nest and register the new basic blocks 3416 // before calling any utilities such as SCEV that require valid LoopInfo. 3417 if (ParentLoop) { 3418 ParentLoop->addChildLoop(Lp); 3419 } else { 3420 LI->addTopLevelLoop(Lp); 3421 } 3422 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3423 return Lp; 3424 } 3425 3426 void InnerLoopVectorizer::createInductionResumeValues( 3427 Loop *L, Value *VectorTripCount, 3428 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3429 assert(VectorTripCount && L && "Expected valid arguments"); 3430 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3431 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3432 "Inconsistent information about additional bypass."); 3433 // We are going to resume the execution of the scalar loop. 3434 // Go over all of the induction variables that we found and fix the 3435 // PHIs that are left in the scalar version of the loop. 3436 // The starting values of PHI nodes depend on the counter of the last 3437 // iteration in the vectorized loop. 3438 // If we come from a bypass edge then we need to start from the original 3439 // start value. 3440 for (auto &InductionEntry : Legal->getInductionVars()) { 3441 PHINode *OrigPhi = InductionEntry.first; 3442 InductionDescriptor II = InductionEntry.second; 3443 3444 // Create phi nodes to merge from the backedge-taken check block. 3445 PHINode *BCResumeVal = 3446 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3447 LoopScalarPreHeader->getTerminator()); 3448 // Copy original phi DL over to the new one. 3449 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3450 Value *&EndValue = IVEndValues[OrigPhi]; 3451 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3452 if (OrigPhi == OldInduction) { 3453 // We know what the end value is. 3454 EndValue = VectorTripCount; 3455 } else { 3456 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3457 Type *StepType = II.getStep()->getType(); 3458 Instruction::CastOps CastOp = 3459 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3460 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3461 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3462 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3463 EndValue->setName("ind.end"); 3464 3465 // Compute the end value for the additional bypass (if applicable). 3466 if (AdditionalBypass.first) { 3467 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3468 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3469 StepType, true); 3470 CRD = 3471 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3472 EndValueFromAdditionalBypass = 3473 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3474 EndValueFromAdditionalBypass->setName("ind.end"); 3475 } 3476 } 3477 // The new PHI merges the original incoming value, in case of a bypass, 3478 // or the value at the end of the vectorized loop. 3479 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3480 3481 // Fix the scalar body counter (PHI node). 3482 // The old induction's phi node in the scalar body needs the truncated 3483 // value. 3484 for (BasicBlock *BB : LoopBypassBlocks) 3485 BCResumeVal->addIncoming(II.getStartValue(), BB); 3486 3487 if (AdditionalBypass.first) 3488 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3489 EndValueFromAdditionalBypass); 3490 3491 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3492 } 3493 } 3494 3495 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3496 MDNode *OrigLoopID) { 3497 assert(L && "Expected valid loop."); 3498 3499 // The trip counts should be cached by now. 3500 Value *Count = getOrCreateTripCount(L); 3501 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3502 3503 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3504 3505 // Add a check in the middle block to see if we have completed 3506 // all of the iterations in the first vector loop. 3507 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3508 // If tail is to be folded, we know we don't need to run the remainder. 3509 if (!Cost->foldTailByMasking()) { 3510 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3511 Count, VectorTripCount, "cmp.n", 3512 LoopMiddleBlock->getTerminator()); 3513 3514 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3515 // of the corresponding compare because they may have ended up with 3516 // different line numbers and we want to avoid awkward line stepping while 3517 // debugging. Eg. if the compare has got a line number inside the loop. 3518 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3519 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3520 } 3521 3522 // Get ready to start creating new instructions into the vectorized body. 3523 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3524 "Inconsistent vector loop preheader"); 3525 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3526 3527 Optional<MDNode *> VectorizedLoopID = 3528 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3529 LLVMLoopVectorizeFollowupVectorized}); 3530 if (VectorizedLoopID.hasValue()) { 3531 L->setLoopID(VectorizedLoopID.getValue()); 3532 3533 // Do not setAlreadyVectorized if loop attributes have been defined 3534 // explicitly. 3535 return LoopVectorPreHeader; 3536 } 3537 3538 // Keep all loop hints from the original loop on the vector loop (we'll 3539 // replace the vectorizer-specific hints below). 3540 if (MDNode *LID = OrigLoop->getLoopID()) 3541 L->setLoopID(LID); 3542 3543 LoopVectorizeHints Hints(L, true, *ORE); 3544 Hints.setAlreadyVectorized(); 3545 3546 #ifdef EXPENSIVE_CHECKS 3547 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3548 LI->verify(*DT); 3549 #endif 3550 3551 return LoopVectorPreHeader; 3552 } 3553 3554 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3555 /* 3556 In this function we generate a new loop. The new loop will contain 3557 the vectorized instructions while the old loop will continue to run the 3558 scalar remainder. 3559 3560 [ ] <-- loop iteration number check. 3561 / | 3562 / v 3563 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3564 | / | 3565 | / v 3566 || [ ] <-- vector pre header. 3567 |/ | 3568 | v 3569 | [ ] \ 3570 | [ ]_| <-- vector loop. 3571 | | 3572 | v 3573 | -[ ] <--- middle-block. 3574 | / | 3575 | / v 3576 -|- >[ ] <--- new preheader. 3577 | | 3578 | v 3579 | [ ] \ 3580 | [ ]_| <-- old scalar loop to handle remainder. 3581 \ | 3582 \ v 3583 >[ ] <-- exit block. 3584 ... 3585 */ 3586 3587 // Get the metadata of the original loop before it gets modified. 3588 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3589 3590 // Create an empty vector loop, and prepare basic blocks for the runtime 3591 // checks. 3592 Loop *Lp = createVectorLoopSkeleton(""); 3593 3594 // Now, compare the new count to zero. If it is zero skip the vector loop and 3595 // jump to the scalar loop. This check also covers the case where the 3596 // backedge-taken count is uint##_max: adding one to it will overflow leading 3597 // to an incorrect trip count of zero. In this (rare) case we will also jump 3598 // to the scalar loop. 3599 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3600 3601 // Generate the code to check any assumptions that we've made for SCEV 3602 // expressions. 3603 emitSCEVChecks(Lp, LoopScalarPreHeader); 3604 3605 // Generate the code that checks in runtime if arrays overlap. We put the 3606 // checks into a separate block to make the more common case of few elements 3607 // faster. 3608 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3609 3610 // Some loops have a single integer induction variable, while other loops 3611 // don't. One example is c++ iterators that often have multiple pointer 3612 // induction variables. In the code below we also support a case where we 3613 // don't have a single induction variable. 3614 // 3615 // We try to obtain an induction variable from the original loop as hard 3616 // as possible. However if we don't find one that: 3617 // - is an integer 3618 // - counts from zero, stepping by one 3619 // - is the size of the widest induction variable type 3620 // then we create a new one. 3621 OldInduction = Legal->getPrimaryInduction(); 3622 Type *IdxTy = Legal->getWidestInductionType(); 3623 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3624 // The loop step is equal to the vectorization factor (num of SIMD elements) 3625 // times the unroll factor (num of SIMD instructions). 3626 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3627 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3628 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3629 Induction = 3630 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3631 getDebugLocFromInstOrOperands(OldInduction)); 3632 3633 // Emit phis for the new starting index of the scalar loop. 3634 createInductionResumeValues(Lp, CountRoundDown); 3635 3636 return completeLoopSkeleton(Lp, OrigLoopID); 3637 } 3638 3639 // Fix up external users of the induction variable. At this point, we are 3640 // in LCSSA form, with all external PHIs that use the IV having one input value, 3641 // coming from the remainder loop. We need those PHIs to also have a correct 3642 // value for the IV when arriving directly from the middle block. 3643 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3644 const InductionDescriptor &II, 3645 Value *CountRoundDown, Value *EndValue, 3646 BasicBlock *MiddleBlock) { 3647 // There are two kinds of external IV usages - those that use the value 3648 // computed in the last iteration (the PHI) and those that use the penultimate 3649 // value (the value that feeds into the phi from the loop latch). 3650 // We allow both, but they, obviously, have different values. 3651 3652 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3653 3654 DenseMap<Value *, Value *> MissingVals; 3655 3656 // An external user of the last iteration's value should see the value that 3657 // the remainder loop uses to initialize its own IV. 3658 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3659 for (User *U : PostInc->users()) { 3660 Instruction *UI = cast<Instruction>(U); 3661 if (!OrigLoop->contains(UI)) { 3662 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3663 MissingVals[UI] = EndValue; 3664 } 3665 } 3666 3667 // An external user of the penultimate value need to see EndValue - Step. 3668 // The simplest way to get this is to recompute it from the constituent SCEVs, 3669 // that is Start + (Step * (CRD - 1)). 3670 for (User *U : OrigPhi->users()) { 3671 auto *UI = cast<Instruction>(U); 3672 if (!OrigLoop->contains(UI)) { 3673 const DataLayout &DL = 3674 OrigLoop->getHeader()->getModule()->getDataLayout(); 3675 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3676 3677 IRBuilder<> B(MiddleBlock->getTerminator()); 3678 Value *CountMinusOne = B.CreateSub( 3679 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3680 Value *CMO = 3681 !II.getStep()->getType()->isIntegerTy() 3682 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3683 II.getStep()->getType()) 3684 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3685 CMO->setName("cast.cmo"); 3686 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3687 Escape->setName("ind.escape"); 3688 MissingVals[UI] = Escape; 3689 } 3690 } 3691 3692 for (auto &I : MissingVals) { 3693 PHINode *PHI = cast<PHINode>(I.first); 3694 // One corner case we have to handle is two IVs "chasing" each-other, 3695 // that is %IV2 = phi [...], [ %IV1, %latch ] 3696 // In this case, if IV1 has an external use, we need to avoid adding both 3697 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3698 // don't already have an incoming value for the middle block. 3699 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3700 PHI->addIncoming(I.second, MiddleBlock); 3701 } 3702 } 3703 3704 namespace { 3705 3706 struct CSEDenseMapInfo { 3707 static bool canHandle(const Instruction *I) { 3708 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3709 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3710 } 3711 3712 static inline Instruction *getEmptyKey() { 3713 return DenseMapInfo<Instruction *>::getEmptyKey(); 3714 } 3715 3716 static inline Instruction *getTombstoneKey() { 3717 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3718 } 3719 3720 static unsigned getHashValue(const Instruction *I) { 3721 assert(canHandle(I) && "Unknown instruction!"); 3722 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3723 I->value_op_end())); 3724 } 3725 3726 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3727 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3728 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3729 return LHS == RHS; 3730 return LHS->isIdenticalTo(RHS); 3731 } 3732 }; 3733 3734 } // end anonymous namespace 3735 3736 ///Perform cse of induction variable instructions. 3737 static void cse(BasicBlock *BB) { 3738 // Perform simple cse. 3739 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3740 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3741 Instruction *In = &*I++; 3742 3743 if (!CSEDenseMapInfo::canHandle(In)) 3744 continue; 3745 3746 // Check if we can replace this instruction with any of the 3747 // visited instructions. 3748 if (Instruction *V = CSEMap.lookup(In)) { 3749 In->replaceAllUsesWith(V); 3750 In->eraseFromParent(); 3751 continue; 3752 } 3753 3754 CSEMap[In] = In; 3755 } 3756 } 3757 3758 InstructionCost 3759 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3760 bool &NeedToScalarize) { 3761 Function *F = CI->getCalledFunction(); 3762 Type *ScalarRetTy = CI->getType(); 3763 SmallVector<Type *, 4> Tys, ScalarTys; 3764 for (auto &ArgOp : CI->arg_operands()) 3765 ScalarTys.push_back(ArgOp->getType()); 3766 3767 // Estimate cost of scalarized vector call. The source operands are assumed 3768 // to be vectors, so we need to extract individual elements from there, 3769 // execute VF scalar calls, and then gather the result into the vector return 3770 // value. 3771 InstructionCost ScalarCallCost = 3772 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3773 if (VF.isScalar()) 3774 return ScalarCallCost; 3775 3776 // Compute corresponding vector type for return value and arguments. 3777 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3778 for (Type *ScalarTy : ScalarTys) 3779 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3780 3781 // Compute costs of unpacking argument values for the scalar calls and 3782 // packing the return values to a vector. 3783 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3784 3785 InstructionCost Cost = 3786 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3787 3788 // If we can't emit a vector call for this function, then the currently found 3789 // cost is the cost we need to return. 3790 NeedToScalarize = true; 3791 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3792 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3793 3794 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3795 return Cost; 3796 3797 // If the corresponding vector cost is cheaper, return its cost. 3798 InstructionCost VectorCallCost = 3799 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3800 if (VectorCallCost < Cost) { 3801 NeedToScalarize = false; 3802 Cost = VectorCallCost; 3803 } 3804 return Cost; 3805 } 3806 3807 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3808 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3809 return Elt; 3810 return VectorType::get(Elt, VF); 3811 } 3812 3813 InstructionCost 3814 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3815 ElementCount VF) { 3816 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3817 assert(ID && "Expected intrinsic call!"); 3818 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3819 FastMathFlags FMF; 3820 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3821 FMF = FPMO->getFastMathFlags(); 3822 3823 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3824 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3825 SmallVector<Type *> ParamTys; 3826 std::transform(FTy->param_begin(), FTy->param_end(), 3827 std::back_inserter(ParamTys), 3828 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3829 3830 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3831 dyn_cast<IntrinsicInst>(CI)); 3832 return TTI.getIntrinsicInstrCost(CostAttrs, 3833 TargetTransformInfo::TCK_RecipThroughput); 3834 } 3835 3836 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3837 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3838 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3839 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3840 } 3841 3842 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3843 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3844 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3845 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3846 } 3847 3848 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3849 // For every instruction `I` in MinBWs, truncate the operands, create a 3850 // truncated version of `I` and reextend its result. InstCombine runs 3851 // later and will remove any ext/trunc pairs. 3852 SmallPtrSet<Value *, 4> Erased; 3853 for (const auto &KV : Cost->getMinimalBitwidths()) { 3854 // If the value wasn't vectorized, we must maintain the original scalar 3855 // type. The absence of the value from State indicates that it 3856 // wasn't vectorized. 3857 VPValue *Def = State.Plan->getVPValue(KV.first); 3858 if (!State.hasAnyVectorValue(Def)) 3859 continue; 3860 for (unsigned Part = 0; Part < UF; ++Part) { 3861 Value *I = State.get(Def, Part); 3862 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3863 continue; 3864 Type *OriginalTy = I->getType(); 3865 Type *ScalarTruncatedTy = 3866 IntegerType::get(OriginalTy->getContext(), KV.second); 3867 auto *TruncatedTy = FixedVectorType::get( 3868 ScalarTruncatedTy, 3869 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3870 if (TruncatedTy == OriginalTy) 3871 continue; 3872 3873 IRBuilder<> B(cast<Instruction>(I)); 3874 auto ShrinkOperand = [&](Value *V) -> Value * { 3875 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3876 if (ZI->getSrcTy() == TruncatedTy) 3877 return ZI->getOperand(0); 3878 return B.CreateZExtOrTrunc(V, TruncatedTy); 3879 }; 3880 3881 // The actual instruction modification depends on the instruction type, 3882 // unfortunately. 3883 Value *NewI = nullptr; 3884 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3885 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3886 ShrinkOperand(BO->getOperand(1))); 3887 3888 // Any wrapping introduced by shrinking this operation shouldn't be 3889 // considered undefined behavior. So, we can't unconditionally copy 3890 // arithmetic wrapping flags to NewI. 3891 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3892 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3893 NewI = 3894 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3895 ShrinkOperand(CI->getOperand(1))); 3896 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3897 NewI = B.CreateSelect(SI->getCondition(), 3898 ShrinkOperand(SI->getTrueValue()), 3899 ShrinkOperand(SI->getFalseValue())); 3900 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3901 switch (CI->getOpcode()) { 3902 default: 3903 llvm_unreachable("Unhandled cast!"); 3904 case Instruction::Trunc: 3905 NewI = ShrinkOperand(CI->getOperand(0)); 3906 break; 3907 case Instruction::SExt: 3908 NewI = B.CreateSExtOrTrunc( 3909 CI->getOperand(0), 3910 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3911 break; 3912 case Instruction::ZExt: 3913 NewI = B.CreateZExtOrTrunc( 3914 CI->getOperand(0), 3915 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3916 break; 3917 } 3918 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3919 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3920 ->getNumElements(); 3921 auto *O0 = B.CreateZExtOrTrunc( 3922 SI->getOperand(0), 3923 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3924 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3925 ->getNumElements(); 3926 auto *O1 = B.CreateZExtOrTrunc( 3927 SI->getOperand(1), 3928 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3929 3930 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3931 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3932 // Don't do anything with the operands, just extend the result. 3933 continue; 3934 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3935 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3936 ->getNumElements(); 3937 auto *O0 = B.CreateZExtOrTrunc( 3938 IE->getOperand(0), 3939 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3940 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3941 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3942 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3943 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3944 ->getNumElements(); 3945 auto *O0 = B.CreateZExtOrTrunc( 3946 EE->getOperand(0), 3947 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3948 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3949 } else { 3950 // If we don't know what to do, be conservative and don't do anything. 3951 continue; 3952 } 3953 3954 // Lastly, extend the result. 3955 NewI->takeName(cast<Instruction>(I)); 3956 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3957 I->replaceAllUsesWith(Res); 3958 cast<Instruction>(I)->eraseFromParent(); 3959 Erased.insert(I); 3960 State.reset(Def, Res, Part); 3961 } 3962 } 3963 3964 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3965 for (const auto &KV : Cost->getMinimalBitwidths()) { 3966 // If the value wasn't vectorized, we must maintain the original scalar 3967 // type. The absence of the value from State indicates that it 3968 // wasn't vectorized. 3969 VPValue *Def = State.Plan->getVPValue(KV.first); 3970 if (!State.hasAnyVectorValue(Def)) 3971 continue; 3972 for (unsigned Part = 0; Part < UF; ++Part) { 3973 Value *I = State.get(Def, Part); 3974 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3975 if (Inst && Inst->use_empty()) { 3976 Value *NewI = Inst->getOperand(0); 3977 Inst->eraseFromParent(); 3978 State.reset(Def, NewI, Part); 3979 } 3980 } 3981 } 3982 } 3983 3984 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3985 // Insert truncates and extends for any truncated instructions as hints to 3986 // InstCombine. 3987 if (VF.isVector()) 3988 truncateToMinimalBitwidths(State); 3989 3990 // Fix widened non-induction PHIs by setting up the PHI operands. 3991 if (OrigPHIsToFix.size()) { 3992 assert(EnableVPlanNativePath && 3993 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3994 fixNonInductionPHIs(State); 3995 } 3996 3997 // At this point every instruction in the original loop is widened to a 3998 // vector form. Now we need to fix the recurrences in the loop. These PHI 3999 // nodes are currently empty because we did not want to introduce cycles. 4000 // This is the second stage of vectorizing recurrences. 4001 fixCrossIterationPHIs(State); 4002 4003 // Forget the original basic block. 4004 PSE.getSE()->forgetLoop(OrigLoop); 4005 4006 // Fix-up external users of the induction variables. 4007 for (auto &Entry : Legal->getInductionVars()) 4008 fixupIVUsers(Entry.first, Entry.second, 4009 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4010 IVEndValues[Entry.first], LoopMiddleBlock); 4011 4012 fixLCSSAPHIs(State); 4013 for (Instruction *PI : PredicatedInstructions) 4014 sinkScalarOperands(&*PI); 4015 4016 // Remove redundant induction instructions. 4017 cse(LoopVectorBody); 4018 4019 // Set/update profile weights for the vector and remainder loops as original 4020 // loop iterations are now distributed among them. Note that original loop 4021 // represented by LoopScalarBody becomes remainder loop after vectorization. 4022 // 4023 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4024 // end up getting slightly roughened result but that should be OK since 4025 // profile is not inherently precise anyway. Note also possible bypass of 4026 // vector code caused by legality checks is ignored, assigning all the weight 4027 // to the vector loop, optimistically. 4028 // 4029 // For scalable vectorization we can't know at compile time how many iterations 4030 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4031 // vscale of '1'. 4032 setProfileInfoAfterUnrolling( 4033 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4034 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4035 } 4036 4037 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4038 // In order to support recurrences we need to be able to vectorize Phi nodes. 4039 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4040 // stage #2: We now need to fix the recurrences by adding incoming edges to 4041 // the currently empty PHI nodes. At this point every instruction in the 4042 // original loop is widened to a vector form so we can use them to construct 4043 // the incoming edges. 4044 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4045 // Handle first-order recurrences and reductions that need to be fixed. 4046 if (Legal->isFirstOrderRecurrence(&Phi)) 4047 fixFirstOrderRecurrence(&Phi, State); 4048 else if (Legal->isReductionVariable(&Phi)) 4049 fixReduction(&Phi, State); 4050 } 4051 } 4052 4053 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4054 VPTransformState &State) { 4055 // This is the second phase of vectorizing first-order recurrences. An 4056 // overview of the transformation is described below. Suppose we have the 4057 // following loop. 4058 // 4059 // for (int i = 0; i < n; ++i) 4060 // b[i] = a[i] - a[i - 1]; 4061 // 4062 // There is a first-order recurrence on "a". For this loop, the shorthand 4063 // scalar IR looks like: 4064 // 4065 // scalar.ph: 4066 // s_init = a[-1] 4067 // br scalar.body 4068 // 4069 // scalar.body: 4070 // i = phi [0, scalar.ph], [i+1, scalar.body] 4071 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4072 // s2 = a[i] 4073 // b[i] = s2 - s1 4074 // br cond, scalar.body, ... 4075 // 4076 // In this example, s1 is a recurrence because it's value depends on the 4077 // previous iteration. In the first phase of vectorization, we created a 4078 // temporary value for s1. We now complete the vectorization and produce the 4079 // shorthand vector IR shown below (for VF = 4, UF = 1). 4080 // 4081 // vector.ph: 4082 // v_init = vector(..., ..., ..., a[-1]) 4083 // br vector.body 4084 // 4085 // vector.body 4086 // i = phi [0, vector.ph], [i+4, vector.body] 4087 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4088 // v2 = a[i, i+1, i+2, i+3]; 4089 // v3 = vector(v1(3), v2(0, 1, 2)) 4090 // b[i, i+1, i+2, i+3] = v2 - v3 4091 // br cond, vector.body, middle.block 4092 // 4093 // middle.block: 4094 // x = v2(3) 4095 // br scalar.ph 4096 // 4097 // scalar.ph: 4098 // s_init = phi [x, middle.block], [a[-1], otherwise] 4099 // br scalar.body 4100 // 4101 // After execution completes the vector loop, we extract the next value of 4102 // the recurrence (x) to use as the initial value in the scalar loop. 4103 4104 // Get the original loop preheader and single loop latch. 4105 auto *Preheader = OrigLoop->getLoopPreheader(); 4106 auto *Latch = OrigLoop->getLoopLatch(); 4107 4108 // Get the initial and previous values of the scalar recurrence. 4109 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4110 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4111 4112 // Create a vector from the initial value. 4113 auto *VectorInit = ScalarInit; 4114 if (VF.isVector()) { 4115 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4116 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4117 VectorInit = Builder.CreateInsertElement( 4118 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4119 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4120 } 4121 4122 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4123 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4124 // We constructed a temporary phi node in the first phase of vectorization. 4125 // This phi node will eventually be deleted. 4126 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4127 4128 // Create a phi node for the new recurrence. The current value will either be 4129 // the initial value inserted into a vector or loop-varying vector value. 4130 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4131 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4132 4133 // Get the vectorized previous value of the last part UF - 1. It appears last 4134 // among all unrolled iterations, due to the order of their construction. 4135 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4136 4137 // Find and set the insertion point after the previous value if it is an 4138 // instruction. 4139 BasicBlock::iterator InsertPt; 4140 // Note that the previous value may have been constant-folded so it is not 4141 // guaranteed to be an instruction in the vector loop. 4142 // FIXME: Loop invariant values do not form recurrences. We should deal with 4143 // them earlier. 4144 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4145 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4146 else { 4147 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4148 if (isa<PHINode>(PreviousLastPart)) 4149 // If the previous value is a phi node, we should insert after all the phi 4150 // nodes in the block containing the PHI to avoid breaking basic block 4151 // verification. Note that the basic block may be different to 4152 // LoopVectorBody, in case we predicate the loop. 4153 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4154 else 4155 InsertPt = ++PreviousInst->getIterator(); 4156 } 4157 Builder.SetInsertPoint(&*InsertPt); 4158 4159 // We will construct a vector for the recurrence by combining the values for 4160 // the current and previous iterations. This is the required shuffle mask. 4161 assert(!VF.isScalable()); 4162 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4163 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4164 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4165 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4166 4167 // The vector from which to take the initial value for the current iteration 4168 // (actual or unrolled). Initially, this is the vector phi node. 4169 Value *Incoming = VecPhi; 4170 4171 // Shuffle the current and previous vector and update the vector parts. 4172 for (unsigned Part = 0; Part < UF; ++Part) { 4173 Value *PreviousPart = State.get(PreviousDef, Part); 4174 Value *PhiPart = State.get(PhiDef, Part); 4175 auto *Shuffle = 4176 VF.isVector() 4177 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4178 : Incoming; 4179 PhiPart->replaceAllUsesWith(Shuffle); 4180 cast<Instruction>(PhiPart)->eraseFromParent(); 4181 State.reset(PhiDef, Shuffle, Part); 4182 Incoming = PreviousPart; 4183 } 4184 4185 // Fix the latch value of the new recurrence in the vector loop. 4186 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4187 4188 // Extract the last vector element in the middle block. This will be the 4189 // initial value for the recurrence when jumping to the scalar loop. 4190 auto *ExtractForScalar = Incoming; 4191 if (VF.isVector()) { 4192 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4193 ExtractForScalar = Builder.CreateExtractElement( 4194 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4195 "vector.recur.extract"); 4196 } 4197 // Extract the second last element in the middle block if the 4198 // Phi is used outside the loop. We need to extract the phi itself 4199 // and not the last element (the phi update in the current iteration). This 4200 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4201 // when the scalar loop is not run at all. 4202 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4203 if (VF.isVector()) 4204 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4205 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4206 "vector.recur.extract.for.phi"); 4207 // When loop is unrolled without vectorizing, initialize 4208 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4209 // `Incoming`. This is analogous to the vectorized case above: extracting the 4210 // second last element when VF > 1. 4211 else if (UF > 1) 4212 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4213 4214 // Fix the initial value of the original recurrence in the scalar loop. 4215 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4216 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4217 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4218 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4219 Start->addIncoming(Incoming, BB); 4220 } 4221 4222 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4223 Phi->setName("scalar.recur"); 4224 4225 // Finally, fix users of the recurrence outside the loop. The users will need 4226 // either the last value of the scalar recurrence or the last value of the 4227 // vector recurrence we extracted in the middle block. Since the loop is in 4228 // LCSSA form, we just need to find all the phi nodes for the original scalar 4229 // recurrence in the exit block, and then add an edge for the middle block. 4230 // Note that LCSSA does not imply single entry when the original scalar loop 4231 // had multiple exiting edges (as we always run the last iteration in the 4232 // scalar epilogue); in that case, the exiting path through middle will be 4233 // dynamically dead and the value picked for the phi doesn't matter. 4234 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4235 if (any_of(LCSSAPhi.incoming_values(), 4236 [Phi](Value *V) { return V == Phi; })) 4237 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4238 } 4239 4240 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4241 // Get it's reduction variable descriptor. 4242 assert(Legal->isReductionVariable(Phi) && 4243 "Unable to find the reduction variable"); 4244 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4245 4246 RecurKind RK = RdxDesc.getRecurrenceKind(); 4247 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4248 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4249 setDebugLocFromInst(Builder, ReductionStartValue); 4250 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4251 4252 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4253 // This is the vector-clone of the value that leaves the loop. 4254 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4255 4256 // Wrap flags are in general invalid after vectorization, clear them. 4257 clearReductionWrapFlags(RdxDesc, State); 4258 4259 // Fix the vector-loop phi. 4260 4261 // Reductions do not have to start at zero. They can start with 4262 // any loop invariant values. 4263 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4264 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4265 4266 for (unsigned Part = 0; Part < UF; ++Part) { 4267 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4268 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4269 cast<PHINode>(VecRdxPhi) 4270 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4271 } 4272 4273 // Before each round, move the insertion point right between 4274 // the PHIs and the values we are going to write. 4275 // This allows us to write both PHINodes and the extractelement 4276 // instructions. 4277 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4278 4279 setDebugLocFromInst(Builder, LoopExitInst); 4280 4281 // If tail is folded by masking, the vector value to leave the loop should be 4282 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4283 // instead of the former. For an inloop reduction the reduction will already 4284 // be predicated, and does not need to be handled here. 4285 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4286 for (unsigned Part = 0; Part < UF; ++Part) { 4287 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4288 Value *Sel = nullptr; 4289 for (User *U : VecLoopExitInst->users()) { 4290 if (isa<SelectInst>(U)) { 4291 assert(!Sel && "Reduction exit feeding two selects"); 4292 Sel = U; 4293 } else 4294 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4295 } 4296 assert(Sel && "Reduction exit feeds no select"); 4297 State.reset(LoopExitInstDef, Sel, Part); 4298 4299 // If the target can create a predicated operator for the reduction at no 4300 // extra cost in the loop (for example a predicated vadd), it can be 4301 // cheaper for the select to remain in the loop than be sunk out of it, 4302 // and so use the select value for the phi instead of the old 4303 // LoopExitValue. 4304 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4305 if (PreferPredicatedReductionSelect || 4306 TTI->preferPredicatedReductionSelect( 4307 RdxDesc.getOpcode(), Phi->getType(), 4308 TargetTransformInfo::ReductionFlags())) { 4309 auto *VecRdxPhi = 4310 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4311 VecRdxPhi->setIncomingValueForBlock( 4312 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4313 } 4314 } 4315 } 4316 4317 // If the vector reduction can be performed in a smaller type, we truncate 4318 // then extend the loop exit value to enable InstCombine to evaluate the 4319 // entire expression in the smaller type. 4320 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4321 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4322 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4323 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4324 Builder.SetInsertPoint( 4325 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4326 VectorParts RdxParts(UF); 4327 for (unsigned Part = 0; Part < UF; ++Part) { 4328 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4329 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4330 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4331 : Builder.CreateZExt(Trunc, VecTy); 4332 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4333 UI != RdxParts[Part]->user_end();) 4334 if (*UI != Trunc) { 4335 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4336 RdxParts[Part] = Extnd; 4337 } else { 4338 ++UI; 4339 } 4340 } 4341 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4342 for (unsigned Part = 0; Part < UF; ++Part) { 4343 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4344 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4345 } 4346 } 4347 4348 // Reduce all of the unrolled parts into a single vector. 4349 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4350 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4351 4352 // The middle block terminator has already been assigned a DebugLoc here (the 4353 // OrigLoop's single latch terminator). We want the whole middle block to 4354 // appear to execute on this line because: (a) it is all compiler generated, 4355 // (b) these instructions are always executed after evaluating the latch 4356 // conditional branch, and (c) other passes may add new predecessors which 4357 // terminate on this line. This is the easiest way to ensure we don't 4358 // accidentally cause an extra step back into the loop while debugging. 4359 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4360 { 4361 // Floating-point operations should have some FMF to enable the reduction. 4362 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4363 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4364 for (unsigned Part = 1; Part < UF; ++Part) { 4365 Value *RdxPart = State.get(LoopExitInstDef, Part); 4366 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4367 ReducedPartRdx = Builder.CreateBinOp( 4368 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4369 } else { 4370 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4371 } 4372 } 4373 } 4374 4375 // Create the reduction after the loop. Note that inloop reductions create the 4376 // target reduction in the loop using a Reduction recipe. 4377 if (VF.isVector() && !IsInLoopReductionPhi) { 4378 ReducedPartRdx = 4379 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4380 // If the reduction can be performed in a smaller type, we need to extend 4381 // the reduction to the wider type before we branch to the original loop. 4382 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4383 ReducedPartRdx = 4384 RdxDesc.isSigned() 4385 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4386 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4387 } 4388 4389 // Create a phi node that merges control-flow from the backedge-taken check 4390 // block and the middle block. 4391 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4392 LoopScalarPreHeader->getTerminator()); 4393 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4394 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4395 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4396 4397 // Now, we need to fix the users of the reduction variable 4398 // inside and outside of the scalar remainder loop. 4399 4400 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4401 // in the exit blocks. See comment on analogous loop in 4402 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4403 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4404 if (any_of(LCSSAPhi.incoming_values(), 4405 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4406 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4407 4408 // Fix the scalar loop reduction variable with the incoming reduction sum 4409 // from the vector body and from the backedge value. 4410 int IncomingEdgeBlockIdx = 4411 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4412 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4413 // Pick the other block. 4414 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4415 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4416 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4417 } 4418 4419 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4420 VPTransformState &State) { 4421 RecurKind RK = RdxDesc.getRecurrenceKind(); 4422 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4423 return; 4424 4425 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4426 assert(LoopExitInstr && "null loop exit instruction"); 4427 SmallVector<Instruction *, 8> Worklist; 4428 SmallPtrSet<Instruction *, 8> Visited; 4429 Worklist.push_back(LoopExitInstr); 4430 Visited.insert(LoopExitInstr); 4431 4432 while (!Worklist.empty()) { 4433 Instruction *Cur = Worklist.pop_back_val(); 4434 if (isa<OverflowingBinaryOperator>(Cur)) 4435 for (unsigned Part = 0; Part < UF; ++Part) { 4436 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4437 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4438 } 4439 4440 for (User *U : Cur->users()) { 4441 Instruction *UI = cast<Instruction>(U); 4442 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4443 Visited.insert(UI).second) 4444 Worklist.push_back(UI); 4445 } 4446 } 4447 } 4448 4449 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4450 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4451 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4452 // Some phis were already hand updated by the reduction and recurrence 4453 // code above, leave them alone. 4454 continue; 4455 4456 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4457 // Non-instruction incoming values will have only one value. 4458 unsigned LastLane = 0; 4459 if (isa<Instruction>(IncomingValue)) 4460 LastLane = Cost->isUniformAfterVectorization( 4461 cast<Instruction>(IncomingValue), VF) 4462 ? 0 4463 : VF.getKnownMinValue() - 1; 4464 assert((!VF.isScalable() || LastLane == 0) && 4465 "scalable vectors dont support non-uniform scalars yet"); 4466 // Can be a loop invariant incoming value or the last scalar value to be 4467 // extracted from the vectorized loop. 4468 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4469 Value *lastIncomingValue = 4470 OrigLoop->isLoopInvariant(IncomingValue) 4471 ? IncomingValue 4472 : State.get(State.Plan->getVPValue(IncomingValue), 4473 VPIteration(UF - 1, LastLane)); 4474 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4475 } 4476 } 4477 4478 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4479 // The basic block and loop containing the predicated instruction. 4480 auto *PredBB = PredInst->getParent(); 4481 auto *VectorLoop = LI->getLoopFor(PredBB); 4482 4483 // Initialize a worklist with the operands of the predicated instruction. 4484 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4485 4486 // Holds instructions that we need to analyze again. An instruction may be 4487 // reanalyzed if we don't yet know if we can sink it or not. 4488 SmallVector<Instruction *, 8> InstsToReanalyze; 4489 4490 // Returns true if a given use occurs in the predicated block. Phi nodes use 4491 // their operands in their corresponding predecessor blocks. 4492 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4493 auto *I = cast<Instruction>(U.getUser()); 4494 BasicBlock *BB = I->getParent(); 4495 if (auto *Phi = dyn_cast<PHINode>(I)) 4496 BB = Phi->getIncomingBlock( 4497 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4498 return BB == PredBB; 4499 }; 4500 4501 // Iteratively sink the scalarized operands of the predicated instruction 4502 // into the block we created for it. When an instruction is sunk, it's 4503 // operands are then added to the worklist. The algorithm ends after one pass 4504 // through the worklist doesn't sink a single instruction. 4505 bool Changed; 4506 do { 4507 // Add the instructions that need to be reanalyzed to the worklist, and 4508 // reset the changed indicator. 4509 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4510 InstsToReanalyze.clear(); 4511 Changed = false; 4512 4513 while (!Worklist.empty()) { 4514 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4515 4516 // We can't sink an instruction if it is a phi node, is already in the 4517 // predicated block, is not in the loop, or may have side effects. 4518 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4519 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4520 continue; 4521 4522 // It's legal to sink the instruction if all its uses occur in the 4523 // predicated block. Otherwise, there's nothing to do yet, and we may 4524 // need to reanalyze the instruction. 4525 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4526 InstsToReanalyze.push_back(I); 4527 continue; 4528 } 4529 4530 // Move the instruction to the beginning of the predicated block, and add 4531 // it's operands to the worklist. 4532 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4533 Worklist.insert(I->op_begin(), I->op_end()); 4534 4535 // The sinking may have enabled other instructions to be sunk, so we will 4536 // need to iterate. 4537 Changed = true; 4538 } 4539 } while (Changed); 4540 } 4541 4542 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4543 for (PHINode *OrigPhi : OrigPHIsToFix) { 4544 VPWidenPHIRecipe *VPPhi = 4545 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4546 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4547 // Make sure the builder has a valid insert point. 4548 Builder.SetInsertPoint(NewPhi); 4549 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4550 VPValue *Inc = VPPhi->getIncomingValue(i); 4551 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4552 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4553 } 4554 } 4555 } 4556 4557 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4558 VPUser &Operands, unsigned UF, 4559 ElementCount VF, bool IsPtrLoopInvariant, 4560 SmallBitVector &IsIndexLoopInvariant, 4561 VPTransformState &State) { 4562 // Construct a vector GEP by widening the operands of the scalar GEP as 4563 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4564 // results in a vector of pointers when at least one operand of the GEP 4565 // is vector-typed. Thus, to keep the representation compact, we only use 4566 // vector-typed operands for loop-varying values. 4567 4568 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4569 // If we are vectorizing, but the GEP has only loop-invariant operands, 4570 // the GEP we build (by only using vector-typed operands for 4571 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4572 // produce a vector of pointers, we need to either arbitrarily pick an 4573 // operand to broadcast, or broadcast a clone of the original GEP. 4574 // Here, we broadcast a clone of the original. 4575 // 4576 // TODO: If at some point we decide to scalarize instructions having 4577 // loop-invariant operands, this special case will no longer be 4578 // required. We would add the scalarization decision to 4579 // collectLoopScalars() and teach getVectorValue() to broadcast 4580 // the lane-zero scalar value. 4581 auto *Clone = Builder.Insert(GEP->clone()); 4582 for (unsigned Part = 0; Part < UF; ++Part) { 4583 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4584 State.set(VPDef, EntryPart, Part); 4585 addMetadata(EntryPart, GEP); 4586 } 4587 } else { 4588 // If the GEP has at least one loop-varying operand, we are sure to 4589 // produce a vector of pointers. But if we are only unrolling, we want 4590 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4591 // produce with the code below will be scalar (if VF == 1) or vector 4592 // (otherwise). Note that for the unroll-only case, we still maintain 4593 // values in the vector mapping with initVector, as we do for other 4594 // instructions. 4595 for (unsigned Part = 0; Part < UF; ++Part) { 4596 // The pointer operand of the new GEP. If it's loop-invariant, we 4597 // won't broadcast it. 4598 auto *Ptr = IsPtrLoopInvariant 4599 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4600 : State.get(Operands.getOperand(0), Part); 4601 4602 // Collect all the indices for the new GEP. If any index is 4603 // loop-invariant, we won't broadcast it. 4604 SmallVector<Value *, 4> Indices; 4605 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4606 VPValue *Operand = Operands.getOperand(I); 4607 if (IsIndexLoopInvariant[I - 1]) 4608 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4609 else 4610 Indices.push_back(State.get(Operand, Part)); 4611 } 4612 4613 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4614 // but it should be a vector, otherwise. 4615 auto *NewGEP = 4616 GEP->isInBounds() 4617 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4618 Indices) 4619 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4620 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4621 "NewGEP is not a pointer vector"); 4622 State.set(VPDef, NewGEP, Part); 4623 addMetadata(NewGEP, GEP); 4624 } 4625 } 4626 } 4627 4628 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4629 RecurrenceDescriptor *RdxDesc, 4630 VPValue *StartVPV, VPValue *Def, 4631 VPTransformState &State) { 4632 PHINode *P = cast<PHINode>(PN); 4633 if (EnableVPlanNativePath) { 4634 // Currently we enter here in the VPlan-native path for non-induction 4635 // PHIs where all control flow is uniform. We simply widen these PHIs. 4636 // Create a vector phi with no operands - the vector phi operands will be 4637 // set at the end of vector code generation. 4638 Type *VecTy = (State.VF.isScalar()) 4639 ? PN->getType() 4640 : VectorType::get(PN->getType(), State.VF); 4641 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4642 State.set(Def, VecPhi, 0); 4643 OrigPHIsToFix.push_back(P); 4644 4645 return; 4646 } 4647 4648 assert(PN->getParent() == OrigLoop->getHeader() && 4649 "Non-header phis should have been handled elsewhere"); 4650 4651 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4652 // In order to support recurrences we need to be able to vectorize Phi nodes. 4653 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4654 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4655 // this value when we vectorize all of the instructions that use the PHI. 4656 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4657 Value *Iden = nullptr; 4658 bool ScalarPHI = 4659 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4660 Type *VecTy = 4661 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4662 4663 if (RdxDesc) { 4664 assert(Legal->isReductionVariable(P) && StartV && 4665 "RdxDesc should only be set for reduction variables; in that case " 4666 "a StartV is also required"); 4667 RecurKind RK = RdxDesc->getRecurrenceKind(); 4668 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4669 // MinMax reduction have the start value as their identify. 4670 if (ScalarPHI) { 4671 Iden = StartV; 4672 } else { 4673 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4674 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4675 StartV = Iden = 4676 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4677 } 4678 } else { 4679 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4680 RK, VecTy->getScalarType()); 4681 Iden = IdenC; 4682 4683 if (!ScalarPHI) { 4684 Iden = ConstantVector::getSplat(State.VF, IdenC); 4685 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4686 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4687 Constant *Zero = Builder.getInt32(0); 4688 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4689 } 4690 } 4691 } 4692 4693 for (unsigned Part = 0; Part < State.UF; ++Part) { 4694 // This is phase one of vectorizing PHIs. 4695 Value *EntryPart = PHINode::Create( 4696 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4697 State.set(Def, EntryPart, Part); 4698 if (StartV) { 4699 // Make sure to add the reduction start value only to the 4700 // first unroll part. 4701 Value *StartVal = (Part == 0) ? StartV : Iden; 4702 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4703 } 4704 } 4705 return; 4706 } 4707 4708 assert(!Legal->isReductionVariable(P) && 4709 "reductions should be handled above"); 4710 4711 setDebugLocFromInst(Builder, P); 4712 4713 // This PHINode must be an induction variable. 4714 // Make sure that we know about it. 4715 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4716 4717 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4718 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4719 4720 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4721 // which can be found from the original scalar operations. 4722 switch (II.getKind()) { 4723 case InductionDescriptor::IK_NoInduction: 4724 llvm_unreachable("Unknown induction"); 4725 case InductionDescriptor::IK_IntInduction: 4726 case InductionDescriptor::IK_FpInduction: 4727 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4728 case InductionDescriptor::IK_PtrInduction: { 4729 // Handle the pointer induction variable case. 4730 assert(P->getType()->isPointerTy() && "Unexpected type."); 4731 4732 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4733 // This is the normalized GEP that starts counting at zero. 4734 Value *PtrInd = 4735 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4736 // Determine the number of scalars we need to generate for each unroll 4737 // iteration. If the instruction is uniform, we only need to generate the 4738 // first lane. Otherwise, we generate all VF values. 4739 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4740 ? 1 4741 : State.VF.getKnownMinValue(); 4742 for (unsigned Part = 0; Part < UF; ++Part) { 4743 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4744 Constant *Idx = ConstantInt::get( 4745 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4746 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4747 Value *SclrGep = 4748 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4749 SclrGep->setName("next.gep"); 4750 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4751 } 4752 } 4753 return; 4754 } 4755 assert(isa<SCEVConstant>(II.getStep()) && 4756 "Induction step not a SCEV constant!"); 4757 Type *PhiType = II.getStep()->getType(); 4758 4759 // Build a pointer phi 4760 Value *ScalarStartValue = II.getStartValue(); 4761 Type *ScStValueType = ScalarStartValue->getType(); 4762 PHINode *NewPointerPhi = 4763 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4764 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4765 4766 // A pointer induction, performed by using a gep 4767 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4768 Instruction *InductionLoc = LoopLatch->getTerminator(); 4769 const SCEV *ScalarStep = II.getStep(); 4770 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4771 Value *ScalarStepValue = 4772 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4773 Value *InductionGEP = GetElementPtrInst::Create( 4774 ScStValueType->getPointerElementType(), NewPointerPhi, 4775 Builder.CreateMul( 4776 ScalarStepValue, 4777 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4778 "ptr.ind", InductionLoc); 4779 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4780 4781 // Create UF many actual address geps that use the pointer 4782 // phi as base and a vectorized version of the step value 4783 // (<step*0, ..., step*N>) as offset. 4784 for (unsigned Part = 0; Part < State.UF; ++Part) { 4785 SmallVector<Constant *, 8> Indices; 4786 // Create a vector of consecutive numbers from zero to VF. 4787 for (unsigned i = 0; i < State.VF.getKnownMinValue(); ++i) 4788 Indices.push_back( 4789 ConstantInt::get(PhiType, i + Part * State.VF.getKnownMinValue())); 4790 Constant *StartOffset = ConstantVector::get(Indices); 4791 4792 Value *GEP = Builder.CreateGEP( 4793 ScStValueType->getPointerElementType(), NewPointerPhi, 4794 Builder.CreateMul(StartOffset, 4795 Builder.CreateVectorSplat( 4796 State.VF.getKnownMinValue(), ScalarStepValue), 4797 "vector.gep")); 4798 State.set(Def, GEP, Part); 4799 } 4800 } 4801 } 4802 } 4803 4804 /// A helper function for checking whether an integer division-related 4805 /// instruction may divide by zero (in which case it must be predicated if 4806 /// executed conditionally in the scalar code). 4807 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4808 /// Non-zero divisors that are non compile-time constants will not be 4809 /// converted into multiplication, so we will still end up scalarizing 4810 /// the division, but can do so w/o predication. 4811 static bool mayDivideByZero(Instruction &I) { 4812 assert((I.getOpcode() == Instruction::UDiv || 4813 I.getOpcode() == Instruction::SDiv || 4814 I.getOpcode() == Instruction::URem || 4815 I.getOpcode() == Instruction::SRem) && 4816 "Unexpected instruction"); 4817 Value *Divisor = I.getOperand(1); 4818 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4819 return !CInt || CInt->isZero(); 4820 } 4821 4822 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4823 VPUser &User, 4824 VPTransformState &State) { 4825 switch (I.getOpcode()) { 4826 case Instruction::Call: 4827 case Instruction::Br: 4828 case Instruction::PHI: 4829 case Instruction::GetElementPtr: 4830 case Instruction::Select: 4831 llvm_unreachable("This instruction is handled by a different recipe."); 4832 case Instruction::UDiv: 4833 case Instruction::SDiv: 4834 case Instruction::SRem: 4835 case Instruction::URem: 4836 case Instruction::Add: 4837 case Instruction::FAdd: 4838 case Instruction::Sub: 4839 case Instruction::FSub: 4840 case Instruction::FNeg: 4841 case Instruction::Mul: 4842 case Instruction::FMul: 4843 case Instruction::FDiv: 4844 case Instruction::FRem: 4845 case Instruction::Shl: 4846 case Instruction::LShr: 4847 case Instruction::AShr: 4848 case Instruction::And: 4849 case Instruction::Or: 4850 case Instruction::Xor: { 4851 // Just widen unops and binops. 4852 setDebugLocFromInst(Builder, &I); 4853 4854 for (unsigned Part = 0; Part < UF; ++Part) { 4855 SmallVector<Value *, 2> Ops; 4856 for (VPValue *VPOp : User.operands()) 4857 Ops.push_back(State.get(VPOp, Part)); 4858 4859 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4860 4861 if (auto *VecOp = dyn_cast<Instruction>(V)) 4862 VecOp->copyIRFlags(&I); 4863 4864 // Use this vector value for all users of the original instruction. 4865 State.set(Def, V, Part); 4866 addMetadata(V, &I); 4867 } 4868 4869 break; 4870 } 4871 case Instruction::ICmp: 4872 case Instruction::FCmp: { 4873 // Widen compares. Generate vector compares. 4874 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4875 auto *Cmp = cast<CmpInst>(&I); 4876 setDebugLocFromInst(Builder, Cmp); 4877 for (unsigned Part = 0; Part < UF; ++Part) { 4878 Value *A = State.get(User.getOperand(0), Part); 4879 Value *B = State.get(User.getOperand(1), Part); 4880 Value *C = nullptr; 4881 if (FCmp) { 4882 // Propagate fast math flags. 4883 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4884 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4885 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4886 } else { 4887 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4888 } 4889 State.set(Def, C, Part); 4890 addMetadata(C, &I); 4891 } 4892 4893 break; 4894 } 4895 4896 case Instruction::ZExt: 4897 case Instruction::SExt: 4898 case Instruction::FPToUI: 4899 case Instruction::FPToSI: 4900 case Instruction::FPExt: 4901 case Instruction::PtrToInt: 4902 case Instruction::IntToPtr: 4903 case Instruction::SIToFP: 4904 case Instruction::UIToFP: 4905 case Instruction::Trunc: 4906 case Instruction::FPTrunc: 4907 case Instruction::BitCast: { 4908 auto *CI = cast<CastInst>(&I); 4909 setDebugLocFromInst(Builder, CI); 4910 4911 /// Vectorize casts. 4912 Type *DestTy = 4913 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4914 4915 for (unsigned Part = 0; Part < UF; ++Part) { 4916 Value *A = State.get(User.getOperand(0), Part); 4917 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4918 State.set(Def, Cast, Part); 4919 addMetadata(Cast, &I); 4920 } 4921 break; 4922 } 4923 default: 4924 // This instruction is not vectorized by simple widening. 4925 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4926 llvm_unreachable("Unhandled instruction!"); 4927 } // end of switch. 4928 } 4929 4930 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4931 VPUser &ArgOperands, 4932 VPTransformState &State) { 4933 assert(!isa<DbgInfoIntrinsic>(I) && 4934 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4935 setDebugLocFromInst(Builder, &I); 4936 4937 Module *M = I.getParent()->getParent()->getParent(); 4938 auto *CI = cast<CallInst>(&I); 4939 4940 SmallVector<Type *, 4> Tys; 4941 for (Value *ArgOperand : CI->arg_operands()) 4942 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4943 4944 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4945 4946 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4947 // version of the instruction. 4948 // Is it beneficial to perform intrinsic call compared to lib call? 4949 bool NeedToScalarize = false; 4950 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4951 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4952 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4953 assert((UseVectorIntrinsic || !NeedToScalarize) && 4954 "Instruction should be scalarized elsewhere."); 4955 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4956 "Cannot have invalid costs while widening"); 4957 4958 for (unsigned Part = 0; Part < UF; ++Part) { 4959 SmallVector<Value *, 4> Args; 4960 for (auto &I : enumerate(ArgOperands.operands())) { 4961 // Some intrinsics have a scalar argument - don't replace it with a 4962 // vector. 4963 Value *Arg; 4964 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4965 Arg = State.get(I.value(), Part); 4966 else 4967 Arg = State.get(I.value(), VPIteration(0, 0)); 4968 Args.push_back(Arg); 4969 } 4970 4971 Function *VectorF; 4972 if (UseVectorIntrinsic) { 4973 // Use vector version of the intrinsic. 4974 Type *TysForDecl[] = {CI->getType()}; 4975 if (VF.isVector()) 4976 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4977 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4978 assert(VectorF && "Can't retrieve vector intrinsic."); 4979 } else { 4980 // Use vector version of the function call. 4981 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4982 #ifndef NDEBUG 4983 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4984 "Can't create vector function."); 4985 #endif 4986 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4987 } 4988 SmallVector<OperandBundleDef, 1> OpBundles; 4989 CI->getOperandBundlesAsDefs(OpBundles); 4990 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4991 4992 if (isa<FPMathOperator>(V)) 4993 V->copyFastMathFlags(CI); 4994 4995 State.set(Def, V, Part); 4996 addMetadata(V, &I); 4997 } 4998 } 4999 5000 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5001 VPUser &Operands, 5002 bool InvariantCond, 5003 VPTransformState &State) { 5004 setDebugLocFromInst(Builder, &I); 5005 5006 // The condition can be loop invariant but still defined inside the 5007 // loop. This means that we can't just use the original 'cond' value. 5008 // We have to take the 'vectorized' value and pick the first lane. 5009 // Instcombine will make this a no-op. 5010 auto *InvarCond = InvariantCond 5011 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5012 : nullptr; 5013 5014 for (unsigned Part = 0; Part < UF; ++Part) { 5015 Value *Cond = 5016 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5017 Value *Op0 = State.get(Operands.getOperand(1), Part); 5018 Value *Op1 = State.get(Operands.getOperand(2), Part); 5019 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5020 State.set(VPDef, Sel, Part); 5021 addMetadata(Sel, &I); 5022 } 5023 } 5024 5025 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5026 // We should not collect Scalars more than once per VF. Right now, this 5027 // function is called from collectUniformsAndScalars(), which already does 5028 // this check. Collecting Scalars for VF=1 does not make any sense. 5029 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5030 "This function should not be visited twice for the same VF"); 5031 5032 SmallSetVector<Instruction *, 8> Worklist; 5033 5034 // These sets are used to seed the analysis with pointers used by memory 5035 // accesses that will remain scalar. 5036 SmallSetVector<Instruction *, 8> ScalarPtrs; 5037 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5038 auto *Latch = TheLoop->getLoopLatch(); 5039 5040 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5041 // The pointer operands of loads and stores will be scalar as long as the 5042 // memory access is not a gather or scatter operation. The value operand of a 5043 // store will remain scalar if the store is scalarized. 5044 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5045 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5046 assert(WideningDecision != CM_Unknown && 5047 "Widening decision should be ready at this moment"); 5048 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5049 if (Ptr == Store->getValueOperand()) 5050 return WideningDecision == CM_Scalarize; 5051 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5052 "Ptr is neither a value or pointer operand"); 5053 return WideningDecision != CM_GatherScatter; 5054 }; 5055 5056 // A helper that returns true if the given value is a bitcast or 5057 // getelementptr instruction contained in the loop. 5058 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5059 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5060 isa<GetElementPtrInst>(V)) && 5061 !TheLoop->isLoopInvariant(V); 5062 }; 5063 5064 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5065 if (!isa<PHINode>(Ptr) || 5066 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5067 return false; 5068 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5069 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5070 return false; 5071 return isScalarUse(MemAccess, Ptr); 5072 }; 5073 5074 // A helper that evaluates a memory access's use of a pointer. If the 5075 // pointer is actually the pointer induction of a loop, it is being 5076 // inserted into Worklist. If the use will be a scalar use, and the 5077 // pointer is only used by memory accesses, we place the pointer in 5078 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5079 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5080 if (isScalarPtrInduction(MemAccess, Ptr)) { 5081 Worklist.insert(cast<Instruction>(Ptr)); 5082 Instruction *Update = cast<Instruction>( 5083 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5084 Worklist.insert(Update); 5085 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5086 << "\n"); 5087 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5088 << "\n"); 5089 return; 5090 } 5091 // We only care about bitcast and getelementptr instructions contained in 5092 // the loop. 5093 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5094 return; 5095 5096 // If the pointer has already been identified as scalar (e.g., if it was 5097 // also identified as uniform), there's nothing to do. 5098 auto *I = cast<Instruction>(Ptr); 5099 if (Worklist.count(I)) 5100 return; 5101 5102 // If the use of the pointer will be a scalar use, and all users of the 5103 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5104 // place the pointer in PossibleNonScalarPtrs. 5105 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5106 return isa<LoadInst>(U) || isa<StoreInst>(U); 5107 })) 5108 ScalarPtrs.insert(I); 5109 else 5110 PossibleNonScalarPtrs.insert(I); 5111 }; 5112 5113 // We seed the scalars analysis with three classes of instructions: (1) 5114 // instructions marked uniform-after-vectorization and (2) bitcast, 5115 // getelementptr and (pointer) phi instructions used by memory accesses 5116 // requiring a scalar use. 5117 // 5118 // (1) Add to the worklist all instructions that have been identified as 5119 // uniform-after-vectorization. 5120 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5121 5122 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5123 // memory accesses requiring a scalar use. The pointer operands of loads and 5124 // stores will be scalar as long as the memory accesses is not a gather or 5125 // scatter operation. The value operand of a store will remain scalar if the 5126 // store is scalarized. 5127 for (auto *BB : TheLoop->blocks()) 5128 for (auto &I : *BB) { 5129 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5130 evaluatePtrUse(Load, Load->getPointerOperand()); 5131 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5132 evaluatePtrUse(Store, Store->getPointerOperand()); 5133 evaluatePtrUse(Store, Store->getValueOperand()); 5134 } 5135 } 5136 for (auto *I : ScalarPtrs) 5137 if (!PossibleNonScalarPtrs.count(I)) { 5138 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5139 Worklist.insert(I); 5140 } 5141 5142 // Insert the forced scalars. 5143 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5144 // induction variable when the PHI user is scalarized. 5145 auto ForcedScalar = ForcedScalars.find(VF); 5146 if (ForcedScalar != ForcedScalars.end()) 5147 for (auto *I : ForcedScalar->second) 5148 Worklist.insert(I); 5149 5150 // Expand the worklist by looking through any bitcasts and getelementptr 5151 // instructions we've already identified as scalar. This is similar to the 5152 // expansion step in collectLoopUniforms(); however, here we're only 5153 // expanding to include additional bitcasts and getelementptr instructions. 5154 unsigned Idx = 0; 5155 while (Idx != Worklist.size()) { 5156 Instruction *Dst = Worklist[Idx++]; 5157 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5158 continue; 5159 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5160 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5161 auto *J = cast<Instruction>(U); 5162 return !TheLoop->contains(J) || Worklist.count(J) || 5163 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5164 isScalarUse(J, Src)); 5165 })) { 5166 Worklist.insert(Src); 5167 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5168 } 5169 } 5170 5171 // An induction variable will remain scalar if all users of the induction 5172 // variable and induction variable update remain scalar. 5173 for (auto &Induction : Legal->getInductionVars()) { 5174 auto *Ind = Induction.first; 5175 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5176 5177 // If tail-folding is applied, the primary induction variable will be used 5178 // to feed a vector compare. 5179 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5180 continue; 5181 5182 // Determine if all users of the induction variable are scalar after 5183 // vectorization. 5184 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5185 auto *I = cast<Instruction>(U); 5186 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5187 }); 5188 if (!ScalarInd) 5189 continue; 5190 5191 // Determine if all users of the induction variable update instruction are 5192 // scalar after vectorization. 5193 auto ScalarIndUpdate = 5194 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5195 auto *I = cast<Instruction>(U); 5196 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5197 }); 5198 if (!ScalarIndUpdate) 5199 continue; 5200 5201 // The induction variable and its update instruction will remain scalar. 5202 Worklist.insert(Ind); 5203 Worklist.insert(IndUpdate); 5204 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5205 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5206 << "\n"); 5207 } 5208 5209 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5210 } 5211 5212 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5213 ElementCount VF) { 5214 if (!blockNeedsPredication(I->getParent())) 5215 return false; 5216 switch(I->getOpcode()) { 5217 default: 5218 break; 5219 case Instruction::Load: 5220 case Instruction::Store: { 5221 if (!Legal->isMaskRequired(I)) 5222 return false; 5223 auto *Ptr = getLoadStorePointerOperand(I); 5224 auto *Ty = getMemInstValueType(I); 5225 // We have already decided how to vectorize this instruction, get that 5226 // result. 5227 if (VF.isVector()) { 5228 InstWidening WideningDecision = getWideningDecision(I, VF); 5229 assert(WideningDecision != CM_Unknown && 5230 "Widening decision should be ready at this moment"); 5231 return WideningDecision == CM_Scalarize; 5232 } 5233 const Align Alignment = getLoadStoreAlignment(I); 5234 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5235 isLegalMaskedGather(Ty, Alignment)) 5236 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5237 isLegalMaskedScatter(Ty, Alignment)); 5238 } 5239 case Instruction::UDiv: 5240 case Instruction::SDiv: 5241 case Instruction::SRem: 5242 case Instruction::URem: 5243 return mayDivideByZero(*I); 5244 } 5245 return false; 5246 } 5247 5248 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5249 Instruction *I, ElementCount VF) { 5250 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5251 assert(getWideningDecision(I, VF) == CM_Unknown && 5252 "Decision should not be set yet."); 5253 auto *Group = getInterleavedAccessGroup(I); 5254 assert(Group && "Must have a group."); 5255 5256 // If the instruction's allocated size doesn't equal it's type size, it 5257 // requires padding and will be scalarized. 5258 auto &DL = I->getModule()->getDataLayout(); 5259 auto *ScalarTy = getMemInstValueType(I); 5260 if (hasIrregularType(ScalarTy, DL, VF)) 5261 return false; 5262 5263 // Check if masking is required. 5264 // A Group may need masking for one of two reasons: it resides in a block that 5265 // needs predication, or it was decided to use masking to deal with gaps. 5266 bool PredicatedAccessRequiresMasking = 5267 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5268 bool AccessWithGapsRequiresMasking = 5269 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5270 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5271 return true; 5272 5273 // If masked interleaving is required, we expect that the user/target had 5274 // enabled it, because otherwise it either wouldn't have been created or 5275 // it should have been invalidated by the CostModel. 5276 assert(useMaskedInterleavedAccesses(TTI) && 5277 "Masked interleave-groups for predicated accesses are not enabled."); 5278 5279 auto *Ty = getMemInstValueType(I); 5280 const Align Alignment = getLoadStoreAlignment(I); 5281 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5282 : TTI.isLegalMaskedStore(Ty, Alignment); 5283 } 5284 5285 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5286 Instruction *I, ElementCount VF) { 5287 // Get and ensure we have a valid memory instruction. 5288 LoadInst *LI = dyn_cast<LoadInst>(I); 5289 StoreInst *SI = dyn_cast<StoreInst>(I); 5290 assert((LI || SI) && "Invalid memory instruction"); 5291 5292 auto *Ptr = getLoadStorePointerOperand(I); 5293 5294 // In order to be widened, the pointer should be consecutive, first of all. 5295 if (!Legal->isConsecutivePtr(Ptr)) 5296 return false; 5297 5298 // If the instruction is a store located in a predicated block, it will be 5299 // scalarized. 5300 if (isScalarWithPredication(I)) 5301 return false; 5302 5303 // If the instruction's allocated size doesn't equal it's type size, it 5304 // requires padding and will be scalarized. 5305 auto &DL = I->getModule()->getDataLayout(); 5306 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5307 if (hasIrregularType(ScalarTy, DL, VF)) 5308 return false; 5309 5310 return true; 5311 } 5312 5313 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5314 // We should not collect Uniforms more than once per VF. Right now, 5315 // this function is called from collectUniformsAndScalars(), which 5316 // already does this check. Collecting Uniforms for VF=1 does not make any 5317 // sense. 5318 5319 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5320 "This function should not be visited twice for the same VF"); 5321 5322 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5323 // not analyze again. Uniforms.count(VF) will return 1. 5324 Uniforms[VF].clear(); 5325 5326 // We now know that the loop is vectorizable! 5327 // Collect instructions inside the loop that will remain uniform after 5328 // vectorization. 5329 5330 // Global values, params and instructions outside of current loop are out of 5331 // scope. 5332 auto isOutOfScope = [&](Value *V) -> bool { 5333 Instruction *I = dyn_cast<Instruction>(V); 5334 return (!I || !TheLoop->contains(I)); 5335 }; 5336 5337 SetVector<Instruction *> Worklist; 5338 BasicBlock *Latch = TheLoop->getLoopLatch(); 5339 5340 // Instructions that are scalar with predication must not be considered 5341 // uniform after vectorization, because that would create an erroneous 5342 // replicating region where only a single instance out of VF should be formed. 5343 // TODO: optimize such seldom cases if found important, see PR40816. 5344 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5345 if (isOutOfScope(I)) { 5346 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5347 << *I << "\n"); 5348 return; 5349 } 5350 if (isScalarWithPredication(I, VF)) { 5351 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5352 << *I << "\n"); 5353 return; 5354 } 5355 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5356 Worklist.insert(I); 5357 }; 5358 5359 // Start with the conditional branch. If the branch condition is an 5360 // instruction contained in the loop that is only used by the branch, it is 5361 // uniform. 5362 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5363 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5364 addToWorklistIfAllowed(Cmp); 5365 5366 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5367 InstWidening WideningDecision = getWideningDecision(I, VF); 5368 assert(WideningDecision != CM_Unknown && 5369 "Widening decision should be ready at this moment"); 5370 5371 // A uniform memory op is itself uniform. We exclude uniform stores 5372 // here as they demand the last lane, not the first one. 5373 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5374 assert(WideningDecision == CM_Scalarize); 5375 return true; 5376 } 5377 5378 return (WideningDecision == CM_Widen || 5379 WideningDecision == CM_Widen_Reverse || 5380 WideningDecision == CM_Interleave); 5381 }; 5382 5383 5384 // Returns true if Ptr is the pointer operand of a memory access instruction 5385 // I, and I is known to not require scalarization. 5386 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5387 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5388 }; 5389 5390 // Holds a list of values which are known to have at least one uniform use. 5391 // Note that there may be other uses which aren't uniform. A "uniform use" 5392 // here is something which only demands lane 0 of the unrolled iterations; 5393 // it does not imply that all lanes produce the same value (e.g. this is not 5394 // the usual meaning of uniform) 5395 SmallPtrSet<Value *, 8> HasUniformUse; 5396 5397 // Scan the loop for instructions which are either a) known to have only 5398 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5399 for (auto *BB : TheLoop->blocks()) 5400 for (auto &I : *BB) { 5401 // If there's no pointer operand, there's nothing to do. 5402 auto *Ptr = getLoadStorePointerOperand(&I); 5403 if (!Ptr) 5404 continue; 5405 5406 // A uniform memory op is itself uniform. We exclude uniform stores 5407 // here as they demand the last lane, not the first one. 5408 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5409 addToWorklistIfAllowed(&I); 5410 5411 if (isUniformDecision(&I, VF)) { 5412 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5413 HasUniformUse.insert(Ptr); 5414 } 5415 } 5416 5417 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5418 // demanding) users. Since loops are assumed to be in LCSSA form, this 5419 // disallows uses outside the loop as well. 5420 for (auto *V : HasUniformUse) { 5421 if (isOutOfScope(V)) 5422 continue; 5423 auto *I = cast<Instruction>(V); 5424 auto UsersAreMemAccesses = 5425 llvm::all_of(I->users(), [&](User *U) -> bool { 5426 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5427 }); 5428 if (UsersAreMemAccesses) 5429 addToWorklistIfAllowed(I); 5430 } 5431 5432 // Expand Worklist in topological order: whenever a new instruction 5433 // is added , its users should be already inside Worklist. It ensures 5434 // a uniform instruction will only be used by uniform instructions. 5435 unsigned idx = 0; 5436 while (idx != Worklist.size()) { 5437 Instruction *I = Worklist[idx++]; 5438 5439 for (auto OV : I->operand_values()) { 5440 // isOutOfScope operands cannot be uniform instructions. 5441 if (isOutOfScope(OV)) 5442 continue; 5443 // First order recurrence Phi's should typically be considered 5444 // non-uniform. 5445 auto *OP = dyn_cast<PHINode>(OV); 5446 if (OP && Legal->isFirstOrderRecurrence(OP)) 5447 continue; 5448 // If all the users of the operand are uniform, then add the 5449 // operand into the uniform worklist. 5450 auto *OI = cast<Instruction>(OV); 5451 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5452 auto *J = cast<Instruction>(U); 5453 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5454 })) 5455 addToWorklistIfAllowed(OI); 5456 } 5457 } 5458 5459 // For an instruction to be added into Worklist above, all its users inside 5460 // the loop should also be in Worklist. However, this condition cannot be 5461 // true for phi nodes that form a cyclic dependence. We must process phi 5462 // nodes separately. An induction variable will remain uniform if all users 5463 // of the induction variable and induction variable update remain uniform. 5464 // The code below handles both pointer and non-pointer induction variables. 5465 for (auto &Induction : Legal->getInductionVars()) { 5466 auto *Ind = Induction.first; 5467 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5468 5469 // Determine if all users of the induction variable are uniform after 5470 // vectorization. 5471 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5472 auto *I = cast<Instruction>(U); 5473 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5474 isVectorizedMemAccessUse(I, Ind); 5475 }); 5476 if (!UniformInd) 5477 continue; 5478 5479 // Determine if all users of the induction variable update instruction are 5480 // uniform after vectorization. 5481 auto UniformIndUpdate = 5482 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5483 auto *I = cast<Instruction>(U); 5484 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5485 isVectorizedMemAccessUse(I, IndUpdate); 5486 }); 5487 if (!UniformIndUpdate) 5488 continue; 5489 5490 // The induction variable and its update instruction will remain uniform. 5491 addToWorklistIfAllowed(Ind); 5492 addToWorklistIfAllowed(IndUpdate); 5493 } 5494 5495 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5496 } 5497 5498 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5499 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5500 5501 if (Legal->getRuntimePointerChecking()->Need) { 5502 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5503 "runtime pointer checks needed. Enable vectorization of this " 5504 "loop with '#pragma clang loop vectorize(enable)' when " 5505 "compiling with -Os/-Oz", 5506 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5507 return true; 5508 } 5509 5510 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5511 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5512 "runtime SCEV checks needed. Enable vectorization of this " 5513 "loop with '#pragma clang loop vectorize(enable)' when " 5514 "compiling with -Os/-Oz", 5515 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5516 return true; 5517 } 5518 5519 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5520 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5521 reportVectorizationFailure("Runtime stride check for small trip count", 5522 "runtime stride == 1 checks needed. Enable vectorization of " 5523 "this loop without such check by compiling with -Os/-Oz", 5524 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5525 return true; 5526 } 5527 5528 return false; 5529 } 5530 5531 Optional<ElementCount> 5532 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5533 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5534 // TODO: It may by useful to do since it's still likely to be dynamically 5535 // uniform if the target can skip. 5536 reportVectorizationFailure( 5537 "Not inserting runtime ptr check for divergent target", 5538 "runtime pointer checks needed. Not enabled for divergent target", 5539 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5540 return None; 5541 } 5542 5543 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5544 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5545 if (TC == 1) { 5546 reportVectorizationFailure("Single iteration (non) loop", 5547 "loop trip count is one, irrelevant for vectorization", 5548 "SingleIterationLoop", ORE, TheLoop); 5549 return None; 5550 } 5551 5552 switch (ScalarEpilogueStatus) { 5553 case CM_ScalarEpilogueAllowed: 5554 return computeFeasibleMaxVF(TC, UserVF); 5555 case CM_ScalarEpilogueNotAllowedUsePredicate: 5556 LLVM_FALLTHROUGH; 5557 case CM_ScalarEpilogueNotNeededUsePredicate: 5558 LLVM_DEBUG( 5559 dbgs() << "LV: vector predicate hint/switch found.\n" 5560 << "LV: Not allowing scalar epilogue, creating predicated " 5561 << "vector loop.\n"); 5562 break; 5563 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5564 // fallthrough as a special case of OptForSize 5565 case CM_ScalarEpilogueNotAllowedOptSize: 5566 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5567 LLVM_DEBUG( 5568 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5569 else 5570 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5571 << "count.\n"); 5572 5573 // Bail if runtime checks are required, which are not good when optimising 5574 // for size. 5575 if (runtimeChecksRequired()) 5576 return None; 5577 5578 break; 5579 } 5580 5581 // The only loops we can vectorize without a scalar epilogue, are loops with 5582 // a bottom-test and a single exiting block. We'd have to handle the fact 5583 // that not every instruction executes on the last iteration. This will 5584 // require a lane mask which varies through the vector loop body. (TODO) 5585 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5586 // If there was a tail-folding hint/switch, but we can't fold the tail by 5587 // masking, fallback to a vectorization with a scalar epilogue. 5588 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5589 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5590 "scalar epilogue instead.\n"); 5591 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5592 return computeFeasibleMaxVF(TC, UserVF); 5593 } 5594 return None; 5595 } 5596 5597 // Now try the tail folding 5598 5599 // Invalidate interleave groups that require an epilogue if we can't mask 5600 // the interleave-group. 5601 if (!useMaskedInterleavedAccesses(TTI)) { 5602 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5603 "No decisions should have been taken at this point"); 5604 // Note: There is no need to invalidate any cost modeling decisions here, as 5605 // non where taken so far. 5606 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5607 } 5608 5609 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5610 assert(!MaxVF.isScalable() && 5611 "Scalable vectors do not yet support tail folding"); 5612 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5613 "MaxVF must be a power of 2"); 5614 unsigned MaxVFtimesIC = 5615 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5616 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5617 // chose. 5618 ScalarEvolution *SE = PSE.getSE(); 5619 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5620 const SCEV *ExitCount = SE->getAddExpr( 5621 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5622 const SCEV *Rem = SE->getURemExpr( 5623 SE->applyLoopGuards(ExitCount, TheLoop), 5624 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5625 if (Rem->isZero()) { 5626 // Accept MaxVF if we do not have a tail. 5627 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5628 return MaxVF; 5629 } 5630 5631 // If we don't know the precise trip count, or if the trip count that we 5632 // found modulo the vectorization factor is not zero, try to fold the tail 5633 // by masking. 5634 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5635 if (Legal->prepareToFoldTailByMasking()) { 5636 FoldTailByMasking = true; 5637 return MaxVF; 5638 } 5639 5640 // If there was a tail-folding hint/switch, but we can't fold the tail by 5641 // masking, fallback to a vectorization with a scalar epilogue. 5642 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5643 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5644 "scalar epilogue instead.\n"); 5645 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5646 return MaxVF; 5647 } 5648 5649 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5650 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5651 return None; 5652 } 5653 5654 if (TC == 0) { 5655 reportVectorizationFailure( 5656 "Unable to calculate the loop count due to complex control flow", 5657 "unable to calculate the loop count due to complex control flow", 5658 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5659 return None; 5660 } 5661 5662 reportVectorizationFailure( 5663 "Cannot optimize for size and vectorize at the same time.", 5664 "cannot optimize for size and vectorize at the same time. " 5665 "Enable vectorization of this loop with '#pragma clang loop " 5666 "vectorize(enable)' when compiling with -Os/-Oz", 5667 "NoTailLoopWithOptForSize", ORE, TheLoop); 5668 return None; 5669 } 5670 5671 ElementCount 5672 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5673 ElementCount UserVF) { 5674 bool IgnoreScalableUserVF = UserVF.isScalable() && 5675 !TTI.supportsScalableVectors() && 5676 !ForceTargetSupportsScalableVectors; 5677 if (IgnoreScalableUserVF) { 5678 LLVM_DEBUG( 5679 dbgs() << "LV: Ignoring VF=" << UserVF 5680 << " because target does not support scalable vectors.\n"); 5681 ORE->emit([&]() { 5682 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5683 TheLoop->getStartLoc(), 5684 TheLoop->getHeader()) 5685 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5686 << " because target does not support scalable vectors."; 5687 }); 5688 } 5689 5690 // Beyond this point two scenarios are handled. If UserVF isn't specified 5691 // then a suitable VF is chosen. If UserVF is specified and there are 5692 // dependencies, check if it's legal. However, if a UserVF is specified and 5693 // there are no dependencies, then there's nothing to do. 5694 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5695 if (!canVectorizeReductions(UserVF)) { 5696 reportVectorizationFailure( 5697 "LV: Scalable vectorization not supported for the reduction " 5698 "operations found in this loop. Using fixed-width " 5699 "vectorization instead.", 5700 "Scalable vectorization not supported for the reduction operations " 5701 "found in this loop. Using fixed-width vectorization instead.", 5702 "ScalableVFUnfeasible", ORE, TheLoop); 5703 return computeFeasibleMaxVF( 5704 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5705 } 5706 5707 if (Legal->isSafeForAnyVectorWidth()) 5708 return UserVF; 5709 } 5710 5711 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5712 unsigned SmallestType, WidestType; 5713 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5714 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5715 5716 // Get the maximum safe dependence distance in bits computed by LAA. 5717 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5718 // the memory accesses that is most restrictive (involved in the smallest 5719 // dependence distance). 5720 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5721 5722 // If the user vectorization factor is legally unsafe, clamp it to a safe 5723 // value. Otherwise, return as is. 5724 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5725 unsigned MaxSafeElements = 5726 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5727 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5728 5729 if (UserVF.isScalable()) { 5730 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5731 5732 // Scale VF by vscale before checking if it's safe. 5733 MaxSafeVF = ElementCount::getScalable( 5734 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5735 5736 if (MaxSafeVF.isZero()) { 5737 // The dependence distance is too small to use scalable vectors, 5738 // fallback on fixed. 5739 LLVM_DEBUG( 5740 dbgs() 5741 << "LV: Max legal vector width too small, scalable vectorization " 5742 "unfeasible. Using fixed-width vectorization instead.\n"); 5743 ORE->emit([&]() { 5744 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5745 TheLoop->getStartLoc(), 5746 TheLoop->getHeader()) 5747 << "Max legal vector width too small, scalable vectorization " 5748 << "unfeasible. Using fixed-width vectorization instead."; 5749 }); 5750 return computeFeasibleMaxVF( 5751 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5752 } 5753 } 5754 5755 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5756 5757 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5758 return UserVF; 5759 5760 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5761 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5762 << ".\n"); 5763 ORE->emit([&]() { 5764 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5765 TheLoop->getStartLoc(), 5766 TheLoop->getHeader()) 5767 << "User-specified vectorization factor " 5768 << ore::NV("UserVectorizationFactor", UserVF) 5769 << " is unsafe, clamping to maximum safe vectorization factor " 5770 << ore::NV("VectorizationFactor", MaxSafeVF); 5771 }); 5772 return MaxSafeVF; 5773 } 5774 5775 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5776 5777 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5778 // Note that both WidestRegister and WidestType may not be a powers of 2. 5779 auto MaxVectorSize = 5780 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5781 5782 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5783 << " / " << WidestType << " bits.\n"); 5784 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5785 << WidestRegister << " bits.\n"); 5786 5787 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5788 "Did not expect to pack so many elements" 5789 " into one vector!"); 5790 if (MaxVectorSize.getFixedValue() == 0) { 5791 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5792 return ElementCount::getFixed(1); 5793 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5794 isPowerOf2_32(ConstTripCount)) { 5795 // We need to clamp the VF to be the ConstTripCount. There is no point in 5796 // choosing a higher viable VF as done in the loop below. 5797 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5798 << ConstTripCount << "\n"); 5799 return ElementCount::getFixed(ConstTripCount); 5800 } 5801 5802 ElementCount MaxVF = MaxVectorSize; 5803 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5804 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5805 // Collect all viable vectorization factors larger than the default MaxVF 5806 // (i.e. MaxVectorSize). 5807 SmallVector<ElementCount, 8> VFs; 5808 auto MaxVectorSizeMaxBW = 5809 ElementCount::getFixed(WidestRegister / SmallestType); 5810 for (ElementCount VS = MaxVectorSize * 2; 5811 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5812 VFs.push_back(VS); 5813 5814 // For each VF calculate its register usage. 5815 auto RUs = calculateRegisterUsage(VFs); 5816 5817 // Select the largest VF which doesn't require more registers than existing 5818 // ones. 5819 for (int i = RUs.size() - 1; i >= 0; --i) { 5820 bool Selected = true; 5821 for (auto &pair : RUs[i].MaxLocalUsers) { 5822 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5823 if (pair.second > TargetNumRegisters) 5824 Selected = false; 5825 } 5826 if (Selected) { 5827 MaxVF = VFs[i]; 5828 break; 5829 } 5830 } 5831 if (ElementCount MinVF = 5832 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5833 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5834 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5835 << ") with target's minimum: " << MinVF << '\n'); 5836 MaxVF = MinVF; 5837 } 5838 } 5839 } 5840 return MaxVF; 5841 } 5842 5843 VectorizationFactor 5844 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5845 // FIXME: This can be fixed for scalable vectors later, because at this stage 5846 // the LoopVectorizer will only consider vectorizing a loop with scalable 5847 // vectors when the loop has a hint to enable vectorization for a given VF. 5848 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5849 5850 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5851 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5852 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5853 5854 auto Width = ElementCount::getFixed(1); 5855 const float ScalarCost = *ExpectedCost.getValue(); 5856 float Cost = ScalarCost; 5857 5858 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5859 if (ForceVectorization && MaxVF.isVector()) { 5860 // Ignore scalar width, because the user explicitly wants vectorization. 5861 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5862 // evaluation. 5863 Cost = std::numeric_limits<float>::max(); 5864 } 5865 5866 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5867 i *= 2) { 5868 // Notice that the vector loop needs to be executed less times, so 5869 // we need to divide the cost of the vector loops by the width of 5870 // the vector elements. 5871 VectorizationCostTy C = expectedCost(i); 5872 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5873 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5874 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5875 << " costs: " << (int)VectorCost << ".\n"); 5876 if (!C.second && !ForceVectorization) { 5877 LLVM_DEBUG( 5878 dbgs() << "LV: Not considering vector loop of width " << i 5879 << " because it will not generate any vector instructions.\n"); 5880 continue; 5881 } 5882 5883 // If profitable add it to ProfitableVF list. 5884 if (VectorCost < ScalarCost) { 5885 ProfitableVFs.push_back(VectorizationFactor( 5886 {i, (unsigned)VectorCost})); 5887 } 5888 5889 if (VectorCost < Cost) { 5890 Cost = VectorCost; 5891 Width = i; 5892 } 5893 } 5894 5895 if (!EnableCondStoresVectorization && NumPredStores) { 5896 reportVectorizationFailure("There are conditional stores.", 5897 "store that is conditionally executed prevents vectorization", 5898 "ConditionalStore", ORE, TheLoop); 5899 Width = ElementCount::getFixed(1); 5900 Cost = ScalarCost; 5901 } 5902 5903 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5904 << "LV: Vectorization seems to be not beneficial, " 5905 << "but was forced by a user.\n"); 5906 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5907 VectorizationFactor Factor = {Width, 5908 (unsigned)(Width.getKnownMinValue() * Cost)}; 5909 return Factor; 5910 } 5911 5912 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5913 const Loop &L, ElementCount VF) const { 5914 // Cross iteration phis such as reductions need special handling and are 5915 // currently unsupported. 5916 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5917 return Legal->isFirstOrderRecurrence(&Phi) || 5918 Legal->isReductionVariable(&Phi); 5919 })) 5920 return false; 5921 5922 // Phis with uses outside of the loop require special handling and are 5923 // currently unsupported. 5924 for (auto &Entry : Legal->getInductionVars()) { 5925 // Look for uses of the value of the induction at the last iteration. 5926 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5927 for (User *U : PostInc->users()) 5928 if (!L.contains(cast<Instruction>(U))) 5929 return false; 5930 // Look for uses of penultimate value of the induction. 5931 for (User *U : Entry.first->users()) 5932 if (!L.contains(cast<Instruction>(U))) 5933 return false; 5934 } 5935 5936 // Induction variables that are widened require special handling that is 5937 // currently not supported. 5938 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5939 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5940 this->isProfitableToScalarize(Entry.first, VF)); 5941 })) 5942 return false; 5943 5944 return true; 5945 } 5946 5947 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5948 const ElementCount VF) const { 5949 // FIXME: We need a much better cost-model to take different parameters such 5950 // as register pressure, code size increase and cost of extra branches into 5951 // account. For now we apply a very crude heuristic and only consider loops 5952 // with vectorization factors larger than a certain value. 5953 // We also consider epilogue vectorization unprofitable for targets that don't 5954 // consider interleaving beneficial (eg. MVE). 5955 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5956 return false; 5957 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5958 return true; 5959 return false; 5960 } 5961 5962 VectorizationFactor 5963 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5964 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5965 VectorizationFactor Result = VectorizationFactor::Disabled(); 5966 if (!EnableEpilogueVectorization) { 5967 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5968 return Result; 5969 } 5970 5971 if (!isScalarEpilogueAllowed()) { 5972 LLVM_DEBUG( 5973 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5974 "allowed.\n";); 5975 return Result; 5976 } 5977 5978 // FIXME: This can be fixed for scalable vectors later, because at this stage 5979 // the LoopVectorizer will only consider vectorizing a loop with scalable 5980 // vectors when the loop has a hint to enable vectorization for a given VF. 5981 if (MainLoopVF.isScalable()) { 5982 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5983 "yet supported.\n"); 5984 return Result; 5985 } 5986 5987 // Not really a cost consideration, but check for unsupported cases here to 5988 // simplify the logic. 5989 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5990 LLVM_DEBUG( 5991 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5992 "not a supported candidate.\n";); 5993 return Result; 5994 } 5995 5996 if (EpilogueVectorizationForceVF > 1) { 5997 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5998 if (LVP.hasPlanWithVFs( 5999 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6000 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6001 else { 6002 LLVM_DEBUG( 6003 dbgs() 6004 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6005 return Result; 6006 } 6007 } 6008 6009 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6010 TheLoop->getHeader()->getParent()->hasMinSize()) { 6011 LLVM_DEBUG( 6012 dbgs() 6013 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6014 return Result; 6015 } 6016 6017 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6018 return Result; 6019 6020 for (auto &NextVF : ProfitableVFs) 6021 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6022 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6023 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6024 Result = NextVF; 6025 6026 if (Result != VectorizationFactor::Disabled()) 6027 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6028 << Result.Width.getFixedValue() << "\n";); 6029 return Result; 6030 } 6031 6032 std::pair<unsigned, unsigned> 6033 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6034 unsigned MinWidth = -1U; 6035 unsigned MaxWidth = 8; 6036 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6037 6038 // For each block. 6039 for (BasicBlock *BB : TheLoop->blocks()) { 6040 // For each instruction in the loop. 6041 for (Instruction &I : BB->instructionsWithoutDebug()) { 6042 Type *T = I.getType(); 6043 6044 // Skip ignored values. 6045 if (ValuesToIgnore.count(&I)) 6046 continue; 6047 6048 // Only examine Loads, Stores and PHINodes. 6049 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6050 continue; 6051 6052 // Examine PHI nodes that are reduction variables. Update the type to 6053 // account for the recurrence type. 6054 if (auto *PN = dyn_cast<PHINode>(&I)) { 6055 if (!Legal->isReductionVariable(PN)) 6056 continue; 6057 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6058 if (PreferInLoopReductions || 6059 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6060 RdxDesc.getRecurrenceType(), 6061 TargetTransformInfo::ReductionFlags())) 6062 continue; 6063 T = RdxDesc.getRecurrenceType(); 6064 } 6065 6066 // Examine the stored values. 6067 if (auto *ST = dyn_cast<StoreInst>(&I)) 6068 T = ST->getValueOperand()->getType(); 6069 6070 // Ignore loaded pointer types and stored pointer types that are not 6071 // vectorizable. 6072 // 6073 // FIXME: The check here attempts to predict whether a load or store will 6074 // be vectorized. We only know this for certain after a VF has 6075 // been selected. Here, we assume that if an access can be 6076 // vectorized, it will be. We should also look at extending this 6077 // optimization to non-pointer types. 6078 // 6079 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6080 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6081 continue; 6082 6083 MinWidth = std::min(MinWidth, 6084 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6085 MaxWidth = std::max(MaxWidth, 6086 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6087 } 6088 } 6089 6090 return {MinWidth, MaxWidth}; 6091 } 6092 6093 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6094 unsigned LoopCost) { 6095 // -- The interleave heuristics -- 6096 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6097 // There are many micro-architectural considerations that we can't predict 6098 // at this level. For example, frontend pressure (on decode or fetch) due to 6099 // code size, or the number and capabilities of the execution ports. 6100 // 6101 // We use the following heuristics to select the interleave count: 6102 // 1. If the code has reductions, then we interleave to break the cross 6103 // iteration dependency. 6104 // 2. If the loop is really small, then we interleave to reduce the loop 6105 // overhead. 6106 // 3. We don't interleave if we think that we will spill registers to memory 6107 // due to the increased register pressure. 6108 6109 if (!isScalarEpilogueAllowed()) 6110 return 1; 6111 6112 // We used the distance for the interleave count. 6113 if (Legal->getMaxSafeDepDistBytes() != -1U) 6114 return 1; 6115 6116 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6117 const bool HasReductions = !Legal->getReductionVars().empty(); 6118 // Do not interleave loops with a relatively small known or estimated trip 6119 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6120 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6121 // because with the above conditions interleaving can expose ILP and break 6122 // cross iteration dependences for reductions. 6123 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6124 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6125 return 1; 6126 6127 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6128 // We divide by these constants so assume that we have at least one 6129 // instruction that uses at least one register. 6130 for (auto& pair : R.MaxLocalUsers) { 6131 pair.second = std::max(pair.second, 1U); 6132 } 6133 6134 // We calculate the interleave count using the following formula. 6135 // Subtract the number of loop invariants from the number of available 6136 // registers. These registers are used by all of the interleaved instances. 6137 // Next, divide the remaining registers by the number of registers that is 6138 // required by the loop, in order to estimate how many parallel instances 6139 // fit without causing spills. All of this is rounded down if necessary to be 6140 // a power of two. We want power of two interleave count to simplify any 6141 // addressing operations or alignment considerations. 6142 // We also want power of two interleave counts to ensure that the induction 6143 // variable of the vector loop wraps to zero, when tail is folded by masking; 6144 // this currently happens when OptForSize, in which case IC is set to 1 above. 6145 unsigned IC = UINT_MAX; 6146 6147 for (auto& pair : R.MaxLocalUsers) { 6148 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6149 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6150 << " registers of " 6151 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6152 if (VF.isScalar()) { 6153 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6154 TargetNumRegisters = ForceTargetNumScalarRegs; 6155 } else { 6156 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6157 TargetNumRegisters = ForceTargetNumVectorRegs; 6158 } 6159 unsigned MaxLocalUsers = pair.second; 6160 unsigned LoopInvariantRegs = 0; 6161 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6162 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6163 6164 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6165 // Don't count the induction variable as interleaved. 6166 if (EnableIndVarRegisterHeur) { 6167 TmpIC = 6168 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6169 std::max(1U, (MaxLocalUsers - 1))); 6170 } 6171 6172 IC = std::min(IC, TmpIC); 6173 } 6174 6175 // Clamp the interleave ranges to reasonable counts. 6176 unsigned MaxInterleaveCount = 6177 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6178 6179 // Check if the user has overridden the max. 6180 if (VF.isScalar()) { 6181 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6182 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6183 } else { 6184 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6185 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6186 } 6187 6188 // If trip count is known or estimated compile time constant, limit the 6189 // interleave count to be less than the trip count divided by VF, provided it 6190 // is at least 1. 6191 // 6192 // For scalable vectors we can't know if interleaving is beneficial. It may 6193 // not be beneficial for small loops if none of the lanes in the second vector 6194 // iterations is enabled. However, for larger loops, there is likely to be a 6195 // similar benefit as for fixed-width vectors. For now, we choose to leave 6196 // the InterleaveCount as if vscale is '1', although if some information about 6197 // the vector is known (e.g. min vector size), we can make a better decision. 6198 if (BestKnownTC) { 6199 MaxInterleaveCount = 6200 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6201 // Make sure MaxInterleaveCount is greater than 0. 6202 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6203 } 6204 6205 assert(MaxInterleaveCount > 0 && 6206 "Maximum interleave count must be greater than 0"); 6207 6208 // Clamp the calculated IC to be between the 1 and the max interleave count 6209 // that the target and trip count allows. 6210 if (IC > MaxInterleaveCount) 6211 IC = MaxInterleaveCount; 6212 else 6213 // Make sure IC is greater than 0. 6214 IC = std::max(1u, IC); 6215 6216 assert(IC > 0 && "Interleave count must be greater than 0."); 6217 6218 // If we did not calculate the cost for VF (because the user selected the VF) 6219 // then we calculate the cost of VF here. 6220 if (LoopCost == 0) { 6221 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6222 LoopCost = *expectedCost(VF).first.getValue(); 6223 } 6224 6225 assert(LoopCost && "Non-zero loop cost expected"); 6226 6227 // Interleave if we vectorized this loop and there is a reduction that could 6228 // benefit from interleaving. 6229 if (VF.isVector() && HasReductions) { 6230 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6231 return IC; 6232 } 6233 6234 // Note that if we've already vectorized the loop we will have done the 6235 // runtime check and so interleaving won't require further checks. 6236 bool InterleavingRequiresRuntimePointerCheck = 6237 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6238 6239 // We want to interleave small loops in order to reduce the loop overhead and 6240 // potentially expose ILP opportunities. 6241 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6242 << "LV: IC is " << IC << '\n' 6243 << "LV: VF is " << VF << '\n'); 6244 const bool AggressivelyInterleaveReductions = 6245 TTI.enableAggressiveInterleaving(HasReductions); 6246 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6247 // We assume that the cost overhead is 1 and we use the cost model 6248 // to estimate the cost of the loop and interleave until the cost of the 6249 // loop overhead is about 5% of the cost of the loop. 6250 unsigned SmallIC = 6251 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6252 6253 // Interleave until store/load ports (estimated by max interleave count) are 6254 // saturated. 6255 unsigned NumStores = Legal->getNumStores(); 6256 unsigned NumLoads = Legal->getNumLoads(); 6257 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6258 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6259 6260 // If we have a scalar reduction (vector reductions are already dealt with 6261 // by this point), we can increase the critical path length if the loop 6262 // we're interleaving is inside another loop. Limit, by default to 2, so the 6263 // critical path only gets increased by one reduction operation. 6264 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6265 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6266 SmallIC = std::min(SmallIC, F); 6267 StoresIC = std::min(StoresIC, F); 6268 LoadsIC = std::min(LoadsIC, F); 6269 } 6270 6271 if (EnableLoadStoreRuntimeInterleave && 6272 std::max(StoresIC, LoadsIC) > SmallIC) { 6273 LLVM_DEBUG( 6274 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6275 return std::max(StoresIC, LoadsIC); 6276 } 6277 6278 // If there are scalar reductions and TTI has enabled aggressive 6279 // interleaving for reductions, we will interleave to expose ILP. 6280 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6281 AggressivelyInterleaveReductions) { 6282 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6283 // Interleave no less than SmallIC but not as aggressive as the normal IC 6284 // to satisfy the rare situation when resources are too limited. 6285 return std::max(IC / 2, SmallIC); 6286 } else { 6287 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6288 return SmallIC; 6289 } 6290 } 6291 6292 // Interleave if this is a large loop (small loops are already dealt with by 6293 // this point) that could benefit from interleaving. 6294 if (AggressivelyInterleaveReductions) { 6295 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6296 return IC; 6297 } 6298 6299 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6300 return 1; 6301 } 6302 6303 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6304 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6305 // This function calculates the register usage by measuring the highest number 6306 // of values that are alive at a single location. Obviously, this is a very 6307 // rough estimation. We scan the loop in a topological order in order and 6308 // assign a number to each instruction. We use RPO to ensure that defs are 6309 // met before their users. We assume that each instruction that has in-loop 6310 // users starts an interval. We record every time that an in-loop value is 6311 // used, so we have a list of the first and last occurrences of each 6312 // instruction. Next, we transpose this data structure into a multi map that 6313 // holds the list of intervals that *end* at a specific location. This multi 6314 // map allows us to perform a linear search. We scan the instructions linearly 6315 // and record each time that a new interval starts, by placing it in a set. 6316 // If we find this value in the multi-map then we remove it from the set. 6317 // The max register usage is the maximum size of the set. 6318 // We also search for instructions that are defined outside the loop, but are 6319 // used inside the loop. We need this number separately from the max-interval 6320 // usage number because when we unroll, loop-invariant values do not take 6321 // more register. 6322 LoopBlocksDFS DFS(TheLoop); 6323 DFS.perform(LI); 6324 6325 RegisterUsage RU; 6326 6327 // Each 'key' in the map opens a new interval. The values 6328 // of the map are the index of the 'last seen' usage of the 6329 // instruction that is the key. 6330 using IntervalMap = DenseMap<Instruction *, unsigned>; 6331 6332 // Maps instruction to its index. 6333 SmallVector<Instruction *, 64> IdxToInstr; 6334 // Marks the end of each interval. 6335 IntervalMap EndPoint; 6336 // Saves the list of instruction indices that are used in the loop. 6337 SmallPtrSet<Instruction *, 8> Ends; 6338 // Saves the list of values that are used in the loop but are 6339 // defined outside the loop, such as arguments and constants. 6340 SmallPtrSet<Value *, 8> LoopInvariants; 6341 6342 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6343 for (Instruction &I : BB->instructionsWithoutDebug()) { 6344 IdxToInstr.push_back(&I); 6345 6346 // Save the end location of each USE. 6347 for (Value *U : I.operands()) { 6348 auto *Instr = dyn_cast<Instruction>(U); 6349 6350 // Ignore non-instruction values such as arguments, constants, etc. 6351 if (!Instr) 6352 continue; 6353 6354 // If this instruction is outside the loop then record it and continue. 6355 if (!TheLoop->contains(Instr)) { 6356 LoopInvariants.insert(Instr); 6357 continue; 6358 } 6359 6360 // Overwrite previous end points. 6361 EndPoint[Instr] = IdxToInstr.size(); 6362 Ends.insert(Instr); 6363 } 6364 } 6365 } 6366 6367 // Saves the list of intervals that end with the index in 'key'. 6368 using InstrList = SmallVector<Instruction *, 2>; 6369 DenseMap<unsigned, InstrList> TransposeEnds; 6370 6371 // Transpose the EndPoints to a list of values that end at each index. 6372 for (auto &Interval : EndPoint) 6373 TransposeEnds[Interval.second].push_back(Interval.first); 6374 6375 SmallPtrSet<Instruction *, 8> OpenIntervals; 6376 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6377 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6378 6379 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6380 6381 // A lambda that gets the register usage for the given type and VF. 6382 const auto &TTICapture = TTI; 6383 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6384 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6385 return 0U; 6386 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6387 }; 6388 6389 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6390 Instruction *I = IdxToInstr[i]; 6391 6392 // Remove all of the instructions that end at this location. 6393 InstrList &List = TransposeEnds[i]; 6394 for (Instruction *ToRemove : List) 6395 OpenIntervals.erase(ToRemove); 6396 6397 // Ignore instructions that are never used within the loop. 6398 if (!Ends.count(I)) 6399 continue; 6400 6401 // Skip ignored values. 6402 if (ValuesToIgnore.count(I)) 6403 continue; 6404 6405 // For each VF find the maximum usage of registers. 6406 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6407 // Count the number of live intervals. 6408 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6409 6410 if (VFs[j].isScalar()) { 6411 for (auto Inst : OpenIntervals) { 6412 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6413 if (RegUsage.find(ClassID) == RegUsage.end()) 6414 RegUsage[ClassID] = 1; 6415 else 6416 RegUsage[ClassID] += 1; 6417 } 6418 } else { 6419 collectUniformsAndScalars(VFs[j]); 6420 for (auto Inst : OpenIntervals) { 6421 // Skip ignored values for VF > 1. 6422 if (VecValuesToIgnore.count(Inst)) 6423 continue; 6424 if (isScalarAfterVectorization(Inst, VFs[j])) { 6425 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6426 if (RegUsage.find(ClassID) == RegUsage.end()) 6427 RegUsage[ClassID] = 1; 6428 else 6429 RegUsage[ClassID] += 1; 6430 } else { 6431 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6432 if (RegUsage.find(ClassID) == RegUsage.end()) 6433 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6434 else 6435 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6436 } 6437 } 6438 } 6439 6440 for (auto& pair : RegUsage) { 6441 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6442 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6443 else 6444 MaxUsages[j][pair.first] = pair.second; 6445 } 6446 } 6447 6448 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6449 << OpenIntervals.size() << '\n'); 6450 6451 // Add the current instruction to the list of open intervals. 6452 OpenIntervals.insert(I); 6453 } 6454 6455 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6456 SmallMapVector<unsigned, unsigned, 4> Invariant; 6457 6458 for (auto Inst : LoopInvariants) { 6459 unsigned Usage = 6460 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6461 unsigned ClassID = 6462 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6463 if (Invariant.find(ClassID) == Invariant.end()) 6464 Invariant[ClassID] = Usage; 6465 else 6466 Invariant[ClassID] += Usage; 6467 } 6468 6469 LLVM_DEBUG({ 6470 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6471 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6472 << " item\n"; 6473 for (const auto &pair : MaxUsages[i]) { 6474 dbgs() << "LV(REG): RegisterClass: " 6475 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6476 << " registers\n"; 6477 } 6478 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6479 << " item\n"; 6480 for (const auto &pair : Invariant) { 6481 dbgs() << "LV(REG): RegisterClass: " 6482 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6483 << " registers\n"; 6484 } 6485 }); 6486 6487 RU.LoopInvariantRegs = Invariant; 6488 RU.MaxLocalUsers = MaxUsages[i]; 6489 RUs[i] = RU; 6490 } 6491 6492 return RUs; 6493 } 6494 6495 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6496 // TODO: Cost model for emulated masked load/store is completely 6497 // broken. This hack guides the cost model to use an artificially 6498 // high enough value to practically disable vectorization with such 6499 // operations, except where previously deployed legality hack allowed 6500 // using very low cost values. This is to avoid regressions coming simply 6501 // from moving "masked load/store" check from legality to cost model. 6502 // Masked Load/Gather emulation was previously never allowed. 6503 // Limited number of Masked Store/Scatter emulation was allowed. 6504 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6505 return isa<LoadInst>(I) || 6506 (isa<StoreInst>(I) && 6507 NumPredStores > NumberOfStoresToPredicate); 6508 } 6509 6510 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6511 // If we aren't vectorizing the loop, or if we've already collected the 6512 // instructions to scalarize, there's nothing to do. Collection may already 6513 // have occurred if we have a user-selected VF and are now computing the 6514 // expected cost for interleaving. 6515 if (VF.isScalar() || VF.isZero() || 6516 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6517 return; 6518 6519 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6520 // not profitable to scalarize any instructions, the presence of VF in the 6521 // map will indicate that we've analyzed it already. 6522 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6523 6524 // Find all the instructions that are scalar with predication in the loop and 6525 // determine if it would be better to not if-convert the blocks they are in. 6526 // If so, we also record the instructions to scalarize. 6527 for (BasicBlock *BB : TheLoop->blocks()) { 6528 if (!blockNeedsPredication(BB)) 6529 continue; 6530 for (Instruction &I : *BB) 6531 if (isScalarWithPredication(&I)) { 6532 ScalarCostsTy ScalarCosts; 6533 // Do not apply discount logic if hacked cost is needed 6534 // for emulated masked memrefs. 6535 if (!useEmulatedMaskMemRefHack(&I) && 6536 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6537 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6538 // Remember that BB will remain after vectorization. 6539 PredicatedBBsAfterVectorization.insert(BB); 6540 } 6541 } 6542 } 6543 6544 int LoopVectorizationCostModel::computePredInstDiscount( 6545 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6546 assert(!isUniformAfterVectorization(PredInst, VF) && 6547 "Instruction marked uniform-after-vectorization will be predicated"); 6548 6549 // Initialize the discount to zero, meaning that the scalar version and the 6550 // vector version cost the same. 6551 InstructionCost Discount = 0; 6552 6553 // Holds instructions to analyze. The instructions we visit are mapped in 6554 // ScalarCosts. Those instructions are the ones that would be scalarized if 6555 // we find that the scalar version costs less. 6556 SmallVector<Instruction *, 8> Worklist; 6557 6558 // Returns true if the given instruction can be scalarized. 6559 auto canBeScalarized = [&](Instruction *I) -> bool { 6560 // We only attempt to scalarize instructions forming a single-use chain 6561 // from the original predicated block that would otherwise be vectorized. 6562 // Although not strictly necessary, we give up on instructions we know will 6563 // already be scalar to avoid traversing chains that are unlikely to be 6564 // beneficial. 6565 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6566 isScalarAfterVectorization(I, VF)) 6567 return false; 6568 6569 // If the instruction is scalar with predication, it will be analyzed 6570 // separately. We ignore it within the context of PredInst. 6571 if (isScalarWithPredication(I)) 6572 return false; 6573 6574 // If any of the instruction's operands are uniform after vectorization, 6575 // the instruction cannot be scalarized. This prevents, for example, a 6576 // masked load from being scalarized. 6577 // 6578 // We assume we will only emit a value for lane zero of an instruction 6579 // marked uniform after vectorization, rather than VF identical values. 6580 // Thus, if we scalarize an instruction that uses a uniform, we would 6581 // create uses of values corresponding to the lanes we aren't emitting code 6582 // for. This behavior can be changed by allowing getScalarValue to clone 6583 // the lane zero values for uniforms rather than asserting. 6584 for (Use &U : I->operands()) 6585 if (auto *J = dyn_cast<Instruction>(U.get())) 6586 if (isUniformAfterVectorization(J, VF)) 6587 return false; 6588 6589 // Otherwise, we can scalarize the instruction. 6590 return true; 6591 }; 6592 6593 // Compute the expected cost discount from scalarizing the entire expression 6594 // feeding the predicated instruction. We currently only consider expressions 6595 // that are single-use instruction chains. 6596 Worklist.push_back(PredInst); 6597 while (!Worklist.empty()) { 6598 Instruction *I = Worklist.pop_back_val(); 6599 6600 // If we've already analyzed the instruction, there's nothing to do. 6601 if (ScalarCosts.find(I) != ScalarCosts.end()) 6602 continue; 6603 6604 // Compute the cost of the vector instruction. Note that this cost already 6605 // includes the scalarization overhead of the predicated instruction. 6606 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6607 6608 // Compute the cost of the scalarized instruction. This cost is the cost of 6609 // the instruction as if it wasn't if-converted and instead remained in the 6610 // predicated block. We will scale this cost by block probability after 6611 // computing the scalarization overhead. 6612 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6613 InstructionCost ScalarCost = 6614 VF.getKnownMinValue() * 6615 getInstructionCost(I, ElementCount::getFixed(1)).first; 6616 6617 // Compute the scalarization overhead of needed insertelement instructions 6618 // and phi nodes. 6619 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6620 ScalarCost += TTI.getScalarizationOverhead( 6621 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6622 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6623 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6624 ScalarCost += 6625 VF.getKnownMinValue() * 6626 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6627 } 6628 6629 // Compute the scalarization overhead of needed extractelement 6630 // instructions. For each of the instruction's operands, if the operand can 6631 // be scalarized, add it to the worklist; otherwise, account for the 6632 // overhead. 6633 for (Use &U : I->operands()) 6634 if (auto *J = dyn_cast<Instruction>(U.get())) { 6635 assert(VectorType::isValidElementType(J->getType()) && 6636 "Instruction has non-scalar type"); 6637 if (canBeScalarized(J)) 6638 Worklist.push_back(J); 6639 else if (needsExtract(J, VF)) { 6640 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6641 ScalarCost += TTI.getScalarizationOverhead( 6642 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6643 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6644 } 6645 } 6646 6647 // Scale the total scalar cost by block probability. 6648 ScalarCost /= getReciprocalPredBlockProb(); 6649 6650 // Compute the discount. A non-negative discount means the vector version 6651 // of the instruction costs more, and scalarizing would be beneficial. 6652 Discount += VectorCost - ScalarCost; 6653 ScalarCosts[I] = ScalarCost; 6654 } 6655 6656 return *Discount.getValue(); 6657 } 6658 6659 LoopVectorizationCostModel::VectorizationCostTy 6660 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6661 VectorizationCostTy Cost; 6662 6663 // For each block. 6664 for (BasicBlock *BB : TheLoop->blocks()) { 6665 VectorizationCostTy BlockCost; 6666 6667 // For each instruction in the old loop. 6668 for (Instruction &I : BB->instructionsWithoutDebug()) { 6669 // Skip ignored values. 6670 if (ValuesToIgnore.count(&I) || 6671 (VF.isVector() && VecValuesToIgnore.count(&I))) 6672 continue; 6673 6674 VectorizationCostTy C = getInstructionCost(&I, VF); 6675 6676 // Check if we should override the cost. 6677 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6678 C.first = InstructionCost(ForceTargetInstructionCost); 6679 6680 BlockCost.first += C.first; 6681 BlockCost.second |= C.second; 6682 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6683 << " for VF " << VF << " For instruction: " << I 6684 << '\n'); 6685 } 6686 6687 // If we are vectorizing a predicated block, it will have been 6688 // if-converted. This means that the block's instructions (aside from 6689 // stores and instructions that may divide by zero) will now be 6690 // unconditionally executed. For the scalar case, we may not always execute 6691 // the predicated block, if it is an if-else block. Thus, scale the block's 6692 // cost by the probability of executing it. blockNeedsPredication from 6693 // Legal is used so as to not include all blocks in tail folded loops. 6694 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6695 BlockCost.first /= getReciprocalPredBlockProb(); 6696 6697 Cost.first += BlockCost.first; 6698 Cost.second |= BlockCost.second; 6699 } 6700 6701 return Cost; 6702 } 6703 6704 /// Gets Address Access SCEV after verifying that the access pattern 6705 /// is loop invariant except the induction variable dependence. 6706 /// 6707 /// This SCEV can be sent to the Target in order to estimate the address 6708 /// calculation cost. 6709 static const SCEV *getAddressAccessSCEV( 6710 Value *Ptr, 6711 LoopVectorizationLegality *Legal, 6712 PredicatedScalarEvolution &PSE, 6713 const Loop *TheLoop) { 6714 6715 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6716 if (!Gep) 6717 return nullptr; 6718 6719 // We are looking for a gep with all loop invariant indices except for one 6720 // which should be an induction variable. 6721 auto SE = PSE.getSE(); 6722 unsigned NumOperands = Gep->getNumOperands(); 6723 for (unsigned i = 1; i < NumOperands; ++i) { 6724 Value *Opd = Gep->getOperand(i); 6725 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6726 !Legal->isInductionVariable(Opd)) 6727 return nullptr; 6728 } 6729 6730 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6731 return PSE.getSCEV(Ptr); 6732 } 6733 6734 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6735 return Legal->hasStride(I->getOperand(0)) || 6736 Legal->hasStride(I->getOperand(1)); 6737 } 6738 6739 InstructionCost 6740 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6741 ElementCount VF) { 6742 assert(VF.isVector() && 6743 "Scalarization cost of instruction implies vectorization."); 6744 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6745 Type *ValTy = getMemInstValueType(I); 6746 auto SE = PSE.getSE(); 6747 6748 unsigned AS = getLoadStoreAddressSpace(I); 6749 Value *Ptr = getLoadStorePointerOperand(I); 6750 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6751 6752 // Figure out whether the access is strided and get the stride value 6753 // if it's known in compile time 6754 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6755 6756 // Get the cost of the scalar memory instruction and address computation. 6757 InstructionCost Cost = 6758 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6759 6760 // Don't pass *I here, since it is scalar but will actually be part of a 6761 // vectorized loop where the user of it is a vectorized instruction. 6762 const Align Alignment = getLoadStoreAlignment(I); 6763 Cost += VF.getKnownMinValue() * 6764 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6765 AS, TTI::TCK_RecipThroughput); 6766 6767 // Get the overhead of the extractelement and insertelement instructions 6768 // we might create due to scalarization. 6769 Cost += getScalarizationOverhead(I, VF); 6770 6771 // If we have a predicated store, it may not be executed for each vector 6772 // lane. Scale the cost by the probability of executing the predicated 6773 // block. 6774 if (isPredicatedInst(I)) { 6775 Cost /= getReciprocalPredBlockProb(); 6776 6777 if (useEmulatedMaskMemRefHack(I)) 6778 // Artificially setting to a high enough value to practically disable 6779 // vectorization with such operations. 6780 Cost = 3000000; 6781 } 6782 6783 return Cost; 6784 } 6785 6786 InstructionCost 6787 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6788 ElementCount VF) { 6789 Type *ValTy = getMemInstValueType(I); 6790 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6791 Value *Ptr = getLoadStorePointerOperand(I); 6792 unsigned AS = getLoadStoreAddressSpace(I); 6793 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6794 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6795 6796 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6797 "Stride should be 1 or -1 for consecutive memory access"); 6798 const Align Alignment = getLoadStoreAlignment(I); 6799 InstructionCost Cost = 0; 6800 if (Legal->isMaskRequired(I)) 6801 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6802 CostKind); 6803 else 6804 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6805 CostKind, I); 6806 6807 bool Reverse = ConsecutiveStride < 0; 6808 if (Reverse) 6809 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6810 return Cost; 6811 } 6812 6813 InstructionCost 6814 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6815 ElementCount VF) { 6816 assert(Legal->isUniformMemOp(*I)); 6817 6818 Type *ValTy = getMemInstValueType(I); 6819 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6820 const Align Alignment = getLoadStoreAlignment(I); 6821 unsigned AS = getLoadStoreAddressSpace(I); 6822 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6823 if (isa<LoadInst>(I)) { 6824 return TTI.getAddressComputationCost(ValTy) + 6825 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6826 CostKind) + 6827 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6828 } 6829 StoreInst *SI = cast<StoreInst>(I); 6830 6831 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6832 return TTI.getAddressComputationCost(ValTy) + 6833 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6834 CostKind) + 6835 (isLoopInvariantStoreValue 6836 ? 0 6837 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6838 VF.getKnownMinValue() - 1)); 6839 } 6840 6841 InstructionCost 6842 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6843 ElementCount VF) { 6844 Type *ValTy = getMemInstValueType(I); 6845 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6846 const Align Alignment = getLoadStoreAlignment(I); 6847 const Value *Ptr = getLoadStorePointerOperand(I); 6848 6849 return TTI.getAddressComputationCost(VectorTy) + 6850 TTI.getGatherScatterOpCost( 6851 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6852 TargetTransformInfo::TCK_RecipThroughput, I); 6853 } 6854 6855 InstructionCost 6856 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6857 ElementCount VF) { 6858 // TODO: Once we have support for interleaving with scalable vectors 6859 // we can calculate the cost properly here. 6860 if (VF.isScalable()) 6861 return InstructionCost::getInvalid(); 6862 6863 Type *ValTy = getMemInstValueType(I); 6864 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6865 unsigned AS = getLoadStoreAddressSpace(I); 6866 6867 auto Group = getInterleavedAccessGroup(I); 6868 assert(Group && "Fail to get an interleaved access group."); 6869 6870 unsigned InterleaveFactor = Group->getFactor(); 6871 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6872 6873 // Holds the indices of existing members in an interleaved load group. 6874 // An interleaved store group doesn't need this as it doesn't allow gaps. 6875 SmallVector<unsigned, 4> Indices; 6876 if (isa<LoadInst>(I)) { 6877 for (unsigned i = 0; i < InterleaveFactor; i++) 6878 if (Group->getMember(i)) 6879 Indices.push_back(i); 6880 } 6881 6882 // Calculate the cost of the whole interleaved group. 6883 bool UseMaskForGaps = 6884 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6885 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6886 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6887 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6888 6889 if (Group->isReverse()) { 6890 // TODO: Add support for reversed masked interleaved access. 6891 assert(!Legal->isMaskRequired(I) && 6892 "Reverse masked interleaved access not supported."); 6893 Cost += Group->getNumMembers() * 6894 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6895 } 6896 return Cost; 6897 } 6898 6899 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6900 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6901 // Early exit for no inloop reductions 6902 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6903 return InstructionCost::getInvalid(); 6904 auto *VectorTy = cast<VectorType>(Ty); 6905 6906 // We are looking for a pattern of, and finding the minimal acceptable cost: 6907 // reduce(mul(ext(A), ext(B))) or 6908 // reduce(mul(A, B)) or 6909 // reduce(ext(A)) or 6910 // reduce(A). 6911 // The basic idea is that we walk down the tree to do that, finding the root 6912 // reduction instruction in InLoopReductionImmediateChains. From there we find 6913 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6914 // of the components. If the reduction cost is lower then we return it for the 6915 // reduction instruction and 0 for the other instructions in the pattern. If 6916 // it is not we return an invalid cost specifying the orignal cost method 6917 // should be used. 6918 Instruction *RetI = I; 6919 if ((RetI->getOpcode() == Instruction::SExt || 6920 RetI->getOpcode() == Instruction::ZExt)) { 6921 if (!RetI->hasOneUser()) 6922 return InstructionCost::getInvalid(); 6923 RetI = RetI->user_back(); 6924 } 6925 if (RetI->getOpcode() == Instruction::Mul && 6926 RetI->user_back()->getOpcode() == Instruction::Add) { 6927 if (!RetI->hasOneUser()) 6928 return InstructionCost::getInvalid(); 6929 RetI = RetI->user_back(); 6930 } 6931 6932 // Test if the found instruction is a reduction, and if not return an invalid 6933 // cost specifying the parent to use the original cost modelling. 6934 if (!InLoopReductionImmediateChains.count(RetI)) 6935 return InstructionCost::getInvalid(); 6936 6937 // Find the reduction this chain is a part of and calculate the basic cost of 6938 // the reduction on its own. 6939 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6940 Instruction *ReductionPhi = LastChain; 6941 while (!isa<PHINode>(ReductionPhi)) 6942 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6943 6944 RecurrenceDescriptor RdxDesc = 6945 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6946 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6947 VectorTy, false, CostKind); 6948 6949 // Get the operand that was not the reduction chain and match it to one of the 6950 // patterns, returning the better cost if it is found. 6951 Instruction *RedOp = RetI->getOperand(1) == LastChain 6952 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6953 : dyn_cast<Instruction>(RetI->getOperand(1)); 6954 6955 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6956 6957 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6958 !TheLoop->isLoopInvariant(RedOp)) { 6959 bool IsUnsigned = isa<ZExtInst>(RedOp); 6960 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6961 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6962 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6963 CostKind); 6964 6965 unsigned ExtCost = 6966 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6967 TTI::CastContextHint::None, CostKind, RedOp); 6968 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6969 return I == RetI ? *RedCost.getValue() : 0; 6970 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6971 Instruction *Mul = RedOp; 6972 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6973 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6974 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6975 Op0->getOpcode() == Op1->getOpcode() && 6976 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6977 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6978 bool IsUnsigned = isa<ZExtInst>(Op0); 6979 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6980 // reduce(mul(ext, ext)) 6981 unsigned ExtCost = 6982 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6983 TTI::CastContextHint::None, CostKind, Op0); 6984 InstructionCost MulCost = 6985 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6986 6987 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6988 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6989 CostKind); 6990 6991 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6992 return I == RetI ? *RedCost.getValue() : 0; 6993 } else { 6994 InstructionCost MulCost = 6995 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6996 6997 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6998 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6999 CostKind); 7000 7001 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7002 return I == RetI ? *RedCost.getValue() : 0; 7003 } 7004 } 7005 7006 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7007 } 7008 7009 InstructionCost 7010 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7011 ElementCount VF) { 7012 // Calculate scalar cost only. Vectorization cost should be ready at this 7013 // moment. 7014 if (VF.isScalar()) { 7015 Type *ValTy = getMemInstValueType(I); 7016 const Align Alignment = getLoadStoreAlignment(I); 7017 unsigned AS = getLoadStoreAddressSpace(I); 7018 7019 return TTI.getAddressComputationCost(ValTy) + 7020 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7021 TTI::TCK_RecipThroughput, I); 7022 } 7023 return getWideningCost(I, VF); 7024 } 7025 7026 LoopVectorizationCostModel::VectorizationCostTy 7027 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7028 ElementCount VF) { 7029 // If we know that this instruction will remain uniform, check the cost of 7030 // the scalar version. 7031 if (isUniformAfterVectorization(I, VF)) 7032 VF = ElementCount::getFixed(1); 7033 7034 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7035 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7036 7037 // Forced scalars do not have any scalarization overhead. 7038 auto ForcedScalar = ForcedScalars.find(VF); 7039 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7040 auto InstSet = ForcedScalar->second; 7041 if (InstSet.count(I)) 7042 return VectorizationCostTy( 7043 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7044 VF.getKnownMinValue()), 7045 false); 7046 } 7047 7048 Type *VectorTy; 7049 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7050 7051 bool TypeNotScalarized = 7052 VF.isVector() && VectorTy->isVectorTy() && 7053 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7054 return VectorizationCostTy(C, TypeNotScalarized); 7055 } 7056 7057 InstructionCost 7058 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7059 ElementCount VF) { 7060 7061 if (VF.isScalable()) 7062 return InstructionCost::getInvalid(); 7063 7064 if (VF.isScalar()) 7065 return 0; 7066 7067 InstructionCost Cost = 0; 7068 Type *RetTy = ToVectorTy(I->getType(), VF); 7069 if (!RetTy->isVoidTy() && 7070 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7071 Cost += TTI.getScalarizationOverhead( 7072 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7073 true, false); 7074 7075 // Some targets keep addresses scalar. 7076 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7077 return Cost; 7078 7079 // Some targets support efficient element stores. 7080 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7081 return Cost; 7082 7083 // Collect operands to consider. 7084 CallInst *CI = dyn_cast<CallInst>(I); 7085 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7086 7087 // Skip operands that do not require extraction/scalarization and do not incur 7088 // any overhead. 7089 SmallVector<Type *> Tys; 7090 for (auto *V : filterExtractingOperands(Ops, VF)) 7091 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7092 return Cost + TTI.getOperandsScalarizationOverhead( 7093 filterExtractingOperands(Ops, VF), Tys); 7094 } 7095 7096 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7097 if (VF.isScalar()) 7098 return; 7099 NumPredStores = 0; 7100 for (BasicBlock *BB : TheLoop->blocks()) { 7101 // For each instruction in the old loop. 7102 for (Instruction &I : *BB) { 7103 Value *Ptr = getLoadStorePointerOperand(&I); 7104 if (!Ptr) 7105 continue; 7106 7107 // TODO: We should generate better code and update the cost model for 7108 // predicated uniform stores. Today they are treated as any other 7109 // predicated store (see added test cases in 7110 // invariant-store-vectorization.ll). 7111 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7112 NumPredStores++; 7113 7114 if (Legal->isUniformMemOp(I)) { 7115 // TODO: Avoid replicating loads and stores instead of 7116 // relying on instcombine to remove them. 7117 // Load: Scalar load + broadcast 7118 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7119 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7120 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7121 continue; 7122 } 7123 7124 // We assume that widening is the best solution when possible. 7125 if (memoryInstructionCanBeWidened(&I, VF)) { 7126 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7127 int ConsecutiveStride = 7128 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7129 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7130 "Expected consecutive stride."); 7131 InstWidening Decision = 7132 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7133 setWideningDecision(&I, VF, Decision, Cost); 7134 continue; 7135 } 7136 7137 // Choose between Interleaving, Gather/Scatter or Scalarization. 7138 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7139 unsigned NumAccesses = 1; 7140 if (isAccessInterleaved(&I)) { 7141 auto Group = getInterleavedAccessGroup(&I); 7142 assert(Group && "Fail to get an interleaved access group."); 7143 7144 // Make one decision for the whole group. 7145 if (getWideningDecision(&I, VF) != CM_Unknown) 7146 continue; 7147 7148 NumAccesses = Group->getNumMembers(); 7149 if (interleavedAccessCanBeWidened(&I, VF)) 7150 InterleaveCost = getInterleaveGroupCost(&I, VF); 7151 } 7152 7153 InstructionCost GatherScatterCost = 7154 isLegalGatherOrScatter(&I) 7155 ? getGatherScatterCost(&I, VF) * NumAccesses 7156 : InstructionCost::getInvalid(); 7157 7158 InstructionCost ScalarizationCost = 7159 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7160 : InstructionCost::getInvalid(); 7161 7162 // Choose better solution for the current VF, 7163 // write down this decision and use it during vectorization. 7164 InstructionCost Cost; 7165 InstWidening Decision; 7166 if (InterleaveCost <= GatherScatterCost && 7167 InterleaveCost < ScalarizationCost) { 7168 Decision = CM_Interleave; 7169 Cost = InterleaveCost; 7170 } else if (GatherScatterCost < ScalarizationCost) { 7171 Decision = CM_GatherScatter; 7172 Cost = GatherScatterCost; 7173 } else { 7174 assert(!VF.isScalable() && 7175 "We cannot yet scalarise for scalable vectors"); 7176 Decision = CM_Scalarize; 7177 Cost = ScalarizationCost; 7178 } 7179 // If the instructions belongs to an interleave group, the whole group 7180 // receives the same decision. The whole group receives the cost, but 7181 // the cost will actually be assigned to one instruction. 7182 if (auto Group = getInterleavedAccessGroup(&I)) 7183 setWideningDecision(Group, VF, Decision, Cost); 7184 else 7185 setWideningDecision(&I, VF, Decision, Cost); 7186 } 7187 } 7188 7189 // Make sure that any load of address and any other address computation 7190 // remains scalar unless there is gather/scatter support. This avoids 7191 // inevitable extracts into address registers, and also has the benefit of 7192 // activating LSR more, since that pass can't optimize vectorized 7193 // addresses. 7194 if (TTI.prefersVectorizedAddressing()) 7195 return; 7196 7197 // Start with all scalar pointer uses. 7198 SmallPtrSet<Instruction *, 8> AddrDefs; 7199 for (BasicBlock *BB : TheLoop->blocks()) 7200 for (Instruction &I : *BB) { 7201 Instruction *PtrDef = 7202 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7203 if (PtrDef && TheLoop->contains(PtrDef) && 7204 getWideningDecision(&I, VF) != CM_GatherScatter) 7205 AddrDefs.insert(PtrDef); 7206 } 7207 7208 // Add all instructions used to generate the addresses. 7209 SmallVector<Instruction *, 4> Worklist; 7210 append_range(Worklist, AddrDefs); 7211 while (!Worklist.empty()) { 7212 Instruction *I = Worklist.pop_back_val(); 7213 for (auto &Op : I->operands()) 7214 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7215 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7216 AddrDefs.insert(InstOp).second) 7217 Worklist.push_back(InstOp); 7218 } 7219 7220 for (auto *I : AddrDefs) { 7221 if (isa<LoadInst>(I)) { 7222 // Setting the desired widening decision should ideally be handled in 7223 // by cost functions, but since this involves the task of finding out 7224 // if the loaded register is involved in an address computation, it is 7225 // instead changed here when we know this is the case. 7226 InstWidening Decision = getWideningDecision(I, VF); 7227 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7228 // Scalarize a widened load of address. 7229 setWideningDecision( 7230 I, VF, CM_Scalarize, 7231 (VF.getKnownMinValue() * 7232 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7233 else if (auto Group = getInterleavedAccessGroup(I)) { 7234 // Scalarize an interleave group of address loads. 7235 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7236 if (Instruction *Member = Group->getMember(I)) 7237 setWideningDecision( 7238 Member, VF, CM_Scalarize, 7239 (VF.getKnownMinValue() * 7240 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7241 } 7242 } 7243 } else 7244 // Make sure I gets scalarized and a cost estimate without 7245 // scalarization overhead. 7246 ForcedScalars[VF].insert(I); 7247 } 7248 } 7249 7250 InstructionCost 7251 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7252 Type *&VectorTy) { 7253 Type *RetTy = I->getType(); 7254 if (canTruncateToMinimalBitwidth(I, VF)) 7255 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7256 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7257 auto SE = PSE.getSE(); 7258 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7259 7260 // TODO: We need to estimate the cost of intrinsic calls. 7261 switch (I->getOpcode()) { 7262 case Instruction::GetElementPtr: 7263 // We mark this instruction as zero-cost because the cost of GEPs in 7264 // vectorized code depends on whether the corresponding memory instruction 7265 // is scalarized or not. Therefore, we handle GEPs with the memory 7266 // instruction cost. 7267 return 0; 7268 case Instruction::Br: { 7269 // In cases of scalarized and predicated instructions, there will be VF 7270 // predicated blocks in the vectorized loop. Each branch around these 7271 // blocks requires also an extract of its vector compare i1 element. 7272 bool ScalarPredicatedBB = false; 7273 BranchInst *BI = cast<BranchInst>(I); 7274 if (VF.isVector() && BI->isConditional() && 7275 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7276 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7277 ScalarPredicatedBB = true; 7278 7279 if (ScalarPredicatedBB) { 7280 // Return cost for branches around scalarized and predicated blocks. 7281 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7282 auto *Vec_i1Ty = 7283 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7284 return (TTI.getScalarizationOverhead( 7285 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7286 false, true) + 7287 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7288 VF.getKnownMinValue())); 7289 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7290 // The back-edge branch will remain, as will all scalar branches. 7291 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7292 else 7293 // This branch will be eliminated by if-conversion. 7294 return 0; 7295 // Note: We currently assume zero cost for an unconditional branch inside 7296 // a predicated block since it will become a fall-through, although we 7297 // may decide in the future to call TTI for all branches. 7298 } 7299 case Instruction::PHI: { 7300 auto *Phi = cast<PHINode>(I); 7301 7302 // First-order recurrences are replaced by vector shuffles inside the loop. 7303 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7304 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7305 return TTI.getShuffleCost( 7306 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7307 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7308 7309 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7310 // converted into select instructions. We require N - 1 selects per phi 7311 // node, where N is the number of incoming values. 7312 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7313 return (Phi->getNumIncomingValues() - 1) * 7314 TTI.getCmpSelInstrCost( 7315 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7316 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7317 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7318 7319 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7320 } 7321 case Instruction::UDiv: 7322 case Instruction::SDiv: 7323 case Instruction::URem: 7324 case Instruction::SRem: 7325 // If we have a predicated instruction, it may not be executed for each 7326 // vector lane. Get the scalarization cost and scale this amount by the 7327 // probability of executing the predicated block. If the instruction is not 7328 // predicated, we fall through to the next case. 7329 if (VF.isVector() && isScalarWithPredication(I)) { 7330 InstructionCost Cost = 0; 7331 7332 // These instructions have a non-void type, so account for the phi nodes 7333 // that we will create. This cost is likely to be zero. The phi node 7334 // cost, if any, should be scaled by the block probability because it 7335 // models a copy at the end of each predicated block. 7336 Cost += VF.getKnownMinValue() * 7337 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7338 7339 // The cost of the non-predicated instruction. 7340 Cost += VF.getKnownMinValue() * 7341 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7342 7343 // The cost of insertelement and extractelement instructions needed for 7344 // scalarization. 7345 Cost += getScalarizationOverhead(I, VF); 7346 7347 // Scale the cost by the probability of executing the predicated blocks. 7348 // This assumes the predicated block for each vector lane is equally 7349 // likely. 7350 return Cost / getReciprocalPredBlockProb(); 7351 } 7352 LLVM_FALLTHROUGH; 7353 case Instruction::Add: 7354 case Instruction::FAdd: 7355 case Instruction::Sub: 7356 case Instruction::FSub: 7357 case Instruction::Mul: 7358 case Instruction::FMul: 7359 case Instruction::FDiv: 7360 case Instruction::FRem: 7361 case Instruction::Shl: 7362 case Instruction::LShr: 7363 case Instruction::AShr: 7364 case Instruction::And: 7365 case Instruction::Or: 7366 case Instruction::Xor: { 7367 // Since we will replace the stride by 1 the multiplication should go away. 7368 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7369 return 0; 7370 7371 // Detect reduction patterns 7372 InstructionCost RedCost; 7373 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7374 .isValid()) 7375 return RedCost; 7376 7377 // Certain instructions can be cheaper to vectorize if they have a constant 7378 // second vector operand. One example of this are shifts on x86. 7379 Value *Op2 = I->getOperand(1); 7380 TargetTransformInfo::OperandValueProperties Op2VP; 7381 TargetTransformInfo::OperandValueKind Op2VK = 7382 TTI.getOperandInfo(Op2, Op2VP); 7383 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7384 Op2VK = TargetTransformInfo::OK_UniformValue; 7385 7386 SmallVector<const Value *, 4> Operands(I->operand_values()); 7387 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7388 return N * TTI.getArithmeticInstrCost( 7389 I->getOpcode(), VectorTy, CostKind, 7390 TargetTransformInfo::OK_AnyValue, 7391 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7392 } 7393 case Instruction::FNeg: { 7394 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7395 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7396 return N * TTI.getArithmeticInstrCost( 7397 I->getOpcode(), VectorTy, CostKind, 7398 TargetTransformInfo::OK_AnyValue, 7399 TargetTransformInfo::OK_AnyValue, 7400 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7401 I->getOperand(0), I); 7402 } 7403 case Instruction::Select: { 7404 SelectInst *SI = cast<SelectInst>(I); 7405 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7406 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7407 Type *CondTy = SI->getCondition()->getType(); 7408 if (!ScalarCond) 7409 CondTy = VectorType::get(CondTy, VF); 7410 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7411 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7412 } 7413 case Instruction::ICmp: 7414 case Instruction::FCmp: { 7415 Type *ValTy = I->getOperand(0)->getType(); 7416 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7417 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7418 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7419 VectorTy = ToVectorTy(ValTy, VF); 7420 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7421 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7422 } 7423 case Instruction::Store: 7424 case Instruction::Load: { 7425 ElementCount Width = VF; 7426 if (Width.isVector()) { 7427 InstWidening Decision = getWideningDecision(I, Width); 7428 assert(Decision != CM_Unknown && 7429 "CM decision should be taken at this point"); 7430 if (Decision == CM_Scalarize) 7431 Width = ElementCount::getFixed(1); 7432 } 7433 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7434 return getMemoryInstructionCost(I, VF); 7435 } 7436 case Instruction::ZExt: 7437 case Instruction::SExt: 7438 case Instruction::FPToUI: 7439 case Instruction::FPToSI: 7440 case Instruction::FPExt: 7441 case Instruction::PtrToInt: 7442 case Instruction::IntToPtr: 7443 case Instruction::SIToFP: 7444 case Instruction::UIToFP: 7445 case Instruction::Trunc: 7446 case Instruction::FPTrunc: 7447 case Instruction::BitCast: { 7448 // Computes the CastContextHint from a Load/Store instruction. 7449 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7450 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7451 "Expected a load or a store!"); 7452 7453 if (VF.isScalar() || !TheLoop->contains(I)) 7454 return TTI::CastContextHint::Normal; 7455 7456 switch (getWideningDecision(I, VF)) { 7457 case LoopVectorizationCostModel::CM_GatherScatter: 7458 return TTI::CastContextHint::GatherScatter; 7459 case LoopVectorizationCostModel::CM_Interleave: 7460 return TTI::CastContextHint::Interleave; 7461 case LoopVectorizationCostModel::CM_Scalarize: 7462 case LoopVectorizationCostModel::CM_Widen: 7463 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7464 : TTI::CastContextHint::Normal; 7465 case LoopVectorizationCostModel::CM_Widen_Reverse: 7466 return TTI::CastContextHint::Reversed; 7467 case LoopVectorizationCostModel::CM_Unknown: 7468 llvm_unreachable("Instr did not go through cost modelling?"); 7469 } 7470 7471 llvm_unreachable("Unhandled case!"); 7472 }; 7473 7474 unsigned Opcode = I->getOpcode(); 7475 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7476 // For Trunc, the context is the only user, which must be a StoreInst. 7477 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7478 if (I->hasOneUse()) 7479 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7480 CCH = ComputeCCH(Store); 7481 } 7482 // For Z/Sext, the context is the operand, which must be a LoadInst. 7483 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7484 Opcode == Instruction::FPExt) { 7485 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7486 CCH = ComputeCCH(Load); 7487 } 7488 7489 // We optimize the truncation of induction variables having constant 7490 // integer steps. The cost of these truncations is the same as the scalar 7491 // operation. 7492 if (isOptimizableIVTruncate(I, VF)) { 7493 auto *Trunc = cast<TruncInst>(I); 7494 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7495 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7496 } 7497 7498 // Detect reduction patterns 7499 InstructionCost RedCost; 7500 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7501 .isValid()) 7502 return RedCost; 7503 7504 Type *SrcScalarTy = I->getOperand(0)->getType(); 7505 Type *SrcVecTy = 7506 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7507 if (canTruncateToMinimalBitwidth(I, VF)) { 7508 // This cast is going to be shrunk. This may remove the cast or it might 7509 // turn it into slightly different cast. For example, if MinBW == 16, 7510 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7511 // 7512 // Calculate the modified src and dest types. 7513 Type *MinVecTy = VectorTy; 7514 if (Opcode == Instruction::Trunc) { 7515 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7516 VectorTy = 7517 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7518 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7519 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7520 VectorTy = 7521 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7522 } 7523 } 7524 7525 unsigned N; 7526 if (isScalarAfterVectorization(I, VF)) { 7527 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7528 N = VF.getKnownMinValue(); 7529 } else 7530 N = 1; 7531 return N * 7532 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7533 } 7534 case Instruction::Call: { 7535 bool NeedToScalarize; 7536 CallInst *CI = cast<CallInst>(I); 7537 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7538 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7539 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7540 return std::min(CallCost, IntrinsicCost); 7541 } 7542 return CallCost; 7543 } 7544 case Instruction::ExtractValue: 7545 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7546 default: 7547 // The cost of executing VF copies of the scalar instruction. This opcode 7548 // is unknown. Assume that it is the same as 'mul'. 7549 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7550 Instruction::Mul, VectorTy, CostKind) + 7551 getScalarizationOverhead(I, VF); 7552 } // end of switch. 7553 } 7554 7555 char LoopVectorize::ID = 0; 7556 7557 static const char lv_name[] = "Loop Vectorization"; 7558 7559 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7560 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7561 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7562 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7563 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7564 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7565 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7566 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7567 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7568 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7569 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7570 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7571 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7572 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7573 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7574 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7575 7576 namespace llvm { 7577 7578 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7579 7580 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7581 bool VectorizeOnlyWhenForced) { 7582 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7583 } 7584 7585 } // end namespace llvm 7586 7587 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7588 // Check if the pointer operand of a load or store instruction is 7589 // consecutive. 7590 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7591 return Legal->isConsecutivePtr(Ptr); 7592 return false; 7593 } 7594 7595 void LoopVectorizationCostModel::collectValuesToIgnore() { 7596 // Ignore ephemeral values. 7597 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7598 7599 // Ignore type-promoting instructions we identified during reduction 7600 // detection. 7601 for (auto &Reduction : Legal->getReductionVars()) { 7602 RecurrenceDescriptor &RedDes = Reduction.second; 7603 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7604 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7605 } 7606 // Ignore type-casting instructions we identified during induction 7607 // detection. 7608 for (auto &Induction : Legal->getInductionVars()) { 7609 InductionDescriptor &IndDes = Induction.second; 7610 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7611 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7612 } 7613 } 7614 7615 void LoopVectorizationCostModel::collectInLoopReductions() { 7616 for (auto &Reduction : Legal->getReductionVars()) { 7617 PHINode *Phi = Reduction.first; 7618 RecurrenceDescriptor &RdxDesc = Reduction.second; 7619 7620 // We don't collect reductions that are type promoted (yet). 7621 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7622 continue; 7623 7624 // If the target would prefer this reduction to happen "in-loop", then we 7625 // want to record it as such. 7626 unsigned Opcode = RdxDesc.getOpcode(); 7627 if (!PreferInLoopReductions && 7628 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7629 TargetTransformInfo::ReductionFlags())) 7630 continue; 7631 7632 // Check that we can correctly put the reductions into the loop, by 7633 // finding the chain of operations that leads from the phi to the loop 7634 // exit value. 7635 SmallVector<Instruction *, 4> ReductionOperations = 7636 RdxDesc.getReductionOpChain(Phi, TheLoop); 7637 bool InLoop = !ReductionOperations.empty(); 7638 if (InLoop) { 7639 InLoopReductionChains[Phi] = ReductionOperations; 7640 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7641 Instruction *LastChain = Phi; 7642 for (auto *I : ReductionOperations) { 7643 InLoopReductionImmediateChains[I] = LastChain; 7644 LastChain = I; 7645 } 7646 } 7647 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7648 << " reduction for phi: " << *Phi << "\n"); 7649 } 7650 } 7651 7652 // TODO: we could return a pair of values that specify the max VF and 7653 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7654 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7655 // doesn't have a cost model that can choose which plan to execute if 7656 // more than one is generated. 7657 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7658 LoopVectorizationCostModel &CM) { 7659 unsigned WidestType; 7660 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7661 return WidestVectorRegBits / WidestType; 7662 } 7663 7664 VectorizationFactor 7665 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7666 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7667 ElementCount VF = UserVF; 7668 // Outer loop handling: They may require CFG and instruction level 7669 // transformations before even evaluating whether vectorization is profitable. 7670 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7671 // the vectorization pipeline. 7672 if (!OrigLoop->isInnermost()) { 7673 // If the user doesn't provide a vectorization factor, determine a 7674 // reasonable one. 7675 if (UserVF.isZero()) { 7676 VF = ElementCount::getFixed( 7677 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7678 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7679 7680 // Make sure we have a VF > 1 for stress testing. 7681 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7682 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7683 << "overriding computed VF.\n"); 7684 VF = ElementCount::getFixed(4); 7685 } 7686 } 7687 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7688 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7689 "VF needs to be a power of two"); 7690 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7691 << "VF " << VF << " to build VPlans.\n"); 7692 buildVPlans(VF, VF); 7693 7694 // For VPlan build stress testing, we bail out after VPlan construction. 7695 if (VPlanBuildStressTest) 7696 return VectorizationFactor::Disabled(); 7697 7698 return {VF, 0 /*Cost*/}; 7699 } 7700 7701 LLVM_DEBUG( 7702 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7703 "VPlan-native path.\n"); 7704 return VectorizationFactor::Disabled(); 7705 } 7706 7707 Optional<VectorizationFactor> 7708 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7709 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7710 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7711 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7712 return None; 7713 7714 // Invalidate interleave groups if all blocks of loop will be predicated. 7715 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7716 !useMaskedInterleavedAccesses(*TTI)) { 7717 LLVM_DEBUG( 7718 dbgs() 7719 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7720 "which requires masked-interleaved support.\n"); 7721 if (CM.InterleaveInfo.invalidateGroups()) 7722 // Invalidating interleave groups also requires invalidating all decisions 7723 // based on them, which includes widening decisions and uniform and scalar 7724 // values. 7725 CM.invalidateCostModelingDecisions(); 7726 } 7727 7728 ElementCount MaxVF = MaybeMaxVF.getValue(); 7729 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7730 7731 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7732 if (!UserVF.isZero() && 7733 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7734 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7735 // VFs here, this should be reverted to only use legal UserVFs once the 7736 // loop below supports scalable VFs. 7737 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7738 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7739 << " VF " << VF << ".\n"); 7740 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7741 "VF needs to be a power of two"); 7742 // Collect the instructions (and their associated costs) that will be more 7743 // profitable to scalarize. 7744 CM.selectUserVectorizationFactor(VF); 7745 CM.collectInLoopReductions(); 7746 buildVPlansWithVPRecipes(VF, VF); 7747 LLVM_DEBUG(printPlans(dbgs())); 7748 return {{VF, 0}}; 7749 } 7750 7751 assert(!MaxVF.isScalable() && 7752 "Scalable vectors not yet supported beyond this point"); 7753 7754 for (ElementCount VF = ElementCount::getFixed(1); 7755 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7756 // Collect Uniform and Scalar instructions after vectorization with VF. 7757 CM.collectUniformsAndScalars(VF); 7758 7759 // Collect the instructions (and their associated costs) that will be more 7760 // profitable to scalarize. 7761 if (VF.isVector()) 7762 CM.collectInstsToScalarize(VF); 7763 } 7764 7765 CM.collectInLoopReductions(); 7766 7767 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7768 LLVM_DEBUG(printPlans(dbgs())); 7769 if (MaxVF.isScalar()) 7770 return VectorizationFactor::Disabled(); 7771 7772 // Select the optimal vectorization factor. 7773 return CM.selectVectorizationFactor(MaxVF); 7774 } 7775 7776 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7777 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7778 << '\n'); 7779 BestVF = VF; 7780 BestUF = UF; 7781 7782 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7783 return !Plan->hasVF(VF); 7784 }); 7785 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7786 } 7787 7788 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7789 DominatorTree *DT) { 7790 // Perform the actual loop transformation. 7791 7792 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7793 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7794 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7795 7796 VPTransformState State{ 7797 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7798 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7799 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7800 State.CanonicalIV = ILV.Induction; 7801 7802 ILV.printDebugTracesAtStart(); 7803 7804 //===------------------------------------------------===// 7805 // 7806 // Notice: any optimization or new instruction that go 7807 // into the code below should also be implemented in 7808 // the cost-model. 7809 // 7810 //===------------------------------------------------===// 7811 7812 // 2. Copy and widen instructions from the old loop into the new loop. 7813 VPlans.front()->execute(&State); 7814 7815 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7816 // predication, updating analyses. 7817 ILV.fixVectorizedLoop(State); 7818 7819 ILV.printDebugTracesAtEnd(); 7820 } 7821 7822 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7823 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7824 7825 // We create new control-flow for the vectorized loop, so the original exit 7826 // conditions will be dead after vectorization if it's only used by the 7827 // terminator 7828 SmallVector<BasicBlock*> ExitingBlocks; 7829 OrigLoop->getExitingBlocks(ExitingBlocks); 7830 for (auto *BB : ExitingBlocks) { 7831 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7832 if (!Cmp || !Cmp->hasOneUse()) 7833 continue; 7834 7835 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7836 if (!DeadInstructions.insert(Cmp).second) 7837 continue; 7838 7839 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7840 // TODO: can recurse through operands in general 7841 for (Value *Op : Cmp->operands()) { 7842 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7843 DeadInstructions.insert(cast<Instruction>(Op)); 7844 } 7845 } 7846 7847 // We create new "steps" for induction variable updates to which the original 7848 // induction variables map. An original update instruction will be dead if 7849 // all its users except the induction variable are dead. 7850 auto *Latch = OrigLoop->getLoopLatch(); 7851 for (auto &Induction : Legal->getInductionVars()) { 7852 PHINode *Ind = Induction.first; 7853 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7854 7855 // If the tail is to be folded by masking, the primary induction variable, 7856 // if exists, isn't dead: it will be used for masking. Don't kill it. 7857 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7858 continue; 7859 7860 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7861 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7862 })) 7863 DeadInstructions.insert(IndUpdate); 7864 7865 // We record as "Dead" also the type-casting instructions we had identified 7866 // during induction analysis. We don't need any handling for them in the 7867 // vectorized loop because we have proven that, under a proper runtime 7868 // test guarding the vectorized loop, the value of the phi, and the casted 7869 // value of the phi, are the same. The last instruction in this casting chain 7870 // will get its scalar/vector/widened def from the scalar/vector/widened def 7871 // of the respective phi node. Any other casts in the induction def-use chain 7872 // have no other uses outside the phi update chain, and will be ignored. 7873 InductionDescriptor &IndDes = Induction.second; 7874 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7875 DeadInstructions.insert(Casts.begin(), Casts.end()); 7876 } 7877 } 7878 7879 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7880 7881 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7882 7883 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7884 Instruction::BinaryOps BinOp) { 7885 // When unrolling and the VF is 1, we only need to add a simple scalar. 7886 Type *Ty = Val->getType(); 7887 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7888 7889 if (Ty->isFloatingPointTy()) { 7890 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7891 7892 // Floating point operations had to be 'fast' to enable the unrolling. 7893 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7894 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7895 } 7896 Constant *C = ConstantInt::get(Ty, StartIdx); 7897 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7898 } 7899 7900 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7901 SmallVector<Metadata *, 4> MDs; 7902 // Reserve first location for self reference to the LoopID metadata node. 7903 MDs.push_back(nullptr); 7904 bool IsUnrollMetadata = false; 7905 MDNode *LoopID = L->getLoopID(); 7906 if (LoopID) { 7907 // First find existing loop unrolling disable metadata. 7908 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7909 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7910 if (MD) { 7911 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7912 IsUnrollMetadata = 7913 S && S->getString().startswith("llvm.loop.unroll.disable"); 7914 } 7915 MDs.push_back(LoopID->getOperand(i)); 7916 } 7917 } 7918 7919 if (!IsUnrollMetadata) { 7920 // Add runtime unroll disable metadata. 7921 LLVMContext &Context = L->getHeader()->getContext(); 7922 SmallVector<Metadata *, 1> DisableOperands; 7923 DisableOperands.push_back( 7924 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7925 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7926 MDs.push_back(DisableNode); 7927 MDNode *NewLoopID = MDNode::get(Context, MDs); 7928 // Set operand 0 to refer to the loop id itself. 7929 NewLoopID->replaceOperandWith(0, NewLoopID); 7930 L->setLoopID(NewLoopID); 7931 } 7932 } 7933 7934 //===--------------------------------------------------------------------===// 7935 // EpilogueVectorizerMainLoop 7936 //===--------------------------------------------------------------------===// 7937 7938 /// This function is partially responsible for generating the control flow 7939 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7940 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7941 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7942 Loop *Lp = createVectorLoopSkeleton(""); 7943 7944 // Generate the code to check the minimum iteration count of the vector 7945 // epilogue (see below). 7946 EPI.EpilogueIterationCountCheck = 7947 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7948 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7949 7950 // Generate the code to check any assumptions that we've made for SCEV 7951 // expressions. 7952 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7953 7954 // Generate the code that checks at runtime if arrays overlap. We put the 7955 // checks into a separate block to make the more common case of few elements 7956 // faster. 7957 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7958 7959 // Generate the iteration count check for the main loop, *after* the check 7960 // for the epilogue loop, so that the path-length is shorter for the case 7961 // that goes directly through the vector epilogue. The longer-path length for 7962 // the main loop is compensated for, by the gain from vectorizing the larger 7963 // trip count. Note: the branch will get updated later on when we vectorize 7964 // the epilogue. 7965 EPI.MainLoopIterationCountCheck = 7966 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7967 7968 // Generate the induction variable. 7969 OldInduction = Legal->getPrimaryInduction(); 7970 Type *IdxTy = Legal->getWidestInductionType(); 7971 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7972 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7973 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7974 EPI.VectorTripCount = CountRoundDown; 7975 Induction = 7976 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7977 getDebugLocFromInstOrOperands(OldInduction)); 7978 7979 // Skip induction resume value creation here because they will be created in 7980 // the second pass. If we created them here, they wouldn't be used anyway, 7981 // because the vplan in the second pass still contains the inductions from the 7982 // original loop. 7983 7984 return completeLoopSkeleton(Lp, OrigLoopID); 7985 } 7986 7987 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7988 LLVM_DEBUG({ 7989 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7990 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7991 << ", Main Loop UF:" << EPI.MainLoopUF 7992 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7993 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7994 }); 7995 } 7996 7997 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7998 DEBUG_WITH_TYPE(VerboseDebug, { 7999 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8000 }); 8001 } 8002 8003 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8004 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8005 assert(L && "Expected valid Loop."); 8006 assert(Bypass && "Expected valid bypass basic block."); 8007 unsigned VFactor = 8008 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8009 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8010 Value *Count = getOrCreateTripCount(L); 8011 // Reuse existing vector loop preheader for TC checks. 8012 // Note that new preheader block is generated for vector loop. 8013 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8014 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8015 8016 // Generate code to check if the loop's trip count is less than VF * UF of the 8017 // main vector loop. 8018 auto P = 8019 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8020 8021 Value *CheckMinIters = Builder.CreateICmp( 8022 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8023 "min.iters.check"); 8024 8025 if (!ForEpilogue) 8026 TCCheckBlock->setName("vector.main.loop.iter.check"); 8027 8028 // Create new preheader for vector loop. 8029 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8030 DT, LI, nullptr, "vector.ph"); 8031 8032 if (ForEpilogue) { 8033 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8034 DT->getNode(Bypass)->getIDom()) && 8035 "TC check is expected to dominate Bypass"); 8036 8037 // Update dominator for Bypass & LoopExit. 8038 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8039 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8040 8041 LoopBypassBlocks.push_back(TCCheckBlock); 8042 8043 // Save the trip count so we don't have to regenerate it in the 8044 // vec.epilog.iter.check. This is safe to do because the trip count 8045 // generated here dominates the vector epilog iter check. 8046 EPI.TripCount = Count; 8047 } 8048 8049 ReplaceInstWithInst( 8050 TCCheckBlock->getTerminator(), 8051 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8052 8053 return TCCheckBlock; 8054 } 8055 8056 //===--------------------------------------------------------------------===// 8057 // EpilogueVectorizerEpilogueLoop 8058 //===--------------------------------------------------------------------===// 8059 8060 /// This function is partially responsible for generating the control flow 8061 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8062 BasicBlock * 8063 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8064 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8065 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8066 8067 // Now, compare the remaining count and if there aren't enough iterations to 8068 // execute the vectorized epilogue skip to the scalar part. 8069 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8070 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8071 LoopVectorPreHeader = 8072 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8073 LI, nullptr, "vec.epilog.ph"); 8074 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8075 VecEpilogueIterationCountCheck); 8076 8077 // Adjust the control flow taking the state info from the main loop 8078 // vectorization into account. 8079 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8080 "expected this to be saved from the previous pass."); 8081 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8082 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8083 8084 DT->changeImmediateDominator(LoopVectorPreHeader, 8085 EPI.MainLoopIterationCountCheck); 8086 8087 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8088 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8089 8090 if (EPI.SCEVSafetyCheck) 8091 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8092 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8093 if (EPI.MemSafetyCheck) 8094 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8095 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8096 8097 DT->changeImmediateDominator( 8098 VecEpilogueIterationCountCheck, 8099 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8100 8101 DT->changeImmediateDominator(LoopScalarPreHeader, 8102 EPI.EpilogueIterationCountCheck); 8103 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8104 8105 // Keep track of bypass blocks, as they feed start values to the induction 8106 // phis in the scalar loop preheader. 8107 if (EPI.SCEVSafetyCheck) 8108 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8109 if (EPI.MemSafetyCheck) 8110 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8111 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8112 8113 // Generate a resume induction for the vector epilogue and put it in the 8114 // vector epilogue preheader 8115 Type *IdxTy = Legal->getWidestInductionType(); 8116 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8117 LoopVectorPreHeader->getFirstNonPHI()); 8118 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8119 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8120 EPI.MainLoopIterationCountCheck); 8121 8122 // Generate the induction variable. 8123 OldInduction = Legal->getPrimaryInduction(); 8124 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8125 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8126 Value *StartIdx = EPResumeVal; 8127 Induction = 8128 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8129 getDebugLocFromInstOrOperands(OldInduction)); 8130 8131 // Generate induction resume values. These variables save the new starting 8132 // indexes for the scalar loop. They are used to test if there are any tail 8133 // iterations left once the vector loop has completed. 8134 // Note that when the vectorized epilogue is skipped due to iteration count 8135 // check, then the resume value for the induction variable comes from 8136 // the trip count of the main vector loop, hence passing the AdditionalBypass 8137 // argument. 8138 createInductionResumeValues(Lp, CountRoundDown, 8139 {VecEpilogueIterationCountCheck, 8140 EPI.VectorTripCount} /* AdditionalBypass */); 8141 8142 AddRuntimeUnrollDisableMetaData(Lp); 8143 return completeLoopSkeleton(Lp, OrigLoopID); 8144 } 8145 8146 BasicBlock * 8147 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8148 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8149 8150 assert(EPI.TripCount && 8151 "Expected trip count to have been safed in the first pass."); 8152 assert( 8153 (!isa<Instruction>(EPI.TripCount) || 8154 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8155 "saved trip count does not dominate insertion point."); 8156 Value *TC = EPI.TripCount; 8157 IRBuilder<> Builder(Insert->getTerminator()); 8158 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8159 8160 // Generate code to check if the loop's trip count is less than VF * UF of the 8161 // vector epilogue loop. 8162 auto P = 8163 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8164 8165 Value *CheckMinIters = Builder.CreateICmp( 8166 P, Count, 8167 ConstantInt::get(Count->getType(), 8168 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8169 "min.epilog.iters.check"); 8170 8171 ReplaceInstWithInst( 8172 Insert->getTerminator(), 8173 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8174 8175 LoopBypassBlocks.push_back(Insert); 8176 return Insert; 8177 } 8178 8179 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8180 LLVM_DEBUG({ 8181 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8182 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8183 << ", Main Loop UF:" << EPI.MainLoopUF 8184 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8185 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8186 }); 8187 } 8188 8189 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8190 DEBUG_WITH_TYPE(VerboseDebug, { 8191 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8192 }); 8193 } 8194 8195 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8196 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8197 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8198 bool PredicateAtRangeStart = Predicate(Range.Start); 8199 8200 for (ElementCount TmpVF = Range.Start * 2; 8201 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8202 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8203 Range.End = TmpVF; 8204 break; 8205 } 8206 8207 return PredicateAtRangeStart; 8208 } 8209 8210 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8211 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8212 /// of VF's starting at a given VF and extending it as much as possible. Each 8213 /// vectorization decision can potentially shorten this sub-range during 8214 /// buildVPlan(). 8215 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8216 ElementCount MaxVF) { 8217 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8218 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8219 VFRange SubRange = {VF, MaxVFPlusOne}; 8220 VPlans.push_back(buildVPlan(SubRange)); 8221 VF = SubRange.End; 8222 } 8223 } 8224 8225 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8226 VPlanPtr &Plan) { 8227 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8228 8229 // Look for cached value. 8230 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8231 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8232 if (ECEntryIt != EdgeMaskCache.end()) 8233 return ECEntryIt->second; 8234 8235 VPValue *SrcMask = createBlockInMask(Src, Plan); 8236 8237 // The terminator has to be a branch inst! 8238 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8239 assert(BI && "Unexpected terminator found"); 8240 8241 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8242 return EdgeMaskCache[Edge] = SrcMask; 8243 8244 // If source is an exiting block, we know the exit edge is dynamically dead 8245 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8246 // adding uses of an otherwise potentially dead instruction. 8247 if (OrigLoop->isLoopExiting(Src)) 8248 return EdgeMaskCache[Edge] = SrcMask; 8249 8250 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8251 assert(EdgeMask && "No Edge Mask found for condition"); 8252 8253 if (BI->getSuccessor(0) != Dst) 8254 EdgeMask = Builder.createNot(EdgeMask); 8255 8256 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8257 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8258 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8259 // The select version does not introduce new UB if SrcMask is false and 8260 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8261 VPValue *False = Plan->getOrAddVPValue( 8262 ConstantInt::getFalse(BI->getCondition()->getType())); 8263 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8264 } 8265 8266 return EdgeMaskCache[Edge] = EdgeMask; 8267 } 8268 8269 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8270 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8271 8272 // Look for cached value. 8273 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8274 if (BCEntryIt != BlockMaskCache.end()) 8275 return BCEntryIt->second; 8276 8277 // All-one mask is modelled as no-mask following the convention for masked 8278 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8279 VPValue *BlockMask = nullptr; 8280 8281 if (OrigLoop->getHeader() == BB) { 8282 if (!CM.blockNeedsPredication(BB)) 8283 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8284 8285 // Create the block in mask as the first non-phi instruction in the block. 8286 VPBuilder::InsertPointGuard Guard(Builder); 8287 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8288 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8289 8290 // Introduce the early-exit compare IV <= BTC to form header block mask. 8291 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8292 // Start by constructing the desired canonical IV. 8293 VPValue *IV = nullptr; 8294 if (Legal->getPrimaryInduction()) 8295 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8296 else { 8297 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8298 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8299 IV = IVRecipe->getVPValue(); 8300 } 8301 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8302 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8303 8304 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8305 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8306 // as a second argument, we only pass the IV here and extract the 8307 // tripcount from the transform state where codegen of the VP instructions 8308 // happen. 8309 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8310 } else { 8311 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8312 } 8313 return BlockMaskCache[BB] = BlockMask; 8314 } 8315 8316 // This is the block mask. We OR all incoming edges. 8317 for (auto *Predecessor : predecessors(BB)) { 8318 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8319 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8320 return BlockMaskCache[BB] = EdgeMask; 8321 8322 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8323 BlockMask = EdgeMask; 8324 continue; 8325 } 8326 8327 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8328 } 8329 8330 return BlockMaskCache[BB] = BlockMask; 8331 } 8332 8333 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8334 VPlanPtr &Plan) { 8335 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8336 "Must be called with either a load or store"); 8337 8338 auto willWiden = [&](ElementCount VF) -> bool { 8339 if (VF.isScalar()) 8340 return false; 8341 LoopVectorizationCostModel::InstWidening Decision = 8342 CM.getWideningDecision(I, VF); 8343 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8344 "CM decision should be taken at this point."); 8345 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8346 return true; 8347 if (CM.isScalarAfterVectorization(I, VF) || 8348 CM.isProfitableToScalarize(I, VF)) 8349 return false; 8350 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8351 }; 8352 8353 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8354 return nullptr; 8355 8356 VPValue *Mask = nullptr; 8357 if (Legal->isMaskRequired(I)) 8358 Mask = createBlockInMask(I->getParent(), Plan); 8359 8360 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8361 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8362 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8363 8364 StoreInst *Store = cast<StoreInst>(I); 8365 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8366 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8367 } 8368 8369 VPWidenIntOrFpInductionRecipe * 8370 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8371 // Check if this is an integer or fp induction. If so, build the recipe that 8372 // produces its scalar and vector values. 8373 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8374 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8375 II.getKind() == InductionDescriptor::IK_FpInduction) { 8376 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8377 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8378 return new VPWidenIntOrFpInductionRecipe( 8379 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8380 } 8381 8382 return nullptr; 8383 } 8384 8385 VPWidenIntOrFpInductionRecipe * 8386 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8387 VPlan &Plan) const { 8388 // Optimize the special case where the source is a constant integer 8389 // induction variable. Notice that we can only optimize the 'trunc' case 8390 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8391 // (c) other casts depend on pointer size. 8392 8393 // Determine whether \p K is a truncation based on an induction variable that 8394 // can be optimized. 8395 auto isOptimizableIVTruncate = 8396 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8397 return [=](ElementCount VF) -> bool { 8398 return CM.isOptimizableIVTruncate(K, VF); 8399 }; 8400 }; 8401 8402 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8403 isOptimizableIVTruncate(I), Range)) { 8404 8405 InductionDescriptor II = 8406 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8407 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8408 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8409 Start, nullptr, I); 8410 } 8411 return nullptr; 8412 } 8413 8414 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8415 // If all incoming values are equal, the incoming VPValue can be used directly 8416 // instead of creating a new VPBlendRecipe. 8417 Value *FirstIncoming = Phi->getIncomingValue(0); 8418 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8419 return FirstIncoming == Inc; 8420 })) { 8421 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8422 } 8423 8424 // We know that all PHIs in non-header blocks are converted into selects, so 8425 // we don't have to worry about the insertion order and we can just use the 8426 // builder. At this point we generate the predication tree. There may be 8427 // duplications since this is a simple recursive scan, but future 8428 // optimizations will clean it up. 8429 SmallVector<VPValue *, 2> Operands; 8430 unsigned NumIncoming = Phi->getNumIncomingValues(); 8431 8432 for (unsigned In = 0; In < NumIncoming; In++) { 8433 VPValue *EdgeMask = 8434 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8435 assert((EdgeMask || NumIncoming == 1) && 8436 "Multiple predecessors with one having a full mask"); 8437 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8438 if (EdgeMask) 8439 Operands.push_back(EdgeMask); 8440 } 8441 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8442 } 8443 8444 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8445 VPlan &Plan) const { 8446 8447 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8448 [this, CI](ElementCount VF) { 8449 return CM.isScalarWithPredication(CI, VF); 8450 }, 8451 Range); 8452 8453 if (IsPredicated) 8454 return nullptr; 8455 8456 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8457 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8458 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8459 ID == Intrinsic::pseudoprobe || 8460 ID == Intrinsic::experimental_noalias_scope_decl)) 8461 return nullptr; 8462 8463 auto willWiden = [&](ElementCount VF) -> bool { 8464 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8465 // The following case may be scalarized depending on the VF. 8466 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8467 // version of the instruction. 8468 // Is it beneficial to perform intrinsic call compared to lib call? 8469 bool NeedToScalarize = false; 8470 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8471 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8472 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8473 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8474 "Cannot have invalid costs while widening"); 8475 return UseVectorIntrinsic || !NeedToScalarize; 8476 }; 8477 8478 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8479 return nullptr; 8480 8481 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8482 } 8483 8484 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8485 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8486 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8487 // Instruction should be widened, unless it is scalar after vectorization, 8488 // scalarization is profitable or it is predicated. 8489 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8490 return CM.isScalarAfterVectorization(I, VF) || 8491 CM.isProfitableToScalarize(I, VF) || 8492 CM.isScalarWithPredication(I, VF); 8493 }; 8494 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8495 Range); 8496 } 8497 8498 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8499 auto IsVectorizableOpcode = [](unsigned Opcode) { 8500 switch (Opcode) { 8501 case Instruction::Add: 8502 case Instruction::And: 8503 case Instruction::AShr: 8504 case Instruction::BitCast: 8505 case Instruction::FAdd: 8506 case Instruction::FCmp: 8507 case Instruction::FDiv: 8508 case Instruction::FMul: 8509 case Instruction::FNeg: 8510 case Instruction::FPExt: 8511 case Instruction::FPToSI: 8512 case Instruction::FPToUI: 8513 case Instruction::FPTrunc: 8514 case Instruction::FRem: 8515 case Instruction::FSub: 8516 case Instruction::ICmp: 8517 case Instruction::IntToPtr: 8518 case Instruction::LShr: 8519 case Instruction::Mul: 8520 case Instruction::Or: 8521 case Instruction::PtrToInt: 8522 case Instruction::SDiv: 8523 case Instruction::Select: 8524 case Instruction::SExt: 8525 case Instruction::Shl: 8526 case Instruction::SIToFP: 8527 case Instruction::SRem: 8528 case Instruction::Sub: 8529 case Instruction::Trunc: 8530 case Instruction::UDiv: 8531 case Instruction::UIToFP: 8532 case Instruction::URem: 8533 case Instruction::Xor: 8534 case Instruction::ZExt: 8535 return true; 8536 } 8537 return false; 8538 }; 8539 8540 if (!IsVectorizableOpcode(I->getOpcode())) 8541 return nullptr; 8542 8543 // Success: widen this instruction. 8544 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8545 } 8546 8547 VPBasicBlock *VPRecipeBuilder::handleReplication( 8548 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8549 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8550 VPlanPtr &Plan) { 8551 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8552 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8553 Range); 8554 8555 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8556 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8557 Range); 8558 8559 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8560 IsUniform, IsPredicated); 8561 setRecipe(I, Recipe); 8562 Plan->addVPValue(I, Recipe); 8563 8564 // Find if I uses a predicated instruction. If so, it will use its scalar 8565 // value. Avoid hoisting the insert-element which packs the scalar value into 8566 // a vector value, as that happens iff all users use the vector value. 8567 for (auto &Op : I->operands()) 8568 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8569 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8570 PredInst2Recipe[PredInst]->setAlsoPack(false); 8571 8572 // Finalize the recipe for Instr, first if it is not predicated. 8573 if (!IsPredicated) { 8574 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8575 VPBB->appendRecipe(Recipe); 8576 return VPBB; 8577 } 8578 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8579 assert(VPBB->getSuccessors().empty() && 8580 "VPBB has successors when handling predicated replication."); 8581 // Record predicated instructions for above packing optimizations. 8582 PredInst2Recipe[I] = Recipe; 8583 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8584 VPBlockUtils::insertBlockAfter(Region, VPBB); 8585 auto *RegSucc = new VPBasicBlock(); 8586 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8587 return RegSucc; 8588 } 8589 8590 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8591 VPRecipeBase *PredRecipe, 8592 VPlanPtr &Plan) { 8593 // Instructions marked for predication are replicated and placed under an 8594 // if-then construct to prevent side-effects. 8595 8596 // Generate recipes to compute the block mask for this region. 8597 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8598 8599 // Build the triangular if-then region. 8600 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8601 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8602 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8603 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8604 auto *PHIRecipe = Instr->getType()->isVoidTy() 8605 ? nullptr 8606 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8607 if (PHIRecipe) { 8608 Plan->removeVPValueFor(Instr); 8609 Plan->addVPValue(Instr, PHIRecipe); 8610 } 8611 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8612 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8613 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8614 8615 // Note: first set Entry as region entry and then connect successors starting 8616 // from it in order, to propagate the "parent" of each VPBasicBlock. 8617 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8618 VPBlockUtils::connectBlocks(Pred, Exit); 8619 8620 return Region; 8621 } 8622 8623 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8624 VFRange &Range, 8625 VPlanPtr &Plan) { 8626 // First, check for specific widening recipes that deal with calls, memory 8627 // operations, inductions and Phi nodes. 8628 if (auto *CI = dyn_cast<CallInst>(Instr)) 8629 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8630 8631 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8632 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8633 8634 VPRecipeBase *Recipe; 8635 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8636 if (Phi->getParent() != OrigLoop->getHeader()) 8637 return tryToBlend(Phi, Plan); 8638 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8639 return toVPRecipeResult(Recipe); 8640 8641 if (Legal->isReductionVariable(Phi)) { 8642 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8643 VPValue *StartV = 8644 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8645 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8646 } 8647 8648 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8649 } 8650 8651 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8652 cast<TruncInst>(Instr), Range, *Plan))) 8653 return toVPRecipeResult(Recipe); 8654 8655 if (!shouldWiden(Instr, Range)) 8656 return nullptr; 8657 8658 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8659 return toVPRecipeResult(new VPWidenGEPRecipe( 8660 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8661 8662 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8663 bool InvariantCond = 8664 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8665 return toVPRecipeResult(new VPWidenSelectRecipe( 8666 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8667 } 8668 8669 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8670 } 8671 8672 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8673 ElementCount MaxVF) { 8674 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8675 8676 // Collect instructions from the original loop that will become trivially dead 8677 // in the vectorized loop. We don't need to vectorize these instructions. For 8678 // example, original induction update instructions can become dead because we 8679 // separately emit induction "steps" when generating code for the new loop. 8680 // Similarly, we create a new latch condition when setting up the structure 8681 // of the new loop, so the old one can become dead. 8682 SmallPtrSet<Instruction *, 4> DeadInstructions; 8683 collectTriviallyDeadInstructions(DeadInstructions); 8684 8685 // Add assume instructions we need to drop to DeadInstructions, to prevent 8686 // them from being added to the VPlan. 8687 // TODO: We only need to drop assumes in blocks that get flattend. If the 8688 // control flow is preserved, we should keep them. 8689 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8690 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8691 8692 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8693 // Dead instructions do not need sinking. Remove them from SinkAfter. 8694 for (Instruction *I : DeadInstructions) 8695 SinkAfter.erase(I); 8696 8697 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8698 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8699 VFRange SubRange = {VF, MaxVFPlusOne}; 8700 VPlans.push_back( 8701 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8702 VF = SubRange.End; 8703 } 8704 } 8705 8706 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8707 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8708 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8709 8710 // Hold a mapping from predicated instructions to their recipes, in order to 8711 // fix their AlsoPack behavior if a user is determined to replicate and use a 8712 // scalar instead of vector value. 8713 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8714 8715 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8716 8717 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8718 8719 // --------------------------------------------------------------------------- 8720 // Pre-construction: record ingredients whose recipes we'll need to further 8721 // process after constructing the initial VPlan. 8722 // --------------------------------------------------------------------------- 8723 8724 // Mark instructions we'll need to sink later and their targets as 8725 // ingredients whose recipe we'll need to record. 8726 for (auto &Entry : SinkAfter) { 8727 RecipeBuilder.recordRecipeOf(Entry.first); 8728 RecipeBuilder.recordRecipeOf(Entry.second); 8729 } 8730 for (auto &Reduction : CM.getInLoopReductionChains()) { 8731 PHINode *Phi = Reduction.first; 8732 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8733 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8734 8735 RecipeBuilder.recordRecipeOf(Phi); 8736 for (auto &R : ReductionOperations) { 8737 RecipeBuilder.recordRecipeOf(R); 8738 // For min/max reducitons, where we have a pair of icmp/select, we also 8739 // need to record the ICmp recipe, so it can be removed later. 8740 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8741 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8742 } 8743 } 8744 8745 // For each interleave group which is relevant for this (possibly trimmed) 8746 // Range, add it to the set of groups to be later applied to the VPlan and add 8747 // placeholders for its members' Recipes which we'll be replacing with a 8748 // single VPInterleaveRecipe. 8749 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8750 auto applyIG = [IG, this](ElementCount VF) -> bool { 8751 return (VF.isVector() && // Query is illegal for VF == 1 8752 CM.getWideningDecision(IG->getInsertPos(), VF) == 8753 LoopVectorizationCostModel::CM_Interleave); 8754 }; 8755 if (!getDecisionAndClampRange(applyIG, Range)) 8756 continue; 8757 InterleaveGroups.insert(IG); 8758 for (unsigned i = 0; i < IG->getFactor(); i++) 8759 if (Instruction *Member = IG->getMember(i)) 8760 RecipeBuilder.recordRecipeOf(Member); 8761 }; 8762 8763 // --------------------------------------------------------------------------- 8764 // Build initial VPlan: Scan the body of the loop in a topological order to 8765 // visit each basic block after having visited its predecessor basic blocks. 8766 // --------------------------------------------------------------------------- 8767 8768 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8769 auto Plan = std::make_unique<VPlan>(); 8770 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8771 Plan->setEntry(VPBB); 8772 8773 // Scan the body of the loop in a topological order to visit each basic block 8774 // after having visited its predecessor basic blocks. 8775 LoopBlocksDFS DFS(OrigLoop); 8776 DFS.perform(LI); 8777 8778 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8779 // Relevant instructions from basic block BB will be grouped into VPRecipe 8780 // ingredients and fill a new VPBasicBlock. 8781 unsigned VPBBsForBB = 0; 8782 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8783 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8784 VPBB = FirstVPBBForBB; 8785 Builder.setInsertPoint(VPBB); 8786 8787 // Introduce each ingredient into VPlan. 8788 // TODO: Model and preserve debug instrinsics in VPlan. 8789 for (Instruction &I : BB->instructionsWithoutDebug()) { 8790 Instruction *Instr = &I; 8791 8792 // First filter out irrelevant instructions, to ensure no recipes are 8793 // built for them. 8794 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8795 continue; 8796 8797 if (auto RecipeOrValue = 8798 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8799 // If Instr can be simplified to an existing VPValue, use it. 8800 if (RecipeOrValue.is<VPValue *>()) { 8801 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8802 continue; 8803 } 8804 // Otherwise, add the new recipe. 8805 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8806 for (auto *Def : Recipe->definedValues()) { 8807 auto *UV = Def->getUnderlyingValue(); 8808 Plan->addVPValue(UV, Def); 8809 } 8810 8811 RecipeBuilder.setRecipe(Instr, Recipe); 8812 VPBB->appendRecipe(Recipe); 8813 continue; 8814 } 8815 8816 // Otherwise, if all widening options failed, Instruction is to be 8817 // replicated. This may create a successor for VPBB. 8818 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8819 Instr, Range, VPBB, PredInst2Recipe, Plan); 8820 if (NextVPBB != VPBB) { 8821 VPBB = NextVPBB; 8822 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8823 : ""); 8824 } 8825 } 8826 } 8827 8828 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8829 // may also be empty, such as the last one VPBB, reflecting original 8830 // basic-blocks with no recipes. 8831 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8832 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8833 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8834 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8835 delete PreEntry; 8836 8837 // --------------------------------------------------------------------------- 8838 // Transform initial VPlan: Apply previously taken decisions, in order, to 8839 // bring the VPlan to its final state. 8840 // --------------------------------------------------------------------------- 8841 8842 // Apply Sink-After legal constraints. 8843 for (auto &Entry : SinkAfter) { 8844 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8845 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8846 // If the target is in a replication region, make sure to move Sink to the 8847 // block after it, not into the replication region itself. 8848 if (auto *Region = 8849 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8850 if (Region->isReplicator()) { 8851 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8852 VPBasicBlock *NextBlock = 8853 cast<VPBasicBlock>(Region->getSuccessors().front()); 8854 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8855 continue; 8856 } 8857 } 8858 Sink->moveAfter(Target); 8859 } 8860 8861 // Interleave memory: for each Interleave Group we marked earlier as relevant 8862 // for this VPlan, replace the Recipes widening its memory instructions with a 8863 // single VPInterleaveRecipe at its insertion point. 8864 for (auto IG : InterleaveGroups) { 8865 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8866 RecipeBuilder.getRecipe(IG->getInsertPos())); 8867 SmallVector<VPValue *, 4> StoredValues; 8868 for (unsigned i = 0; i < IG->getFactor(); ++i) 8869 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8870 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8871 8872 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8873 Recipe->getMask()); 8874 VPIG->insertBefore(Recipe); 8875 unsigned J = 0; 8876 for (unsigned i = 0; i < IG->getFactor(); ++i) 8877 if (Instruction *Member = IG->getMember(i)) { 8878 if (!Member->getType()->isVoidTy()) { 8879 VPValue *OriginalV = Plan->getVPValue(Member); 8880 Plan->removeVPValueFor(Member); 8881 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8882 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8883 J++; 8884 } 8885 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8886 } 8887 } 8888 8889 // Adjust the recipes for any inloop reductions. 8890 if (Range.Start.isVector()) 8891 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8892 8893 // Finally, if tail is folded by masking, introduce selects between the phi 8894 // and the live-out instruction of each reduction, at the end of the latch. 8895 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8896 Builder.setInsertPoint(VPBB); 8897 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8898 for (auto &Reduction : Legal->getReductionVars()) { 8899 if (CM.isInLoopReduction(Reduction.first)) 8900 continue; 8901 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8902 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8903 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8904 } 8905 } 8906 8907 std::string PlanName; 8908 raw_string_ostream RSO(PlanName); 8909 ElementCount VF = Range.Start; 8910 Plan->addVF(VF); 8911 RSO << "Initial VPlan for VF={" << VF; 8912 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8913 Plan->addVF(VF); 8914 RSO << "," << VF; 8915 } 8916 RSO << "},UF>=1"; 8917 RSO.flush(); 8918 Plan->setName(PlanName); 8919 8920 return Plan; 8921 } 8922 8923 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8924 // Outer loop handling: They may require CFG and instruction level 8925 // transformations before even evaluating whether vectorization is profitable. 8926 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8927 // the vectorization pipeline. 8928 assert(!OrigLoop->isInnermost()); 8929 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8930 8931 // Create new empty VPlan 8932 auto Plan = std::make_unique<VPlan>(); 8933 8934 // Build hierarchical CFG 8935 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8936 HCFGBuilder.buildHierarchicalCFG(); 8937 8938 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8939 VF *= 2) 8940 Plan->addVF(VF); 8941 8942 if (EnableVPlanPredication) { 8943 VPlanPredicator VPP(*Plan); 8944 VPP.predicate(); 8945 8946 // Avoid running transformation to recipes until masked code generation in 8947 // VPlan-native path is in place. 8948 return Plan; 8949 } 8950 8951 SmallPtrSet<Instruction *, 1> DeadInstructions; 8952 VPlanTransforms::VPInstructionsToVPRecipes( 8953 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8954 return Plan; 8955 } 8956 8957 // Adjust the recipes for any inloop reductions. The chain of instructions 8958 // leading from the loop exit instr to the phi need to be converted to 8959 // reductions, with one operand being vector and the other being the scalar 8960 // reduction chain. 8961 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8962 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8963 for (auto &Reduction : CM.getInLoopReductionChains()) { 8964 PHINode *Phi = Reduction.first; 8965 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8966 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8967 8968 // ReductionOperations are orders top-down from the phi's use to the 8969 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8970 // which of the two operands will remain scalar and which will be reduced. 8971 // For minmax the chain will be the select instructions. 8972 Instruction *Chain = Phi; 8973 for (Instruction *R : ReductionOperations) { 8974 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8975 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8976 8977 VPValue *ChainOp = Plan->getVPValue(Chain); 8978 unsigned FirstOpId; 8979 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8980 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8981 "Expected to replace a VPWidenSelectSC"); 8982 FirstOpId = 1; 8983 } else { 8984 assert(isa<VPWidenRecipe>(WidenRecipe) && 8985 "Expected to replace a VPWidenSC"); 8986 FirstOpId = 0; 8987 } 8988 unsigned VecOpId = 8989 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8990 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8991 8992 auto *CondOp = CM.foldTailByMasking() 8993 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8994 : nullptr; 8995 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8996 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 8997 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8998 Plan->removeVPValueFor(R); 8999 Plan->addVPValue(R, RedRecipe); 9000 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9001 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9002 WidenRecipe->eraseFromParent(); 9003 9004 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9005 VPRecipeBase *CompareRecipe = 9006 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9007 assert(isa<VPWidenRecipe>(CompareRecipe) && 9008 "Expected to replace a VPWidenSC"); 9009 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9010 "Expected no remaining users"); 9011 CompareRecipe->eraseFromParent(); 9012 } 9013 Chain = R; 9014 } 9015 } 9016 } 9017 9018 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9019 VPSlotTracker &SlotTracker) const { 9020 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9021 IG->getInsertPos()->printAsOperand(O, false); 9022 O << ", "; 9023 getAddr()->printAsOperand(O, SlotTracker); 9024 VPValue *Mask = getMask(); 9025 if (Mask) { 9026 O << ", "; 9027 Mask->printAsOperand(O, SlotTracker); 9028 } 9029 for (unsigned i = 0; i < IG->getFactor(); ++i) 9030 if (Instruction *I = IG->getMember(i)) 9031 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 9032 } 9033 9034 void VPWidenCallRecipe::execute(VPTransformState &State) { 9035 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9036 *this, State); 9037 } 9038 9039 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9040 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9041 this, *this, InvariantCond, State); 9042 } 9043 9044 void VPWidenRecipe::execute(VPTransformState &State) { 9045 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9046 } 9047 9048 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9049 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9050 *this, State.UF, State.VF, IsPtrLoopInvariant, 9051 IsIndexLoopInvariant, State); 9052 } 9053 9054 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9055 assert(!State.Instance && "Int or FP induction being replicated."); 9056 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9057 getTruncInst(), getVPValue(0), 9058 getCastValue(), State); 9059 } 9060 9061 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9062 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9063 getStartValue(), this, State); 9064 } 9065 9066 void VPBlendRecipe::execute(VPTransformState &State) { 9067 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9068 // We know that all PHIs in non-header blocks are converted into 9069 // selects, so we don't have to worry about the insertion order and we 9070 // can just use the builder. 9071 // At this point we generate the predication tree. There may be 9072 // duplications since this is a simple recursive scan, but future 9073 // optimizations will clean it up. 9074 9075 unsigned NumIncoming = getNumIncomingValues(); 9076 9077 // Generate a sequence of selects of the form: 9078 // SELECT(Mask3, In3, 9079 // SELECT(Mask2, In2, 9080 // SELECT(Mask1, In1, 9081 // In0))) 9082 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9083 // are essentially undef are taken from In0. 9084 InnerLoopVectorizer::VectorParts Entry(State.UF); 9085 for (unsigned In = 0; In < NumIncoming; ++In) { 9086 for (unsigned Part = 0; Part < State.UF; ++Part) { 9087 // We might have single edge PHIs (blocks) - use an identity 9088 // 'select' for the first PHI operand. 9089 Value *In0 = State.get(getIncomingValue(In), Part); 9090 if (In == 0) 9091 Entry[Part] = In0; // Initialize with the first incoming value. 9092 else { 9093 // Select between the current value and the previous incoming edge 9094 // based on the incoming mask. 9095 Value *Cond = State.get(getMask(In), Part); 9096 Entry[Part] = 9097 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9098 } 9099 } 9100 } 9101 for (unsigned Part = 0; Part < State.UF; ++Part) 9102 State.set(this, Entry[Part], Part); 9103 } 9104 9105 void VPInterleaveRecipe::execute(VPTransformState &State) { 9106 assert(!State.Instance && "Interleave group being replicated."); 9107 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9108 getStoredValues(), getMask()); 9109 } 9110 9111 void VPReductionRecipe::execute(VPTransformState &State) { 9112 assert(!State.Instance && "Reduction being replicated."); 9113 for (unsigned Part = 0; Part < State.UF; ++Part) { 9114 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9115 Value *NewVecOp = State.get(getVecOp(), Part); 9116 if (VPValue *Cond = getCondOp()) { 9117 Value *NewCond = State.get(Cond, Part); 9118 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9119 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9120 Kind, VecTy->getElementType()); 9121 Constant *IdenVec = 9122 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9123 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9124 NewVecOp = Select; 9125 } 9126 Value *NewRed = 9127 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9128 Value *PrevInChain = State.get(getChainOp(), Part); 9129 Value *NextInChain; 9130 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9131 NextInChain = 9132 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9133 NewRed, PrevInChain); 9134 } else { 9135 NextInChain = State.Builder.CreateBinOp( 9136 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9137 PrevInChain); 9138 } 9139 State.set(this, NextInChain, Part); 9140 } 9141 } 9142 9143 void VPReplicateRecipe::execute(VPTransformState &State) { 9144 if (State.Instance) { // Generate a single instance. 9145 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9146 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9147 *State.Instance, IsPredicated, State); 9148 // Insert scalar instance packing it into a vector. 9149 if (AlsoPack && State.VF.isVector()) { 9150 // If we're constructing lane 0, initialize to start from poison. 9151 if (State.Instance->Lane == 0) { 9152 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9153 Value *Poison = PoisonValue::get( 9154 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9155 State.set(this, Poison, State.Instance->Part); 9156 } 9157 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9158 } 9159 return; 9160 } 9161 9162 // Generate scalar instances for all VF lanes of all UF parts, unless the 9163 // instruction is uniform inwhich case generate only the first lane for each 9164 // of the UF parts. 9165 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9166 assert((!State.VF.isScalable() || IsUniform) && 9167 "Can't scalarize a scalable vector"); 9168 for (unsigned Part = 0; Part < State.UF; ++Part) 9169 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9170 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9171 VPIteration(Part, Lane), IsPredicated, 9172 State); 9173 } 9174 9175 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9176 assert(State.Instance && "Branch on Mask works only on single instance."); 9177 9178 unsigned Part = State.Instance->Part; 9179 unsigned Lane = State.Instance->Lane; 9180 9181 Value *ConditionBit = nullptr; 9182 VPValue *BlockInMask = getMask(); 9183 if (BlockInMask) { 9184 ConditionBit = State.get(BlockInMask, Part); 9185 if (ConditionBit->getType()->isVectorTy()) 9186 ConditionBit = State.Builder.CreateExtractElement( 9187 ConditionBit, State.Builder.getInt32(Lane)); 9188 } else // Block in mask is all-one. 9189 ConditionBit = State.Builder.getTrue(); 9190 9191 // Replace the temporary unreachable terminator with a new conditional branch, 9192 // whose two destinations will be set later when they are created. 9193 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9194 assert(isa<UnreachableInst>(CurrentTerminator) && 9195 "Expected to replace unreachable terminator with conditional branch."); 9196 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9197 CondBr->setSuccessor(0, nullptr); 9198 ReplaceInstWithInst(CurrentTerminator, CondBr); 9199 } 9200 9201 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9202 assert(State.Instance && "Predicated instruction PHI works per instance."); 9203 Instruction *ScalarPredInst = 9204 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9205 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9206 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9207 assert(PredicatingBB && "Predicated block has no single predecessor."); 9208 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9209 "operand must be VPReplicateRecipe"); 9210 9211 // By current pack/unpack logic we need to generate only a single phi node: if 9212 // a vector value for the predicated instruction exists at this point it means 9213 // the instruction has vector users only, and a phi for the vector value is 9214 // needed. In this case the recipe of the predicated instruction is marked to 9215 // also do that packing, thereby "hoisting" the insert-element sequence. 9216 // Otherwise, a phi node for the scalar value is needed. 9217 unsigned Part = State.Instance->Part; 9218 if (State.hasVectorValue(getOperand(0), Part)) { 9219 Value *VectorValue = State.get(getOperand(0), Part); 9220 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9221 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9222 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9223 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9224 if (State.hasVectorValue(this, Part)) 9225 State.reset(this, VPhi, Part); 9226 else 9227 State.set(this, VPhi, Part); 9228 // NOTE: Currently we need to update the value of the operand, so the next 9229 // predicated iteration inserts its generated value in the correct vector. 9230 State.reset(getOperand(0), VPhi, Part); 9231 } else { 9232 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9233 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9234 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9235 PredicatingBB); 9236 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9237 if (State.hasScalarValue(this, *State.Instance)) 9238 State.reset(this, Phi, *State.Instance); 9239 else 9240 State.set(this, Phi, *State.Instance); 9241 // NOTE: Currently we need to update the value of the operand, so the next 9242 // predicated iteration inserts its generated value in the correct vector. 9243 State.reset(getOperand(0), Phi, *State.Instance); 9244 } 9245 } 9246 9247 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9248 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9249 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9250 StoredValue ? nullptr : getVPValue(), 9251 getAddr(), StoredValue, getMask()); 9252 } 9253 9254 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9255 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9256 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9257 // for predication. 9258 static ScalarEpilogueLowering getScalarEpilogueLowering( 9259 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9260 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9261 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9262 LoopVectorizationLegality &LVL) { 9263 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9264 // don't look at hints or options, and don't request a scalar epilogue. 9265 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9266 // LoopAccessInfo (due to code dependency and not being able to reliably get 9267 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9268 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9269 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9270 // back to the old way and vectorize with versioning when forced. See D81345.) 9271 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9272 PGSOQueryType::IRPass) && 9273 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9274 return CM_ScalarEpilogueNotAllowedOptSize; 9275 9276 // 2) If set, obey the directives 9277 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9278 switch (PreferPredicateOverEpilogue) { 9279 case PreferPredicateTy::ScalarEpilogue: 9280 return CM_ScalarEpilogueAllowed; 9281 case PreferPredicateTy::PredicateElseScalarEpilogue: 9282 return CM_ScalarEpilogueNotNeededUsePredicate; 9283 case PreferPredicateTy::PredicateOrDontVectorize: 9284 return CM_ScalarEpilogueNotAllowedUsePredicate; 9285 }; 9286 } 9287 9288 // 3) If set, obey the hints 9289 switch (Hints.getPredicate()) { 9290 case LoopVectorizeHints::FK_Enabled: 9291 return CM_ScalarEpilogueNotNeededUsePredicate; 9292 case LoopVectorizeHints::FK_Disabled: 9293 return CM_ScalarEpilogueAllowed; 9294 }; 9295 9296 // 4) if the TTI hook indicates this is profitable, request predication. 9297 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9298 LVL.getLAI())) 9299 return CM_ScalarEpilogueNotNeededUsePredicate; 9300 9301 return CM_ScalarEpilogueAllowed; 9302 } 9303 9304 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9305 // If Values have been set for this Def return the one relevant for \p Part. 9306 if (hasVectorValue(Def, Part)) 9307 return Data.PerPartOutput[Def][Part]; 9308 9309 if (!hasScalarValue(Def, {Part, 0})) { 9310 Value *IRV = Def->getLiveInIRValue(); 9311 Value *B = ILV->getBroadcastInstrs(IRV); 9312 set(Def, B, Part); 9313 return B; 9314 } 9315 9316 Value *ScalarValue = get(Def, {Part, 0}); 9317 // If we aren't vectorizing, we can just copy the scalar map values over 9318 // to the vector map. 9319 if (VF.isScalar()) { 9320 set(Def, ScalarValue, Part); 9321 return ScalarValue; 9322 } 9323 9324 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9325 bool IsUniform = RepR && RepR->isUniform(); 9326 9327 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9328 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9329 9330 // Set the insert point after the last scalarized instruction. This 9331 // ensures the insertelement sequence will directly follow the scalar 9332 // definitions. 9333 auto OldIP = Builder.saveIP(); 9334 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9335 Builder.SetInsertPoint(&*NewIP); 9336 9337 // However, if we are vectorizing, we need to construct the vector values. 9338 // If the value is known to be uniform after vectorization, we can just 9339 // broadcast the scalar value corresponding to lane zero for each unroll 9340 // iteration. Otherwise, we construct the vector values using 9341 // insertelement instructions. Since the resulting vectors are stored in 9342 // State, we will only generate the insertelements once. 9343 Value *VectorValue = nullptr; 9344 if (IsUniform) { 9345 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9346 set(Def, VectorValue, Part); 9347 } else { 9348 // Initialize packing with insertelements to start from undef. 9349 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9350 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9351 set(Def, Undef, Part); 9352 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9353 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9354 VectorValue = get(Def, Part); 9355 } 9356 Builder.restoreIP(OldIP); 9357 return VectorValue; 9358 } 9359 9360 // Process the loop in the VPlan-native vectorization path. This path builds 9361 // VPlan upfront in the vectorization pipeline, which allows to apply 9362 // VPlan-to-VPlan transformations from the very beginning without modifying the 9363 // input LLVM IR. 9364 static bool processLoopInVPlanNativePath( 9365 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9366 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9367 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9368 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9369 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9370 9371 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9372 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9373 return false; 9374 } 9375 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9376 Function *F = L->getHeader()->getParent(); 9377 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9378 9379 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9380 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9381 9382 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9383 &Hints, IAI); 9384 // Use the planner for outer loop vectorization. 9385 // TODO: CM is not used at this point inside the planner. Turn CM into an 9386 // optional argument if we don't need it in the future. 9387 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9388 9389 // Get user vectorization factor. 9390 ElementCount UserVF = Hints.getWidth(); 9391 9392 // Plan how to best vectorize, return the best VF and its cost. 9393 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9394 9395 // If we are stress testing VPlan builds, do not attempt to generate vector 9396 // code. Masked vector code generation support will follow soon. 9397 // Also, do not attempt to vectorize if no vector code will be produced. 9398 if (VPlanBuildStressTest || EnableVPlanPredication || 9399 VectorizationFactor::Disabled() == VF) 9400 return false; 9401 9402 LVP.setBestPlan(VF.Width, 1); 9403 9404 { 9405 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9406 F->getParent()->getDataLayout()); 9407 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9408 &CM, BFI, PSI, Checks); 9409 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9410 << L->getHeader()->getParent()->getName() << "\"\n"); 9411 LVP.executePlan(LB, DT); 9412 } 9413 9414 // Mark the loop as already vectorized to avoid vectorizing again. 9415 Hints.setAlreadyVectorized(); 9416 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9417 return true; 9418 } 9419 9420 // Emit a remark if there are stores to floats that required a floating point 9421 // extension. If the vectorized loop was generated with floating point there 9422 // will be a performance penalty from the conversion overhead and the change in 9423 // the vector width. 9424 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9425 SmallVector<Instruction *, 4> Worklist; 9426 for (BasicBlock *BB : L->getBlocks()) { 9427 for (Instruction &Inst : *BB) { 9428 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9429 if (S->getValueOperand()->getType()->isFloatTy()) 9430 Worklist.push_back(S); 9431 } 9432 } 9433 } 9434 9435 // Traverse the floating point stores upwards searching, for floating point 9436 // conversions. 9437 SmallPtrSet<const Instruction *, 4> Visited; 9438 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9439 while (!Worklist.empty()) { 9440 auto *I = Worklist.pop_back_val(); 9441 if (!L->contains(I)) 9442 continue; 9443 if (!Visited.insert(I).second) 9444 continue; 9445 9446 // Emit a remark if the floating point store required a floating 9447 // point conversion. 9448 // TODO: More work could be done to identify the root cause such as a 9449 // constant or a function return type and point the user to it. 9450 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9451 ORE->emit([&]() { 9452 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9453 I->getDebugLoc(), L->getHeader()) 9454 << "floating point conversion changes vector width. " 9455 << "Mixed floating point precision requires an up/down " 9456 << "cast that will negatively impact performance."; 9457 }); 9458 9459 for (Use &Op : I->operands()) 9460 if (auto *OpI = dyn_cast<Instruction>(Op)) 9461 Worklist.push_back(OpI); 9462 } 9463 } 9464 9465 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9466 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9467 !EnableLoopInterleaving), 9468 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9469 !EnableLoopVectorization) {} 9470 9471 bool LoopVectorizePass::processLoop(Loop *L) { 9472 assert((EnableVPlanNativePath || L->isInnermost()) && 9473 "VPlan-native path is not enabled. Only process inner loops."); 9474 9475 #ifndef NDEBUG 9476 const std::string DebugLocStr = getDebugLocString(L); 9477 #endif /* NDEBUG */ 9478 9479 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9480 << L->getHeader()->getParent()->getName() << "\" from " 9481 << DebugLocStr << "\n"); 9482 9483 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9484 9485 LLVM_DEBUG( 9486 dbgs() << "LV: Loop hints:" 9487 << " force=" 9488 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9489 ? "disabled" 9490 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9491 ? "enabled" 9492 : "?")) 9493 << " width=" << Hints.getWidth() 9494 << " unroll=" << Hints.getInterleave() << "\n"); 9495 9496 // Function containing loop 9497 Function *F = L->getHeader()->getParent(); 9498 9499 // Looking at the diagnostic output is the only way to determine if a loop 9500 // was vectorized (other than looking at the IR or machine code), so it 9501 // is important to generate an optimization remark for each loop. Most of 9502 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9503 // generated as OptimizationRemark and OptimizationRemarkMissed are 9504 // less verbose reporting vectorized loops and unvectorized loops that may 9505 // benefit from vectorization, respectively. 9506 9507 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9508 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9509 return false; 9510 } 9511 9512 PredicatedScalarEvolution PSE(*SE, *L); 9513 9514 // Check if it is legal to vectorize the loop. 9515 LoopVectorizationRequirements Requirements(*ORE); 9516 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9517 &Requirements, &Hints, DB, AC, BFI, PSI); 9518 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9519 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9520 Hints.emitRemarkWithHints(); 9521 return false; 9522 } 9523 9524 // Check the function attributes and profiles to find out if this function 9525 // should be optimized for size. 9526 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9527 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9528 9529 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9530 // here. They may require CFG and instruction level transformations before 9531 // even evaluating whether vectorization is profitable. Since we cannot modify 9532 // the incoming IR, we need to build VPlan upfront in the vectorization 9533 // pipeline. 9534 if (!L->isInnermost()) 9535 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9536 ORE, BFI, PSI, Hints); 9537 9538 assert(L->isInnermost() && "Inner loop expected."); 9539 9540 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9541 // count by optimizing for size, to minimize overheads. 9542 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9543 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9544 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9545 << "This loop is worth vectorizing only if no scalar " 9546 << "iteration overheads are incurred."); 9547 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9548 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9549 else { 9550 LLVM_DEBUG(dbgs() << "\n"); 9551 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9552 } 9553 } 9554 9555 // Check the function attributes to see if implicit floats are allowed. 9556 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9557 // an integer loop and the vector instructions selected are purely integer 9558 // vector instructions? 9559 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9560 reportVectorizationFailure( 9561 "Can't vectorize when the NoImplicitFloat attribute is used", 9562 "loop not vectorized due to NoImplicitFloat attribute", 9563 "NoImplicitFloat", ORE, L); 9564 Hints.emitRemarkWithHints(); 9565 return false; 9566 } 9567 9568 // Check if the target supports potentially unsafe FP vectorization. 9569 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9570 // for the target we're vectorizing for, to make sure none of the 9571 // additional fp-math flags can help. 9572 if (Hints.isPotentiallyUnsafe() && 9573 TTI->isFPVectorizationPotentiallyUnsafe()) { 9574 reportVectorizationFailure( 9575 "Potentially unsafe FP op prevents vectorization", 9576 "loop not vectorized due to unsafe FP support.", 9577 "UnsafeFP", ORE, L); 9578 Hints.emitRemarkWithHints(); 9579 return false; 9580 } 9581 9582 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9583 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9584 9585 // If an override option has been passed in for interleaved accesses, use it. 9586 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9587 UseInterleaved = EnableInterleavedMemAccesses; 9588 9589 // Analyze interleaved memory accesses. 9590 if (UseInterleaved) { 9591 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9592 } 9593 9594 // Use the cost model. 9595 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9596 F, &Hints, IAI); 9597 CM.collectValuesToIgnore(); 9598 9599 // Use the planner for vectorization. 9600 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9601 9602 // Get user vectorization factor and interleave count. 9603 ElementCount UserVF = Hints.getWidth(); 9604 unsigned UserIC = Hints.getInterleave(); 9605 9606 // Plan how to best vectorize, return the best VF and its cost. 9607 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9608 9609 VectorizationFactor VF = VectorizationFactor::Disabled(); 9610 unsigned IC = 1; 9611 9612 if (MaybeVF) { 9613 VF = *MaybeVF; 9614 // Select the interleave count. 9615 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9616 } 9617 9618 // Identify the diagnostic messages that should be produced. 9619 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9620 bool VectorizeLoop = true, InterleaveLoop = true; 9621 if (Requirements.doesNotMeet(F, L, Hints)) { 9622 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9623 "requirements.\n"); 9624 Hints.emitRemarkWithHints(); 9625 return false; 9626 } 9627 9628 if (VF.Width.isScalar()) { 9629 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9630 VecDiagMsg = std::make_pair( 9631 "VectorizationNotBeneficial", 9632 "the cost-model indicates that vectorization is not beneficial"); 9633 VectorizeLoop = false; 9634 } 9635 9636 if (!MaybeVF && UserIC > 1) { 9637 // Tell the user interleaving was avoided up-front, despite being explicitly 9638 // requested. 9639 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9640 "interleaving should be avoided up front\n"); 9641 IntDiagMsg = std::make_pair( 9642 "InterleavingAvoided", 9643 "Ignoring UserIC, because interleaving was avoided up front"); 9644 InterleaveLoop = false; 9645 } else if (IC == 1 && UserIC <= 1) { 9646 // Tell the user interleaving is not beneficial. 9647 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9648 IntDiagMsg = std::make_pair( 9649 "InterleavingNotBeneficial", 9650 "the cost-model indicates that interleaving is not beneficial"); 9651 InterleaveLoop = false; 9652 if (UserIC == 1) { 9653 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9654 IntDiagMsg.second += 9655 " and is explicitly disabled or interleave count is set to 1"; 9656 } 9657 } else if (IC > 1 && UserIC == 1) { 9658 // Tell the user interleaving is beneficial, but it explicitly disabled. 9659 LLVM_DEBUG( 9660 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9661 IntDiagMsg = std::make_pair( 9662 "InterleavingBeneficialButDisabled", 9663 "the cost-model indicates that interleaving is beneficial " 9664 "but is explicitly disabled or interleave count is set to 1"); 9665 InterleaveLoop = false; 9666 } 9667 9668 // Override IC if user provided an interleave count. 9669 IC = UserIC > 0 ? UserIC : IC; 9670 9671 // Emit diagnostic messages, if any. 9672 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9673 if (!VectorizeLoop && !InterleaveLoop) { 9674 // Do not vectorize or interleaving the loop. 9675 ORE->emit([&]() { 9676 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9677 L->getStartLoc(), L->getHeader()) 9678 << VecDiagMsg.second; 9679 }); 9680 ORE->emit([&]() { 9681 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9682 L->getStartLoc(), L->getHeader()) 9683 << IntDiagMsg.second; 9684 }); 9685 return false; 9686 } else if (!VectorizeLoop && InterleaveLoop) { 9687 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9688 ORE->emit([&]() { 9689 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9690 L->getStartLoc(), L->getHeader()) 9691 << VecDiagMsg.second; 9692 }); 9693 } else if (VectorizeLoop && !InterleaveLoop) { 9694 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9695 << ") in " << DebugLocStr << '\n'); 9696 ORE->emit([&]() { 9697 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9698 L->getStartLoc(), L->getHeader()) 9699 << IntDiagMsg.second; 9700 }); 9701 } else if (VectorizeLoop && InterleaveLoop) { 9702 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9703 << ") in " << DebugLocStr << '\n'); 9704 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9705 } 9706 9707 bool DisableRuntimeUnroll = false; 9708 MDNode *OrigLoopID = L->getLoopID(); 9709 { 9710 // Optimistically generate runtime checks. Drop them if they turn out to not 9711 // be profitable. Limit the scope of Checks, so the cleanup happens 9712 // immediately after vector codegeneration is done. 9713 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9714 F->getParent()->getDataLayout()); 9715 if (!VF.Width.isScalar() || IC > 1) 9716 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9717 LVP.setBestPlan(VF.Width, IC); 9718 9719 using namespace ore; 9720 if (!VectorizeLoop) { 9721 assert(IC > 1 && "interleave count should not be 1 or 0"); 9722 // If we decided that it is not legal to vectorize the loop, then 9723 // interleave it. 9724 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9725 &CM, BFI, PSI, Checks); 9726 LVP.executePlan(Unroller, DT); 9727 9728 ORE->emit([&]() { 9729 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9730 L->getHeader()) 9731 << "interleaved loop (interleaved count: " 9732 << NV("InterleaveCount", IC) << ")"; 9733 }); 9734 } else { 9735 // If we decided that it is *legal* to vectorize the loop, then do it. 9736 9737 // Consider vectorizing the epilogue too if it's profitable. 9738 VectorizationFactor EpilogueVF = 9739 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9740 if (EpilogueVF.Width.isVector()) { 9741 9742 // The first pass vectorizes the main loop and creates a scalar epilogue 9743 // to be vectorized by executing the plan (potentially with a different 9744 // factor) again shortly afterwards. 9745 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9746 EpilogueVF.Width.getKnownMinValue(), 9747 1); 9748 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9749 EPI, &LVL, &CM, BFI, PSI, Checks); 9750 9751 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9752 LVP.executePlan(MainILV, DT); 9753 ++LoopsVectorized; 9754 9755 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9756 formLCSSARecursively(*L, *DT, LI, SE); 9757 9758 // Second pass vectorizes the epilogue and adjusts the control flow 9759 // edges from the first pass. 9760 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9761 EPI.MainLoopVF = EPI.EpilogueVF; 9762 EPI.MainLoopUF = EPI.EpilogueUF; 9763 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9764 ORE, EPI, &LVL, &CM, BFI, PSI, 9765 Checks); 9766 LVP.executePlan(EpilogILV, DT); 9767 ++LoopsEpilogueVectorized; 9768 9769 if (!MainILV.areSafetyChecksAdded()) 9770 DisableRuntimeUnroll = true; 9771 } else { 9772 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9773 &LVL, &CM, BFI, PSI, Checks); 9774 LVP.executePlan(LB, DT); 9775 ++LoopsVectorized; 9776 9777 // Add metadata to disable runtime unrolling a scalar loop when there 9778 // are no runtime checks about strides and memory. A scalar loop that is 9779 // rarely used is not worth unrolling. 9780 if (!LB.areSafetyChecksAdded()) 9781 DisableRuntimeUnroll = true; 9782 } 9783 // Report the vectorization decision. 9784 ORE->emit([&]() { 9785 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9786 L->getHeader()) 9787 << "vectorized loop (vectorization width: " 9788 << NV("VectorizationFactor", VF.Width) 9789 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9790 }); 9791 } 9792 9793 if (ORE->allowExtraAnalysis(LV_NAME)) 9794 checkMixedPrecision(L, ORE); 9795 } 9796 9797 Optional<MDNode *> RemainderLoopID = 9798 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9799 LLVMLoopVectorizeFollowupEpilogue}); 9800 if (RemainderLoopID.hasValue()) { 9801 L->setLoopID(RemainderLoopID.getValue()); 9802 } else { 9803 if (DisableRuntimeUnroll) 9804 AddRuntimeUnrollDisableMetaData(L); 9805 9806 // Mark the loop as already vectorized to avoid vectorizing again. 9807 Hints.setAlreadyVectorized(); 9808 } 9809 9810 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9811 return true; 9812 } 9813 9814 LoopVectorizeResult LoopVectorizePass::runImpl( 9815 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9816 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9817 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9818 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9819 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9820 SE = &SE_; 9821 LI = &LI_; 9822 TTI = &TTI_; 9823 DT = &DT_; 9824 BFI = &BFI_; 9825 TLI = TLI_; 9826 AA = &AA_; 9827 AC = &AC_; 9828 GetLAA = &GetLAA_; 9829 DB = &DB_; 9830 ORE = &ORE_; 9831 PSI = PSI_; 9832 9833 // Don't attempt if 9834 // 1. the target claims to have no vector registers, and 9835 // 2. interleaving won't help ILP. 9836 // 9837 // The second condition is necessary because, even if the target has no 9838 // vector registers, loop vectorization may still enable scalar 9839 // interleaving. 9840 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9841 TTI->getMaxInterleaveFactor(1) < 2) 9842 return LoopVectorizeResult(false, false); 9843 9844 bool Changed = false, CFGChanged = false; 9845 9846 // The vectorizer requires loops to be in simplified form. 9847 // Since simplification may add new inner loops, it has to run before the 9848 // legality and profitability checks. This means running the loop vectorizer 9849 // will simplify all loops, regardless of whether anything end up being 9850 // vectorized. 9851 for (auto &L : *LI) 9852 Changed |= CFGChanged |= 9853 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9854 9855 // Build up a worklist of inner-loops to vectorize. This is necessary as 9856 // the act of vectorizing or partially unrolling a loop creates new loops 9857 // and can invalidate iterators across the loops. 9858 SmallVector<Loop *, 8> Worklist; 9859 9860 for (Loop *L : *LI) 9861 collectSupportedLoops(*L, LI, ORE, Worklist); 9862 9863 LoopsAnalyzed += Worklist.size(); 9864 9865 // Now walk the identified inner loops. 9866 while (!Worklist.empty()) { 9867 Loop *L = Worklist.pop_back_val(); 9868 9869 // For the inner loops we actually process, form LCSSA to simplify the 9870 // transform. 9871 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9872 9873 Changed |= CFGChanged |= processLoop(L); 9874 } 9875 9876 // Process each loop nest in the function. 9877 return LoopVectorizeResult(Changed, CFGChanged); 9878 } 9879 9880 PreservedAnalyses LoopVectorizePass::run(Function &F, 9881 FunctionAnalysisManager &AM) { 9882 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9883 auto &LI = AM.getResult<LoopAnalysis>(F); 9884 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9885 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9886 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9887 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9888 auto &AA = AM.getResult<AAManager>(F); 9889 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9890 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9891 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9892 MemorySSA *MSSA = EnableMSSALoopDependency 9893 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9894 : nullptr; 9895 9896 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9897 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9898 [&](Loop &L) -> const LoopAccessInfo & { 9899 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9900 TLI, TTI, nullptr, MSSA}; 9901 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9902 }; 9903 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9904 ProfileSummaryInfo *PSI = 9905 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9906 LoopVectorizeResult Result = 9907 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9908 if (!Result.MadeAnyChange) 9909 return PreservedAnalyses::all(); 9910 PreservedAnalyses PA; 9911 9912 // We currently do not preserve loopinfo/dominator analyses with outer loop 9913 // vectorization. Until this is addressed, mark these analyses as preserved 9914 // only for non-VPlan-native path. 9915 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9916 if (!EnableVPlanNativePath) { 9917 PA.preserve<LoopAnalysis>(); 9918 PA.preserve<DominatorTreeAnalysis>(); 9919 } 9920 PA.preserve<BasicAA>(); 9921 PA.preserve<GlobalsAA>(); 9922 if (!Result.MadeCFGChange) 9923 PA.preserveSet<CFGAnalyses>(); 9924 return PA; 9925 } 9926