1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 201 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 202 cl::desc("The maximum allowed number of runtime memory checks with a " 203 "vectorize(enable) pragma.")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<bool> MaximizeBandwidth( 237 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 238 cl::desc("Maximize bandwidth when selecting vectorization factor which " 239 "will be determined by the smallest type in loop.")); 240 241 static cl::opt<bool> EnableInterleavedMemAccesses( 242 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 243 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 244 245 /// An interleave-group may need masking if it resides in a block that needs 246 /// predication, or in order to mask away gaps. 247 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 248 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 249 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 250 251 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 252 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 253 cl::desc("We don't interleave loops with a estimated constant trip count " 254 "below this number")); 255 256 static cl::opt<unsigned> ForceTargetNumScalarRegs( 257 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of scalar registers.")); 259 260 static cl::opt<unsigned> ForceTargetNumVectorRegs( 261 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's number of vector registers.")); 263 264 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 265 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "scalar loops.")); 268 269 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 270 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's max interleave factor for " 272 "vectorized loops.")); 273 274 static cl::opt<unsigned> ForceTargetInstructionCost( 275 "force-target-instruction-cost", cl::init(0), cl::Hidden, 276 cl::desc("A flag that overrides the target's expected cost for " 277 "an instruction to a single constant value. Mostly " 278 "useful for getting consistent testing.")); 279 280 static cl::opt<bool> ForceTargetSupportsScalableVectors( 281 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 282 cl::desc( 283 "Pretend that scalable vectors are supported, even if the target does " 284 "not support them. This flag should only be used for testing.")); 285 286 static cl::opt<unsigned> SmallLoopCost( 287 "small-loop-cost", cl::init(20), cl::Hidden, 288 cl::desc( 289 "The cost of a loop that is considered 'small' by the interleaver.")); 290 291 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 292 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 293 cl::desc("Enable the use of the block frequency analysis to access PGO " 294 "heuristics minimizing code growth in cold regions and being more " 295 "aggressive in hot regions.")); 296 297 // Runtime interleave loops for load/store throughput. 298 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 299 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 300 cl::desc( 301 "Enable runtime interleaving until load/store ports are saturated")); 302 303 /// Interleave small loops with scalar reductions. 304 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 305 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 306 cl::desc("Enable interleaving for loops with small iteration counts that " 307 "contain scalar reductions to expose ILP.")); 308 309 /// The number of stores in a loop that are allowed to need predication. 310 static cl::opt<unsigned> NumberOfStoresToPredicate( 311 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 312 cl::desc("Max number of stores to be predicated behind an if.")); 313 314 static cl::opt<bool> EnableIndVarRegisterHeur( 315 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 316 cl::desc("Count the induction variable only once when interleaving")); 317 318 static cl::opt<bool> EnableCondStoresVectorization( 319 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 320 cl::desc("Enable if predication of stores during vectorization.")); 321 322 static cl::opt<unsigned> MaxNestedScalarReductionIC( 323 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 324 cl::desc("The maximum interleave count to use when interleaving a scalar " 325 "reduction in a nested loop.")); 326 327 static cl::opt<bool> 328 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 329 cl::Hidden, 330 cl::desc("Prefer in-loop vector reductions, " 331 "overriding the targets preference.")); 332 333 static cl::opt<bool> PreferPredicatedReductionSelect( 334 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 335 cl::desc( 336 "Prefer predicating a reduction operation over an after loop select.")); 337 338 cl::opt<bool> EnableVPlanNativePath( 339 "enable-vplan-native-path", cl::init(false), cl::Hidden, 340 cl::desc("Enable VPlan-native vectorization path with " 341 "support for outer loop vectorization.")); 342 343 // FIXME: Remove this switch once we have divergence analysis. Currently we 344 // assume divergent non-backedge branches when this switch is true. 345 cl::opt<bool> EnableVPlanPredication( 346 "enable-vplan-predication", cl::init(false), cl::Hidden, 347 cl::desc("Enable VPlan-native vectorization path predicator with " 348 "support for outer loop vectorization.")); 349 350 // This flag enables the stress testing of the VPlan H-CFG construction in the 351 // VPlan-native vectorization path. It must be used in conjuction with 352 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 353 // verification of the H-CFGs built. 354 static cl::opt<bool> VPlanBuildStressTest( 355 "vplan-build-stress-test", cl::init(false), cl::Hidden, 356 cl::desc( 357 "Build VPlan for every supported loop nest in the function and bail " 358 "out right after the build (stress test the VPlan H-CFG construction " 359 "in the VPlan-native vectorization path).")); 360 361 cl::opt<bool> llvm::EnableLoopInterleaving( 362 "interleave-loops", cl::init(true), cl::Hidden, 363 cl::desc("Enable loop interleaving in Loop vectorization passes")); 364 cl::opt<bool> llvm::EnableLoopVectorization( 365 "vectorize-loops", cl::init(true), cl::Hidden, 366 cl::desc("Run the Loop vectorization passes")); 367 368 cl::opt<bool> PrintVPlansInDotFormat( 369 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 370 cl::desc("Use dot format instead of plain text when dumping VPlans")); 371 372 /// A helper function that returns the type of loaded or stored value. 373 static Type *getMemInstValueType(Value *I) { 374 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 375 "Expected Load or Store instruction"); 376 if (auto *LI = dyn_cast<LoadInst>(I)) 377 return LI->getType(); 378 return cast<StoreInst>(I)->getValueOperand()->getType(); 379 } 380 381 /// A helper function that returns true if the given type is irregular. The 382 /// type is irregular if its allocated size doesn't equal the store size of an 383 /// element of the corresponding vector type. 384 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 385 // Determine if an array of N elements of type Ty is "bitcast compatible" 386 // with a <N x Ty> vector. 387 // This is only true if there is no padding between the array elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that returns an integer or floating-point constant with 400 /// value C. 401 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 402 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 403 : ConstantFP::get(Ty, C); 404 } 405 406 /// Returns "best known" trip count for the specified loop \p L as defined by 407 /// the following procedure: 408 /// 1) Returns exact trip count if it is known. 409 /// 2) Returns expected trip count according to profile data if any. 410 /// 3) Returns upper bound estimate if it is known. 411 /// 4) Returns None if all of the above failed. 412 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 413 // Check if exact trip count is known. 414 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 415 return ExpectedTC; 416 417 // Check if there is an expected trip count available from profile data. 418 if (LoopVectorizeWithBlockFrequency) 419 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 420 return EstimatedTC; 421 422 // Check if upper bound estimate is known. 423 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 424 return ExpectedTC; 425 426 return None; 427 } 428 429 // Forward declare GeneratedRTChecks. 430 class GeneratedRTChecks; 431 432 namespace llvm { 433 434 /// InnerLoopVectorizer vectorizes loops which contain only one basic 435 /// block to a specified vectorization factor (VF). 436 /// This class performs the widening of scalars into vectors, or multiple 437 /// scalars. This class also implements the following features: 438 /// * It inserts an epilogue loop for handling loops that don't have iteration 439 /// counts that are known to be a multiple of the vectorization factor. 440 /// * It handles the code generation for reduction variables. 441 /// * Scalarization (implementation using scalars) of un-vectorizable 442 /// instructions. 443 /// InnerLoopVectorizer does not perform any vectorization-legality 444 /// checks, and relies on the caller to check for the different legality 445 /// aspects. The InnerLoopVectorizer relies on the 446 /// LoopVectorizationLegality class to provide information about the induction 447 /// and reduction variables that were found to a given vectorization factor. 448 class InnerLoopVectorizer { 449 public: 450 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 451 LoopInfo *LI, DominatorTree *DT, 452 const TargetLibraryInfo *TLI, 453 const TargetTransformInfo *TTI, AssumptionCache *AC, 454 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 455 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 456 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 457 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 458 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 459 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 460 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 461 PSI(PSI), RTChecks(RTChecks) { 462 // Query this against the original loop and save it here because the profile 463 // of the original loop header may change as the transformation happens. 464 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 465 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 466 } 467 468 virtual ~InnerLoopVectorizer() = default; 469 470 /// Create a new empty loop that will contain vectorized instructions later 471 /// on, while the old loop will be used as the scalar remainder. Control flow 472 /// is generated around the vectorized (and scalar epilogue) loops consisting 473 /// of various checks and bypasses. Return the pre-header block of the new 474 /// loop. 475 /// In the case of epilogue vectorization, this function is overriden to 476 /// handle the more complex control flow around the loops. 477 virtual BasicBlock *createVectorizedLoopSkeleton(); 478 479 /// Widen a single instruction within the innermost loop. 480 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 481 VPTransformState &State); 482 483 /// Widen a single call instruction within the innermost loop. 484 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 485 VPTransformState &State); 486 487 /// Widen a single select instruction within the innermost loop. 488 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 489 bool InvariantCond, VPTransformState &State); 490 491 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 492 void fixVectorizedLoop(VPTransformState &State); 493 494 // Return true if any runtime check is added. 495 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 496 497 /// A type for vectorized values in the new loop. Each value from the 498 /// original loop, when vectorized, is represented by UF vector values in the 499 /// new unrolled loop, where UF is the unroll factor. 500 using VectorParts = SmallVector<Value *, 2>; 501 502 /// Vectorize a single GetElementPtrInst based on information gathered and 503 /// decisions taken during planning. 504 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 505 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 506 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 507 508 /// Vectorize a single PHINode in a block. This method handles the induction 509 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 510 /// arbitrary length vectors. 511 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 512 VPValue *StartV, VPValue *Def, 513 VPTransformState &State); 514 515 /// A helper function to scalarize a single Instruction in the innermost loop. 516 /// Generates a sequence of scalar instances for each lane between \p MinLane 517 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 518 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 519 /// Instr's operands. 520 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 521 const VPIteration &Instance, bool IfPredicateInstr, 522 VPTransformState &State); 523 524 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 525 /// is provided, the integer induction variable will first be truncated to 526 /// the corresponding type. 527 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 528 VPValue *Def, VPValue *CastDef, 529 VPTransformState &State); 530 531 /// Construct the vector value of a scalarized value \p V one lane at a time. 532 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 533 VPTransformState &State); 534 535 /// Try to vectorize interleaved access group \p Group with the base address 536 /// given in \p Addr, optionally masking the vector operations if \p 537 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 538 /// values in the vectorized loop. 539 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 540 ArrayRef<VPValue *> VPDefs, 541 VPTransformState &State, VPValue *Addr, 542 ArrayRef<VPValue *> StoredValues, 543 VPValue *BlockInMask = nullptr); 544 545 /// Vectorize Load and Store instructions with the base address given in \p 546 /// Addr, optionally masking the vector operations if \p BlockInMask is 547 /// non-null. Use \p State to translate given VPValues to IR values in the 548 /// vectorized loop. 549 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 550 VPValue *Def, VPValue *Addr, 551 VPValue *StoredValue, VPValue *BlockInMask); 552 553 /// Set the debug location in the builder using the debug location in 554 /// the instruction. 555 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 556 557 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 558 void fixNonInductionPHIs(VPTransformState &State); 559 560 /// Create a broadcast instruction. This method generates a broadcast 561 /// instruction (shuffle) for loop invariant values and for the induction 562 /// value. If this is the induction variable then we extend it to N, N+1, ... 563 /// this is needed because each iteration in the loop corresponds to a SIMD 564 /// element. 565 virtual Value *getBroadcastInstrs(Value *V); 566 567 protected: 568 friend class LoopVectorizationPlanner; 569 570 /// A small list of PHINodes. 571 using PhiVector = SmallVector<PHINode *, 4>; 572 573 /// A type for scalarized values in the new loop. Each value from the 574 /// original loop, when scalarized, is represented by UF x VF scalar values 575 /// in the new unrolled loop, where UF is the unroll factor and VF is the 576 /// vectorization factor. 577 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 578 579 /// Set up the values of the IVs correctly when exiting the vector loop. 580 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 581 Value *CountRoundDown, Value *EndValue, 582 BasicBlock *MiddleBlock); 583 584 /// Create a new induction variable inside L. 585 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 586 Value *Step, Instruction *DL); 587 588 /// Handle all cross-iteration phis in the header. 589 void fixCrossIterationPHIs(VPTransformState &State); 590 591 /// Fix a first-order recurrence. This is the second phase of vectorizing 592 /// this phi node. 593 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 594 595 /// Fix a reduction cross-iteration phi. This is the second phase of 596 /// vectorizing this phi node. 597 void fixReduction(PHINode *Phi, VPTransformState &State); 598 599 /// Clear NSW/NUW flags from reduction instructions if necessary. 600 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 601 VPTransformState &State); 602 603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 604 /// means we need to add the appropriate incoming value from the middle 605 /// block as exiting edges from the scalar epilogue loop (if present) are 606 /// already in place, and we exit the vector loop exclusively to the middle 607 /// block. 608 void fixLCSSAPHIs(VPTransformState &State); 609 610 /// Iteratively sink the scalarized operands of a predicated instruction into 611 /// the block that was created for it. 612 void sinkScalarOperands(Instruction *PredInst); 613 614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 615 /// represented as. 616 void truncateToMinimalBitwidths(VPTransformState &State); 617 618 /// This function adds 619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 620 /// to each vector element of Val. The sequence starts at StartIndex. 621 /// \p Opcode is relevant for FP induction variable. 622 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 623 Instruction::BinaryOps Opcode = 624 Instruction::BinaryOpsEnd); 625 626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 627 /// variable on which to base the steps, \p Step is the size of the step, and 628 /// \p EntryVal is the value from the original loop that maps to the steps. 629 /// Note that \p EntryVal doesn't have to be an induction variable - it 630 /// can also be a truncate instruction. 631 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 632 const InductionDescriptor &ID, VPValue *Def, 633 VPValue *CastDef, VPTransformState &State); 634 635 /// Create a vector induction phi node based on an existing scalar one. \p 636 /// EntryVal is the value from the original loop that maps to the vector phi 637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 638 /// truncate instruction, instead of widening the original IV, we widen a 639 /// version of the IV truncated to \p EntryVal's type. 640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 641 Value *Step, Value *Start, 642 Instruction *EntryVal, VPValue *Def, 643 VPValue *CastDef, 644 VPTransformState &State); 645 646 /// Returns true if an instruction \p I should be scalarized instead of 647 /// vectorized for the chosen vectorization factor. 648 bool shouldScalarizeInstruction(Instruction *I) const; 649 650 /// Returns true if we should generate a scalar version of \p IV. 651 bool needsScalarInduction(Instruction *IV) const; 652 653 /// If there is a cast involved in the induction variable \p ID, which should 654 /// be ignored in the vectorized loop body, this function records the 655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 656 /// cast. We had already proved that the casted Phi is equal to the uncasted 657 /// Phi in the vectorized loop (under a runtime guard), and therefore 658 /// there is no need to vectorize the cast - the same value can be used in the 659 /// vector loop for both the Phi and the cast. 660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 662 /// 663 /// \p EntryVal is the value from the original loop that maps to the vector 664 /// phi node and is used to distinguish what is the IV currently being 665 /// processed - original one (if \p EntryVal is a phi corresponding to the 666 /// original IV) or the "newly-created" one based on the proof mentioned above 667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 668 /// latter case \p EntryVal is a TruncInst and we must not record anything for 669 /// that IV, but it's error-prone to expect callers of this routine to care 670 /// about that, hence this explicit parameter. 671 void recordVectorLoopValueForInductionCast( 672 const InductionDescriptor &ID, const Instruction *EntryVal, 673 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 674 unsigned Part, unsigned Lane = UINT_MAX); 675 676 /// Generate a shuffle sequence that will reverse the vector Vec. 677 virtual Value *reverseVector(Value *Vec); 678 679 /// Returns (and creates if needed) the original loop trip count. 680 Value *getOrCreateTripCount(Loop *NewLoop); 681 682 /// Returns (and creates if needed) the trip count of the widened loop. 683 Value *getOrCreateVectorTripCount(Loop *NewLoop); 684 685 /// Returns a bitcasted value to the requested vector type. 686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 688 const DataLayout &DL); 689 690 /// Emit a bypass check to see if the vector trip count is zero, including if 691 /// it overflows. 692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 693 694 /// Emit a bypass check to see if all of the SCEV assumptions we've 695 /// had to make are correct. Returns the block containing the checks or 696 /// nullptr if no checks have been added. 697 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Emit bypass checks to check any memory assumptions we may have made. 700 /// Returns the block containing the checks or nullptr if no checks have been 701 /// added. 702 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Compute the transformed value of Index at offset StartValue using step 705 /// StepValue. 706 /// For integer induction, returns StartValue + Index * StepValue. 707 /// For pointer induction, returns StartValue[Index * StepValue]. 708 /// FIXME: The newly created binary instructions should contain nsw/nuw 709 /// flags, which can be found from the original scalar operations. 710 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 711 const DataLayout &DL, 712 const InductionDescriptor &ID) const; 713 714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 715 /// vector loop preheader, middle block and scalar preheader. Also 716 /// allocate a loop object for the new vector loop and return it. 717 Loop *createVectorLoopSkeleton(StringRef Prefix); 718 719 /// Create new phi nodes for the induction variables to resume iteration count 720 /// in the scalar epilogue, from where the vectorized loop left off (given by 721 /// \p VectorTripCount). 722 /// In cases where the loop skeleton is more complicated (eg. epilogue 723 /// vectorization) and the resume values can come from an additional bypass 724 /// block, the \p AdditionalBypass pair provides information about the bypass 725 /// block and the end value on the edge from bypass to this loop. 726 void createInductionResumeValues( 727 Loop *L, Value *VectorTripCount, 728 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 729 730 /// Complete the loop skeleton by adding debug MDs, creating appropriate 731 /// conditional branches in the middle block, preparing the builder and 732 /// running the verifier. Take in the vector loop \p L as argument, and return 733 /// the preheader of the completed vector loop. 734 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 735 736 /// Add additional metadata to \p To that was not present on \p Orig. 737 /// 738 /// Currently this is used to add the noalias annotations based on the 739 /// inserted memchecks. Use this for instructions that are *cloned* into the 740 /// vector loop. 741 void addNewMetadata(Instruction *To, const Instruction *Orig); 742 743 /// Add metadata from one instruction to another. 744 /// 745 /// This includes both the original MDs from \p From and additional ones (\see 746 /// addNewMetadata). Use this for *newly created* instructions in the vector 747 /// loop. 748 void addMetadata(Instruction *To, Instruction *From); 749 750 /// Similar to the previous function but it adds the metadata to a 751 /// vector of instructions. 752 void addMetadata(ArrayRef<Value *> To, Instruction *From); 753 754 /// Allow subclasses to override and print debug traces before/after vplan 755 /// execution, when trace information is requested. 756 virtual void printDebugTracesAtStart(){}; 757 virtual void printDebugTracesAtEnd(){}; 758 759 /// The original loop. 760 Loop *OrigLoop; 761 762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 763 /// dynamic knowledge to simplify SCEV expressions and converts them to a 764 /// more usable form. 765 PredicatedScalarEvolution &PSE; 766 767 /// Loop Info. 768 LoopInfo *LI; 769 770 /// Dominator Tree. 771 DominatorTree *DT; 772 773 /// Alias Analysis. 774 AAResults *AA; 775 776 /// Target Library Info. 777 const TargetLibraryInfo *TLI; 778 779 /// Target Transform Info. 780 const TargetTransformInfo *TTI; 781 782 /// Assumption Cache. 783 AssumptionCache *AC; 784 785 /// Interface to emit optimization remarks. 786 OptimizationRemarkEmitter *ORE; 787 788 /// LoopVersioning. It's only set up (non-null) if memchecks were 789 /// used. 790 /// 791 /// This is currently only used to add no-alias metadata based on the 792 /// memchecks. The actually versioning is performed manually. 793 std::unique_ptr<LoopVersioning> LVer; 794 795 /// The vectorization SIMD factor to use. Each vector will have this many 796 /// vector elements. 797 ElementCount VF; 798 799 /// The vectorization unroll factor to use. Each scalar is vectorized to this 800 /// many different vector instructions. 801 unsigned UF; 802 803 /// The builder that we use 804 IRBuilder<> Builder; 805 806 // --- Vectorization state --- 807 808 /// The vector-loop preheader. 809 BasicBlock *LoopVectorPreHeader; 810 811 /// The scalar-loop preheader. 812 BasicBlock *LoopScalarPreHeader; 813 814 /// Middle Block between the vector and the scalar. 815 BasicBlock *LoopMiddleBlock; 816 817 /// The (unique) ExitBlock of the scalar loop. Note that 818 /// there can be multiple exiting edges reaching this block. 819 BasicBlock *LoopExitBlock; 820 821 /// The vector loop body. 822 BasicBlock *LoopVectorBody; 823 824 /// The scalar loop body. 825 BasicBlock *LoopScalarBody; 826 827 /// A list of all bypass blocks. The first block is the entry of the loop. 828 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 829 830 /// The new Induction variable which was added to the new block. 831 PHINode *Induction = nullptr; 832 833 /// The induction variable of the old basic block. 834 PHINode *OldInduction = nullptr; 835 836 /// Store instructions that were predicated. 837 SmallVector<Instruction *, 4> PredicatedInstructions; 838 839 /// Trip count of the original loop. 840 Value *TripCount = nullptr; 841 842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 843 Value *VectorTripCount = nullptr; 844 845 /// The legality analysis. 846 LoopVectorizationLegality *Legal; 847 848 /// The profitablity analysis. 849 LoopVectorizationCostModel *Cost; 850 851 // Record whether runtime checks are added. 852 bool AddedSafetyChecks = false; 853 854 // Holds the end values for each induction variable. We save the end values 855 // so we can later fix-up the external users of the induction variables. 856 DenseMap<PHINode *, Value *> IVEndValues; 857 858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 859 // fixed up at the end of vector code generation. 860 SmallVector<PHINode *, 8> OrigPHIsToFix; 861 862 /// BFI and PSI are used to check for profile guided size optimizations. 863 BlockFrequencyInfo *BFI; 864 ProfileSummaryInfo *PSI; 865 866 // Whether this loop should be optimized for size based on profile guided size 867 // optimizatios. 868 bool OptForSizeBasedOnProfile; 869 870 /// Structure to hold information about generated runtime checks, responsible 871 /// for cleaning the checks, if vectorization turns out unprofitable. 872 GeneratedRTChecks &RTChecks; 873 }; 874 875 class InnerLoopUnroller : public InnerLoopVectorizer { 876 public: 877 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 878 LoopInfo *LI, DominatorTree *DT, 879 const TargetLibraryInfo *TLI, 880 const TargetTransformInfo *TTI, AssumptionCache *AC, 881 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 882 LoopVectorizationLegality *LVL, 883 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 884 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 885 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 886 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 887 BFI, PSI, Check) {} 888 889 private: 890 Value *getBroadcastInstrs(Value *V) override; 891 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 892 Instruction::BinaryOps Opcode = 893 Instruction::BinaryOpsEnd) override; 894 Value *reverseVector(Value *Vec) override; 895 }; 896 897 /// Encapsulate information regarding vectorization of a loop and its epilogue. 898 /// This information is meant to be updated and used across two stages of 899 /// epilogue vectorization. 900 struct EpilogueLoopVectorizationInfo { 901 ElementCount MainLoopVF = ElementCount::getFixed(0); 902 unsigned MainLoopUF = 0; 903 ElementCount EpilogueVF = ElementCount::getFixed(0); 904 unsigned EpilogueUF = 0; 905 BasicBlock *MainLoopIterationCountCheck = nullptr; 906 BasicBlock *EpilogueIterationCountCheck = nullptr; 907 BasicBlock *SCEVSafetyCheck = nullptr; 908 BasicBlock *MemSafetyCheck = nullptr; 909 Value *TripCount = nullptr; 910 Value *VectorTripCount = nullptr; 911 912 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 913 unsigned EUF) 914 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 915 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1043 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1044 const DILocation *DIL = Inst->getDebugLoc(); 1045 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1046 !isa<DbgInfoIntrinsic>(Inst)) { 1047 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1048 auto NewDIL = 1049 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1050 if (NewDIL) 1051 B.SetCurrentDebugLocation(NewDIL.getValue()); 1052 else 1053 LLVM_DEBUG(dbgs() 1054 << "Failed to create new discriminator: " 1055 << DIL->getFilename() << " Line: " << DIL->getLine()); 1056 } 1057 else 1058 B.SetCurrentDebugLocation(DIL); 1059 } else 1060 B.SetCurrentDebugLocation(DebugLoc()); 1061 } 1062 1063 /// Write a record \p DebugMsg about vectorization failure to the debug 1064 /// output stream. If \p I is passed, it is an instruction that prevents 1065 /// vectorization. 1066 #ifndef NDEBUG 1067 static void debugVectorizationFailure(const StringRef DebugMsg, 1068 Instruction *I) { 1069 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1070 if (I != nullptr) 1071 dbgs() << " " << *I; 1072 else 1073 dbgs() << '.'; 1074 dbgs() << '\n'; 1075 } 1076 #endif 1077 1078 /// Create an analysis remark that explains why vectorization failed 1079 /// 1080 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1081 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1082 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1083 /// the location of the remark. \return the remark object that can be 1084 /// streamed to. 1085 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1086 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1087 Value *CodeRegion = TheLoop->getHeader(); 1088 DebugLoc DL = TheLoop->getStartLoc(); 1089 1090 if (I) { 1091 CodeRegion = I->getParent(); 1092 // If there is no debug location attached to the instruction, revert back to 1093 // using the loop's. 1094 if (I->getDebugLoc()) 1095 DL = I->getDebugLoc(); 1096 } 1097 1098 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1099 R << "loop not vectorized: "; 1100 return R; 1101 } 1102 1103 /// Return a value for Step multiplied by VF. 1104 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1105 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1106 Constant *StepVal = ConstantInt::get( 1107 Step->getType(), 1108 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1109 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1110 } 1111 1112 namespace llvm { 1113 1114 /// Return the runtime value for VF. 1115 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1116 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1117 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1118 } 1119 1120 void reportVectorizationFailure(const StringRef DebugMsg, 1121 const StringRef OREMsg, const StringRef ORETag, 1122 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1123 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1124 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1125 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1126 ORETag, TheLoop, I) << OREMsg); 1127 } 1128 1129 } // end namespace llvm 1130 1131 #ifndef NDEBUG 1132 /// \return string containing a file name and a line # for the given loop. 1133 static std::string getDebugLocString(const Loop *L) { 1134 std::string Result; 1135 if (L) { 1136 raw_string_ostream OS(Result); 1137 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1138 LoopDbgLoc.print(OS); 1139 else 1140 // Just print the module name. 1141 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1142 OS.flush(); 1143 } 1144 return Result; 1145 } 1146 #endif 1147 1148 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1149 const Instruction *Orig) { 1150 // If the loop was versioned with memchecks, add the corresponding no-alias 1151 // metadata. 1152 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1153 LVer->annotateInstWithNoAlias(To, Orig); 1154 } 1155 1156 void InnerLoopVectorizer::addMetadata(Instruction *To, 1157 Instruction *From) { 1158 propagateMetadata(To, From); 1159 addNewMetadata(To, From); 1160 } 1161 1162 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1163 Instruction *From) { 1164 for (Value *V : To) { 1165 if (Instruction *I = dyn_cast<Instruction>(V)) 1166 addMetadata(I, From); 1167 } 1168 } 1169 1170 namespace llvm { 1171 1172 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1173 // lowered. 1174 enum ScalarEpilogueLowering { 1175 1176 // The default: allowing scalar epilogues. 1177 CM_ScalarEpilogueAllowed, 1178 1179 // Vectorization with OptForSize: don't allow epilogues. 1180 CM_ScalarEpilogueNotAllowedOptSize, 1181 1182 // A special case of vectorisation with OptForSize: loops with a very small 1183 // trip count are considered for vectorization under OptForSize, thereby 1184 // making sure the cost of their loop body is dominant, free of runtime 1185 // guards and scalar iteration overheads. 1186 CM_ScalarEpilogueNotAllowedLowTripLoop, 1187 1188 // Loop hint predicate indicating an epilogue is undesired. 1189 CM_ScalarEpilogueNotNeededUsePredicate, 1190 1191 // Directive indicating we must either tail fold or not vectorize 1192 CM_ScalarEpilogueNotAllowedUsePredicate 1193 }; 1194 1195 /// LoopVectorizationCostModel - estimates the expected speedups due to 1196 /// vectorization. 1197 /// In many cases vectorization is not profitable. This can happen because of 1198 /// a number of reasons. In this class we mainly attempt to predict the 1199 /// expected speedup/slowdowns due to the supported instruction set. We use the 1200 /// TargetTransformInfo to query the different backends for the cost of 1201 /// different operations. 1202 class LoopVectorizationCostModel { 1203 public: 1204 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1205 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1206 LoopVectorizationLegality *Legal, 1207 const TargetTransformInfo &TTI, 1208 const TargetLibraryInfo *TLI, DemandedBits *DB, 1209 AssumptionCache *AC, 1210 OptimizationRemarkEmitter *ORE, const Function *F, 1211 const LoopVectorizeHints *Hints, 1212 InterleavedAccessInfo &IAI) 1213 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1214 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1215 Hints(Hints), InterleaveInfo(IAI) {} 1216 1217 /// \return An upper bound for the vectorization factor, or None if 1218 /// vectorization and interleaving should be avoided up front. 1219 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1220 1221 /// \return True if runtime checks are required for vectorization, and false 1222 /// otherwise. 1223 bool runtimeChecksRequired(); 1224 1225 /// \return The most profitable vectorization factor and the cost of that VF. 1226 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1227 /// then this vectorization factor will be selected if vectorization is 1228 /// possible. 1229 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1230 VectorizationFactor 1231 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1232 const LoopVectorizationPlanner &LVP); 1233 1234 /// Setup cost-based decisions for user vectorization factor. 1235 void selectUserVectorizationFactor(ElementCount UserVF) { 1236 collectUniformsAndScalars(UserVF); 1237 collectInstsToScalarize(UserVF); 1238 } 1239 1240 /// \return The size (in bits) of the smallest and widest types in the code 1241 /// that needs to be vectorized. We ignore values that remain scalar such as 1242 /// 64 bit loop indices. 1243 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1244 1245 /// \return The desired interleave count. 1246 /// If interleave count has been specified by metadata it will be returned. 1247 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1248 /// are the selected vectorization factor and the cost of the selected VF. 1249 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1250 1251 /// Memory access instruction may be vectorized in more than one way. 1252 /// Form of instruction after vectorization depends on cost. 1253 /// This function takes cost-based decisions for Load/Store instructions 1254 /// and collects them in a map. This decisions map is used for building 1255 /// the lists of loop-uniform and loop-scalar instructions. 1256 /// The calculated cost is saved with widening decision in order to 1257 /// avoid redundant calculations. 1258 void setCostBasedWideningDecision(ElementCount VF); 1259 1260 /// A struct that represents some properties of the register usage 1261 /// of a loop. 1262 struct RegisterUsage { 1263 /// Holds the number of loop invariant values that are used in the loop. 1264 /// The key is ClassID of target-provided register class. 1265 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1266 /// Holds the maximum number of concurrent live intervals in the loop. 1267 /// The key is ClassID of target-provided register class. 1268 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1269 }; 1270 1271 /// \return Returns information about the register usages of the loop for the 1272 /// given vectorization factors. 1273 SmallVector<RegisterUsage, 8> 1274 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1275 1276 /// Collect values we want to ignore in the cost model. 1277 void collectValuesToIgnore(); 1278 1279 /// Split reductions into those that happen in the loop, and those that happen 1280 /// outside. In loop reductions are collected into InLoopReductionChains. 1281 void collectInLoopReductions(); 1282 1283 /// \returns The smallest bitwidth each instruction can be represented with. 1284 /// The vector equivalents of these instructions should be truncated to this 1285 /// type. 1286 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1287 return MinBWs; 1288 } 1289 1290 /// \returns True if it is more profitable to scalarize instruction \p I for 1291 /// vectorization factor \p VF. 1292 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1293 assert(VF.isVector() && 1294 "Profitable to scalarize relevant only for VF > 1."); 1295 1296 // Cost model is not run in the VPlan-native path - return conservative 1297 // result until this changes. 1298 if (EnableVPlanNativePath) 1299 return false; 1300 1301 auto Scalars = InstsToScalarize.find(VF); 1302 assert(Scalars != InstsToScalarize.end() && 1303 "VF not yet analyzed for scalarization profitability"); 1304 return Scalars->second.find(I) != Scalars->second.end(); 1305 } 1306 1307 /// Returns true if \p I is known to be uniform after vectorization. 1308 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1309 if (VF.isScalar()) 1310 return true; 1311 1312 // Cost model is not run in the VPlan-native path - return conservative 1313 // result until this changes. 1314 if (EnableVPlanNativePath) 1315 return false; 1316 1317 auto UniformsPerVF = Uniforms.find(VF); 1318 assert(UniformsPerVF != Uniforms.end() && 1319 "VF not yet analyzed for uniformity"); 1320 return UniformsPerVF->second.count(I); 1321 } 1322 1323 /// Returns true if \p I is known to be scalar after vectorization. 1324 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1325 if (VF.isScalar()) 1326 return true; 1327 1328 // Cost model is not run in the VPlan-native path - return conservative 1329 // result until this changes. 1330 if (EnableVPlanNativePath) 1331 return false; 1332 1333 auto ScalarsPerVF = Scalars.find(VF); 1334 assert(ScalarsPerVF != Scalars.end() && 1335 "Scalar values are not calculated for VF"); 1336 return ScalarsPerVF->second.count(I); 1337 } 1338 1339 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1340 /// for vectorization factor \p VF. 1341 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1342 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1343 !isProfitableToScalarize(I, VF) && 1344 !isScalarAfterVectorization(I, VF); 1345 } 1346 1347 /// Decision that was taken during cost calculation for memory instruction. 1348 enum InstWidening { 1349 CM_Unknown, 1350 CM_Widen, // For consecutive accesses with stride +1. 1351 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1352 CM_Interleave, 1353 CM_GatherScatter, 1354 CM_Scalarize 1355 }; 1356 1357 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1358 /// instruction \p I and vector width \p VF. 1359 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1360 InstructionCost Cost) { 1361 assert(VF.isVector() && "Expected VF >=2"); 1362 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1363 } 1364 1365 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1366 /// interleaving group \p Grp and vector width \p VF. 1367 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1368 ElementCount VF, InstWidening W, 1369 InstructionCost Cost) { 1370 assert(VF.isVector() && "Expected VF >=2"); 1371 /// Broadcast this decicion to all instructions inside the group. 1372 /// But the cost will be assigned to one instruction only. 1373 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1374 if (auto *I = Grp->getMember(i)) { 1375 if (Grp->getInsertPos() == I) 1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1377 else 1378 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1379 } 1380 } 1381 } 1382 1383 /// Return the cost model decision for the given instruction \p I and vector 1384 /// width \p VF. Return CM_Unknown if this instruction did not pass 1385 /// through the cost modeling. 1386 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1387 assert(VF.isVector() && "Expected VF to be a vector VF"); 1388 // Cost model is not run in the VPlan-native path - return conservative 1389 // result until this changes. 1390 if (EnableVPlanNativePath) 1391 return CM_GatherScatter; 1392 1393 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1394 auto Itr = WideningDecisions.find(InstOnVF); 1395 if (Itr == WideningDecisions.end()) 1396 return CM_Unknown; 1397 return Itr->second.first; 1398 } 1399 1400 /// Return the vectorization cost for the given instruction \p I and vector 1401 /// width \p VF. 1402 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1403 assert(VF.isVector() && "Expected VF >=2"); 1404 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1405 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1406 "The cost is not calculated"); 1407 return WideningDecisions[InstOnVF].second; 1408 } 1409 1410 /// Return True if instruction \p I is an optimizable truncate whose operand 1411 /// is an induction variable. Such a truncate will be removed by adding a new 1412 /// induction variable with the destination type. 1413 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1414 // If the instruction is not a truncate, return false. 1415 auto *Trunc = dyn_cast<TruncInst>(I); 1416 if (!Trunc) 1417 return false; 1418 1419 // Get the source and destination types of the truncate. 1420 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1421 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1422 1423 // If the truncate is free for the given types, return false. Replacing a 1424 // free truncate with an induction variable would add an induction variable 1425 // update instruction to each iteration of the loop. We exclude from this 1426 // check the primary induction variable since it will need an update 1427 // instruction regardless. 1428 Value *Op = Trunc->getOperand(0); 1429 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1430 return false; 1431 1432 // If the truncated value is not an induction variable, return false. 1433 return Legal->isInductionPhi(Op); 1434 } 1435 1436 /// Collects the instructions to scalarize for each predicated instruction in 1437 /// the loop. 1438 void collectInstsToScalarize(ElementCount VF); 1439 1440 /// Collect Uniform and Scalar values for the given \p VF. 1441 /// The sets depend on CM decision for Load/Store instructions 1442 /// that may be vectorized as interleave, gather-scatter or scalarized. 1443 void collectUniformsAndScalars(ElementCount VF) { 1444 // Do the analysis once. 1445 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1446 return; 1447 setCostBasedWideningDecision(VF); 1448 collectLoopUniforms(VF); 1449 collectLoopScalars(VF); 1450 } 1451 1452 /// Returns true if the target machine supports masked store operation 1453 /// for the given \p DataType and kind of access to \p Ptr. 1454 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1455 return Legal->isConsecutivePtr(Ptr) && 1456 TTI.isLegalMaskedStore(DataType, Alignment); 1457 } 1458 1459 /// Returns true if the target machine supports masked load operation 1460 /// for the given \p DataType and kind of access to \p Ptr. 1461 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1462 return Legal->isConsecutivePtr(Ptr) && 1463 TTI.isLegalMaskedLoad(DataType, Alignment); 1464 } 1465 1466 /// Returns true if the target machine supports masked scatter operation 1467 /// for the given \p DataType. 1468 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1469 return TTI.isLegalMaskedScatter(DataType, Alignment); 1470 } 1471 1472 /// Returns true if the target machine supports masked gather operation 1473 /// for the given \p DataType. 1474 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1475 return TTI.isLegalMaskedGather(DataType, Alignment); 1476 } 1477 1478 /// Returns true if the target machine can represent \p V as a masked gather 1479 /// or scatter operation. 1480 bool isLegalGatherOrScatter(Value *V) { 1481 bool LI = isa<LoadInst>(V); 1482 bool SI = isa<StoreInst>(V); 1483 if (!LI && !SI) 1484 return false; 1485 auto *Ty = getMemInstValueType(V); 1486 Align Align = getLoadStoreAlignment(V); 1487 return (LI && isLegalMaskedGather(Ty, Align)) || 1488 (SI && isLegalMaskedScatter(Ty, Align)); 1489 } 1490 1491 /// Returns true if the target machine supports all of the reduction 1492 /// variables found for the given VF. 1493 bool canVectorizeReductions(ElementCount VF) { 1494 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1495 RecurrenceDescriptor RdxDesc = Reduction.second; 1496 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1497 })); 1498 } 1499 1500 /// Returns true if \p I is an instruction that will be scalarized with 1501 /// predication. Such instructions include conditional stores and 1502 /// instructions that may divide by zero. 1503 /// If a non-zero VF has been calculated, we check if I will be scalarized 1504 /// predication for that VF. 1505 bool 1506 isScalarWithPredication(Instruction *I, 1507 ElementCount VF = ElementCount::getFixed(1)) const; 1508 1509 // Returns true if \p I is an instruction that will be predicated either 1510 // through scalar predication or masked load/store or masked gather/scatter. 1511 // Superset of instructions that return true for isScalarWithPredication. 1512 bool isPredicatedInst(Instruction *I) { 1513 if (!blockNeedsPredication(I->getParent())) 1514 return false; 1515 // Loads and stores that need some form of masked operation are predicated 1516 // instructions. 1517 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1518 return Legal->isMaskRequired(I); 1519 return isScalarWithPredication(I); 1520 } 1521 1522 /// Returns true if \p I is a memory instruction with consecutive memory 1523 /// access that can be widened. 1524 bool 1525 memoryInstructionCanBeWidened(Instruction *I, 1526 ElementCount VF = ElementCount::getFixed(1)); 1527 1528 /// Returns true if \p I is a memory instruction in an interleaved-group 1529 /// of memory accesses that can be vectorized with wide vector loads/stores 1530 /// and shuffles. 1531 bool 1532 interleavedAccessCanBeWidened(Instruction *I, 1533 ElementCount VF = ElementCount::getFixed(1)); 1534 1535 /// Check if \p Instr belongs to any interleaved access group. 1536 bool isAccessInterleaved(Instruction *Instr) { 1537 return InterleaveInfo.isInterleaved(Instr); 1538 } 1539 1540 /// Get the interleaved access group that \p Instr belongs to. 1541 const InterleaveGroup<Instruction> * 1542 getInterleavedAccessGroup(Instruction *Instr) { 1543 return InterleaveInfo.getInterleaveGroup(Instr); 1544 } 1545 1546 /// Returns true if we're required to use a scalar epilogue for at least 1547 /// the final iteration of the original loop. 1548 bool requiresScalarEpilogue() const { 1549 if (!isScalarEpilogueAllowed()) 1550 return false; 1551 // If we might exit from anywhere but the latch, must run the exiting 1552 // iteration in scalar form. 1553 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1554 return true; 1555 return InterleaveInfo.requiresScalarEpilogue(); 1556 } 1557 1558 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1559 /// loop hint annotation. 1560 bool isScalarEpilogueAllowed() const { 1561 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1562 } 1563 1564 /// Returns true if all loop blocks should be masked to fold tail loop. 1565 bool foldTailByMasking() const { return FoldTailByMasking; } 1566 1567 bool blockNeedsPredication(BasicBlock *BB) const { 1568 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1569 } 1570 1571 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1572 /// nodes to the chain of instructions representing the reductions. Uses a 1573 /// MapVector to ensure deterministic iteration order. 1574 using ReductionChainMap = 1575 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1576 1577 /// Return the chain of instructions representing an inloop reduction. 1578 const ReductionChainMap &getInLoopReductionChains() const { 1579 return InLoopReductionChains; 1580 } 1581 1582 /// Returns true if the Phi is part of an inloop reduction. 1583 bool isInLoopReduction(PHINode *Phi) const { 1584 return InLoopReductionChains.count(Phi); 1585 } 1586 1587 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1588 /// with factor VF. Return the cost of the instruction, including 1589 /// scalarization overhead if it's needed. 1590 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1591 1592 /// Estimate cost of a call instruction CI if it were vectorized with factor 1593 /// VF. Return the cost of the instruction, including scalarization overhead 1594 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1595 /// scalarized - 1596 /// i.e. either vector version isn't available, or is too expensive. 1597 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1598 bool &NeedToScalarize) const; 1599 1600 /// Invalidates decisions already taken by the cost model. 1601 void invalidateCostModelingDecisions() { 1602 WideningDecisions.clear(); 1603 Uniforms.clear(); 1604 Scalars.clear(); 1605 } 1606 1607 private: 1608 unsigned NumPredStores = 0; 1609 1610 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1611 /// than zero. One is returned if vectorization should best be avoided due 1612 /// to cost. 1613 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1614 ElementCount UserVF); 1615 1616 /// The vectorization cost is a combination of the cost itself and a boolean 1617 /// indicating whether any of the contributing operations will actually 1618 /// operate on 1619 /// vector values after type legalization in the backend. If this latter value 1620 /// is 1621 /// false, then all operations will be scalarized (i.e. no vectorization has 1622 /// actually taken place). 1623 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1624 1625 /// Returns the expected execution cost. The unit of the cost does 1626 /// not matter because we use the 'cost' units to compare different 1627 /// vector widths. The cost that is returned is *not* normalized by 1628 /// the factor width. 1629 VectorizationCostTy expectedCost(ElementCount VF); 1630 1631 /// Returns the execution time cost of an instruction for a given vector 1632 /// width. Vector width of one means scalar. 1633 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1634 1635 /// The cost-computation logic from getInstructionCost which provides 1636 /// the vector type as an output parameter. 1637 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1638 Type *&VectorTy); 1639 1640 /// Return the cost of instructions in an inloop reduction pattern, if I is 1641 /// part of that pattern. 1642 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1643 Type *VectorTy, 1644 TTI::TargetCostKind CostKind); 1645 1646 /// Calculate vectorization cost of memory instruction \p I. 1647 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1648 1649 /// The cost computation for scalarized memory instruction. 1650 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1651 1652 /// The cost computation for interleaving group of memory instructions. 1653 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1654 1655 /// The cost computation for Gather/Scatter instruction. 1656 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1657 1658 /// The cost computation for widening instruction \p I with consecutive 1659 /// memory access. 1660 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1661 1662 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1663 /// Load: scalar load + broadcast. 1664 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1665 /// element) 1666 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1667 1668 /// Estimate the overhead of scalarizing an instruction. This is a 1669 /// convenience wrapper for the type-based getScalarizationOverhead API. 1670 InstructionCost getScalarizationOverhead(Instruction *I, 1671 ElementCount VF) const; 1672 1673 /// Returns whether the instruction is a load or store and will be a emitted 1674 /// as a vector operation. 1675 bool isConsecutiveLoadOrStore(Instruction *I); 1676 1677 /// Returns true if an artificially high cost for emulated masked memrefs 1678 /// should be used. 1679 bool useEmulatedMaskMemRefHack(Instruction *I); 1680 1681 /// Map of scalar integer values to the smallest bitwidth they can be legally 1682 /// represented as. The vector equivalents of these values should be truncated 1683 /// to this type. 1684 MapVector<Instruction *, uint64_t> MinBWs; 1685 1686 /// A type representing the costs for instructions if they were to be 1687 /// scalarized rather than vectorized. The entries are Instruction-Cost 1688 /// pairs. 1689 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1690 1691 /// A set containing all BasicBlocks that are known to present after 1692 /// vectorization as a predicated block. 1693 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1694 1695 /// Records whether it is allowed to have the original scalar loop execute at 1696 /// least once. This may be needed as a fallback loop in case runtime 1697 /// aliasing/dependence checks fail, or to handle the tail/remainder 1698 /// iterations when the trip count is unknown or doesn't divide by the VF, 1699 /// or as a peel-loop to handle gaps in interleave-groups. 1700 /// Under optsize and when the trip count is very small we don't allow any 1701 /// iterations to execute in the scalar loop. 1702 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1703 1704 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1705 bool FoldTailByMasking = false; 1706 1707 /// A map holding scalar costs for different vectorization factors. The 1708 /// presence of a cost for an instruction in the mapping indicates that the 1709 /// instruction will be scalarized when vectorizing with the associated 1710 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1711 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1712 1713 /// Holds the instructions known to be uniform after vectorization. 1714 /// The data is collected per VF. 1715 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1716 1717 /// Holds the instructions known to be scalar after vectorization. 1718 /// The data is collected per VF. 1719 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1720 1721 /// Holds the instructions (address computations) that are forced to be 1722 /// scalarized. 1723 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1724 1725 /// PHINodes of the reductions that should be expanded in-loop along with 1726 /// their associated chains of reduction operations, in program order from top 1727 /// (PHI) to bottom 1728 ReductionChainMap InLoopReductionChains; 1729 1730 /// A Map of inloop reduction operations and their immediate chain operand. 1731 /// FIXME: This can be removed once reductions can be costed correctly in 1732 /// vplan. This was added to allow quick lookup to the inloop operations, 1733 /// without having to loop through InLoopReductionChains. 1734 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1735 1736 /// Returns the expected difference in cost from scalarizing the expression 1737 /// feeding a predicated instruction \p PredInst. The instructions to 1738 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1739 /// non-negative return value implies the expression will be scalarized. 1740 /// Currently, only single-use chains are considered for scalarization. 1741 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1742 ElementCount VF); 1743 1744 /// Collect the instructions that are uniform after vectorization. An 1745 /// instruction is uniform if we represent it with a single scalar value in 1746 /// the vectorized loop corresponding to each vector iteration. Examples of 1747 /// uniform instructions include pointer operands of consecutive or 1748 /// interleaved memory accesses. Note that although uniformity implies an 1749 /// instruction will be scalar, the reverse is not true. In general, a 1750 /// scalarized instruction will be represented by VF scalar values in the 1751 /// vectorized loop, each corresponding to an iteration of the original 1752 /// scalar loop. 1753 void collectLoopUniforms(ElementCount VF); 1754 1755 /// Collect the instructions that are scalar after vectorization. An 1756 /// instruction is scalar if it is known to be uniform or will be scalarized 1757 /// during vectorization. Non-uniform scalarized instructions will be 1758 /// represented by VF values in the vectorized loop, each corresponding to an 1759 /// iteration of the original scalar loop. 1760 void collectLoopScalars(ElementCount VF); 1761 1762 /// Keeps cost model vectorization decision and cost for instructions. 1763 /// Right now it is used for memory instructions only. 1764 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1765 std::pair<InstWidening, InstructionCost>>; 1766 1767 DecisionList WideningDecisions; 1768 1769 /// Returns true if \p V is expected to be vectorized and it needs to be 1770 /// extracted. 1771 bool needsExtract(Value *V, ElementCount VF) const { 1772 Instruction *I = dyn_cast<Instruction>(V); 1773 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1774 TheLoop->isLoopInvariant(I)) 1775 return false; 1776 1777 // Assume we can vectorize V (and hence we need extraction) if the 1778 // scalars are not computed yet. This can happen, because it is called 1779 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1780 // the scalars are collected. That should be a safe assumption in most 1781 // cases, because we check if the operands have vectorizable types 1782 // beforehand in LoopVectorizationLegality. 1783 return Scalars.find(VF) == Scalars.end() || 1784 !isScalarAfterVectorization(I, VF); 1785 }; 1786 1787 /// Returns a range containing only operands needing to be extracted. 1788 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1789 ElementCount VF) const { 1790 return SmallVector<Value *, 4>(make_filter_range( 1791 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1792 } 1793 1794 /// Determines if we have the infrastructure to vectorize loop \p L and its 1795 /// epilogue, assuming the main loop is vectorized by \p VF. 1796 bool isCandidateForEpilogueVectorization(const Loop &L, 1797 const ElementCount VF) const; 1798 1799 /// Returns true if epilogue vectorization is considered profitable, and 1800 /// false otherwise. 1801 /// \p VF is the vectorization factor chosen for the original loop. 1802 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1803 1804 public: 1805 /// The loop that we evaluate. 1806 Loop *TheLoop; 1807 1808 /// Predicated scalar evolution analysis. 1809 PredicatedScalarEvolution &PSE; 1810 1811 /// Loop Info analysis. 1812 LoopInfo *LI; 1813 1814 /// Vectorization legality. 1815 LoopVectorizationLegality *Legal; 1816 1817 /// Vector target information. 1818 const TargetTransformInfo &TTI; 1819 1820 /// Target Library Info. 1821 const TargetLibraryInfo *TLI; 1822 1823 /// Demanded bits analysis. 1824 DemandedBits *DB; 1825 1826 /// Assumption cache. 1827 AssumptionCache *AC; 1828 1829 /// Interface to emit optimization remarks. 1830 OptimizationRemarkEmitter *ORE; 1831 1832 const Function *TheFunction; 1833 1834 /// Loop Vectorize Hint. 1835 const LoopVectorizeHints *Hints; 1836 1837 /// The interleave access information contains groups of interleaved accesses 1838 /// with the same stride and close to each other. 1839 InterleavedAccessInfo &InterleaveInfo; 1840 1841 /// Values to ignore in the cost model. 1842 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1843 1844 /// Values to ignore in the cost model when VF > 1. 1845 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1846 1847 /// Profitable vector factors. 1848 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1849 }; 1850 } // end namespace llvm 1851 1852 /// Helper struct to manage generating runtime checks for vectorization. 1853 /// 1854 /// The runtime checks are created up-front in temporary blocks to allow better 1855 /// estimating the cost and un-linked from the existing IR. After deciding to 1856 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1857 /// temporary blocks are completely removed. 1858 class GeneratedRTChecks { 1859 /// Basic block which contains the generated SCEV checks, if any. 1860 BasicBlock *SCEVCheckBlock = nullptr; 1861 1862 /// The value representing the result of the generated SCEV checks. If it is 1863 /// nullptr, either no SCEV checks have been generated or they have been used. 1864 Value *SCEVCheckCond = nullptr; 1865 1866 /// Basic block which contains the generated memory runtime checks, if any. 1867 BasicBlock *MemCheckBlock = nullptr; 1868 1869 /// The value representing the result of the generated memory runtime checks. 1870 /// If it is nullptr, either no memory runtime checks have been generated or 1871 /// they have been used. 1872 Instruction *MemRuntimeCheckCond = nullptr; 1873 1874 DominatorTree *DT; 1875 LoopInfo *LI; 1876 1877 SCEVExpander SCEVExp; 1878 SCEVExpander MemCheckExp; 1879 1880 public: 1881 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1882 const DataLayout &DL) 1883 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1884 MemCheckExp(SE, DL, "scev.check") {} 1885 1886 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1887 /// accurately estimate the cost of the runtime checks. The blocks are 1888 /// un-linked from the IR and is added back during vector code generation. If 1889 /// there is no vector code generation, the check blocks are removed 1890 /// completely. 1891 void Create(Loop *L, const LoopAccessInfo &LAI, 1892 const SCEVUnionPredicate &UnionPred) { 1893 1894 BasicBlock *LoopHeader = L->getHeader(); 1895 BasicBlock *Preheader = L->getLoopPreheader(); 1896 1897 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1898 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1899 // may be used by SCEVExpander. The blocks will be un-linked from their 1900 // predecessors and removed from LI & DT at the end of the function. 1901 if (!UnionPred.isAlwaysTrue()) { 1902 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1903 nullptr, "vector.scevcheck"); 1904 1905 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1906 &UnionPred, SCEVCheckBlock->getTerminator()); 1907 } 1908 1909 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1910 if (RtPtrChecking.Need) { 1911 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1912 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1913 "vector.memcheck"); 1914 1915 std::tie(std::ignore, MemRuntimeCheckCond) = 1916 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1917 RtPtrChecking.getChecks(), MemCheckExp); 1918 assert(MemRuntimeCheckCond && 1919 "no RT checks generated although RtPtrChecking " 1920 "claimed checks are required"); 1921 } 1922 1923 if (!MemCheckBlock && !SCEVCheckBlock) 1924 return; 1925 1926 // Unhook the temporary block with the checks, update various places 1927 // accordingly. 1928 if (SCEVCheckBlock) 1929 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1930 if (MemCheckBlock) 1931 MemCheckBlock->replaceAllUsesWith(Preheader); 1932 1933 if (SCEVCheckBlock) { 1934 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1935 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1936 Preheader->getTerminator()->eraseFromParent(); 1937 } 1938 if (MemCheckBlock) { 1939 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1940 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1941 Preheader->getTerminator()->eraseFromParent(); 1942 } 1943 1944 DT->changeImmediateDominator(LoopHeader, Preheader); 1945 if (MemCheckBlock) { 1946 DT->eraseNode(MemCheckBlock); 1947 LI->removeBlock(MemCheckBlock); 1948 } 1949 if (SCEVCheckBlock) { 1950 DT->eraseNode(SCEVCheckBlock); 1951 LI->removeBlock(SCEVCheckBlock); 1952 } 1953 } 1954 1955 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1956 /// unused. 1957 ~GeneratedRTChecks() { 1958 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1959 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1960 if (!SCEVCheckCond) 1961 SCEVCleaner.markResultUsed(); 1962 1963 if (!MemRuntimeCheckCond) 1964 MemCheckCleaner.markResultUsed(); 1965 1966 if (MemRuntimeCheckCond) { 1967 auto &SE = *MemCheckExp.getSE(); 1968 // Memory runtime check generation creates compares that use expanded 1969 // values. Remove them before running the SCEVExpanderCleaners. 1970 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1971 if (MemCheckExp.isInsertedInstruction(&I)) 1972 continue; 1973 SE.forgetValue(&I); 1974 SE.eraseValueFromMap(&I); 1975 I.eraseFromParent(); 1976 } 1977 } 1978 MemCheckCleaner.cleanup(); 1979 SCEVCleaner.cleanup(); 1980 1981 if (SCEVCheckCond) 1982 SCEVCheckBlock->eraseFromParent(); 1983 if (MemRuntimeCheckCond) 1984 MemCheckBlock->eraseFromParent(); 1985 } 1986 1987 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1988 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1989 /// depending on the generated condition. 1990 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1991 BasicBlock *LoopVectorPreHeader, 1992 BasicBlock *LoopExitBlock) { 1993 if (!SCEVCheckCond) 1994 return nullptr; 1995 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1996 if (C->isZero()) 1997 return nullptr; 1998 1999 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2000 2001 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2002 // Create new preheader for vector loop. 2003 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2004 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2005 2006 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2007 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2008 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2009 SCEVCheckBlock); 2010 2011 DT->addNewBlock(SCEVCheckBlock, Pred); 2012 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2013 2014 ReplaceInstWithInst( 2015 SCEVCheckBlock->getTerminator(), 2016 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2017 // Mark the check as used, to prevent it from being removed during cleanup. 2018 SCEVCheckCond = nullptr; 2019 return SCEVCheckBlock; 2020 } 2021 2022 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2023 /// the branches to branch to the vector preheader or \p Bypass, depending on 2024 /// the generated condition. 2025 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2026 BasicBlock *LoopVectorPreHeader) { 2027 // Check if we generated code that checks in runtime if arrays overlap. 2028 if (!MemRuntimeCheckCond) 2029 return nullptr; 2030 2031 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2032 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2033 MemCheckBlock); 2034 2035 DT->addNewBlock(MemCheckBlock, Pred); 2036 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2037 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2038 2039 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2040 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2041 2042 ReplaceInstWithInst( 2043 MemCheckBlock->getTerminator(), 2044 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2045 MemCheckBlock->getTerminator()->setDebugLoc( 2046 Pred->getTerminator()->getDebugLoc()); 2047 2048 // Mark the check as used, to prevent it from being removed during cleanup. 2049 MemRuntimeCheckCond = nullptr; 2050 return MemCheckBlock; 2051 } 2052 }; 2053 2054 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2055 // vectorization. The loop needs to be annotated with #pragma omp simd 2056 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2057 // vector length information is not provided, vectorization is not considered 2058 // explicit. Interleave hints are not allowed either. These limitations will be 2059 // relaxed in the future. 2060 // Please, note that we are currently forced to abuse the pragma 'clang 2061 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2062 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2063 // provides *explicit vectorization hints* (LV can bypass legal checks and 2064 // assume that vectorization is legal). However, both hints are implemented 2065 // using the same metadata (llvm.loop.vectorize, processed by 2066 // LoopVectorizeHints). This will be fixed in the future when the native IR 2067 // representation for pragma 'omp simd' is introduced. 2068 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2069 OptimizationRemarkEmitter *ORE) { 2070 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2071 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2072 2073 // Only outer loops with an explicit vectorization hint are supported. 2074 // Unannotated outer loops are ignored. 2075 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2076 return false; 2077 2078 Function *Fn = OuterLp->getHeader()->getParent(); 2079 if (!Hints.allowVectorization(Fn, OuterLp, 2080 true /*VectorizeOnlyWhenForced*/)) { 2081 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2082 return false; 2083 } 2084 2085 if (Hints.getInterleave() > 1) { 2086 // TODO: Interleave support is future work. 2087 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2088 "outer loops.\n"); 2089 Hints.emitRemarkWithHints(); 2090 return false; 2091 } 2092 2093 return true; 2094 } 2095 2096 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2097 OptimizationRemarkEmitter *ORE, 2098 SmallVectorImpl<Loop *> &V) { 2099 // Collect inner loops and outer loops without irreducible control flow. For 2100 // now, only collect outer loops that have explicit vectorization hints. If we 2101 // are stress testing the VPlan H-CFG construction, we collect the outermost 2102 // loop of every loop nest. 2103 if (L.isInnermost() || VPlanBuildStressTest || 2104 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2105 LoopBlocksRPO RPOT(&L); 2106 RPOT.perform(LI); 2107 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2108 V.push_back(&L); 2109 // TODO: Collect inner loops inside marked outer loops in case 2110 // vectorization fails for the outer loop. Do not invoke 2111 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2112 // already known to be reducible. We can use an inherited attribute for 2113 // that. 2114 return; 2115 } 2116 } 2117 for (Loop *InnerL : L) 2118 collectSupportedLoops(*InnerL, LI, ORE, V); 2119 } 2120 2121 namespace { 2122 2123 /// The LoopVectorize Pass. 2124 struct LoopVectorize : public FunctionPass { 2125 /// Pass identification, replacement for typeid 2126 static char ID; 2127 2128 LoopVectorizePass Impl; 2129 2130 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2131 bool VectorizeOnlyWhenForced = false) 2132 : FunctionPass(ID), 2133 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2134 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2135 } 2136 2137 bool runOnFunction(Function &F) override { 2138 if (skipFunction(F)) 2139 return false; 2140 2141 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2142 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2143 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2144 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2145 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2146 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2147 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2148 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2149 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2150 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2151 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2152 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2153 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2154 2155 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2156 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2157 2158 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2159 GetLAA, *ORE, PSI).MadeAnyChange; 2160 } 2161 2162 void getAnalysisUsage(AnalysisUsage &AU) const override { 2163 AU.addRequired<AssumptionCacheTracker>(); 2164 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2165 AU.addRequired<DominatorTreeWrapperPass>(); 2166 AU.addRequired<LoopInfoWrapperPass>(); 2167 AU.addRequired<ScalarEvolutionWrapperPass>(); 2168 AU.addRequired<TargetTransformInfoWrapperPass>(); 2169 AU.addRequired<AAResultsWrapperPass>(); 2170 AU.addRequired<LoopAccessLegacyAnalysis>(); 2171 AU.addRequired<DemandedBitsWrapperPass>(); 2172 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2173 AU.addRequired<InjectTLIMappingsLegacy>(); 2174 2175 // We currently do not preserve loopinfo/dominator analyses with outer loop 2176 // vectorization. Until this is addressed, mark these analyses as preserved 2177 // only for non-VPlan-native path. 2178 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2179 if (!EnableVPlanNativePath) { 2180 AU.addPreserved<LoopInfoWrapperPass>(); 2181 AU.addPreserved<DominatorTreeWrapperPass>(); 2182 } 2183 2184 AU.addPreserved<BasicAAWrapperPass>(); 2185 AU.addPreserved<GlobalsAAWrapperPass>(); 2186 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2187 } 2188 }; 2189 2190 } // end anonymous namespace 2191 2192 //===----------------------------------------------------------------------===// 2193 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2194 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2195 //===----------------------------------------------------------------------===// 2196 2197 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2198 // We need to place the broadcast of invariant variables outside the loop, 2199 // but only if it's proven safe to do so. Else, broadcast will be inside 2200 // vector loop body. 2201 Instruction *Instr = dyn_cast<Instruction>(V); 2202 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2203 (!Instr || 2204 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2205 // Place the code for broadcasting invariant variables in the new preheader. 2206 IRBuilder<>::InsertPointGuard Guard(Builder); 2207 if (SafeToHoist) 2208 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2209 2210 // Broadcast the scalar into all locations in the vector. 2211 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2212 2213 return Shuf; 2214 } 2215 2216 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2217 const InductionDescriptor &II, Value *Step, Value *Start, 2218 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2219 VPTransformState &State) { 2220 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2221 "Expected either an induction phi-node or a truncate of it!"); 2222 2223 // Construct the initial value of the vector IV in the vector loop preheader 2224 auto CurrIP = Builder.saveIP(); 2225 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2226 if (isa<TruncInst>(EntryVal)) { 2227 assert(Start->getType()->isIntegerTy() && 2228 "Truncation requires an integer type"); 2229 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2230 Step = Builder.CreateTrunc(Step, TruncType); 2231 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2232 } 2233 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2234 Value *SteppedStart = 2235 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2236 2237 // We create vector phi nodes for both integer and floating-point induction 2238 // variables. Here, we determine the kind of arithmetic we will perform. 2239 Instruction::BinaryOps AddOp; 2240 Instruction::BinaryOps MulOp; 2241 if (Step->getType()->isIntegerTy()) { 2242 AddOp = Instruction::Add; 2243 MulOp = Instruction::Mul; 2244 } else { 2245 AddOp = II.getInductionOpcode(); 2246 MulOp = Instruction::FMul; 2247 } 2248 2249 // Multiply the vectorization factor by the step using integer or 2250 // floating-point arithmetic as appropriate. 2251 Type *StepType = Step->getType(); 2252 if (Step->getType()->isFloatingPointTy()) 2253 StepType = IntegerType::get(StepType->getContext(), 2254 StepType->getScalarSizeInBits()); 2255 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2256 if (Step->getType()->isFloatingPointTy()) 2257 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2258 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2259 2260 // Create a vector splat to use in the induction update. 2261 // 2262 // FIXME: If the step is non-constant, we create the vector splat with 2263 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2264 // handle a constant vector splat. 2265 Value *SplatVF = isa<Constant>(Mul) 2266 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2267 : Builder.CreateVectorSplat(VF, Mul); 2268 Builder.restoreIP(CurrIP); 2269 2270 // We may need to add the step a number of times, depending on the unroll 2271 // factor. The last of those goes into the PHI. 2272 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2273 &*LoopVectorBody->getFirstInsertionPt()); 2274 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2275 Instruction *LastInduction = VecInd; 2276 for (unsigned Part = 0; Part < UF; ++Part) { 2277 State.set(Def, LastInduction, Part); 2278 2279 if (isa<TruncInst>(EntryVal)) 2280 addMetadata(LastInduction, EntryVal); 2281 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2282 State, Part); 2283 2284 LastInduction = cast<Instruction>( 2285 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2286 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2287 } 2288 2289 // Move the last step to the end of the latch block. This ensures consistent 2290 // placement of all induction updates. 2291 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2292 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2293 auto *ICmp = cast<Instruction>(Br->getCondition()); 2294 LastInduction->moveBefore(ICmp); 2295 LastInduction->setName("vec.ind.next"); 2296 2297 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2298 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2299 } 2300 2301 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2302 return Cost->isScalarAfterVectorization(I, VF) || 2303 Cost->isProfitableToScalarize(I, VF); 2304 } 2305 2306 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2307 if (shouldScalarizeInstruction(IV)) 2308 return true; 2309 auto isScalarInst = [&](User *U) -> bool { 2310 auto *I = cast<Instruction>(U); 2311 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2312 }; 2313 return llvm::any_of(IV->users(), isScalarInst); 2314 } 2315 2316 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2317 const InductionDescriptor &ID, const Instruction *EntryVal, 2318 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2319 unsigned Part, unsigned Lane) { 2320 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2321 "Expected either an induction phi-node or a truncate of it!"); 2322 2323 // This induction variable is not the phi from the original loop but the 2324 // newly-created IV based on the proof that casted Phi is equal to the 2325 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2326 // re-uses the same InductionDescriptor that original IV uses but we don't 2327 // have to do any recording in this case - that is done when original IV is 2328 // processed. 2329 if (isa<TruncInst>(EntryVal)) 2330 return; 2331 2332 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2333 if (Casts.empty()) 2334 return; 2335 // Only the first Cast instruction in the Casts vector is of interest. 2336 // The rest of the Casts (if exist) have no uses outside the 2337 // induction update chain itself. 2338 if (Lane < UINT_MAX) 2339 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2340 else 2341 State.set(CastDef, VectorLoopVal, Part); 2342 } 2343 2344 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2345 TruncInst *Trunc, VPValue *Def, 2346 VPValue *CastDef, 2347 VPTransformState &State) { 2348 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2349 "Primary induction variable must have an integer type"); 2350 2351 auto II = Legal->getInductionVars().find(IV); 2352 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2353 2354 auto ID = II->second; 2355 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2356 2357 // The value from the original loop to which we are mapping the new induction 2358 // variable. 2359 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2360 2361 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2362 2363 // Generate code for the induction step. Note that induction steps are 2364 // required to be loop-invariant 2365 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2366 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2367 "Induction step should be loop invariant"); 2368 if (PSE.getSE()->isSCEVable(IV->getType())) { 2369 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2370 return Exp.expandCodeFor(Step, Step->getType(), 2371 LoopVectorPreHeader->getTerminator()); 2372 } 2373 return cast<SCEVUnknown>(Step)->getValue(); 2374 }; 2375 2376 // The scalar value to broadcast. This is derived from the canonical 2377 // induction variable. If a truncation type is given, truncate the canonical 2378 // induction variable and step. Otherwise, derive these values from the 2379 // induction descriptor. 2380 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2381 Value *ScalarIV = Induction; 2382 if (IV != OldInduction) { 2383 ScalarIV = IV->getType()->isIntegerTy() 2384 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2385 : Builder.CreateCast(Instruction::SIToFP, Induction, 2386 IV->getType()); 2387 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2388 ScalarIV->setName("offset.idx"); 2389 } 2390 if (Trunc) { 2391 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2392 assert(Step->getType()->isIntegerTy() && 2393 "Truncation requires an integer step"); 2394 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2395 Step = Builder.CreateTrunc(Step, TruncType); 2396 } 2397 return ScalarIV; 2398 }; 2399 2400 // Create the vector values from the scalar IV, in the absence of creating a 2401 // vector IV. 2402 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2403 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2404 for (unsigned Part = 0; Part < UF; ++Part) { 2405 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2406 Value *EntryPart = 2407 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2408 ID.getInductionOpcode()); 2409 State.set(Def, EntryPart, Part); 2410 if (Trunc) 2411 addMetadata(EntryPart, Trunc); 2412 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2413 State, Part); 2414 } 2415 }; 2416 2417 // Fast-math-flags propagate from the original induction instruction. 2418 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2419 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2420 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2421 2422 // Now do the actual transformations, and start with creating the step value. 2423 Value *Step = CreateStepValue(ID.getStep()); 2424 if (VF.isZero() || VF.isScalar()) { 2425 Value *ScalarIV = CreateScalarIV(Step); 2426 CreateSplatIV(ScalarIV, Step); 2427 return; 2428 } 2429 2430 // Determine if we want a scalar version of the induction variable. This is 2431 // true if the induction variable itself is not widened, or if it has at 2432 // least one user in the loop that is not widened. 2433 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2434 if (!NeedsScalarIV) { 2435 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2436 State); 2437 return; 2438 } 2439 2440 // Try to create a new independent vector induction variable. If we can't 2441 // create the phi node, we will splat the scalar induction variable in each 2442 // loop iteration. 2443 if (!shouldScalarizeInstruction(EntryVal)) { 2444 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2445 State); 2446 Value *ScalarIV = CreateScalarIV(Step); 2447 // Create scalar steps that can be used by instructions we will later 2448 // scalarize. Note that the addition of the scalar steps will not increase 2449 // the number of instructions in the loop in the common case prior to 2450 // InstCombine. We will be trading one vector extract for each scalar step. 2451 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2452 return; 2453 } 2454 2455 // All IV users are scalar instructions, so only emit a scalar IV, not a 2456 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2457 // predicate used by the masked loads/stores. 2458 Value *ScalarIV = CreateScalarIV(Step); 2459 if (!Cost->isScalarEpilogueAllowed()) 2460 CreateSplatIV(ScalarIV, Step); 2461 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2462 } 2463 2464 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2465 Instruction::BinaryOps BinOp) { 2466 // Create and check the types. 2467 auto *ValVTy = cast<VectorType>(Val->getType()); 2468 ElementCount VLen = ValVTy->getElementCount(); 2469 2470 Type *STy = Val->getType()->getScalarType(); 2471 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2472 "Induction Step must be an integer or FP"); 2473 assert(Step->getType() == STy && "Step has wrong type"); 2474 2475 SmallVector<Constant *, 8> Indices; 2476 2477 // Create a vector of consecutive numbers from zero to VF. 2478 VectorType *InitVecValVTy = ValVTy; 2479 Type *InitVecValSTy = STy; 2480 if (STy->isFloatingPointTy()) { 2481 InitVecValSTy = 2482 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2483 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2484 } 2485 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2486 2487 // Add on StartIdx 2488 Value *StartIdxSplat = Builder.CreateVectorSplat( 2489 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2490 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2491 2492 if (STy->isIntegerTy()) { 2493 Step = Builder.CreateVectorSplat(VLen, Step); 2494 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2495 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2496 // which can be found from the original scalar operations. 2497 Step = Builder.CreateMul(InitVec, Step); 2498 return Builder.CreateAdd(Val, Step, "induction"); 2499 } 2500 2501 // Floating point induction. 2502 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2503 "Binary Opcode should be specified for FP induction"); 2504 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2505 Step = Builder.CreateVectorSplat(VLen, Step); 2506 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2507 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2508 } 2509 2510 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2511 Instruction *EntryVal, 2512 const InductionDescriptor &ID, 2513 VPValue *Def, VPValue *CastDef, 2514 VPTransformState &State) { 2515 // We shouldn't have to build scalar steps if we aren't vectorizing. 2516 assert(VF.isVector() && "VF should be greater than one"); 2517 // Get the value type and ensure it and the step have the same integer type. 2518 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2519 assert(ScalarIVTy == Step->getType() && 2520 "Val and Step should have the same type"); 2521 2522 // We build scalar steps for both integer and floating-point induction 2523 // variables. Here, we determine the kind of arithmetic we will perform. 2524 Instruction::BinaryOps AddOp; 2525 Instruction::BinaryOps MulOp; 2526 if (ScalarIVTy->isIntegerTy()) { 2527 AddOp = Instruction::Add; 2528 MulOp = Instruction::Mul; 2529 } else { 2530 AddOp = ID.getInductionOpcode(); 2531 MulOp = Instruction::FMul; 2532 } 2533 2534 // Determine the number of scalars we need to generate for each unroll 2535 // iteration. If EntryVal is uniform, we only need to generate the first 2536 // lane. Otherwise, we generate all VF values. 2537 bool IsUniform = 2538 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2539 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2540 // Compute the scalar steps and save the results in State. 2541 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2542 ScalarIVTy->getScalarSizeInBits()); 2543 Type *VecIVTy = nullptr; 2544 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2545 if (!IsUniform && VF.isScalable()) { 2546 VecIVTy = VectorType::get(ScalarIVTy, VF); 2547 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2548 SplatStep = Builder.CreateVectorSplat(VF, Step); 2549 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2550 } 2551 2552 for (unsigned Part = 0; Part < UF; ++Part) { 2553 Value *StartIdx0 = 2554 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2555 2556 if (!IsUniform && VF.isScalable()) { 2557 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2558 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2559 if (ScalarIVTy->isFloatingPointTy()) 2560 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2561 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2562 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2563 State.set(Def, Add, Part); 2564 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2565 Part); 2566 // It's useful to record the lane values too for the known minimum number 2567 // of elements so we do those below. This improves the code quality when 2568 // trying to extract the first element, for example. 2569 } 2570 2571 if (ScalarIVTy->isFloatingPointTy()) 2572 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2573 2574 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2575 Value *StartIdx = Builder.CreateBinOp( 2576 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2577 // The step returned by `createStepForVF` is a runtime-evaluated value 2578 // when VF is scalable. Otherwise, it should be folded into a Constant. 2579 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2580 "Expected StartIdx to be folded to a constant when VF is not " 2581 "scalable"); 2582 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2583 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2584 State.set(Def, Add, VPIteration(Part, Lane)); 2585 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2586 Part, Lane); 2587 } 2588 } 2589 } 2590 2591 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2592 const VPIteration &Instance, 2593 VPTransformState &State) { 2594 Value *ScalarInst = State.get(Def, Instance); 2595 Value *VectorValue = State.get(Def, Instance.Part); 2596 VectorValue = Builder.CreateInsertElement( 2597 VectorValue, ScalarInst, 2598 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2599 State.set(Def, VectorValue, Instance.Part); 2600 } 2601 2602 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2603 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2604 return Builder.CreateVectorReverse(Vec, "reverse"); 2605 } 2606 2607 // Return whether we allow using masked interleave-groups (for dealing with 2608 // strided loads/stores that reside in predicated blocks, or for dealing 2609 // with gaps). 2610 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2611 // If an override option has been passed in for interleaved accesses, use it. 2612 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2613 return EnableMaskedInterleavedMemAccesses; 2614 2615 return TTI.enableMaskedInterleavedAccessVectorization(); 2616 } 2617 2618 // Try to vectorize the interleave group that \p Instr belongs to. 2619 // 2620 // E.g. Translate following interleaved load group (factor = 3): 2621 // for (i = 0; i < N; i+=3) { 2622 // R = Pic[i]; // Member of index 0 2623 // G = Pic[i+1]; // Member of index 1 2624 // B = Pic[i+2]; // Member of index 2 2625 // ... // do something to R, G, B 2626 // } 2627 // To: 2628 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2629 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2630 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2631 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2632 // 2633 // Or translate following interleaved store group (factor = 3): 2634 // for (i = 0; i < N; i+=3) { 2635 // ... do something to R, G, B 2636 // Pic[i] = R; // Member of index 0 2637 // Pic[i+1] = G; // Member of index 1 2638 // Pic[i+2] = B; // Member of index 2 2639 // } 2640 // To: 2641 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2642 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2643 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2644 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2645 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2646 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2647 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2648 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2649 VPValue *BlockInMask) { 2650 Instruction *Instr = Group->getInsertPos(); 2651 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2652 2653 // Prepare for the vector type of the interleaved load/store. 2654 Type *ScalarTy = getMemInstValueType(Instr); 2655 unsigned InterleaveFactor = Group->getFactor(); 2656 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2657 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2658 2659 // Prepare for the new pointers. 2660 SmallVector<Value *, 2> AddrParts; 2661 unsigned Index = Group->getIndex(Instr); 2662 2663 // TODO: extend the masked interleaved-group support to reversed access. 2664 assert((!BlockInMask || !Group->isReverse()) && 2665 "Reversed masked interleave-group not supported."); 2666 2667 // If the group is reverse, adjust the index to refer to the last vector lane 2668 // instead of the first. We adjust the index from the first vector lane, 2669 // rather than directly getting the pointer for lane VF - 1, because the 2670 // pointer operand of the interleaved access is supposed to be uniform. For 2671 // uniform instructions, we're only required to generate a value for the 2672 // first vector lane in each unroll iteration. 2673 assert(!VF.isScalable() && 2674 "scalable vector reverse operation is not implemented"); 2675 if (Group->isReverse()) 2676 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2677 2678 for (unsigned Part = 0; Part < UF; Part++) { 2679 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2680 setDebugLocFromInst(Builder, AddrPart); 2681 2682 // Notice current instruction could be any index. Need to adjust the address 2683 // to the member of index 0. 2684 // 2685 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2686 // b = A[i]; // Member of index 0 2687 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2688 // 2689 // E.g. A[i+1] = a; // Member of index 1 2690 // A[i] = b; // Member of index 0 2691 // A[i+2] = c; // Member of index 2 (Current instruction) 2692 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2693 2694 bool InBounds = false; 2695 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2696 InBounds = gep->isInBounds(); 2697 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2698 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2699 2700 // Cast to the vector pointer type. 2701 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2702 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2703 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2704 } 2705 2706 setDebugLocFromInst(Builder, Instr); 2707 Value *PoisonVec = PoisonValue::get(VecTy); 2708 2709 Value *MaskForGaps = nullptr; 2710 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2711 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2712 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2713 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2714 } 2715 2716 // Vectorize the interleaved load group. 2717 if (isa<LoadInst>(Instr)) { 2718 // For each unroll part, create a wide load for the group. 2719 SmallVector<Value *, 2> NewLoads; 2720 for (unsigned Part = 0; Part < UF; Part++) { 2721 Instruction *NewLoad; 2722 if (BlockInMask || MaskForGaps) { 2723 assert(useMaskedInterleavedAccesses(*TTI) && 2724 "masked interleaved groups are not allowed."); 2725 Value *GroupMask = MaskForGaps; 2726 if (BlockInMask) { 2727 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2728 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2729 Value *ShuffledMask = Builder.CreateShuffleVector( 2730 BlockInMaskPart, 2731 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2732 "interleaved.mask"); 2733 GroupMask = MaskForGaps 2734 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2735 MaskForGaps) 2736 : ShuffledMask; 2737 } 2738 NewLoad = 2739 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2740 GroupMask, PoisonVec, "wide.masked.vec"); 2741 } 2742 else 2743 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2744 Group->getAlign(), "wide.vec"); 2745 Group->addMetadata(NewLoad); 2746 NewLoads.push_back(NewLoad); 2747 } 2748 2749 // For each member in the group, shuffle out the appropriate data from the 2750 // wide loads. 2751 unsigned J = 0; 2752 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2753 Instruction *Member = Group->getMember(I); 2754 2755 // Skip the gaps in the group. 2756 if (!Member) 2757 continue; 2758 2759 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2760 auto StrideMask = 2761 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2762 for (unsigned Part = 0; Part < UF; Part++) { 2763 Value *StridedVec = Builder.CreateShuffleVector( 2764 NewLoads[Part], StrideMask, "strided.vec"); 2765 2766 // If this member has different type, cast the result type. 2767 if (Member->getType() != ScalarTy) { 2768 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2769 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2770 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2771 } 2772 2773 if (Group->isReverse()) 2774 StridedVec = reverseVector(StridedVec); 2775 2776 State.set(VPDefs[J], StridedVec, Part); 2777 } 2778 ++J; 2779 } 2780 return; 2781 } 2782 2783 // The sub vector type for current instruction. 2784 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2785 auto *SubVT = VectorType::get(ScalarTy, VF); 2786 2787 // Vectorize the interleaved store group. 2788 for (unsigned Part = 0; Part < UF; Part++) { 2789 // Collect the stored vector from each member. 2790 SmallVector<Value *, 4> StoredVecs; 2791 for (unsigned i = 0; i < InterleaveFactor; i++) { 2792 // Interleaved store group doesn't allow a gap, so each index has a member 2793 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2794 2795 Value *StoredVec = State.get(StoredValues[i], Part); 2796 2797 if (Group->isReverse()) 2798 StoredVec = reverseVector(StoredVec); 2799 2800 // If this member has different type, cast it to a unified type. 2801 2802 if (StoredVec->getType() != SubVT) 2803 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2804 2805 StoredVecs.push_back(StoredVec); 2806 } 2807 2808 // Concatenate all vectors into a wide vector. 2809 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2810 2811 // Interleave the elements in the wide vector. 2812 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2813 Value *IVec = Builder.CreateShuffleVector( 2814 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2815 "interleaved.vec"); 2816 2817 Instruction *NewStoreInstr; 2818 if (BlockInMask) { 2819 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2820 Value *ShuffledMask = Builder.CreateShuffleVector( 2821 BlockInMaskPart, 2822 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2823 "interleaved.mask"); 2824 NewStoreInstr = Builder.CreateMaskedStore( 2825 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2826 } 2827 else 2828 NewStoreInstr = 2829 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2830 2831 Group->addMetadata(NewStoreInstr); 2832 } 2833 } 2834 2835 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2836 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2837 VPValue *StoredValue, VPValue *BlockInMask) { 2838 // Attempt to issue a wide load. 2839 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2840 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2841 2842 assert((LI || SI) && "Invalid Load/Store instruction"); 2843 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2844 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2845 2846 LoopVectorizationCostModel::InstWidening Decision = 2847 Cost->getWideningDecision(Instr, VF); 2848 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2849 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2850 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2851 "CM decision is not to widen the memory instruction"); 2852 2853 Type *ScalarDataTy = getMemInstValueType(Instr); 2854 2855 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2856 const Align Alignment = getLoadStoreAlignment(Instr); 2857 2858 // Determine if the pointer operand of the access is either consecutive or 2859 // reverse consecutive. 2860 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2861 bool ConsecutiveStride = 2862 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2863 bool CreateGatherScatter = 2864 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2865 2866 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2867 // gather/scatter. Otherwise Decision should have been to Scalarize. 2868 assert((ConsecutiveStride || CreateGatherScatter) && 2869 "The instruction should be scalarized"); 2870 (void)ConsecutiveStride; 2871 2872 VectorParts BlockInMaskParts(UF); 2873 bool isMaskRequired = BlockInMask; 2874 if (isMaskRequired) 2875 for (unsigned Part = 0; Part < UF; ++Part) 2876 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2877 2878 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2879 // Calculate the pointer for the specific unroll-part. 2880 GetElementPtrInst *PartPtr = nullptr; 2881 2882 bool InBounds = false; 2883 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2884 InBounds = gep->isInBounds(); 2885 if (Reverse) { 2886 // If the address is consecutive but reversed, then the 2887 // wide store needs to start at the last vector element. 2888 // RunTimeVF = VScale * VF.getKnownMinValue() 2889 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2890 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2891 // NumElt = -Part * RunTimeVF 2892 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2893 // LastLane = 1 - RunTimeVF 2894 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2895 PartPtr = 2896 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2897 PartPtr->setIsInBounds(InBounds); 2898 PartPtr = cast<GetElementPtrInst>( 2899 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2900 PartPtr->setIsInBounds(InBounds); 2901 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2902 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2903 } else { 2904 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2905 PartPtr = cast<GetElementPtrInst>( 2906 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2907 PartPtr->setIsInBounds(InBounds); 2908 } 2909 2910 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2911 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2912 }; 2913 2914 // Handle Stores: 2915 if (SI) { 2916 setDebugLocFromInst(Builder, SI); 2917 2918 for (unsigned Part = 0; Part < UF; ++Part) { 2919 Instruction *NewSI = nullptr; 2920 Value *StoredVal = State.get(StoredValue, Part); 2921 if (CreateGatherScatter) { 2922 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2923 Value *VectorGep = State.get(Addr, Part); 2924 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2925 MaskPart); 2926 } else { 2927 if (Reverse) { 2928 // If we store to reverse consecutive memory locations, then we need 2929 // to reverse the order of elements in the stored value. 2930 StoredVal = reverseVector(StoredVal); 2931 // We don't want to update the value in the map as it might be used in 2932 // another expression. So don't call resetVectorValue(StoredVal). 2933 } 2934 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2935 if (isMaskRequired) 2936 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2937 BlockInMaskParts[Part]); 2938 else 2939 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2940 } 2941 addMetadata(NewSI, SI); 2942 } 2943 return; 2944 } 2945 2946 // Handle loads. 2947 assert(LI && "Must have a load instruction"); 2948 setDebugLocFromInst(Builder, LI); 2949 for (unsigned Part = 0; Part < UF; ++Part) { 2950 Value *NewLI; 2951 if (CreateGatherScatter) { 2952 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2953 Value *VectorGep = State.get(Addr, Part); 2954 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2955 nullptr, "wide.masked.gather"); 2956 addMetadata(NewLI, LI); 2957 } else { 2958 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2959 if (isMaskRequired) 2960 NewLI = Builder.CreateMaskedLoad( 2961 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2962 "wide.masked.load"); 2963 else 2964 NewLI = 2965 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2966 2967 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2968 addMetadata(NewLI, LI); 2969 if (Reverse) 2970 NewLI = reverseVector(NewLI); 2971 } 2972 2973 State.set(Def, NewLI, Part); 2974 } 2975 } 2976 2977 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2978 VPUser &User, 2979 const VPIteration &Instance, 2980 bool IfPredicateInstr, 2981 VPTransformState &State) { 2982 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2983 2984 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2985 // the first lane and part. 2986 if (isa<NoAliasScopeDeclInst>(Instr)) 2987 if (!Instance.isFirstIteration()) 2988 return; 2989 2990 setDebugLocFromInst(Builder, Instr); 2991 2992 // Does this instruction return a value ? 2993 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2994 2995 Instruction *Cloned = Instr->clone(); 2996 if (!IsVoidRetTy) 2997 Cloned->setName(Instr->getName() + ".cloned"); 2998 2999 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3000 Builder.GetInsertPoint()); 3001 // Replace the operands of the cloned instructions with their scalar 3002 // equivalents in the new loop. 3003 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3004 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3005 auto InputInstance = Instance; 3006 if (!Operand || !OrigLoop->contains(Operand) || 3007 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3008 InputInstance.Lane = VPLane::getFirstLane(); 3009 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3010 Cloned->setOperand(op, NewOp); 3011 } 3012 addNewMetadata(Cloned, Instr); 3013 3014 // Place the cloned scalar in the new loop. 3015 Builder.Insert(Cloned); 3016 3017 State.set(Def, Cloned, Instance); 3018 3019 // If we just cloned a new assumption, add it the assumption cache. 3020 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 3021 if (II->getIntrinsicID() == Intrinsic::assume) 3022 AC->registerAssumption(II); 3023 3024 // End if-block. 3025 if (IfPredicateInstr) 3026 PredicatedInstructions.push_back(Cloned); 3027 } 3028 3029 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3030 Value *End, Value *Step, 3031 Instruction *DL) { 3032 BasicBlock *Header = L->getHeader(); 3033 BasicBlock *Latch = L->getLoopLatch(); 3034 // As we're just creating this loop, it's possible no latch exists 3035 // yet. If so, use the header as this will be a single block loop. 3036 if (!Latch) 3037 Latch = Header; 3038 3039 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3040 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3041 setDebugLocFromInst(Builder, OldInst); 3042 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3043 3044 Builder.SetInsertPoint(Latch->getTerminator()); 3045 setDebugLocFromInst(Builder, OldInst); 3046 3047 // Create i+1 and fill the PHINode. 3048 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3049 Induction->addIncoming(Start, L->getLoopPreheader()); 3050 Induction->addIncoming(Next, Latch); 3051 // Create the compare. 3052 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3053 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3054 3055 // Now we have two terminators. Remove the old one from the block. 3056 Latch->getTerminator()->eraseFromParent(); 3057 3058 return Induction; 3059 } 3060 3061 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3062 if (TripCount) 3063 return TripCount; 3064 3065 assert(L && "Create Trip Count for null loop."); 3066 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3067 // Find the loop boundaries. 3068 ScalarEvolution *SE = PSE.getSE(); 3069 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3070 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3071 "Invalid loop count"); 3072 3073 Type *IdxTy = Legal->getWidestInductionType(); 3074 assert(IdxTy && "No type for induction"); 3075 3076 // The exit count might have the type of i64 while the phi is i32. This can 3077 // happen if we have an induction variable that is sign extended before the 3078 // compare. The only way that we get a backedge taken count is that the 3079 // induction variable was signed and as such will not overflow. In such a case 3080 // truncation is legal. 3081 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3082 IdxTy->getPrimitiveSizeInBits()) 3083 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3084 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3085 3086 // Get the total trip count from the count by adding 1. 3087 const SCEV *ExitCount = SE->getAddExpr( 3088 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3089 3090 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3091 3092 // Expand the trip count and place the new instructions in the preheader. 3093 // Notice that the pre-header does not change, only the loop body. 3094 SCEVExpander Exp(*SE, DL, "induction"); 3095 3096 // Count holds the overall loop count (N). 3097 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3098 L->getLoopPreheader()->getTerminator()); 3099 3100 if (TripCount->getType()->isPointerTy()) 3101 TripCount = 3102 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3103 L->getLoopPreheader()->getTerminator()); 3104 3105 return TripCount; 3106 } 3107 3108 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3109 if (VectorTripCount) 3110 return VectorTripCount; 3111 3112 Value *TC = getOrCreateTripCount(L); 3113 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3114 3115 Type *Ty = TC->getType(); 3116 // This is where we can make the step a runtime constant. 3117 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3118 3119 // If the tail is to be folded by masking, round the number of iterations N 3120 // up to a multiple of Step instead of rounding down. This is done by first 3121 // adding Step-1 and then rounding down. Note that it's ok if this addition 3122 // overflows: the vector induction variable will eventually wrap to zero given 3123 // that it starts at zero and its Step is a power of two; the loop will then 3124 // exit, with the last early-exit vector comparison also producing all-true. 3125 if (Cost->foldTailByMasking()) { 3126 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3127 "VF*UF must be a power of 2 when folding tail by masking"); 3128 assert(!VF.isScalable() && 3129 "Tail folding not yet supported for scalable vectors"); 3130 TC = Builder.CreateAdd( 3131 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3132 } 3133 3134 // Now we need to generate the expression for the part of the loop that the 3135 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3136 // iterations are not required for correctness, or N - Step, otherwise. Step 3137 // is equal to the vectorization factor (number of SIMD elements) times the 3138 // unroll factor (number of SIMD instructions). 3139 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3140 3141 // There are two cases where we need to ensure (at least) the last iteration 3142 // runs in the scalar remainder loop. Thus, if the step evenly divides 3143 // the trip count, we set the remainder to be equal to the step. If the step 3144 // does not evenly divide the trip count, no adjustment is necessary since 3145 // there will already be scalar iterations. Note that the minimum iterations 3146 // check ensures that N >= Step. The cases are: 3147 // 1) If there is a non-reversed interleaved group that may speculatively 3148 // access memory out-of-bounds. 3149 // 2) If any instruction may follow a conditionally taken exit. That is, if 3150 // the loop contains multiple exiting blocks, or a single exiting block 3151 // which is not the latch. 3152 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3153 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3154 R = Builder.CreateSelect(IsZero, Step, R); 3155 } 3156 3157 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3158 3159 return VectorTripCount; 3160 } 3161 3162 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3163 const DataLayout &DL) { 3164 // Verify that V is a vector type with same number of elements as DstVTy. 3165 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3166 unsigned VF = DstFVTy->getNumElements(); 3167 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3168 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3169 Type *SrcElemTy = SrcVecTy->getElementType(); 3170 Type *DstElemTy = DstFVTy->getElementType(); 3171 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3172 "Vector elements must have same size"); 3173 3174 // Do a direct cast if element types are castable. 3175 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3176 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3177 } 3178 // V cannot be directly casted to desired vector type. 3179 // May happen when V is a floating point vector but DstVTy is a vector of 3180 // pointers or vice-versa. Handle this using a two-step bitcast using an 3181 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3182 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3183 "Only one type should be a pointer type"); 3184 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3185 "Only one type should be a floating point type"); 3186 Type *IntTy = 3187 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3188 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3189 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3190 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3191 } 3192 3193 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3194 BasicBlock *Bypass) { 3195 Value *Count = getOrCreateTripCount(L); 3196 // Reuse existing vector loop preheader for TC checks. 3197 // Note that new preheader block is generated for vector loop. 3198 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3199 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3200 3201 // Generate code to check if the loop's trip count is less than VF * UF, or 3202 // equal to it in case a scalar epilogue is required; this implies that the 3203 // vector trip count is zero. This check also covers the case where adding one 3204 // to the backedge-taken count overflowed leading to an incorrect trip count 3205 // of zero. In this case we will also jump to the scalar loop. 3206 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3207 : ICmpInst::ICMP_ULT; 3208 3209 // If tail is to be folded, vector loop takes care of all iterations. 3210 Value *CheckMinIters = Builder.getFalse(); 3211 if (!Cost->foldTailByMasking()) { 3212 Value *Step = 3213 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3214 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3215 } 3216 // Create new preheader for vector loop. 3217 LoopVectorPreHeader = 3218 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3219 "vector.ph"); 3220 3221 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3222 DT->getNode(Bypass)->getIDom()) && 3223 "TC check is expected to dominate Bypass"); 3224 3225 // Update dominator for Bypass & LoopExit. 3226 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3227 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3228 3229 ReplaceInstWithInst( 3230 TCCheckBlock->getTerminator(), 3231 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3232 LoopBypassBlocks.push_back(TCCheckBlock); 3233 } 3234 3235 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3236 3237 BasicBlock *const SCEVCheckBlock = 3238 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3239 if (!SCEVCheckBlock) 3240 return nullptr; 3241 3242 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3243 (OptForSizeBasedOnProfile && 3244 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3245 "Cannot SCEV check stride or overflow when optimizing for size"); 3246 3247 3248 // Update dominator only if this is first RT check. 3249 if (LoopBypassBlocks.empty()) { 3250 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3251 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3252 } 3253 3254 LoopBypassBlocks.push_back(SCEVCheckBlock); 3255 AddedSafetyChecks = true; 3256 return SCEVCheckBlock; 3257 } 3258 3259 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3260 BasicBlock *Bypass) { 3261 // VPlan-native path does not do any analysis for runtime checks currently. 3262 if (EnableVPlanNativePath) 3263 return nullptr; 3264 3265 BasicBlock *const MemCheckBlock = 3266 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3267 3268 // Check if we generated code that checks in runtime if arrays overlap. We put 3269 // the checks into a separate block to make the more common case of few 3270 // elements faster. 3271 if (!MemCheckBlock) 3272 return nullptr; 3273 3274 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3275 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3276 "Cannot emit memory checks when optimizing for size, unless forced " 3277 "to vectorize."); 3278 ORE->emit([&]() { 3279 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3280 L->getStartLoc(), L->getHeader()) 3281 << "Code-size may be reduced by not forcing " 3282 "vectorization, or by source-code modifications " 3283 "eliminating the need for runtime checks " 3284 "(e.g., adding 'restrict')."; 3285 }); 3286 } 3287 3288 LoopBypassBlocks.push_back(MemCheckBlock); 3289 3290 AddedSafetyChecks = true; 3291 3292 // We currently don't use LoopVersioning for the actual loop cloning but we 3293 // still use it to add the noalias metadata. 3294 LVer = std::make_unique<LoopVersioning>( 3295 *Legal->getLAI(), 3296 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3297 DT, PSE.getSE()); 3298 LVer->prepareNoAliasMetadata(); 3299 return MemCheckBlock; 3300 } 3301 3302 Value *InnerLoopVectorizer::emitTransformedIndex( 3303 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3304 const InductionDescriptor &ID) const { 3305 3306 SCEVExpander Exp(*SE, DL, "induction"); 3307 auto Step = ID.getStep(); 3308 auto StartValue = ID.getStartValue(); 3309 assert(Index->getType() == Step->getType() && 3310 "Index type does not match StepValue type"); 3311 3312 // Note: the IR at this point is broken. We cannot use SE to create any new 3313 // SCEV and then expand it, hoping that SCEV's simplification will give us 3314 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3315 // lead to various SCEV crashes. So all we can do is to use builder and rely 3316 // on InstCombine for future simplifications. Here we handle some trivial 3317 // cases only. 3318 auto CreateAdd = [&B](Value *X, Value *Y) { 3319 assert(X->getType() == Y->getType() && "Types don't match!"); 3320 if (auto *CX = dyn_cast<ConstantInt>(X)) 3321 if (CX->isZero()) 3322 return Y; 3323 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3324 if (CY->isZero()) 3325 return X; 3326 return B.CreateAdd(X, Y); 3327 }; 3328 3329 auto CreateMul = [&B](Value *X, Value *Y) { 3330 assert(X->getType() == Y->getType() && "Types don't match!"); 3331 if (auto *CX = dyn_cast<ConstantInt>(X)) 3332 if (CX->isOne()) 3333 return Y; 3334 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3335 if (CY->isOne()) 3336 return X; 3337 return B.CreateMul(X, Y); 3338 }; 3339 3340 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3341 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3342 // the DomTree is not kept up-to-date for additional blocks generated in the 3343 // vector loop. By using the header as insertion point, we guarantee that the 3344 // expanded instructions dominate all their uses. 3345 auto GetInsertPoint = [this, &B]() { 3346 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3347 if (InsertBB != LoopVectorBody && 3348 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3349 return LoopVectorBody->getTerminator(); 3350 return &*B.GetInsertPoint(); 3351 }; 3352 3353 switch (ID.getKind()) { 3354 case InductionDescriptor::IK_IntInduction: { 3355 assert(Index->getType() == StartValue->getType() && 3356 "Index type does not match StartValue type"); 3357 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3358 return B.CreateSub(StartValue, Index); 3359 auto *Offset = CreateMul( 3360 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3361 return CreateAdd(StartValue, Offset); 3362 } 3363 case InductionDescriptor::IK_PtrInduction: { 3364 assert(isa<SCEVConstant>(Step) && 3365 "Expected constant step for pointer induction"); 3366 return B.CreateGEP( 3367 StartValue->getType()->getPointerElementType(), StartValue, 3368 CreateMul(Index, 3369 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3370 } 3371 case InductionDescriptor::IK_FpInduction: { 3372 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3373 auto InductionBinOp = ID.getInductionBinOp(); 3374 assert(InductionBinOp && 3375 (InductionBinOp->getOpcode() == Instruction::FAdd || 3376 InductionBinOp->getOpcode() == Instruction::FSub) && 3377 "Original bin op should be defined for FP induction"); 3378 3379 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3380 Value *MulExp = B.CreateFMul(StepValue, Index); 3381 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3382 "induction"); 3383 } 3384 case InductionDescriptor::IK_NoInduction: 3385 return nullptr; 3386 } 3387 llvm_unreachable("invalid enum"); 3388 } 3389 3390 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3391 LoopScalarBody = OrigLoop->getHeader(); 3392 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3393 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3394 assert(LoopExitBlock && "Must have an exit block"); 3395 assert(LoopVectorPreHeader && "Invalid loop structure"); 3396 3397 LoopMiddleBlock = 3398 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3399 LI, nullptr, Twine(Prefix) + "middle.block"); 3400 LoopScalarPreHeader = 3401 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3402 nullptr, Twine(Prefix) + "scalar.ph"); 3403 3404 // Set up branch from middle block to the exit and scalar preheader blocks. 3405 // completeLoopSkeleton will update the condition to use an iteration check, 3406 // if required to decide whether to execute the remainder. 3407 BranchInst *BrInst = 3408 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3409 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3410 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3411 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3412 3413 // We intentionally don't let SplitBlock to update LoopInfo since 3414 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3415 // LoopVectorBody is explicitly added to the correct place few lines later. 3416 LoopVectorBody = 3417 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3418 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3419 3420 // Update dominator for loop exit. 3421 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3422 3423 // Create and register the new vector loop. 3424 Loop *Lp = LI->AllocateLoop(); 3425 Loop *ParentLoop = OrigLoop->getParentLoop(); 3426 3427 // Insert the new loop into the loop nest and register the new basic blocks 3428 // before calling any utilities such as SCEV that require valid LoopInfo. 3429 if (ParentLoop) { 3430 ParentLoop->addChildLoop(Lp); 3431 } else { 3432 LI->addTopLevelLoop(Lp); 3433 } 3434 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3435 return Lp; 3436 } 3437 3438 void InnerLoopVectorizer::createInductionResumeValues( 3439 Loop *L, Value *VectorTripCount, 3440 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3441 assert(VectorTripCount && L && "Expected valid arguments"); 3442 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3443 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3444 "Inconsistent information about additional bypass."); 3445 // We are going to resume the execution of the scalar loop. 3446 // Go over all of the induction variables that we found and fix the 3447 // PHIs that are left in the scalar version of the loop. 3448 // The starting values of PHI nodes depend on the counter of the last 3449 // iteration in the vectorized loop. 3450 // If we come from a bypass edge then we need to start from the original 3451 // start value. 3452 for (auto &InductionEntry : Legal->getInductionVars()) { 3453 PHINode *OrigPhi = InductionEntry.first; 3454 InductionDescriptor II = InductionEntry.second; 3455 3456 // Create phi nodes to merge from the backedge-taken check block. 3457 PHINode *BCResumeVal = 3458 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3459 LoopScalarPreHeader->getTerminator()); 3460 // Copy original phi DL over to the new one. 3461 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3462 Value *&EndValue = IVEndValues[OrigPhi]; 3463 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3464 if (OrigPhi == OldInduction) { 3465 // We know what the end value is. 3466 EndValue = VectorTripCount; 3467 } else { 3468 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3469 3470 // Fast-math-flags propagate from the original induction instruction. 3471 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3472 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3473 3474 Type *StepType = II.getStep()->getType(); 3475 Instruction::CastOps CastOp = 3476 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3477 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3478 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3479 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3480 EndValue->setName("ind.end"); 3481 3482 // Compute the end value for the additional bypass (if applicable). 3483 if (AdditionalBypass.first) { 3484 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3485 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3486 StepType, true); 3487 CRD = 3488 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3489 EndValueFromAdditionalBypass = 3490 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3491 EndValueFromAdditionalBypass->setName("ind.end"); 3492 } 3493 } 3494 // The new PHI merges the original incoming value, in case of a bypass, 3495 // or the value at the end of the vectorized loop. 3496 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3497 3498 // Fix the scalar body counter (PHI node). 3499 // The old induction's phi node in the scalar body needs the truncated 3500 // value. 3501 for (BasicBlock *BB : LoopBypassBlocks) 3502 BCResumeVal->addIncoming(II.getStartValue(), BB); 3503 3504 if (AdditionalBypass.first) 3505 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3506 EndValueFromAdditionalBypass); 3507 3508 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3509 } 3510 } 3511 3512 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3513 MDNode *OrigLoopID) { 3514 assert(L && "Expected valid loop."); 3515 3516 // The trip counts should be cached by now. 3517 Value *Count = getOrCreateTripCount(L); 3518 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3519 3520 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3521 3522 // Add a check in the middle block to see if we have completed 3523 // all of the iterations in the first vector loop. 3524 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3525 // If tail is to be folded, we know we don't need to run the remainder. 3526 if (!Cost->foldTailByMasking()) { 3527 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3528 Count, VectorTripCount, "cmp.n", 3529 LoopMiddleBlock->getTerminator()); 3530 3531 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3532 // of the corresponding compare because they may have ended up with 3533 // different line numbers and we want to avoid awkward line stepping while 3534 // debugging. Eg. if the compare has got a line number inside the loop. 3535 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3536 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3537 } 3538 3539 // Get ready to start creating new instructions into the vectorized body. 3540 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3541 "Inconsistent vector loop preheader"); 3542 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3543 3544 Optional<MDNode *> VectorizedLoopID = 3545 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3546 LLVMLoopVectorizeFollowupVectorized}); 3547 if (VectorizedLoopID.hasValue()) { 3548 L->setLoopID(VectorizedLoopID.getValue()); 3549 3550 // Do not setAlreadyVectorized if loop attributes have been defined 3551 // explicitly. 3552 return LoopVectorPreHeader; 3553 } 3554 3555 // Keep all loop hints from the original loop on the vector loop (we'll 3556 // replace the vectorizer-specific hints below). 3557 if (MDNode *LID = OrigLoop->getLoopID()) 3558 L->setLoopID(LID); 3559 3560 LoopVectorizeHints Hints(L, true, *ORE); 3561 Hints.setAlreadyVectorized(); 3562 3563 #ifdef EXPENSIVE_CHECKS 3564 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3565 LI->verify(*DT); 3566 #endif 3567 3568 return LoopVectorPreHeader; 3569 } 3570 3571 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3572 /* 3573 In this function we generate a new loop. The new loop will contain 3574 the vectorized instructions while the old loop will continue to run the 3575 scalar remainder. 3576 3577 [ ] <-- loop iteration number check. 3578 / | 3579 / v 3580 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3581 | / | 3582 | / v 3583 || [ ] <-- vector pre header. 3584 |/ | 3585 | v 3586 | [ ] \ 3587 | [ ]_| <-- vector loop. 3588 | | 3589 | v 3590 | -[ ] <--- middle-block. 3591 | / | 3592 | / v 3593 -|- >[ ] <--- new preheader. 3594 | | 3595 | v 3596 | [ ] \ 3597 | [ ]_| <-- old scalar loop to handle remainder. 3598 \ | 3599 \ v 3600 >[ ] <-- exit block. 3601 ... 3602 */ 3603 3604 // Get the metadata of the original loop before it gets modified. 3605 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3606 3607 // Create an empty vector loop, and prepare basic blocks for the runtime 3608 // checks. 3609 Loop *Lp = createVectorLoopSkeleton(""); 3610 3611 // Now, compare the new count to zero. If it is zero skip the vector loop and 3612 // jump to the scalar loop. This check also covers the case where the 3613 // backedge-taken count is uint##_max: adding one to it will overflow leading 3614 // to an incorrect trip count of zero. In this (rare) case we will also jump 3615 // to the scalar loop. 3616 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3617 3618 // Generate the code to check any assumptions that we've made for SCEV 3619 // expressions. 3620 emitSCEVChecks(Lp, LoopScalarPreHeader); 3621 3622 // Generate the code that checks in runtime if arrays overlap. We put the 3623 // checks into a separate block to make the more common case of few elements 3624 // faster. 3625 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3626 3627 // Some loops have a single integer induction variable, while other loops 3628 // don't. One example is c++ iterators that often have multiple pointer 3629 // induction variables. In the code below we also support a case where we 3630 // don't have a single induction variable. 3631 // 3632 // We try to obtain an induction variable from the original loop as hard 3633 // as possible. However if we don't find one that: 3634 // - is an integer 3635 // - counts from zero, stepping by one 3636 // - is the size of the widest induction variable type 3637 // then we create a new one. 3638 OldInduction = Legal->getPrimaryInduction(); 3639 Type *IdxTy = Legal->getWidestInductionType(); 3640 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3641 // The loop step is equal to the vectorization factor (num of SIMD elements) 3642 // times the unroll factor (num of SIMD instructions). 3643 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3644 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3645 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3646 Induction = 3647 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3648 getDebugLocFromInstOrOperands(OldInduction)); 3649 3650 // Emit phis for the new starting index of the scalar loop. 3651 createInductionResumeValues(Lp, CountRoundDown); 3652 3653 return completeLoopSkeleton(Lp, OrigLoopID); 3654 } 3655 3656 // Fix up external users of the induction variable. At this point, we are 3657 // in LCSSA form, with all external PHIs that use the IV having one input value, 3658 // coming from the remainder loop. We need those PHIs to also have a correct 3659 // value for the IV when arriving directly from the middle block. 3660 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3661 const InductionDescriptor &II, 3662 Value *CountRoundDown, Value *EndValue, 3663 BasicBlock *MiddleBlock) { 3664 // There are two kinds of external IV usages - those that use the value 3665 // computed in the last iteration (the PHI) and those that use the penultimate 3666 // value (the value that feeds into the phi from the loop latch). 3667 // We allow both, but they, obviously, have different values. 3668 3669 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3670 3671 DenseMap<Value *, Value *> MissingVals; 3672 3673 // An external user of the last iteration's value should see the value that 3674 // the remainder loop uses to initialize its own IV. 3675 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3676 for (User *U : PostInc->users()) { 3677 Instruction *UI = cast<Instruction>(U); 3678 if (!OrigLoop->contains(UI)) { 3679 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3680 MissingVals[UI] = EndValue; 3681 } 3682 } 3683 3684 // An external user of the penultimate value need to see EndValue - Step. 3685 // The simplest way to get this is to recompute it from the constituent SCEVs, 3686 // that is Start + (Step * (CRD - 1)). 3687 for (User *U : OrigPhi->users()) { 3688 auto *UI = cast<Instruction>(U); 3689 if (!OrigLoop->contains(UI)) { 3690 const DataLayout &DL = 3691 OrigLoop->getHeader()->getModule()->getDataLayout(); 3692 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3693 3694 IRBuilder<> B(MiddleBlock->getTerminator()); 3695 3696 // Fast-math-flags propagate from the original induction instruction. 3697 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3698 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3699 3700 Value *CountMinusOne = B.CreateSub( 3701 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3702 Value *CMO = 3703 !II.getStep()->getType()->isIntegerTy() 3704 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3705 II.getStep()->getType()) 3706 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3707 CMO->setName("cast.cmo"); 3708 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3709 Escape->setName("ind.escape"); 3710 MissingVals[UI] = Escape; 3711 } 3712 } 3713 3714 for (auto &I : MissingVals) { 3715 PHINode *PHI = cast<PHINode>(I.first); 3716 // One corner case we have to handle is two IVs "chasing" each-other, 3717 // that is %IV2 = phi [...], [ %IV1, %latch ] 3718 // In this case, if IV1 has an external use, we need to avoid adding both 3719 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3720 // don't already have an incoming value for the middle block. 3721 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3722 PHI->addIncoming(I.second, MiddleBlock); 3723 } 3724 } 3725 3726 namespace { 3727 3728 struct CSEDenseMapInfo { 3729 static bool canHandle(const Instruction *I) { 3730 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3731 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3732 } 3733 3734 static inline Instruction *getEmptyKey() { 3735 return DenseMapInfo<Instruction *>::getEmptyKey(); 3736 } 3737 3738 static inline Instruction *getTombstoneKey() { 3739 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3740 } 3741 3742 static unsigned getHashValue(const Instruction *I) { 3743 assert(canHandle(I) && "Unknown instruction!"); 3744 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3745 I->value_op_end())); 3746 } 3747 3748 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3749 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3750 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3751 return LHS == RHS; 3752 return LHS->isIdenticalTo(RHS); 3753 } 3754 }; 3755 3756 } // end anonymous namespace 3757 3758 ///Perform cse of induction variable instructions. 3759 static void cse(BasicBlock *BB) { 3760 // Perform simple cse. 3761 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3762 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3763 Instruction *In = &*I++; 3764 3765 if (!CSEDenseMapInfo::canHandle(In)) 3766 continue; 3767 3768 // Check if we can replace this instruction with any of the 3769 // visited instructions. 3770 if (Instruction *V = CSEMap.lookup(In)) { 3771 In->replaceAllUsesWith(V); 3772 In->eraseFromParent(); 3773 continue; 3774 } 3775 3776 CSEMap[In] = In; 3777 } 3778 } 3779 3780 InstructionCost 3781 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3782 bool &NeedToScalarize) const { 3783 Function *F = CI->getCalledFunction(); 3784 Type *ScalarRetTy = CI->getType(); 3785 SmallVector<Type *, 4> Tys, ScalarTys; 3786 for (auto &ArgOp : CI->arg_operands()) 3787 ScalarTys.push_back(ArgOp->getType()); 3788 3789 // Estimate cost of scalarized vector call. The source operands are assumed 3790 // to be vectors, so we need to extract individual elements from there, 3791 // execute VF scalar calls, and then gather the result into the vector return 3792 // value. 3793 InstructionCost ScalarCallCost = 3794 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3795 if (VF.isScalar()) 3796 return ScalarCallCost; 3797 3798 // Compute corresponding vector type for return value and arguments. 3799 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3800 for (Type *ScalarTy : ScalarTys) 3801 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3802 3803 // Compute costs of unpacking argument values for the scalar calls and 3804 // packing the return values to a vector. 3805 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3806 3807 InstructionCost Cost = 3808 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3809 3810 // If we can't emit a vector call for this function, then the currently found 3811 // cost is the cost we need to return. 3812 NeedToScalarize = true; 3813 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3814 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3815 3816 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3817 return Cost; 3818 3819 // If the corresponding vector cost is cheaper, return its cost. 3820 InstructionCost VectorCallCost = 3821 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3822 if (VectorCallCost < Cost) { 3823 NeedToScalarize = false; 3824 Cost = VectorCallCost; 3825 } 3826 return Cost; 3827 } 3828 3829 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3830 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3831 return Elt; 3832 return VectorType::get(Elt, VF); 3833 } 3834 3835 InstructionCost 3836 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3837 ElementCount VF) const { 3838 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3839 assert(ID && "Expected intrinsic call!"); 3840 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3841 FastMathFlags FMF; 3842 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3843 FMF = FPMO->getFastMathFlags(); 3844 3845 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3846 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3847 SmallVector<Type *> ParamTys; 3848 std::transform(FTy->param_begin(), FTy->param_end(), 3849 std::back_inserter(ParamTys), 3850 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3851 3852 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3853 dyn_cast<IntrinsicInst>(CI)); 3854 return TTI.getIntrinsicInstrCost(CostAttrs, 3855 TargetTransformInfo::TCK_RecipThroughput); 3856 } 3857 3858 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3859 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3860 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3861 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3862 } 3863 3864 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3865 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3866 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3867 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3868 } 3869 3870 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3871 // For every instruction `I` in MinBWs, truncate the operands, create a 3872 // truncated version of `I` and reextend its result. InstCombine runs 3873 // later and will remove any ext/trunc pairs. 3874 SmallPtrSet<Value *, 4> Erased; 3875 for (const auto &KV : Cost->getMinimalBitwidths()) { 3876 // If the value wasn't vectorized, we must maintain the original scalar 3877 // type. The absence of the value from State indicates that it 3878 // wasn't vectorized. 3879 VPValue *Def = State.Plan->getVPValue(KV.first); 3880 if (!State.hasAnyVectorValue(Def)) 3881 continue; 3882 for (unsigned Part = 0; Part < UF; ++Part) { 3883 Value *I = State.get(Def, Part); 3884 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3885 continue; 3886 Type *OriginalTy = I->getType(); 3887 Type *ScalarTruncatedTy = 3888 IntegerType::get(OriginalTy->getContext(), KV.second); 3889 auto *TruncatedTy = FixedVectorType::get( 3890 ScalarTruncatedTy, 3891 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3892 if (TruncatedTy == OriginalTy) 3893 continue; 3894 3895 IRBuilder<> B(cast<Instruction>(I)); 3896 auto ShrinkOperand = [&](Value *V) -> Value * { 3897 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3898 if (ZI->getSrcTy() == TruncatedTy) 3899 return ZI->getOperand(0); 3900 return B.CreateZExtOrTrunc(V, TruncatedTy); 3901 }; 3902 3903 // The actual instruction modification depends on the instruction type, 3904 // unfortunately. 3905 Value *NewI = nullptr; 3906 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3907 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3908 ShrinkOperand(BO->getOperand(1))); 3909 3910 // Any wrapping introduced by shrinking this operation shouldn't be 3911 // considered undefined behavior. So, we can't unconditionally copy 3912 // arithmetic wrapping flags to NewI. 3913 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3914 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3915 NewI = 3916 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3917 ShrinkOperand(CI->getOperand(1))); 3918 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3919 NewI = B.CreateSelect(SI->getCondition(), 3920 ShrinkOperand(SI->getTrueValue()), 3921 ShrinkOperand(SI->getFalseValue())); 3922 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3923 switch (CI->getOpcode()) { 3924 default: 3925 llvm_unreachable("Unhandled cast!"); 3926 case Instruction::Trunc: 3927 NewI = ShrinkOperand(CI->getOperand(0)); 3928 break; 3929 case Instruction::SExt: 3930 NewI = B.CreateSExtOrTrunc( 3931 CI->getOperand(0), 3932 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3933 break; 3934 case Instruction::ZExt: 3935 NewI = B.CreateZExtOrTrunc( 3936 CI->getOperand(0), 3937 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3938 break; 3939 } 3940 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3941 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3942 ->getNumElements(); 3943 auto *O0 = B.CreateZExtOrTrunc( 3944 SI->getOperand(0), 3945 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3946 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3947 ->getNumElements(); 3948 auto *O1 = B.CreateZExtOrTrunc( 3949 SI->getOperand(1), 3950 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3951 3952 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3953 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3954 // Don't do anything with the operands, just extend the result. 3955 continue; 3956 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3957 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3958 ->getNumElements(); 3959 auto *O0 = B.CreateZExtOrTrunc( 3960 IE->getOperand(0), 3961 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3962 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3963 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3964 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3965 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3966 ->getNumElements(); 3967 auto *O0 = B.CreateZExtOrTrunc( 3968 EE->getOperand(0), 3969 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3970 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3971 } else { 3972 // If we don't know what to do, be conservative and don't do anything. 3973 continue; 3974 } 3975 3976 // Lastly, extend the result. 3977 NewI->takeName(cast<Instruction>(I)); 3978 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3979 I->replaceAllUsesWith(Res); 3980 cast<Instruction>(I)->eraseFromParent(); 3981 Erased.insert(I); 3982 State.reset(Def, Res, Part); 3983 } 3984 } 3985 3986 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3987 for (const auto &KV : Cost->getMinimalBitwidths()) { 3988 // If the value wasn't vectorized, we must maintain the original scalar 3989 // type. The absence of the value from State indicates that it 3990 // wasn't vectorized. 3991 VPValue *Def = State.Plan->getVPValue(KV.first); 3992 if (!State.hasAnyVectorValue(Def)) 3993 continue; 3994 for (unsigned Part = 0; Part < UF; ++Part) { 3995 Value *I = State.get(Def, Part); 3996 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3997 if (Inst && Inst->use_empty()) { 3998 Value *NewI = Inst->getOperand(0); 3999 Inst->eraseFromParent(); 4000 State.reset(Def, NewI, Part); 4001 } 4002 } 4003 } 4004 } 4005 4006 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4007 // Insert truncates and extends for any truncated instructions as hints to 4008 // InstCombine. 4009 if (VF.isVector()) 4010 truncateToMinimalBitwidths(State); 4011 4012 // Fix widened non-induction PHIs by setting up the PHI operands. 4013 if (OrigPHIsToFix.size()) { 4014 assert(EnableVPlanNativePath && 4015 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4016 fixNonInductionPHIs(State); 4017 } 4018 4019 // At this point every instruction in the original loop is widened to a 4020 // vector form. Now we need to fix the recurrences in the loop. These PHI 4021 // nodes are currently empty because we did not want to introduce cycles. 4022 // This is the second stage of vectorizing recurrences. 4023 fixCrossIterationPHIs(State); 4024 4025 // Forget the original basic block. 4026 PSE.getSE()->forgetLoop(OrigLoop); 4027 4028 // Fix-up external users of the induction variables. 4029 for (auto &Entry : Legal->getInductionVars()) 4030 fixupIVUsers(Entry.first, Entry.second, 4031 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4032 IVEndValues[Entry.first], LoopMiddleBlock); 4033 4034 fixLCSSAPHIs(State); 4035 for (Instruction *PI : PredicatedInstructions) 4036 sinkScalarOperands(&*PI); 4037 4038 // Remove redundant induction instructions. 4039 cse(LoopVectorBody); 4040 4041 // Set/update profile weights for the vector and remainder loops as original 4042 // loop iterations are now distributed among them. Note that original loop 4043 // represented by LoopScalarBody becomes remainder loop after vectorization. 4044 // 4045 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4046 // end up getting slightly roughened result but that should be OK since 4047 // profile is not inherently precise anyway. Note also possible bypass of 4048 // vector code caused by legality checks is ignored, assigning all the weight 4049 // to the vector loop, optimistically. 4050 // 4051 // For scalable vectorization we can't know at compile time how many iterations 4052 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4053 // vscale of '1'. 4054 setProfileInfoAfterUnrolling( 4055 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4056 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4057 } 4058 4059 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4060 // In order to support recurrences we need to be able to vectorize Phi nodes. 4061 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4062 // stage #2: We now need to fix the recurrences by adding incoming edges to 4063 // the currently empty PHI nodes. At this point every instruction in the 4064 // original loop is widened to a vector form so we can use them to construct 4065 // the incoming edges. 4066 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4067 // Handle first-order recurrences and reductions that need to be fixed. 4068 if (Legal->isFirstOrderRecurrence(&Phi)) 4069 fixFirstOrderRecurrence(&Phi, State); 4070 else if (Legal->isReductionVariable(&Phi)) 4071 fixReduction(&Phi, State); 4072 } 4073 } 4074 4075 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4076 VPTransformState &State) { 4077 // This is the second phase of vectorizing first-order recurrences. An 4078 // overview of the transformation is described below. Suppose we have the 4079 // following loop. 4080 // 4081 // for (int i = 0; i < n; ++i) 4082 // b[i] = a[i] - a[i - 1]; 4083 // 4084 // There is a first-order recurrence on "a". For this loop, the shorthand 4085 // scalar IR looks like: 4086 // 4087 // scalar.ph: 4088 // s_init = a[-1] 4089 // br scalar.body 4090 // 4091 // scalar.body: 4092 // i = phi [0, scalar.ph], [i+1, scalar.body] 4093 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4094 // s2 = a[i] 4095 // b[i] = s2 - s1 4096 // br cond, scalar.body, ... 4097 // 4098 // In this example, s1 is a recurrence because it's value depends on the 4099 // previous iteration. In the first phase of vectorization, we created a 4100 // temporary value for s1. We now complete the vectorization and produce the 4101 // shorthand vector IR shown below (for VF = 4, UF = 1). 4102 // 4103 // vector.ph: 4104 // v_init = vector(..., ..., ..., a[-1]) 4105 // br vector.body 4106 // 4107 // vector.body 4108 // i = phi [0, vector.ph], [i+4, vector.body] 4109 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4110 // v2 = a[i, i+1, i+2, i+3]; 4111 // v3 = vector(v1(3), v2(0, 1, 2)) 4112 // b[i, i+1, i+2, i+3] = v2 - v3 4113 // br cond, vector.body, middle.block 4114 // 4115 // middle.block: 4116 // x = v2(3) 4117 // br scalar.ph 4118 // 4119 // scalar.ph: 4120 // s_init = phi [x, middle.block], [a[-1], otherwise] 4121 // br scalar.body 4122 // 4123 // After execution completes the vector loop, we extract the next value of 4124 // the recurrence (x) to use as the initial value in the scalar loop. 4125 4126 // Get the original loop preheader and single loop latch. 4127 auto *Preheader = OrigLoop->getLoopPreheader(); 4128 auto *Latch = OrigLoop->getLoopLatch(); 4129 4130 // Get the initial and previous values of the scalar recurrence. 4131 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4132 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4133 4134 // Create a vector from the initial value. 4135 auto *VectorInit = ScalarInit; 4136 if (VF.isVector()) { 4137 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4138 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4139 VectorInit = Builder.CreateInsertElement( 4140 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4141 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4142 } 4143 4144 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4145 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4146 // We constructed a temporary phi node in the first phase of vectorization. 4147 // This phi node will eventually be deleted. 4148 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4149 4150 // Create a phi node for the new recurrence. The current value will either be 4151 // the initial value inserted into a vector or loop-varying vector value. 4152 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4153 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4154 4155 // Get the vectorized previous value of the last part UF - 1. It appears last 4156 // among all unrolled iterations, due to the order of their construction. 4157 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4158 4159 // Find and set the insertion point after the previous value if it is an 4160 // instruction. 4161 BasicBlock::iterator InsertPt; 4162 // Note that the previous value may have been constant-folded so it is not 4163 // guaranteed to be an instruction in the vector loop. 4164 // FIXME: Loop invariant values do not form recurrences. We should deal with 4165 // them earlier. 4166 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4167 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4168 else { 4169 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4170 if (isa<PHINode>(PreviousLastPart)) 4171 // If the previous value is a phi node, we should insert after all the phi 4172 // nodes in the block containing the PHI to avoid breaking basic block 4173 // verification. Note that the basic block may be different to 4174 // LoopVectorBody, in case we predicate the loop. 4175 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4176 else 4177 InsertPt = ++PreviousInst->getIterator(); 4178 } 4179 Builder.SetInsertPoint(&*InsertPt); 4180 4181 // We will construct a vector for the recurrence by combining the values for 4182 // the current and previous iterations. This is the required shuffle mask. 4183 assert(!VF.isScalable()); 4184 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4185 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4186 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4187 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4188 4189 // The vector from which to take the initial value for the current iteration 4190 // (actual or unrolled). Initially, this is the vector phi node. 4191 Value *Incoming = VecPhi; 4192 4193 // Shuffle the current and previous vector and update the vector parts. 4194 for (unsigned Part = 0; Part < UF; ++Part) { 4195 Value *PreviousPart = State.get(PreviousDef, Part); 4196 Value *PhiPart = State.get(PhiDef, Part); 4197 auto *Shuffle = 4198 VF.isVector() 4199 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4200 : Incoming; 4201 PhiPart->replaceAllUsesWith(Shuffle); 4202 cast<Instruction>(PhiPart)->eraseFromParent(); 4203 State.reset(PhiDef, Shuffle, Part); 4204 Incoming = PreviousPart; 4205 } 4206 4207 // Fix the latch value of the new recurrence in the vector loop. 4208 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4209 4210 // Extract the last vector element in the middle block. This will be the 4211 // initial value for the recurrence when jumping to the scalar loop. 4212 auto *ExtractForScalar = Incoming; 4213 if (VF.isVector()) { 4214 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4215 ExtractForScalar = Builder.CreateExtractElement( 4216 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4217 "vector.recur.extract"); 4218 } 4219 // Extract the second last element in the middle block if the 4220 // Phi is used outside the loop. We need to extract the phi itself 4221 // and not the last element (the phi update in the current iteration). This 4222 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4223 // when the scalar loop is not run at all. 4224 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4225 if (VF.isVector()) 4226 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4227 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4228 "vector.recur.extract.for.phi"); 4229 // When loop is unrolled without vectorizing, initialize 4230 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4231 // `Incoming`. This is analogous to the vectorized case above: extracting the 4232 // second last element when VF > 1. 4233 else if (UF > 1) 4234 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4235 4236 // Fix the initial value of the original recurrence in the scalar loop. 4237 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4238 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4239 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4240 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4241 Start->addIncoming(Incoming, BB); 4242 } 4243 4244 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4245 Phi->setName("scalar.recur"); 4246 4247 // Finally, fix users of the recurrence outside the loop. The users will need 4248 // either the last value of the scalar recurrence or the last value of the 4249 // vector recurrence we extracted in the middle block. Since the loop is in 4250 // LCSSA form, we just need to find all the phi nodes for the original scalar 4251 // recurrence in the exit block, and then add an edge for the middle block. 4252 // Note that LCSSA does not imply single entry when the original scalar loop 4253 // had multiple exiting edges (as we always run the last iteration in the 4254 // scalar epilogue); in that case, the exiting path through middle will be 4255 // dynamically dead and the value picked for the phi doesn't matter. 4256 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4257 if (any_of(LCSSAPhi.incoming_values(), 4258 [Phi](Value *V) { return V == Phi; })) 4259 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4260 } 4261 4262 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4263 // Get it's reduction variable descriptor. 4264 assert(Legal->isReductionVariable(Phi) && 4265 "Unable to find the reduction variable"); 4266 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4267 4268 RecurKind RK = RdxDesc.getRecurrenceKind(); 4269 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4270 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4271 setDebugLocFromInst(Builder, ReductionStartValue); 4272 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4273 4274 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4275 // This is the vector-clone of the value that leaves the loop. 4276 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4277 4278 // Wrap flags are in general invalid after vectorization, clear them. 4279 clearReductionWrapFlags(RdxDesc, State); 4280 4281 // Fix the vector-loop phi. 4282 4283 // Reductions do not have to start at zero. They can start with 4284 // any loop invariant values. 4285 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4286 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4287 4288 for (unsigned Part = 0; Part < UF; ++Part) { 4289 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4290 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4291 cast<PHINode>(VecRdxPhi) 4292 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4293 } 4294 4295 // Before each round, move the insertion point right between 4296 // the PHIs and the values we are going to write. 4297 // This allows us to write both PHINodes and the extractelement 4298 // instructions. 4299 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4300 4301 setDebugLocFromInst(Builder, LoopExitInst); 4302 4303 Type *PhiTy = Phi->getType(); 4304 // If tail is folded by masking, the vector value to leave the loop should be 4305 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4306 // instead of the former. For an inloop reduction the reduction will already 4307 // be predicated, and does not need to be handled here. 4308 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4309 for (unsigned Part = 0; Part < UF; ++Part) { 4310 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4311 Value *Sel = nullptr; 4312 for (User *U : VecLoopExitInst->users()) { 4313 if (isa<SelectInst>(U)) { 4314 assert(!Sel && "Reduction exit feeding two selects"); 4315 Sel = U; 4316 } else 4317 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4318 } 4319 assert(Sel && "Reduction exit feeds no select"); 4320 State.reset(LoopExitInstDef, Sel, Part); 4321 4322 // If the target can create a predicated operator for the reduction at no 4323 // extra cost in the loop (for example a predicated vadd), it can be 4324 // cheaper for the select to remain in the loop than be sunk out of it, 4325 // and so use the select value for the phi instead of the old 4326 // LoopExitValue. 4327 if (PreferPredicatedReductionSelect || 4328 TTI->preferPredicatedReductionSelect( 4329 RdxDesc.getOpcode(), PhiTy, 4330 TargetTransformInfo::ReductionFlags())) { 4331 auto *VecRdxPhi = 4332 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4333 VecRdxPhi->setIncomingValueForBlock( 4334 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4335 } 4336 } 4337 } 4338 4339 // If the vector reduction can be performed in a smaller type, we truncate 4340 // then extend the loop exit value to enable InstCombine to evaluate the 4341 // entire expression in the smaller type. 4342 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4343 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4344 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4345 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4346 Builder.SetInsertPoint( 4347 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4348 VectorParts RdxParts(UF); 4349 for (unsigned Part = 0; Part < UF; ++Part) { 4350 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4351 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4352 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4353 : Builder.CreateZExt(Trunc, VecTy); 4354 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4355 UI != RdxParts[Part]->user_end();) 4356 if (*UI != Trunc) { 4357 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4358 RdxParts[Part] = Extnd; 4359 } else { 4360 ++UI; 4361 } 4362 } 4363 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4364 for (unsigned Part = 0; Part < UF; ++Part) { 4365 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4366 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4367 } 4368 } 4369 4370 // Reduce all of the unrolled parts into a single vector. 4371 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4372 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4373 4374 // The middle block terminator has already been assigned a DebugLoc here (the 4375 // OrigLoop's single latch terminator). We want the whole middle block to 4376 // appear to execute on this line because: (a) it is all compiler generated, 4377 // (b) these instructions are always executed after evaluating the latch 4378 // conditional branch, and (c) other passes may add new predecessors which 4379 // terminate on this line. This is the easiest way to ensure we don't 4380 // accidentally cause an extra step back into the loop while debugging. 4381 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4382 { 4383 // Floating-point operations should have some FMF to enable the reduction. 4384 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4385 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4386 for (unsigned Part = 1; Part < UF; ++Part) { 4387 Value *RdxPart = State.get(LoopExitInstDef, Part); 4388 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4389 ReducedPartRdx = Builder.CreateBinOp( 4390 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4391 } else { 4392 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4393 } 4394 } 4395 } 4396 4397 // Create the reduction after the loop. Note that inloop reductions create the 4398 // target reduction in the loop using a Reduction recipe. 4399 if (VF.isVector() && !IsInLoopReductionPhi) { 4400 ReducedPartRdx = 4401 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4402 // If the reduction can be performed in a smaller type, we need to extend 4403 // the reduction to the wider type before we branch to the original loop. 4404 if (PhiTy != RdxDesc.getRecurrenceType()) 4405 ReducedPartRdx = RdxDesc.isSigned() 4406 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4407 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4408 } 4409 4410 // Create a phi node that merges control-flow from the backedge-taken check 4411 // block and the middle block. 4412 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4413 LoopScalarPreHeader->getTerminator()); 4414 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4415 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4416 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4417 4418 // Now, we need to fix the users of the reduction variable 4419 // inside and outside of the scalar remainder loop. 4420 4421 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4422 // in the exit blocks. See comment on analogous loop in 4423 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4424 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4425 if (any_of(LCSSAPhi.incoming_values(), 4426 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4427 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4428 4429 // Fix the scalar loop reduction variable with the incoming reduction sum 4430 // from the vector body and from the backedge value. 4431 int IncomingEdgeBlockIdx = 4432 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4433 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4434 // Pick the other block. 4435 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4436 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4437 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4438 } 4439 4440 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4441 VPTransformState &State) { 4442 RecurKind RK = RdxDesc.getRecurrenceKind(); 4443 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4444 return; 4445 4446 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4447 assert(LoopExitInstr && "null loop exit instruction"); 4448 SmallVector<Instruction *, 8> Worklist; 4449 SmallPtrSet<Instruction *, 8> Visited; 4450 Worklist.push_back(LoopExitInstr); 4451 Visited.insert(LoopExitInstr); 4452 4453 while (!Worklist.empty()) { 4454 Instruction *Cur = Worklist.pop_back_val(); 4455 if (isa<OverflowingBinaryOperator>(Cur)) 4456 for (unsigned Part = 0; Part < UF; ++Part) { 4457 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4458 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4459 } 4460 4461 for (User *U : Cur->users()) { 4462 Instruction *UI = cast<Instruction>(U); 4463 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4464 Visited.insert(UI).second) 4465 Worklist.push_back(UI); 4466 } 4467 } 4468 } 4469 4470 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4471 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4472 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4473 // Some phis were already hand updated by the reduction and recurrence 4474 // code above, leave them alone. 4475 continue; 4476 4477 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4478 // Non-instruction incoming values will have only one value. 4479 4480 VPLane Lane = VPLane::getFirstLane(); 4481 if (isa<Instruction>(IncomingValue) && 4482 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4483 VF)) 4484 Lane = VPLane::getLastLaneForVF(VF); 4485 4486 // Can be a loop invariant incoming value or the last scalar value to be 4487 // extracted from the vectorized loop. 4488 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4489 Value *lastIncomingValue = 4490 OrigLoop->isLoopInvariant(IncomingValue) 4491 ? IncomingValue 4492 : State.get(State.Plan->getVPValue(IncomingValue), 4493 VPIteration(UF - 1, Lane)); 4494 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4495 } 4496 } 4497 4498 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4499 // The basic block and loop containing the predicated instruction. 4500 auto *PredBB = PredInst->getParent(); 4501 auto *VectorLoop = LI->getLoopFor(PredBB); 4502 4503 // Initialize a worklist with the operands of the predicated instruction. 4504 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4505 4506 // Holds instructions that we need to analyze again. An instruction may be 4507 // reanalyzed if we don't yet know if we can sink it or not. 4508 SmallVector<Instruction *, 8> InstsToReanalyze; 4509 4510 // Returns true if a given use occurs in the predicated block. Phi nodes use 4511 // their operands in their corresponding predecessor blocks. 4512 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4513 auto *I = cast<Instruction>(U.getUser()); 4514 BasicBlock *BB = I->getParent(); 4515 if (auto *Phi = dyn_cast<PHINode>(I)) 4516 BB = Phi->getIncomingBlock( 4517 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4518 return BB == PredBB; 4519 }; 4520 4521 // Iteratively sink the scalarized operands of the predicated instruction 4522 // into the block we created for it. When an instruction is sunk, it's 4523 // operands are then added to the worklist. The algorithm ends after one pass 4524 // through the worklist doesn't sink a single instruction. 4525 bool Changed; 4526 do { 4527 // Add the instructions that need to be reanalyzed to the worklist, and 4528 // reset the changed indicator. 4529 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4530 InstsToReanalyze.clear(); 4531 Changed = false; 4532 4533 while (!Worklist.empty()) { 4534 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4535 4536 // We can't sink an instruction if it is a phi node, is already in the 4537 // predicated block, is not in the loop, or may have side effects. 4538 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4539 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4540 continue; 4541 4542 // It's legal to sink the instruction if all its uses occur in the 4543 // predicated block. Otherwise, there's nothing to do yet, and we may 4544 // need to reanalyze the instruction. 4545 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4546 InstsToReanalyze.push_back(I); 4547 continue; 4548 } 4549 4550 // Move the instruction to the beginning of the predicated block, and add 4551 // it's operands to the worklist. 4552 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4553 Worklist.insert(I->op_begin(), I->op_end()); 4554 4555 // The sinking may have enabled other instructions to be sunk, so we will 4556 // need to iterate. 4557 Changed = true; 4558 } 4559 } while (Changed); 4560 } 4561 4562 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4563 for (PHINode *OrigPhi : OrigPHIsToFix) { 4564 VPWidenPHIRecipe *VPPhi = 4565 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4566 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4567 // Make sure the builder has a valid insert point. 4568 Builder.SetInsertPoint(NewPhi); 4569 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4570 VPValue *Inc = VPPhi->getIncomingValue(i); 4571 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4572 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4573 } 4574 } 4575 } 4576 4577 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4578 VPUser &Operands, unsigned UF, 4579 ElementCount VF, bool IsPtrLoopInvariant, 4580 SmallBitVector &IsIndexLoopInvariant, 4581 VPTransformState &State) { 4582 // Construct a vector GEP by widening the operands of the scalar GEP as 4583 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4584 // results in a vector of pointers when at least one operand of the GEP 4585 // is vector-typed. Thus, to keep the representation compact, we only use 4586 // vector-typed operands for loop-varying values. 4587 4588 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4589 // If we are vectorizing, but the GEP has only loop-invariant operands, 4590 // the GEP we build (by only using vector-typed operands for 4591 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4592 // produce a vector of pointers, we need to either arbitrarily pick an 4593 // operand to broadcast, or broadcast a clone of the original GEP. 4594 // Here, we broadcast a clone of the original. 4595 // 4596 // TODO: If at some point we decide to scalarize instructions having 4597 // loop-invariant operands, this special case will no longer be 4598 // required. We would add the scalarization decision to 4599 // collectLoopScalars() and teach getVectorValue() to broadcast 4600 // the lane-zero scalar value. 4601 auto *Clone = Builder.Insert(GEP->clone()); 4602 for (unsigned Part = 0; Part < UF; ++Part) { 4603 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4604 State.set(VPDef, EntryPart, Part); 4605 addMetadata(EntryPart, GEP); 4606 } 4607 } else { 4608 // If the GEP has at least one loop-varying operand, we are sure to 4609 // produce a vector of pointers. But if we are only unrolling, we want 4610 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4611 // produce with the code below will be scalar (if VF == 1) or vector 4612 // (otherwise). Note that for the unroll-only case, we still maintain 4613 // values in the vector mapping with initVector, as we do for other 4614 // instructions. 4615 for (unsigned Part = 0; Part < UF; ++Part) { 4616 // The pointer operand of the new GEP. If it's loop-invariant, we 4617 // won't broadcast it. 4618 auto *Ptr = IsPtrLoopInvariant 4619 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4620 : State.get(Operands.getOperand(0), Part); 4621 4622 // Collect all the indices for the new GEP. If any index is 4623 // loop-invariant, we won't broadcast it. 4624 SmallVector<Value *, 4> Indices; 4625 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4626 VPValue *Operand = Operands.getOperand(I); 4627 if (IsIndexLoopInvariant[I - 1]) 4628 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4629 else 4630 Indices.push_back(State.get(Operand, Part)); 4631 } 4632 4633 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4634 // but it should be a vector, otherwise. 4635 auto *NewGEP = 4636 GEP->isInBounds() 4637 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4638 Indices) 4639 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4640 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4641 "NewGEP is not a pointer vector"); 4642 State.set(VPDef, NewGEP, Part); 4643 addMetadata(NewGEP, GEP); 4644 } 4645 } 4646 } 4647 4648 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4649 RecurrenceDescriptor *RdxDesc, 4650 VPValue *StartVPV, VPValue *Def, 4651 VPTransformState &State) { 4652 PHINode *P = cast<PHINode>(PN); 4653 if (EnableVPlanNativePath) { 4654 // Currently we enter here in the VPlan-native path for non-induction 4655 // PHIs where all control flow is uniform. We simply widen these PHIs. 4656 // Create a vector phi with no operands - the vector phi operands will be 4657 // set at the end of vector code generation. 4658 Type *VecTy = (State.VF.isScalar()) 4659 ? PN->getType() 4660 : VectorType::get(PN->getType(), State.VF); 4661 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4662 State.set(Def, VecPhi, 0); 4663 OrigPHIsToFix.push_back(P); 4664 4665 return; 4666 } 4667 4668 assert(PN->getParent() == OrigLoop->getHeader() && 4669 "Non-header phis should have been handled elsewhere"); 4670 4671 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4672 // In order to support recurrences we need to be able to vectorize Phi nodes. 4673 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4674 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4675 // this value when we vectorize all of the instructions that use the PHI. 4676 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4677 Value *Iden = nullptr; 4678 bool ScalarPHI = 4679 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4680 Type *VecTy = 4681 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4682 4683 if (RdxDesc) { 4684 assert(Legal->isReductionVariable(P) && StartV && 4685 "RdxDesc should only be set for reduction variables; in that case " 4686 "a StartV is also required"); 4687 RecurKind RK = RdxDesc->getRecurrenceKind(); 4688 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4689 // MinMax reduction have the start value as their identify. 4690 if (ScalarPHI) { 4691 Iden = StartV; 4692 } else { 4693 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4694 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4695 StartV = Iden = 4696 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4697 } 4698 } else { 4699 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4700 RK, VecTy->getScalarType()); 4701 Iden = IdenC; 4702 4703 if (!ScalarPHI) { 4704 Iden = ConstantVector::getSplat(State.VF, IdenC); 4705 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4706 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4707 Constant *Zero = Builder.getInt32(0); 4708 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4709 } 4710 } 4711 } 4712 4713 for (unsigned Part = 0; Part < State.UF; ++Part) { 4714 // This is phase one of vectorizing PHIs. 4715 Value *EntryPart = PHINode::Create( 4716 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4717 State.set(Def, EntryPart, Part); 4718 if (StartV) { 4719 // Make sure to add the reduction start value only to the 4720 // first unroll part. 4721 Value *StartVal = (Part == 0) ? StartV : Iden; 4722 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4723 } 4724 } 4725 return; 4726 } 4727 4728 assert(!Legal->isReductionVariable(P) && 4729 "reductions should be handled above"); 4730 4731 setDebugLocFromInst(Builder, P); 4732 4733 // This PHINode must be an induction variable. 4734 // Make sure that we know about it. 4735 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4736 4737 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4738 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4739 4740 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4741 // which can be found from the original scalar operations. 4742 switch (II.getKind()) { 4743 case InductionDescriptor::IK_NoInduction: 4744 llvm_unreachable("Unknown induction"); 4745 case InductionDescriptor::IK_IntInduction: 4746 case InductionDescriptor::IK_FpInduction: 4747 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4748 case InductionDescriptor::IK_PtrInduction: { 4749 // Handle the pointer induction variable case. 4750 assert(P->getType()->isPointerTy() && "Unexpected type."); 4751 assert(!VF.isScalable() && "Currently unsupported for scalable vectors"); 4752 4753 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4754 // This is the normalized GEP that starts counting at zero. 4755 Value *PtrInd = 4756 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4757 // Determine the number of scalars we need to generate for each unroll 4758 // iteration. If the instruction is uniform, we only need to generate the 4759 // first lane. Otherwise, we generate all VF values. 4760 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4761 ? 1 4762 : State.VF.getKnownMinValue(); 4763 for (unsigned Part = 0; Part < UF; ++Part) { 4764 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4765 Constant *Idx = ConstantInt::get( 4766 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4767 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4768 Value *SclrGep = 4769 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4770 SclrGep->setName("next.gep"); 4771 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4772 } 4773 } 4774 return; 4775 } 4776 assert(isa<SCEVConstant>(II.getStep()) && 4777 "Induction step not a SCEV constant!"); 4778 Type *PhiType = II.getStep()->getType(); 4779 4780 // Build a pointer phi 4781 Value *ScalarStartValue = II.getStartValue(); 4782 Type *ScStValueType = ScalarStartValue->getType(); 4783 PHINode *NewPointerPhi = 4784 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4785 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4786 4787 // A pointer induction, performed by using a gep 4788 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4789 Instruction *InductionLoc = LoopLatch->getTerminator(); 4790 const SCEV *ScalarStep = II.getStep(); 4791 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4792 Value *ScalarStepValue = 4793 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4794 Value *InductionGEP = GetElementPtrInst::Create( 4795 ScStValueType->getPointerElementType(), NewPointerPhi, 4796 Builder.CreateMul( 4797 ScalarStepValue, 4798 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4799 "ptr.ind", InductionLoc); 4800 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4801 4802 // Create UF many actual address geps that use the pointer 4803 // phi as base and a vectorized version of the step value 4804 // (<step*0, ..., step*N>) as offset. 4805 for (unsigned Part = 0; Part < State.UF; ++Part) { 4806 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4807 Value *StartOffset = 4808 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue()); 4809 // Create a vector of consecutive numbers from zero to VF. 4810 StartOffset = 4811 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4812 4813 Value *GEP = Builder.CreateGEP( 4814 ScStValueType->getPointerElementType(), NewPointerPhi, 4815 Builder.CreateMul(StartOffset, 4816 Builder.CreateVectorSplat( 4817 State.VF.getKnownMinValue(), ScalarStepValue), 4818 "vector.gep")); 4819 State.set(Def, GEP, Part); 4820 } 4821 } 4822 } 4823 } 4824 4825 /// A helper function for checking whether an integer division-related 4826 /// instruction may divide by zero (in which case it must be predicated if 4827 /// executed conditionally in the scalar code). 4828 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4829 /// Non-zero divisors that are non compile-time constants will not be 4830 /// converted into multiplication, so we will still end up scalarizing 4831 /// the division, but can do so w/o predication. 4832 static bool mayDivideByZero(Instruction &I) { 4833 assert((I.getOpcode() == Instruction::UDiv || 4834 I.getOpcode() == Instruction::SDiv || 4835 I.getOpcode() == Instruction::URem || 4836 I.getOpcode() == Instruction::SRem) && 4837 "Unexpected instruction"); 4838 Value *Divisor = I.getOperand(1); 4839 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4840 return !CInt || CInt->isZero(); 4841 } 4842 4843 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4844 VPUser &User, 4845 VPTransformState &State) { 4846 switch (I.getOpcode()) { 4847 case Instruction::Call: 4848 case Instruction::Br: 4849 case Instruction::PHI: 4850 case Instruction::GetElementPtr: 4851 case Instruction::Select: 4852 llvm_unreachable("This instruction is handled by a different recipe."); 4853 case Instruction::UDiv: 4854 case Instruction::SDiv: 4855 case Instruction::SRem: 4856 case Instruction::URem: 4857 case Instruction::Add: 4858 case Instruction::FAdd: 4859 case Instruction::Sub: 4860 case Instruction::FSub: 4861 case Instruction::FNeg: 4862 case Instruction::Mul: 4863 case Instruction::FMul: 4864 case Instruction::FDiv: 4865 case Instruction::FRem: 4866 case Instruction::Shl: 4867 case Instruction::LShr: 4868 case Instruction::AShr: 4869 case Instruction::And: 4870 case Instruction::Or: 4871 case Instruction::Xor: { 4872 // Just widen unops and binops. 4873 setDebugLocFromInst(Builder, &I); 4874 4875 for (unsigned Part = 0; Part < UF; ++Part) { 4876 SmallVector<Value *, 2> Ops; 4877 for (VPValue *VPOp : User.operands()) 4878 Ops.push_back(State.get(VPOp, Part)); 4879 4880 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4881 4882 if (auto *VecOp = dyn_cast<Instruction>(V)) 4883 VecOp->copyIRFlags(&I); 4884 4885 // Use this vector value for all users of the original instruction. 4886 State.set(Def, V, Part); 4887 addMetadata(V, &I); 4888 } 4889 4890 break; 4891 } 4892 case Instruction::ICmp: 4893 case Instruction::FCmp: { 4894 // Widen compares. Generate vector compares. 4895 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4896 auto *Cmp = cast<CmpInst>(&I); 4897 setDebugLocFromInst(Builder, Cmp); 4898 for (unsigned Part = 0; Part < UF; ++Part) { 4899 Value *A = State.get(User.getOperand(0), Part); 4900 Value *B = State.get(User.getOperand(1), Part); 4901 Value *C = nullptr; 4902 if (FCmp) { 4903 // Propagate fast math flags. 4904 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4905 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4906 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4907 } else { 4908 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4909 } 4910 State.set(Def, C, Part); 4911 addMetadata(C, &I); 4912 } 4913 4914 break; 4915 } 4916 4917 case Instruction::ZExt: 4918 case Instruction::SExt: 4919 case Instruction::FPToUI: 4920 case Instruction::FPToSI: 4921 case Instruction::FPExt: 4922 case Instruction::PtrToInt: 4923 case Instruction::IntToPtr: 4924 case Instruction::SIToFP: 4925 case Instruction::UIToFP: 4926 case Instruction::Trunc: 4927 case Instruction::FPTrunc: 4928 case Instruction::BitCast: { 4929 auto *CI = cast<CastInst>(&I); 4930 setDebugLocFromInst(Builder, CI); 4931 4932 /// Vectorize casts. 4933 Type *DestTy = 4934 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4935 4936 for (unsigned Part = 0; Part < UF; ++Part) { 4937 Value *A = State.get(User.getOperand(0), Part); 4938 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4939 State.set(Def, Cast, Part); 4940 addMetadata(Cast, &I); 4941 } 4942 break; 4943 } 4944 default: 4945 // This instruction is not vectorized by simple widening. 4946 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4947 llvm_unreachable("Unhandled instruction!"); 4948 } // end of switch. 4949 } 4950 4951 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4952 VPUser &ArgOperands, 4953 VPTransformState &State) { 4954 assert(!isa<DbgInfoIntrinsic>(I) && 4955 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4956 setDebugLocFromInst(Builder, &I); 4957 4958 Module *M = I.getParent()->getParent()->getParent(); 4959 auto *CI = cast<CallInst>(&I); 4960 4961 SmallVector<Type *, 4> Tys; 4962 for (Value *ArgOperand : CI->arg_operands()) 4963 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4964 4965 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4966 4967 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4968 // version of the instruction. 4969 // Is it beneficial to perform intrinsic call compared to lib call? 4970 bool NeedToScalarize = false; 4971 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4972 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4973 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4974 assert((UseVectorIntrinsic || !NeedToScalarize) && 4975 "Instruction should be scalarized elsewhere."); 4976 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4977 "Either the intrinsic cost or vector call cost must be valid"); 4978 4979 for (unsigned Part = 0; Part < UF; ++Part) { 4980 SmallVector<Value *, 4> Args; 4981 for (auto &I : enumerate(ArgOperands.operands())) { 4982 // Some intrinsics have a scalar argument - don't replace it with a 4983 // vector. 4984 Value *Arg; 4985 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4986 Arg = State.get(I.value(), Part); 4987 else 4988 Arg = State.get(I.value(), VPIteration(0, 0)); 4989 Args.push_back(Arg); 4990 } 4991 4992 Function *VectorF; 4993 if (UseVectorIntrinsic) { 4994 // Use vector version of the intrinsic. 4995 Type *TysForDecl[] = {CI->getType()}; 4996 if (VF.isVector()) 4997 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4998 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4999 assert(VectorF && "Can't retrieve vector intrinsic."); 5000 } else { 5001 // Use vector version of the function call. 5002 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5003 #ifndef NDEBUG 5004 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5005 "Can't create vector function."); 5006 #endif 5007 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5008 } 5009 SmallVector<OperandBundleDef, 1> OpBundles; 5010 CI->getOperandBundlesAsDefs(OpBundles); 5011 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5012 5013 if (isa<FPMathOperator>(V)) 5014 V->copyFastMathFlags(CI); 5015 5016 State.set(Def, V, Part); 5017 addMetadata(V, &I); 5018 } 5019 } 5020 5021 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5022 VPUser &Operands, 5023 bool InvariantCond, 5024 VPTransformState &State) { 5025 setDebugLocFromInst(Builder, &I); 5026 5027 // The condition can be loop invariant but still defined inside the 5028 // loop. This means that we can't just use the original 'cond' value. 5029 // We have to take the 'vectorized' value and pick the first lane. 5030 // Instcombine will make this a no-op. 5031 auto *InvarCond = InvariantCond 5032 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5033 : nullptr; 5034 5035 for (unsigned Part = 0; Part < UF; ++Part) { 5036 Value *Cond = 5037 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5038 Value *Op0 = State.get(Operands.getOperand(1), Part); 5039 Value *Op1 = State.get(Operands.getOperand(2), Part); 5040 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5041 State.set(VPDef, Sel, Part); 5042 addMetadata(Sel, &I); 5043 } 5044 } 5045 5046 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5047 // We should not collect Scalars more than once per VF. Right now, this 5048 // function is called from collectUniformsAndScalars(), which already does 5049 // this check. Collecting Scalars for VF=1 does not make any sense. 5050 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5051 "This function should not be visited twice for the same VF"); 5052 5053 SmallSetVector<Instruction *, 8> Worklist; 5054 5055 // These sets are used to seed the analysis with pointers used by memory 5056 // accesses that will remain scalar. 5057 SmallSetVector<Instruction *, 8> ScalarPtrs; 5058 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5059 auto *Latch = TheLoop->getLoopLatch(); 5060 5061 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5062 // The pointer operands of loads and stores will be scalar as long as the 5063 // memory access is not a gather or scatter operation. The value operand of a 5064 // store will remain scalar if the store is scalarized. 5065 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5066 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5067 assert(WideningDecision != CM_Unknown && 5068 "Widening decision should be ready at this moment"); 5069 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5070 if (Ptr == Store->getValueOperand()) 5071 return WideningDecision == CM_Scalarize; 5072 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5073 "Ptr is neither a value or pointer operand"); 5074 return WideningDecision != CM_GatherScatter; 5075 }; 5076 5077 // A helper that returns true if the given value is a bitcast or 5078 // getelementptr instruction contained in the loop. 5079 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5080 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5081 isa<GetElementPtrInst>(V)) && 5082 !TheLoop->isLoopInvariant(V); 5083 }; 5084 5085 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5086 if (!isa<PHINode>(Ptr) || 5087 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5088 return false; 5089 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5090 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5091 return false; 5092 return isScalarUse(MemAccess, Ptr); 5093 }; 5094 5095 // A helper that evaluates a memory access's use of a pointer. If the 5096 // pointer is actually the pointer induction of a loop, it is being 5097 // inserted into Worklist. If the use will be a scalar use, and the 5098 // pointer is only used by memory accesses, we place the pointer in 5099 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5100 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5101 if (isScalarPtrInduction(MemAccess, Ptr)) { 5102 Worklist.insert(cast<Instruction>(Ptr)); 5103 Instruction *Update = cast<Instruction>( 5104 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5105 Worklist.insert(Update); 5106 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5107 << "\n"); 5108 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5109 << "\n"); 5110 return; 5111 } 5112 // We only care about bitcast and getelementptr instructions contained in 5113 // the loop. 5114 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5115 return; 5116 5117 // If the pointer has already been identified as scalar (e.g., if it was 5118 // also identified as uniform), there's nothing to do. 5119 auto *I = cast<Instruction>(Ptr); 5120 if (Worklist.count(I)) 5121 return; 5122 5123 // If the use of the pointer will be a scalar use, and all users of the 5124 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5125 // place the pointer in PossibleNonScalarPtrs. 5126 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5127 return isa<LoadInst>(U) || isa<StoreInst>(U); 5128 })) 5129 ScalarPtrs.insert(I); 5130 else 5131 PossibleNonScalarPtrs.insert(I); 5132 }; 5133 5134 // We seed the scalars analysis with three classes of instructions: (1) 5135 // instructions marked uniform-after-vectorization and (2) bitcast, 5136 // getelementptr and (pointer) phi instructions used by memory accesses 5137 // requiring a scalar use. 5138 // 5139 // (1) Add to the worklist all instructions that have been identified as 5140 // uniform-after-vectorization. 5141 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5142 5143 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5144 // memory accesses requiring a scalar use. The pointer operands of loads and 5145 // stores will be scalar as long as the memory accesses is not a gather or 5146 // scatter operation. The value operand of a store will remain scalar if the 5147 // store is scalarized. 5148 for (auto *BB : TheLoop->blocks()) 5149 for (auto &I : *BB) { 5150 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5151 evaluatePtrUse(Load, Load->getPointerOperand()); 5152 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5153 evaluatePtrUse(Store, Store->getPointerOperand()); 5154 evaluatePtrUse(Store, Store->getValueOperand()); 5155 } 5156 } 5157 for (auto *I : ScalarPtrs) 5158 if (!PossibleNonScalarPtrs.count(I)) { 5159 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5160 Worklist.insert(I); 5161 } 5162 5163 // Insert the forced scalars. 5164 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5165 // induction variable when the PHI user is scalarized. 5166 auto ForcedScalar = ForcedScalars.find(VF); 5167 if (ForcedScalar != ForcedScalars.end()) 5168 for (auto *I : ForcedScalar->second) 5169 Worklist.insert(I); 5170 5171 // Expand the worklist by looking through any bitcasts and getelementptr 5172 // instructions we've already identified as scalar. This is similar to the 5173 // expansion step in collectLoopUniforms(); however, here we're only 5174 // expanding to include additional bitcasts and getelementptr instructions. 5175 unsigned Idx = 0; 5176 while (Idx != Worklist.size()) { 5177 Instruction *Dst = Worklist[Idx++]; 5178 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5179 continue; 5180 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5181 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5182 auto *J = cast<Instruction>(U); 5183 return !TheLoop->contains(J) || Worklist.count(J) || 5184 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5185 isScalarUse(J, Src)); 5186 })) { 5187 Worklist.insert(Src); 5188 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5189 } 5190 } 5191 5192 // An induction variable will remain scalar if all users of the induction 5193 // variable and induction variable update remain scalar. 5194 for (auto &Induction : Legal->getInductionVars()) { 5195 auto *Ind = Induction.first; 5196 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5197 5198 // If tail-folding is applied, the primary induction variable will be used 5199 // to feed a vector compare. 5200 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5201 continue; 5202 5203 // Determine if all users of the induction variable are scalar after 5204 // vectorization. 5205 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5206 auto *I = cast<Instruction>(U); 5207 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5208 }); 5209 if (!ScalarInd) 5210 continue; 5211 5212 // Determine if all users of the induction variable update instruction are 5213 // scalar after vectorization. 5214 auto ScalarIndUpdate = 5215 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5216 auto *I = cast<Instruction>(U); 5217 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5218 }); 5219 if (!ScalarIndUpdate) 5220 continue; 5221 5222 // The induction variable and its update instruction will remain scalar. 5223 Worklist.insert(Ind); 5224 Worklist.insert(IndUpdate); 5225 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5226 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5227 << "\n"); 5228 } 5229 5230 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5231 } 5232 5233 bool LoopVectorizationCostModel::isScalarWithPredication( 5234 Instruction *I, ElementCount VF) const { 5235 if (!blockNeedsPredication(I->getParent())) 5236 return false; 5237 switch(I->getOpcode()) { 5238 default: 5239 break; 5240 case Instruction::Load: 5241 case Instruction::Store: { 5242 if (!Legal->isMaskRequired(I)) 5243 return false; 5244 auto *Ptr = getLoadStorePointerOperand(I); 5245 auto *Ty = getMemInstValueType(I); 5246 // We have already decided how to vectorize this instruction, get that 5247 // result. 5248 if (VF.isVector()) { 5249 InstWidening WideningDecision = getWideningDecision(I, VF); 5250 assert(WideningDecision != CM_Unknown && 5251 "Widening decision should be ready at this moment"); 5252 return WideningDecision == CM_Scalarize; 5253 } 5254 const Align Alignment = getLoadStoreAlignment(I); 5255 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5256 isLegalMaskedGather(Ty, Alignment)) 5257 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5258 isLegalMaskedScatter(Ty, Alignment)); 5259 } 5260 case Instruction::UDiv: 5261 case Instruction::SDiv: 5262 case Instruction::SRem: 5263 case Instruction::URem: 5264 return mayDivideByZero(*I); 5265 } 5266 return false; 5267 } 5268 5269 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5270 Instruction *I, ElementCount VF) { 5271 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5272 assert(getWideningDecision(I, VF) == CM_Unknown && 5273 "Decision should not be set yet."); 5274 auto *Group = getInterleavedAccessGroup(I); 5275 assert(Group && "Must have a group."); 5276 5277 // If the instruction's allocated size doesn't equal it's type size, it 5278 // requires padding and will be scalarized. 5279 auto &DL = I->getModule()->getDataLayout(); 5280 auto *ScalarTy = getMemInstValueType(I); 5281 if (hasIrregularType(ScalarTy, DL)) 5282 return false; 5283 5284 // Check if masking is required. 5285 // A Group may need masking for one of two reasons: it resides in a block that 5286 // needs predication, or it was decided to use masking to deal with gaps. 5287 bool PredicatedAccessRequiresMasking = 5288 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5289 bool AccessWithGapsRequiresMasking = 5290 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5291 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5292 return true; 5293 5294 // If masked interleaving is required, we expect that the user/target had 5295 // enabled it, because otherwise it either wouldn't have been created or 5296 // it should have been invalidated by the CostModel. 5297 assert(useMaskedInterleavedAccesses(TTI) && 5298 "Masked interleave-groups for predicated accesses are not enabled."); 5299 5300 auto *Ty = getMemInstValueType(I); 5301 const Align Alignment = getLoadStoreAlignment(I); 5302 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5303 : TTI.isLegalMaskedStore(Ty, Alignment); 5304 } 5305 5306 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5307 Instruction *I, ElementCount VF) { 5308 // Get and ensure we have a valid memory instruction. 5309 LoadInst *LI = dyn_cast<LoadInst>(I); 5310 StoreInst *SI = dyn_cast<StoreInst>(I); 5311 assert((LI || SI) && "Invalid memory instruction"); 5312 5313 auto *Ptr = getLoadStorePointerOperand(I); 5314 5315 // In order to be widened, the pointer should be consecutive, first of all. 5316 if (!Legal->isConsecutivePtr(Ptr)) 5317 return false; 5318 5319 // If the instruction is a store located in a predicated block, it will be 5320 // scalarized. 5321 if (isScalarWithPredication(I)) 5322 return false; 5323 5324 // If the instruction's allocated size doesn't equal it's type size, it 5325 // requires padding and will be scalarized. 5326 auto &DL = I->getModule()->getDataLayout(); 5327 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5328 if (hasIrregularType(ScalarTy, DL)) 5329 return false; 5330 5331 return true; 5332 } 5333 5334 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5335 // We should not collect Uniforms more than once per VF. Right now, 5336 // this function is called from collectUniformsAndScalars(), which 5337 // already does this check. Collecting Uniforms for VF=1 does not make any 5338 // sense. 5339 5340 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5341 "This function should not be visited twice for the same VF"); 5342 5343 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5344 // not analyze again. Uniforms.count(VF) will return 1. 5345 Uniforms[VF].clear(); 5346 5347 // We now know that the loop is vectorizable! 5348 // Collect instructions inside the loop that will remain uniform after 5349 // vectorization. 5350 5351 // Global values, params and instructions outside of current loop are out of 5352 // scope. 5353 auto isOutOfScope = [&](Value *V) -> bool { 5354 Instruction *I = dyn_cast<Instruction>(V); 5355 return (!I || !TheLoop->contains(I)); 5356 }; 5357 5358 SetVector<Instruction *> Worklist; 5359 BasicBlock *Latch = TheLoop->getLoopLatch(); 5360 5361 // Instructions that are scalar with predication must not be considered 5362 // uniform after vectorization, because that would create an erroneous 5363 // replicating region where only a single instance out of VF should be formed. 5364 // TODO: optimize such seldom cases if found important, see PR40816. 5365 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5366 if (isOutOfScope(I)) { 5367 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5368 << *I << "\n"); 5369 return; 5370 } 5371 if (isScalarWithPredication(I, VF)) { 5372 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5373 << *I << "\n"); 5374 return; 5375 } 5376 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5377 Worklist.insert(I); 5378 }; 5379 5380 // Start with the conditional branch. If the branch condition is an 5381 // instruction contained in the loop that is only used by the branch, it is 5382 // uniform. 5383 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5384 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5385 addToWorklistIfAllowed(Cmp); 5386 5387 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5388 InstWidening WideningDecision = getWideningDecision(I, VF); 5389 assert(WideningDecision != CM_Unknown && 5390 "Widening decision should be ready at this moment"); 5391 5392 // A uniform memory op is itself uniform. We exclude uniform stores 5393 // here as they demand the last lane, not the first one. 5394 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5395 assert(WideningDecision == CM_Scalarize); 5396 return true; 5397 } 5398 5399 return (WideningDecision == CM_Widen || 5400 WideningDecision == CM_Widen_Reverse || 5401 WideningDecision == CM_Interleave); 5402 }; 5403 5404 5405 // Returns true if Ptr is the pointer operand of a memory access instruction 5406 // I, and I is known to not require scalarization. 5407 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5408 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5409 }; 5410 5411 // Holds a list of values which are known to have at least one uniform use. 5412 // Note that there may be other uses which aren't uniform. A "uniform use" 5413 // here is something which only demands lane 0 of the unrolled iterations; 5414 // it does not imply that all lanes produce the same value (e.g. this is not 5415 // the usual meaning of uniform) 5416 SetVector<Value *> HasUniformUse; 5417 5418 // Scan the loop for instructions which are either a) known to have only 5419 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5420 for (auto *BB : TheLoop->blocks()) 5421 for (auto &I : *BB) { 5422 // If there's no pointer operand, there's nothing to do. 5423 auto *Ptr = getLoadStorePointerOperand(&I); 5424 if (!Ptr) 5425 continue; 5426 5427 // A uniform memory op is itself uniform. We exclude uniform stores 5428 // here as they demand the last lane, not the first one. 5429 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5430 addToWorklistIfAllowed(&I); 5431 5432 if (isUniformDecision(&I, VF)) { 5433 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5434 HasUniformUse.insert(Ptr); 5435 } 5436 } 5437 5438 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5439 // demanding) users. Since loops are assumed to be in LCSSA form, this 5440 // disallows uses outside the loop as well. 5441 for (auto *V : HasUniformUse) { 5442 if (isOutOfScope(V)) 5443 continue; 5444 auto *I = cast<Instruction>(V); 5445 auto UsersAreMemAccesses = 5446 llvm::all_of(I->users(), [&](User *U) -> bool { 5447 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5448 }); 5449 if (UsersAreMemAccesses) 5450 addToWorklistIfAllowed(I); 5451 } 5452 5453 // Expand Worklist in topological order: whenever a new instruction 5454 // is added , its users should be already inside Worklist. It ensures 5455 // a uniform instruction will only be used by uniform instructions. 5456 unsigned idx = 0; 5457 while (idx != Worklist.size()) { 5458 Instruction *I = Worklist[idx++]; 5459 5460 for (auto OV : I->operand_values()) { 5461 // isOutOfScope operands cannot be uniform instructions. 5462 if (isOutOfScope(OV)) 5463 continue; 5464 // First order recurrence Phi's should typically be considered 5465 // non-uniform. 5466 auto *OP = dyn_cast<PHINode>(OV); 5467 if (OP && Legal->isFirstOrderRecurrence(OP)) 5468 continue; 5469 // If all the users of the operand are uniform, then add the 5470 // operand into the uniform worklist. 5471 auto *OI = cast<Instruction>(OV); 5472 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5473 auto *J = cast<Instruction>(U); 5474 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5475 })) 5476 addToWorklistIfAllowed(OI); 5477 } 5478 } 5479 5480 // For an instruction to be added into Worklist above, all its users inside 5481 // the loop should also be in Worklist. However, this condition cannot be 5482 // true for phi nodes that form a cyclic dependence. We must process phi 5483 // nodes separately. An induction variable will remain uniform if all users 5484 // of the induction variable and induction variable update remain uniform. 5485 // The code below handles both pointer and non-pointer induction variables. 5486 for (auto &Induction : Legal->getInductionVars()) { 5487 auto *Ind = Induction.first; 5488 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5489 5490 // Determine if all users of the induction variable are uniform after 5491 // vectorization. 5492 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5493 auto *I = cast<Instruction>(U); 5494 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5495 isVectorizedMemAccessUse(I, Ind); 5496 }); 5497 if (!UniformInd) 5498 continue; 5499 5500 // Determine if all users of the induction variable update instruction are 5501 // uniform after vectorization. 5502 auto UniformIndUpdate = 5503 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5504 auto *I = cast<Instruction>(U); 5505 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5506 isVectorizedMemAccessUse(I, IndUpdate); 5507 }); 5508 if (!UniformIndUpdate) 5509 continue; 5510 5511 // The induction variable and its update instruction will remain uniform. 5512 addToWorklistIfAllowed(Ind); 5513 addToWorklistIfAllowed(IndUpdate); 5514 } 5515 5516 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5517 } 5518 5519 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5520 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5521 5522 if (Legal->getRuntimePointerChecking()->Need) { 5523 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5524 "runtime pointer checks needed. Enable vectorization of this " 5525 "loop with '#pragma clang loop vectorize(enable)' when " 5526 "compiling with -Os/-Oz", 5527 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5528 return true; 5529 } 5530 5531 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5532 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5533 "runtime SCEV checks needed. Enable vectorization of this " 5534 "loop with '#pragma clang loop vectorize(enable)' when " 5535 "compiling with -Os/-Oz", 5536 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5537 return true; 5538 } 5539 5540 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5541 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5542 reportVectorizationFailure("Runtime stride check for small trip count", 5543 "runtime stride == 1 checks needed. Enable vectorization of " 5544 "this loop without such check by compiling with -Os/-Oz", 5545 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5546 return true; 5547 } 5548 5549 return false; 5550 } 5551 5552 Optional<ElementCount> 5553 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5554 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5555 // TODO: It may by useful to do since it's still likely to be dynamically 5556 // uniform if the target can skip. 5557 reportVectorizationFailure( 5558 "Not inserting runtime ptr check for divergent target", 5559 "runtime pointer checks needed. Not enabled for divergent target", 5560 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5561 return None; 5562 } 5563 5564 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5565 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5566 if (TC == 1) { 5567 reportVectorizationFailure("Single iteration (non) loop", 5568 "loop trip count is one, irrelevant for vectorization", 5569 "SingleIterationLoop", ORE, TheLoop); 5570 return None; 5571 } 5572 5573 switch (ScalarEpilogueStatus) { 5574 case CM_ScalarEpilogueAllowed: 5575 return computeFeasibleMaxVF(TC, UserVF); 5576 case CM_ScalarEpilogueNotAllowedUsePredicate: 5577 LLVM_FALLTHROUGH; 5578 case CM_ScalarEpilogueNotNeededUsePredicate: 5579 LLVM_DEBUG( 5580 dbgs() << "LV: vector predicate hint/switch found.\n" 5581 << "LV: Not allowing scalar epilogue, creating predicated " 5582 << "vector loop.\n"); 5583 break; 5584 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5585 // fallthrough as a special case of OptForSize 5586 case CM_ScalarEpilogueNotAllowedOptSize: 5587 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5588 LLVM_DEBUG( 5589 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5590 else 5591 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5592 << "count.\n"); 5593 5594 // Bail if runtime checks are required, which are not good when optimising 5595 // for size. 5596 if (runtimeChecksRequired()) 5597 return None; 5598 5599 break; 5600 } 5601 5602 // The only loops we can vectorize without a scalar epilogue, are loops with 5603 // a bottom-test and a single exiting block. We'd have to handle the fact 5604 // that not every instruction executes on the last iteration. This will 5605 // require a lane mask which varies through the vector loop body. (TODO) 5606 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5607 // If there was a tail-folding hint/switch, but we can't fold the tail by 5608 // masking, fallback to a vectorization with a scalar epilogue. 5609 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5610 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5611 "scalar epilogue instead.\n"); 5612 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5613 return computeFeasibleMaxVF(TC, UserVF); 5614 } 5615 return None; 5616 } 5617 5618 // Now try the tail folding 5619 5620 // Invalidate interleave groups that require an epilogue if we can't mask 5621 // the interleave-group. 5622 if (!useMaskedInterleavedAccesses(TTI)) { 5623 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5624 "No decisions should have been taken at this point"); 5625 // Note: There is no need to invalidate any cost modeling decisions here, as 5626 // non where taken so far. 5627 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5628 } 5629 5630 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5631 assert(!MaxVF.isScalable() && 5632 "Scalable vectors do not yet support tail folding"); 5633 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5634 "MaxVF must be a power of 2"); 5635 unsigned MaxVFtimesIC = 5636 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5637 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5638 // chose. 5639 ScalarEvolution *SE = PSE.getSE(); 5640 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5641 const SCEV *ExitCount = SE->getAddExpr( 5642 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5643 const SCEV *Rem = SE->getURemExpr( 5644 SE->applyLoopGuards(ExitCount, TheLoop), 5645 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5646 if (Rem->isZero()) { 5647 // Accept MaxVF if we do not have a tail. 5648 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5649 return MaxVF; 5650 } 5651 5652 // If we don't know the precise trip count, or if the trip count that we 5653 // found modulo the vectorization factor is not zero, try to fold the tail 5654 // by masking. 5655 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5656 if (Legal->prepareToFoldTailByMasking()) { 5657 FoldTailByMasking = true; 5658 return MaxVF; 5659 } 5660 5661 // If there was a tail-folding hint/switch, but we can't fold the tail by 5662 // masking, fallback to a vectorization with a scalar epilogue. 5663 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5664 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5665 "scalar epilogue instead.\n"); 5666 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5667 return MaxVF; 5668 } 5669 5670 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5671 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5672 return None; 5673 } 5674 5675 if (TC == 0) { 5676 reportVectorizationFailure( 5677 "Unable to calculate the loop count due to complex control flow", 5678 "unable to calculate the loop count due to complex control flow", 5679 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5680 return None; 5681 } 5682 5683 reportVectorizationFailure( 5684 "Cannot optimize for size and vectorize at the same time.", 5685 "cannot optimize for size and vectorize at the same time. " 5686 "Enable vectorization of this loop with '#pragma clang loop " 5687 "vectorize(enable)' when compiling with -Os/-Oz", 5688 "NoTailLoopWithOptForSize", ORE, TheLoop); 5689 return None; 5690 } 5691 5692 ElementCount 5693 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5694 ElementCount UserVF) { 5695 bool IgnoreScalableUserVF = UserVF.isScalable() && 5696 !TTI.supportsScalableVectors() && 5697 !ForceTargetSupportsScalableVectors; 5698 if (IgnoreScalableUserVF) { 5699 LLVM_DEBUG( 5700 dbgs() << "LV: Ignoring VF=" << UserVF 5701 << " because target does not support scalable vectors.\n"); 5702 ORE->emit([&]() { 5703 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5704 TheLoop->getStartLoc(), 5705 TheLoop->getHeader()) 5706 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5707 << " because target does not support scalable vectors."; 5708 }); 5709 } 5710 5711 // Beyond this point two scenarios are handled. If UserVF isn't specified 5712 // then a suitable VF is chosen. If UserVF is specified and there are 5713 // dependencies, check if it's legal. However, if a UserVF is specified and 5714 // there are no dependencies, then there's nothing to do. 5715 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5716 if (!canVectorizeReductions(UserVF)) { 5717 reportVectorizationFailure( 5718 "LV: Scalable vectorization not supported for the reduction " 5719 "operations found in this loop. Using fixed-width " 5720 "vectorization instead.", 5721 "Scalable vectorization not supported for the reduction operations " 5722 "found in this loop. Using fixed-width vectorization instead.", 5723 "ScalableVFUnfeasible", ORE, TheLoop); 5724 return computeFeasibleMaxVF( 5725 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5726 } 5727 5728 if (Legal->isSafeForAnyVectorWidth()) 5729 return UserVF; 5730 } 5731 5732 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5733 unsigned SmallestType, WidestType; 5734 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5735 unsigned WidestRegister = 5736 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 5737 .getFixedSize(); 5738 5739 // Get the maximum safe dependence distance in bits computed by LAA. 5740 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5741 // the memory accesses that is most restrictive (involved in the smallest 5742 // dependence distance). 5743 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5744 5745 // If the user vectorization factor is legally unsafe, clamp it to a safe 5746 // value. Otherwise, return as is. 5747 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5748 unsigned MaxSafeElements = 5749 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5750 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5751 5752 if (UserVF.isScalable()) { 5753 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5754 5755 // Scale VF by vscale before checking if it's safe. 5756 MaxSafeVF = ElementCount::getScalable( 5757 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5758 5759 if (MaxSafeVF.isZero()) { 5760 // The dependence distance is too small to use scalable vectors, 5761 // fallback on fixed. 5762 LLVM_DEBUG( 5763 dbgs() 5764 << "LV: Max legal vector width too small, scalable vectorization " 5765 "unfeasible. Using fixed-width vectorization instead.\n"); 5766 ORE->emit([&]() { 5767 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5768 TheLoop->getStartLoc(), 5769 TheLoop->getHeader()) 5770 << "Max legal vector width too small, scalable vectorization " 5771 << "unfeasible. Using fixed-width vectorization instead."; 5772 }); 5773 return computeFeasibleMaxVF( 5774 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5775 } 5776 } 5777 5778 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5779 5780 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5781 return UserVF; 5782 5783 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5784 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5785 << ".\n"); 5786 ORE->emit([&]() { 5787 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5788 TheLoop->getStartLoc(), 5789 TheLoop->getHeader()) 5790 << "User-specified vectorization factor " 5791 << ore::NV("UserVectorizationFactor", UserVF) 5792 << " is unsafe, clamping to maximum safe vectorization factor " 5793 << ore::NV("VectorizationFactor", MaxSafeVF); 5794 }); 5795 return MaxSafeVF; 5796 } 5797 5798 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5799 5800 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5801 // Note that both WidestRegister and WidestType may not be a powers of 2. 5802 auto MaxVectorSize = 5803 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5804 5805 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5806 << " / " << WidestType << " bits.\n"); 5807 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5808 << WidestRegister << " bits.\n"); 5809 5810 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5811 "Did not expect to pack so many elements" 5812 " into one vector!"); 5813 if (MaxVectorSize.getFixedValue() == 0) { 5814 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5815 return ElementCount::getFixed(1); 5816 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5817 isPowerOf2_32(ConstTripCount)) { 5818 // We need to clamp the VF to be the ConstTripCount. There is no point in 5819 // choosing a higher viable VF as done in the loop below. 5820 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5821 << ConstTripCount << "\n"); 5822 return ElementCount::getFixed(ConstTripCount); 5823 } 5824 5825 ElementCount MaxVF = MaxVectorSize; 5826 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5827 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5828 // Collect all viable vectorization factors larger than the default MaxVF 5829 // (i.e. MaxVectorSize). 5830 SmallVector<ElementCount, 8> VFs; 5831 auto MaxVectorSizeMaxBW = 5832 ElementCount::getFixed(WidestRegister / SmallestType); 5833 for (ElementCount VS = MaxVectorSize * 2; 5834 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5835 VFs.push_back(VS); 5836 5837 // For each VF calculate its register usage. 5838 auto RUs = calculateRegisterUsage(VFs); 5839 5840 // Select the largest VF which doesn't require more registers than existing 5841 // ones. 5842 for (int i = RUs.size() - 1; i >= 0; --i) { 5843 bool Selected = true; 5844 for (auto &pair : RUs[i].MaxLocalUsers) { 5845 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5846 if (pair.second > TargetNumRegisters) 5847 Selected = false; 5848 } 5849 if (Selected) { 5850 MaxVF = VFs[i]; 5851 break; 5852 } 5853 } 5854 if (ElementCount MinVF = 5855 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5856 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5857 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5858 << ") with target's minimum: " << MinVF << '\n'); 5859 MaxVF = MinVF; 5860 } 5861 } 5862 } 5863 return MaxVF; 5864 } 5865 5866 VectorizationFactor 5867 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5868 // FIXME: This can be fixed for scalable vectors later, because at this stage 5869 // the LoopVectorizer will only consider vectorizing a loop with scalable 5870 // vectors when the loop has a hint to enable vectorization for a given VF. 5871 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5872 5873 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5874 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5875 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5876 5877 auto Width = ElementCount::getFixed(1); 5878 const float ScalarCost = *ExpectedCost.getValue(); 5879 float Cost = ScalarCost; 5880 5881 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5882 if (ForceVectorization && MaxVF.isVector()) { 5883 // Ignore scalar width, because the user explicitly wants vectorization. 5884 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5885 // evaluation. 5886 Cost = std::numeric_limits<float>::max(); 5887 } 5888 5889 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5890 i *= 2) { 5891 // Notice that the vector loop needs to be executed less times, so 5892 // we need to divide the cost of the vector loops by the width of 5893 // the vector elements. 5894 VectorizationCostTy C = expectedCost(i); 5895 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5896 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5897 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5898 << " costs: " << (int)VectorCost << ".\n"); 5899 if (!C.second && !ForceVectorization) { 5900 LLVM_DEBUG( 5901 dbgs() << "LV: Not considering vector loop of width " << i 5902 << " because it will not generate any vector instructions.\n"); 5903 continue; 5904 } 5905 5906 // If profitable add it to ProfitableVF list. 5907 if (VectorCost < ScalarCost) { 5908 ProfitableVFs.push_back(VectorizationFactor( 5909 {i, (unsigned)VectorCost})); 5910 } 5911 5912 if (VectorCost < Cost) { 5913 Cost = VectorCost; 5914 Width = i; 5915 } 5916 } 5917 5918 if (!EnableCondStoresVectorization && NumPredStores) { 5919 reportVectorizationFailure("There are conditional stores.", 5920 "store that is conditionally executed prevents vectorization", 5921 "ConditionalStore", ORE, TheLoop); 5922 Width = ElementCount::getFixed(1); 5923 Cost = ScalarCost; 5924 } 5925 5926 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5927 << "LV: Vectorization seems to be not beneficial, " 5928 << "but was forced by a user.\n"); 5929 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5930 VectorizationFactor Factor = {Width, 5931 (unsigned)(Width.getKnownMinValue() * Cost)}; 5932 return Factor; 5933 } 5934 5935 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5936 const Loop &L, ElementCount VF) const { 5937 // Cross iteration phis such as reductions need special handling and are 5938 // currently unsupported. 5939 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5940 return Legal->isFirstOrderRecurrence(&Phi) || 5941 Legal->isReductionVariable(&Phi); 5942 })) 5943 return false; 5944 5945 // Phis with uses outside of the loop require special handling and are 5946 // currently unsupported. 5947 for (auto &Entry : Legal->getInductionVars()) { 5948 // Look for uses of the value of the induction at the last iteration. 5949 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5950 for (User *U : PostInc->users()) 5951 if (!L.contains(cast<Instruction>(U))) 5952 return false; 5953 // Look for uses of penultimate value of the induction. 5954 for (User *U : Entry.first->users()) 5955 if (!L.contains(cast<Instruction>(U))) 5956 return false; 5957 } 5958 5959 // Induction variables that are widened require special handling that is 5960 // currently not supported. 5961 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5962 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5963 this->isProfitableToScalarize(Entry.first, VF)); 5964 })) 5965 return false; 5966 5967 return true; 5968 } 5969 5970 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5971 const ElementCount VF) const { 5972 // FIXME: We need a much better cost-model to take different parameters such 5973 // as register pressure, code size increase and cost of extra branches into 5974 // account. For now we apply a very crude heuristic and only consider loops 5975 // with vectorization factors larger than a certain value. 5976 // We also consider epilogue vectorization unprofitable for targets that don't 5977 // consider interleaving beneficial (eg. MVE). 5978 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5979 return false; 5980 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5981 return true; 5982 return false; 5983 } 5984 5985 VectorizationFactor 5986 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5987 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5988 VectorizationFactor Result = VectorizationFactor::Disabled(); 5989 if (!EnableEpilogueVectorization) { 5990 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5991 return Result; 5992 } 5993 5994 if (!isScalarEpilogueAllowed()) { 5995 LLVM_DEBUG( 5996 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5997 "allowed.\n";); 5998 return Result; 5999 } 6000 6001 // FIXME: This can be fixed for scalable vectors later, because at this stage 6002 // the LoopVectorizer will only consider vectorizing a loop with scalable 6003 // vectors when the loop has a hint to enable vectorization for a given VF. 6004 if (MainLoopVF.isScalable()) { 6005 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6006 "yet supported.\n"); 6007 return Result; 6008 } 6009 6010 // Not really a cost consideration, but check for unsupported cases here to 6011 // simplify the logic. 6012 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6013 LLVM_DEBUG( 6014 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6015 "not a supported candidate.\n";); 6016 return Result; 6017 } 6018 6019 if (EpilogueVectorizationForceVF > 1) { 6020 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6021 if (LVP.hasPlanWithVFs( 6022 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6023 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6024 else { 6025 LLVM_DEBUG( 6026 dbgs() 6027 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6028 return Result; 6029 } 6030 } 6031 6032 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6033 TheLoop->getHeader()->getParent()->hasMinSize()) { 6034 LLVM_DEBUG( 6035 dbgs() 6036 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6037 return Result; 6038 } 6039 6040 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6041 return Result; 6042 6043 for (auto &NextVF : ProfitableVFs) 6044 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6045 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6046 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6047 Result = NextVF; 6048 6049 if (Result != VectorizationFactor::Disabled()) 6050 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6051 << Result.Width.getFixedValue() << "\n";); 6052 return Result; 6053 } 6054 6055 std::pair<unsigned, unsigned> 6056 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6057 unsigned MinWidth = -1U; 6058 unsigned MaxWidth = 8; 6059 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6060 6061 // For each block. 6062 for (BasicBlock *BB : TheLoop->blocks()) { 6063 // For each instruction in the loop. 6064 for (Instruction &I : BB->instructionsWithoutDebug()) { 6065 Type *T = I.getType(); 6066 6067 // Skip ignored values. 6068 if (ValuesToIgnore.count(&I)) 6069 continue; 6070 6071 // Only examine Loads, Stores and PHINodes. 6072 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6073 continue; 6074 6075 // Examine PHI nodes that are reduction variables. Update the type to 6076 // account for the recurrence type. 6077 if (auto *PN = dyn_cast<PHINode>(&I)) { 6078 if (!Legal->isReductionVariable(PN)) 6079 continue; 6080 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6081 if (PreferInLoopReductions || 6082 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6083 RdxDesc.getRecurrenceType(), 6084 TargetTransformInfo::ReductionFlags())) 6085 continue; 6086 T = RdxDesc.getRecurrenceType(); 6087 } 6088 6089 // Examine the stored values. 6090 if (auto *ST = dyn_cast<StoreInst>(&I)) 6091 T = ST->getValueOperand()->getType(); 6092 6093 // Ignore loaded pointer types and stored pointer types that are not 6094 // vectorizable. 6095 // 6096 // FIXME: The check here attempts to predict whether a load or store will 6097 // be vectorized. We only know this for certain after a VF has 6098 // been selected. Here, we assume that if an access can be 6099 // vectorized, it will be. We should also look at extending this 6100 // optimization to non-pointer types. 6101 // 6102 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6103 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6104 continue; 6105 6106 MinWidth = std::min(MinWidth, 6107 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6108 MaxWidth = std::max(MaxWidth, 6109 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6110 } 6111 } 6112 6113 return {MinWidth, MaxWidth}; 6114 } 6115 6116 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6117 unsigned LoopCost) { 6118 // -- The interleave heuristics -- 6119 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6120 // There are many micro-architectural considerations that we can't predict 6121 // at this level. For example, frontend pressure (on decode or fetch) due to 6122 // code size, or the number and capabilities of the execution ports. 6123 // 6124 // We use the following heuristics to select the interleave count: 6125 // 1. If the code has reductions, then we interleave to break the cross 6126 // iteration dependency. 6127 // 2. If the loop is really small, then we interleave to reduce the loop 6128 // overhead. 6129 // 3. We don't interleave if we think that we will spill registers to memory 6130 // due to the increased register pressure. 6131 6132 if (!isScalarEpilogueAllowed()) 6133 return 1; 6134 6135 // We used the distance for the interleave count. 6136 if (Legal->getMaxSafeDepDistBytes() != -1U) 6137 return 1; 6138 6139 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6140 const bool HasReductions = !Legal->getReductionVars().empty(); 6141 // Do not interleave loops with a relatively small known or estimated trip 6142 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6143 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6144 // because with the above conditions interleaving can expose ILP and break 6145 // cross iteration dependences for reductions. 6146 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6147 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6148 return 1; 6149 6150 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6151 // We divide by these constants so assume that we have at least one 6152 // instruction that uses at least one register. 6153 for (auto& pair : R.MaxLocalUsers) { 6154 pair.second = std::max(pair.second, 1U); 6155 } 6156 6157 // We calculate the interleave count using the following formula. 6158 // Subtract the number of loop invariants from the number of available 6159 // registers. These registers are used by all of the interleaved instances. 6160 // Next, divide the remaining registers by the number of registers that is 6161 // required by the loop, in order to estimate how many parallel instances 6162 // fit without causing spills. All of this is rounded down if necessary to be 6163 // a power of two. We want power of two interleave count to simplify any 6164 // addressing operations or alignment considerations. 6165 // We also want power of two interleave counts to ensure that the induction 6166 // variable of the vector loop wraps to zero, when tail is folded by masking; 6167 // this currently happens when OptForSize, in which case IC is set to 1 above. 6168 unsigned IC = UINT_MAX; 6169 6170 for (auto& pair : R.MaxLocalUsers) { 6171 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6172 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6173 << " registers of " 6174 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6175 if (VF.isScalar()) { 6176 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6177 TargetNumRegisters = ForceTargetNumScalarRegs; 6178 } else { 6179 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6180 TargetNumRegisters = ForceTargetNumVectorRegs; 6181 } 6182 unsigned MaxLocalUsers = pair.second; 6183 unsigned LoopInvariantRegs = 0; 6184 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6185 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6186 6187 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6188 // Don't count the induction variable as interleaved. 6189 if (EnableIndVarRegisterHeur) { 6190 TmpIC = 6191 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6192 std::max(1U, (MaxLocalUsers - 1))); 6193 } 6194 6195 IC = std::min(IC, TmpIC); 6196 } 6197 6198 // Clamp the interleave ranges to reasonable counts. 6199 unsigned MaxInterleaveCount = 6200 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6201 6202 // Check if the user has overridden the max. 6203 if (VF.isScalar()) { 6204 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6205 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6206 } else { 6207 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6208 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6209 } 6210 6211 // If trip count is known or estimated compile time constant, limit the 6212 // interleave count to be less than the trip count divided by VF, provided it 6213 // is at least 1. 6214 // 6215 // For scalable vectors we can't know if interleaving is beneficial. It may 6216 // not be beneficial for small loops if none of the lanes in the second vector 6217 // iterations is enabled. However, for larger loops, there is likely to be a 6218 // similar benefit as for fixed-width vectors. For now, we choose to leave 6219 // the InterleaveCount as if vscale is '1', although if some information about 6220 // the vector is known (e.g. min vector size), we can make a better decision. 6221 if (BestKnownTC) { 6222 MaxInterleaveCount = 6223 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6224 // Make sure MaxInterleaveCount is greater than 0. 6225 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6226 } 6227 6228 assert(MaxInterleaveCount > 0 && 6229 "Maximum interleave count must be greater than 0"); 6230 6231 // Clamp the calculated IC to be between the 1 and the max interleave count 6232 // that the target and trip count allows. 6233 if (IC > MaxInterleaveCount) 6234 IC = MaxInterleaveCount; 6235 else 6236 // Make sure IC is greater than 0. 6237 IC = std::max(1u, IC); 6238 6239 assert(IC > 0 && "Interleave count must be greater than 0."); 6240 6241 // If we did not calculate the cost for VF (because the user selected the VF) 6242 // then we calculate the cost of VF here. 6243 if (LoopCost == 0) { 6244 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6245 LoopCost = *expectedCost(VF).first.getValue(); 6246 } 6247 6248 assert(LoopCost && "Non-zero loop cost expected"); 6249 6250 // Interleave if we vectorized this loop and there is a reduction that could 6251 // benefit from interleaving. 6252 if (VF.isVector() && HasReductions) { 6253 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6254 return IC; 6255 } 6256 6257 // Note that if we've already vectorized the loop we will have done the 6258 // runtime check and so interleaving won't require further checks. 6259 bool InterleavingRequiresRuntimePointerCheck = 6260 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6261 6262 // We want to interleave small loops in order to reduce the loop overhead and 6263 // potentially expose ILP opportunities. 6264 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6265 << "LV: IC is " << IC << '\n' 6266 << "LV: VF is " << VF << '\n'); 6267 const bool AggressivelyInterleaveReductions = 6268 TTI.enableAggressiveInterleaving(HasReductions); 6269 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6270 // We assume that the cost overhead is 1 and we use the cost model 6271 // to estimate the cost of the loop and interleave until the cost of the 6272 // loop overhead is about 5% of the cost of the loop. 6273 unsigned SmallIC = 6274 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6275 6276 // Interleave until store/load ports (estimated by max interleave count) are 6277 // saturated. 6278 unsigned NumStores = Legal->getNumStores(); 6279 unsigned NumLoads = Legal->getNumLoads(); 6280 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6281 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6282 6283 // If we have a scalar reduction (vector reductions are already dealt with 6284 // by this point), we can increase the critical path length if the loop 6285 // we're interleaving is inside another loop. Limit, by default to 2, so the 6286 // critical path only gets increased by one reduction operation. 6287 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6288 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6289 SmallIC = std::min(SmallIC, F); 6290 StoresIC = std::min(StoresIC, F); 6291 LoadsIC = std::min(LoadsIC, F); 6292 } 6293 6294 if (EnableLoadStoreRuntimeInterleave && 6295 std::max(StoresIC, LoadsIC) > SmallIC) { 6296 LLVM_DEBUG( 6297 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6298 return std::max(StoresIC, LoadsIC); 6299 } 6300 6301 // If there are scalar reductions and TTI has enabled aggressive 6302 // interleaving for reductions, we will interleave to expose ILP. 6303 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6304 AggressivelyInterleaveReductions) { 6305 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6306 // Interleave no less than SmallIC but not as aggressive as the normal IC 6307 // to satisfy the rare situation when resources are too limited. 6308 return std::max(IC / 2, SmallIC); 6309 } else { 6310 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6311 return SmallIC; 6312 } 6313 } 6314 6315 // Interleave if this is a large loop (small loops are already dealt with by 6316 // this point) that could benefit from interleaving. 6317 if (AggressivelyInterleaveReductions) { 6318 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6319 return IC; 6320 } 6321 6322 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6323 return 1; 6324 } 6325 6326 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6327 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6328 // This function calculates the register usage by measuring the highest number 6329 // of values that are alive at a single location. Obviously, this is a very 6330 // rough estimation. We scan the loop in a topological order in order and 6331 // assign a number to each instruction. We use RPO to ensure that defs are 6332 // met before their users. We assume that each instruction that has in-loop 6333 // users starts an interval. We record every time that an in-loop value is 6334 // used, so we have a list of the first and last occurrences of each 6335 // instruction. Next, we transpose this data structure into a multi map that 6336 // holds the list of intervals that *end* at a specific location. This multi 6337 // map allows us to perform a linear search. We scan the instructions linearly 6338 // and record each time that a new interval starts, by placing it in a set. 6339 // If we find this value in the multi-map then we remove it from the set. 6340 // The max register usage is the maximum size of the set. 6341 // We also search for instructions that are defined outside the loop, but are 6342 // used inside the loop. We need this number separately from the max-interval 6343 // usage number because when we unroll, loop-invariant values do not take 6344 // more register. 6345 LoopBlocksDFS DFS(TheLoop); 6346 DFS.perform(LI); 6347 6348 RegisterUsage RU; 6349 6350 // Each 'key' in the map opens a new interval. The values 6351 // of the map are the index of the 'last seen' usage of the 6352 // instruction that is the key. 6353 using IntervalMap = DenseMap<Instruction *, unsigned>; 6354 6355 // Maps instruction to its index. 6356 SmallVector<Instruction *, 64> IdxToInstr; 6357 // Marks the end of each interval. 6358 IntervalMap EndPoint; 6359 // Saves the list of instruction indices that are used in the loop. 6360 SmallPtrSet<Instruction *, 8> Ends; 6361 // Saves the list of values that are used in the loop but are 6362 // defined outside the loop, such as arguments and constants. 6363 SmallPtrSet<Value *, 8> LoopInvariants; 6364 6365 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6366 for (Instruction &I : BB->instructionsWithoutDebug()) { 6367 IdxToInstr.push_back(&I); 6368 6369 // Save the end location of each USE. 6370 for (Value *U : I.operands()) { 6371 auto *Instr = dyn_cast<Instruction>(U); 6372 6373 // Ignore non-instruction values such as arguments, constants, etc. 6374 if (!Instr) 6375 continue; 6376 6377 // If this instruction is outside the loop then record it and continue. 6378 if (!TheLoop->contains(Instr)) { 6379 LoopInvariants.insert(Instr); 6380 continue; 6381 } 6382 6383 // Overwrite previous end points. 6384 EndPoint[Instr] = IdxToInstr.size(); 6385 Ends.insert(Instr); 6386 } 6387 } 6388 } 6389 6390 // Saves the list of intervals that end with the index in 'key'. 6391 using InstrList = SmallVector<Instruction *, 2>; 6392 DenseMap<unsigned, InstrList> TransposeEnds; 6393 6394 // Transpose the EndPoints to a list of values that end at each index. 6395 for (auto &Interval : EndPoint) 6396 TransposeEnds[Interval.second].push_back(Interval.first); 6397 6398 SmallPtrSet<Instruction *, 8> OpenIntervals; 6399 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6400 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6401 6402 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6403 6404 // A lambda that gets the register usage for the given type and VF. 6405 const auto &TTICapture = TTI; 6406 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6407 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6408 return 0U; 6409 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6410 }; 6411 6412 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6413 Instruction *I = IdxToInstr[i]; 6414 6415 // Remove all of the instructions that end at this location. 6416 InstrList &List = TransposeEnds[i]; 6417 for (Instruction *ToRemove : List) 6418 OpenIntervals.erase(ToRemove); 6419 6420 // Ignore instructions that are never used within the loop. 6421 if (!Ends.count(I)) 6422 continue; 6423 6424 // Skip ignored values. 6425 if (ValuesToIgnore.count(I)) 6426 continue; 6427 6428 // For each VF find the maximum usage of registers. 6429 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6430 // Count the number of live intervals. 6431 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6432 6433 if (VFs[j].isScalar()) { 6434 for (auto Inst : OpenIntervals) { 6435 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6436 if (RegUsage.find(ClassID) == RegUsage.end()) 6437 RegUsage[ClassID] = 1; 6438 else 6439 RegUsage[ClassID] += 1; 6440 } 6441 } else { 6442 collectUniformsAndScalars(VFs[j]); 6443 for (auto Inst : OpenIntervals) { 6444 // Skip ignored values for VF > 1. 6445 if (VecValuesToIgnore.count(Inst)) 6446 continue; 6447 if (isScalarAfterVectorization(Inst, VFs[j])) { 6448 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6449 if (RegUsage.find(ClassID) == RegUsage.end()) 6450 RegUsage[ClassID] = 1; 6451 else 6452 RegUsage[ClassID] += 1; 6453 } else { 6454 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6455 if (RegUsage.find(ClassID) == RegUsage.end()) 6456 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6457 else 6458 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6459 } 6460 } 6461 } 6462 6463 for (auto& pair : RegUsage) { 6464 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6465 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6466 else 6467 MaxUsages[j][pair.first] = pair.second; 6468 } 6469 } 6470 6471 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6472 << OpenIntervals.size() << '\n'); 6473 6474 // Add the current instruction to the list of open intervals. 6475 OpenIntervals.insert(I); 6476 } 6477 6478 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6479 SmallMapVector<unsigned, unsigned, 4> Invariant; 6480 6481 for (auto Inst : LoopInvariants) { 6482 unsigned Usage = 6483 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6484 unsigned ClassID = 6485 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6486 if (Invariant.find(ClassID) == Invariant.end()) 6487 Invariant[ClassID] = Usage; 6488 else 6489 Invariant[ClassID] += Usage; 6490 } 6491 6492 LLVM_DEBUG({ 6493 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6494 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6495 << " item\n"; 6496 for (const auto &pair : MaxUsages[i]) { 6497 dbgs() << "LV(REG): RegisterClass: " 6498 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6499 << " registers\n"; 6500 } 6501 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6502 << " item\n"; 6503 for (const auto &pair : Invariant) { 6504 dbgs() << "LV(REG): RegisterClass: " 6505 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6506 << " registers\n"; 6507 } 6508 }); 6509 6510 RU.LoopInvariantRegs = Invariant; 6511 RU.MaxLocalUsers = MaxUsages[i]; 6512 RUs[i] = RU; 6513 } 6514 6515 return RUs; 6516 } 6517 6518 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6519 // TODO: Cost model for emulated masked load/store is completely 6520 // broken. This hack guides the cost model to use an artificially 6521 // high enough value to practically disable vectorization with such 6522 // operations, except where previously deployed legality hack allowed 6523 // using very low cost values. This is to avoid regressions coming simply 6524 // from moving "masked load/store" check from legality to cost model. 6525 // Masked Load/Gather emulation was previously never allowed. 6526 // Limited number of Masked Store/Scatter emulation was allowed. 6527 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6528 return isa<LoadInst>(I) || 6529 (isa<StoreInst>(I) && 6530 NumPredStores > NumberOfStoresToPredicate); 6531 } 6532 6533 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6534 // If we aren't vectorizing the loop, or if we've already collected the 6535 // instructions to scalarize, there's nothing to do. Collection may already 6536 // have occurred if we have a user-selected VF and are now computing the 6537 // expected cost for interleaving. 6538 if (VF.isScalar() || VF.isZero() || 6539 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6540 return; 6541 6542 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6543 // not profitable to scalarize any instructions, the presence of VF in the 6544 // map will indicate that we've analyzed it already. 6545 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6546 6547 // Find all the instructions that are scalar with predication in the loop and 6548 // determine if it would be better to not if-convert the blocks they are in. 6549 // If so, we also record the instructions to scalarize. 6550 for (BasicBlock *BB : TheLoop->blocks()) { 6551 if (!blockNeedsPredication(BB)) 6552 continue; 6553 for (Instruction &I : *BB) 6554 if (isScalarWithPredication(&I)) { 6555 ScalarCostsTy ScalarCosts; 6556 // Do not apply discount logic if hacked cost is needed 6557 // for emulated masked memrefs. 6558 if (!useEmulatedMaskMemRefHack(&I) && 6559 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6560 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6561 // Remember that BB will remain after vectorization. 6562 PredicatedBBsAfterVectorization.insert(BB); 6563 } 6564 } 6565 } 6566 6567 int LoopVectorizationCostModel::computePredInstDiscount( 6568 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6569 assert(!isUniformAfterVectorization(PredInst, VF) && 6570 "Instruction marked uniform-after-vectorization will be predicated"); 6571 6572 // Initialize the discount to zero, meaning that the scalar version and the 6573 // vector version cost the same. 6574 InstructionCost Discount = 0; 6575 6576 // Holds instructions to analyze. The instructions we visit are mapped in 6577 // ScalarCosts. Those instructions are the ones that would be scalarized if 6578 // we find that the scalar version costs less. 6579 SmallVector<Instruction *, 8> Worklist; 6580 6581 // Returns true if the given instruction can be scalarized. 6582 auto canBeScalarized = [&](Instruction *I) -> bool { 6583 // We only attempt to scalarize instructions forming a single-use chain 6584 // from the original predicated block that would otherwise be vectorized. 6585 // Although not strictly necessary, we give up on instructions we know will 6586 // already be scalar to avoid traversing chains that are unlikely to be 6587 // beneficial. 6588 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6589 isScalarAfterVectorization(I, VF)) 6590 return false; 6591 6592 // If the instruction is scalar with predication, it will be analyzed 6593 // separately. We ignore it within the context of PredInst. 6594 if (isScalarWithPredication(I)) 6595 return false; 6596 6597 // If any of the instruction's operands are uniform after vectorization, 6598 // the instruction cannot be scalarized. This prevents, for example, a 6599 // masked load from being scalarized. 6600 // 6601 // We assume we will only emit a value for lane zero of an instruction 6602 // marked uniform after vectorization, rather than VF identical values. 6603 // Thus, if we scalarize an instruction that uses a uniform, we would 6604 // create uses of values corresponding to the lanes we aren't emitting code 6605 // for. This behavior can be changed by allowing getScalarValue to clone 6606 // the lane zero values for uniforms rather than asserting. 6607 for (Use &U : I->operands()) 6608 if (auto *J = dyn_cast<Instruction>(U.get())) 6609 if (isUniformAfterVectorization(J, VF)) 6610 return false; 6611 6612 // Otherwise, we can scalarize the instruction. 6613 return true; 6614 }; 6615 6616 // Compute the expected cost discount from scalarizing the entire expression 6617 // feeding the predicated instruction. We currently only consider expressions 6618 // that are single-use instruction chains. 6619 Worklist.push_back(PredInst); 6620 while (!Worklist.empty()) { 6621 Instruction *I = Worklist.pop_back_val(); 6622 6623 // If we've already analyzed the instruction, there's nothing to do. 6624 if (ScalarCosts.find(I) != ScalarCosts.end()) 6625 continue; 6626 6627 // Compute the cost of the vector instruction. Note that this cost already 6628 // includes the scalarization overhead of the predicated instruction. 6629 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6630 6631 // Compute the cost of the scalarized instruction. This cost is the cost of 6632 // the instruction as if it wasn't if-converted and instead remained in the 6633 // predicated block. We will scale this cost by block probability after 6634 // computing the scalarization overhead. 6635 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6636 InstructionCost ScalarCost = 6637 VF.getKnownMinValue() * 6638 getInstructionCost(I, ElementCount::getFixed(1)).first; 6639 6640 // Compute the scalarization overhead of needed insertelement instructions 6641 // and phi nodes. 6642 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6643 ScalarCost += TTI.getScalarizationOverhead( 6644 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6645 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6646 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6647 ScalarCost += 6648 VF.getKnownMinValue() * 6649 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6650 } 6651 6652 // Compute the scalarization overhead of needed extractelement 6653 // instructions. For each of the instruction's operands, if the operand can 6654 // be scalarized, add it to the worklist; otherwise, account for the 6655 // overhead. 6656 for (Use &U : I->operands()) 6657 if (auto *J = dyn_cast<Instruction>(U.get())) { 6658 assert(VectorType::isValidElementType(J->getType()) && 6659 "Instruction has non-scalar type"); 6660 if (canBeScalarized(J)) 6661 Worklist.push_back(J); 6662 else if (needsExtract(J, VF)) { 6663 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6664 ScalarCost += TTI.getScalarizationOverhead( 6665 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6666 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6667 } 6668 } 6669 6670 // Scale the total scalar cost by block probability. 6671 ScalarCost /= getReciprocalPredBlockProb(); 6672 6673 // Compute the discount. A non-negative discount means the vector version 6674 // of the instruction costs more, and scalarizing would be beneficial. 6675 Discount += VectorCost - ScalarCost; 6676 ScalarCosts[I] = ScalarCost; 6677 } 6678 6679 return *Discount.getValue(); 6680 } 6681 6682 LoopVectorizationCostModel::VectorizationCostTy 6683 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6684 VectorizationCostTy Cost; 6685 6686 // For each block. 6687 for (BasicBlock *BB : TheLoop->blocks()) { 6688 VectorizationCostTy BlockCost; 6689 6690 // For each instruction in the old loop. 6691 for (Instruction &I : BB->instructionsWithoutDebug()) { 6692 // Skip ignored values. 6693 if (ValuesToIgnore.count(&I) || 6694 (VF.isVector() && VecValuesToIgnore.count(&I))) 6695 continue; 6696 6697 VectorizationCostTy C = getInstructionCost(&I, VF); 6698 6699 // Check if we should override the cost. 6700 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6701 C.first = InstructionCost(ForceTargetInstructionCost); 6702 6703 BlockCost.first += C.first; 6704 BlockCost.second |= C.second; 6705 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6706 << " for VF " << VF << " For instruction: " << I 6707 << '\n'); 6708 } 6709 6710 // If we are vectorizing a predicated block, it will have been 6711 // if-converted. This means that the block's instructions (aside from 6712 // stores and instructions that may divide by zero) will now be 6713 // unconditionally executed. For the scalar case, we may not always execute 6714 // the predicated block, if it is an if-else block. Thus, scale the block's 6715 // cost by the probability of executing it. blockNeedsPredication from 6716 // Legal is used so as to not include all blocks in tail folded loops. 6717 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6718 BlockCost.first /= getReciprocalPredBlockProb(); 6719 6720 Cost.first += BlockCost.first; 6721 Cost.second |= BlockCost.second; 6722 } 6723 6724 return Cost; 6725 } 6726 6727 /// Gets Address Access SCEV after verifying that the access pattern 6728 /// is loop invariant except the induction variable dependence. 6729 /// 6730 /// This SCEV can be sent to the Target in order to estimate the address 6731 /// calculation cost. 6732 static const SCEV *getAddressAccessSCEV( 6733 Value *Ptr, 6734 LoopVectorizationLegality *Legal, 6735 PredicatedScalarEvolution &PSE, 6736 const Loop *TheLoop) { 6737 6738 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6739 if (!Gep) 6740 return nullptr; 6741 6742 // We are looking for a gep with all loop invariant indices except for one 6743 // which should be an induction variable. 6744 auto SE = PSE.getSE(); 6745 unsigned NumOperands = Gep->getNumOperands(); 6746 for (unsigned i = 1; i < NumOperands; ++i) { 6747 Value *Opd = Gep->getOperand(i); 6748 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6749 !Legal->isInductionVariable(Opd)) 6750 return nullptr; 6751 } 6752 6753 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6754 return PSE.getSCEV(Ptr); 6755 } 6756 6757 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6758 return Legal->hasStride(I->getOperand(0)) || 6759 Legal->hasStride(I->getOperand(1)); 6760 } 6761 6762 InstructionCost 6763 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6764 ElementCount VF) { 6765 assert(VF.isVector() && 6766 "Scalarization cost of instruction implies vectorization."); 6767 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6768 Type *ValTy = getMemInstValueType(I); 6769 auto SE = PSE.getSE(); 6770 6771 unsigned AS = getLoadStoreAddressSpace(I); 6772 Value *Ptr = getLoadStorePointerOperand(I); 6773 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6774 6775 // Figure out whether the access is strided and get the stride value 6776 // if it's known in compile time 6777 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6778 6779 // Get the cost of the scalar memory instruction and address computation. 6780 InstructionCost Cost = 6781 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6782 6783 // Don't pass *I here, since it is scalar but will actually be part of a 6784 // vectorized loop where the user of it is a vectorized instruction. 6785 const Align Alignment = getLoadStoreAlignment(I); 6786 Cost += VF.getKnownMinValue() * 6787 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6788 AS, TTI::TCK_RecipThroughput); 6789 6790 // Get the overhead of the extractelement and insertelement instructions 6791 // we might create due to scalarization. 6792 Cost += getScalarizationOverhead(I, VF); 6793 6794 // If we have a predicated load/store, it will need extra i1 extracts and 6795 // conditional branches, but may not be executed for each vector lane. Scale 6796 // the cost by the probability of executing the predicated block. 6797 if (isPredicatedInst(I)) { 6798 Cost /= getReciprocalPredBlockProb(); 6799 6800 // Add the cost of an i1 extract and a branch 6801 auto *Vec_i1Ty = 6802 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6803 Cost += TTI.getScalarizationOverhead( 6804 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6805 /*Insert=*/false, /*Extract=*/true); 6806 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6807 6808 if (useEmulatedMaskMemRefHack(I)) 6809 // Artificially setting to a high enough value to practically disable 6810 // vectorization with such operations. 6811 Cost = 3000000; 6812 } 6813 6814 return Cost; 6815 } 6816 6817 InstructionCost 6818 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6819 ElementCount VF) { 6820 Type *ValTy = getMemInstValueType(I); 6821 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6822 Value *Ptr = getLoadStorePointerOperand(I); 6823 unsigned AS = getLoadStoreAddressSpace(I); 6824 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6825 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6826 6827 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6828 "Stride should be 1 or -1 for consecutive memory access"); 6829 const Align Alignment = getLoadStoreAlignment(I); 6830 InstructionCost Cost = 0; 6831 if (Legal->isMaskRequired(I)) 6832 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6833 CostKind); 6834 else 6835 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6836 CostKind, I); 6837 6838 bool Reverse = ConsecutiveStride < 0; 6839 if (Reverse) 6840 Cost += 6841 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6842 return Cost; 6843 } 6844 6845 InstructionCost 6846 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6847 ElementCount VF) { 6848 assert(Legal->isUniformMemOp(*I)); 6849 6850 Type *ValTy = getMemInstValueType(I); 6851 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6852 const Align Alignment = getLoadStoreAlignment(I); 6853 unsigned AS = getLoadStoreAddressSpace(I); 6854 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6855 if (isa<LoadInst>(I)) { 6856 return TTI.getAddressComputationCost(ValTy) + 6857 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6858 CostKind) + 6859 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6860 } 6861 StoreInst *SI = cast<StoreInst>(I); 6862 6863 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6864 return TTI.getAddressComputationCost(ValTy) + 6865 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6866 CostKind) + 6867 (isLoopInvariantStoreValue 6868 ? 0 6869 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6870 VF.getKnownMinValue() - 1)); 6871 } 6872 6873 InstructionCost 6874 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6875 ElementCount VF) { 6876 Type *ValTy = getMemInstValueType(I); 6877 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6878 const Align Alignment = getLoadStoreAlignment(I); 6879 const Value *Ptr = getLoadStorePointerOperand(I); 6880 6881 return TTI.getAddressComputationCost(VectorTy) + 6882 TTI.getGatherScatterOpCost( 6883 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6884 TargetTransformInfo::TCK_RecipThroughput, I); 6885 } 6886 6887 InstructionCost 6888 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6889 ElementCount VF) { 6890 // TODO: Once we have support for interleaving with scalable vectors 6891 // we can calculate the cost properly here. 6892 if (VF.isScalable()) 6893 return InstructionCost::getInvalid(); 6894 6895 Type *ValTy = getMemInstValueType(I); 6896 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6897 unsigned AS = getLoadStoreAddressSpace(I); 6898 6899 auto Group = getInterleavedAccessGroup(I); 6900 assert(Group && "Fail to get an interleaved access group."); 6901 6902 unsigned InterleaveFactor = Group->getFactor(); 6903 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6904 6905 // Holds the indices of existing members in an interleaved load group. 6906 // An interleaved store group doesn't need this as it doesn't allow gaps. 6907 SmallVector<unsigned, 4> Indices; 6908 if (isa<LoadInst>(I)) { 6909 for (unsigned i = 0; i < InterleaveFactor; i++) 6910 if (Group->getMember(i)) 6911 Indices.push_back(i); 6912 } 6913 6914 // Calculate the cost of the whole interleaved group. 6915 bool UseMaskForGaps = 6916 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6917 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6918 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6919 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6920 6921 if (Group->isReverse()) { 6922 // TODO: Add support for reversed masked interleaved access. 6923 assert(!Legal->isMaskRequired(I) && 6924 "Reverse masked interleaved access not supported."); 6925 Cost += 6926 Group->getNumMembers() * 6927 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6928 } 6929 return Cost; 6930 } 6931 6932 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6933 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6934 // Early exit for no inloop reductions 6935 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6936 return InstructionCost::getInvalid(); 6937 auto *VectorTy = cast<VectorType>(Ty); 6938 6939 // We are looking for a pattern of, and finding the minimal acceptable cost: 6940 // reduce(mul(ext(A), ext(B))) or 6941 // reduce(mul(A, B)) or 6942 // reduce(ext(A)) or 6943 // reduce(A). 6944 // The basic idea is that we walk down the tree to do that, finding the root 6945 // reduction instruction in InLoopReductionImmediateChains. From there we find 6946 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6947 // of the components. If the reduction cost is lower then we return it for the 6948 // reduction instruction and 0 for the other instructions in the pattern. If 6949 // it is not we return an invalid cost specifying the orignal cost method 6950 // should be used. 6951 Instruction *RetI = I; 6952 if ((RetI->getOpcode() == Instruction::SExt || 6953 RetI->getOpcode() == Instruction::ZExt)) { 6954 if (!RetI->hasOneUser()) 6955 return InstructionCost::getInvalid(); 6956 RetI = RetI->user_back(); 6957 } 6958 if (RetI->getOpcode() == Instruction::Mul && 6959 RetI->user_back()->getOpcode() == Instruction::Add) { 6960 if (!RetI->hasOneUser()) 6961 return InstructionCost::getInvalid(); 6962 RetI = RetI->user_back(); 6963 } 6964 6965 // Test if the found instruction is a reduction, and if not return an invalid 6966 // cost specifying the parent to use the original cost modelling. 6967 if (!InLoopReductionImmediateChains.count(RetI)) 6968 return InstructionCost::getInvalid(); 6969 6970 // Find the reduction this chain is a part of and calculate the basic cost of 6971 // the reduction on its own. 6972 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6973 Instruction *ReductionPhi = LastChain; 6974 while (!isa<PHINode>(ReductionPhi)) 6975 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6976 6977 RecurrenceDescriptor RdxDesc = 6978 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6979 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6980 VectorTy, false, CostKind); 6981 6982 // Get the operand that was not the reduction chain and match it to one of the 6983 // patterns, returning the better cost if it is found. 6984 Instruction *RedOp = RetI->getOperand(1) == LastChain 6985 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6986 : dyn_cast<Instruction>(RetI->getOperand(1)); 6987 6988 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6989 6990 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6991 !TheLoop->isLoopInvariant(RedOp)) { 6992 bool IsUnsigned = isa<ZExtInst>(RedOp); 6993 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6994 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6995 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6996 CostKind); 6997 6998 unsigned ExtCost = 6999 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7000 TTI::CastContextHint::None, CostKind, RedOp); 7001 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7002 return I == RetI ? *RedCost.getValue() : 0; 7003 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7004 Instruction *Mul = RedOp; 7005 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7006 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7007 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7008 Op0->getOpcode() == Op1->getOpcode() && 7009 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7010 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7011 bool IsUnsigned = isa<ZExtInst>(Op0); 7012 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7013 // reduce(mul(ext, ext)) 7014 unsigned ExtCost = 7015 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7016 TTI::CastContextHint::None, CostKind, Op0); 7017 InstructionCost MulCost = 7018 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7019 7020 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7021 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7022 CostKind); 7023 7024 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7025 return I == RetI ? *RedCost.getValue() : 0; 7026 } else { 7027 InstructionCost MulCost = 7028 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7029 7030 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7031 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7032 CostKind); 7033 7034 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7035 return I == RetI ? *RedCost.getValue() : 0; 7036 } 7037 } 7038 7039 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7040 } 7041 7042 InstructionCost 7043 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7044 ElementCount VF) { 7045 // Calculate scalar cost only. Vectorization cost should be ready at this 7046 // moment. 7047 if (VF.isScalar()) { 7048 Type *ValTy = getMemInstValueType(I); 7049 const Align Alignment = getLoadStoreAlignment(I); 7050 unsigned AS = getLoadStoreAddressSpace(I); 7051 7052 return TTI.getAddressComputationCost(ValTy) + 7053 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7054 TTI::TCK_RecipThroughput, I); 7055 } 7056 return getWideningCost(I, VF); 7057 } 7058 7059 LoopVectorizationCostModel::VectorizationCostTy 7060 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7061 ElementCount VF) { 7062 // If we know that this instruction will remain uniform, check the cost of 7063 // the scalar version. 7064 if (isUniformAfterVectorization(I, VF)) 7065 VF = ElementCount::getFixed(1); 7066 7067 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7068 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7069 7070 // Forced scalars do not have any scalarization overhead. 7071 auto ForcedScalar = ForcedScalars.find(VF); 7072 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7073 auto InstSet = ForcedScalar->second; 7074 if (InstSet.count(I)) 7075 return VectorizationCostTy( 7076 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7077 VF.getKnownMinValue()), 7078 false); 7079 } 7080 7081 Type *VectorTy; 7082 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7083 7084 bool TypeNotScalarized = 7085 VF.isVector() && VectorTy->isVectorTy() && 7086 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7087 return VectorizationCostTy(C, TypeNotScalarized); 7088 } 7089 7090 InstructionCost 7091 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7092 ElementCount VF) const { 7093 7094 if (VF.isScalable()) 7095 return InstructionCost::getInvalid(); 7096 7097 if (VF.isScalar()) 7098 return 0; 7099 7100 InstructionCost Cost = 0; 7101 Type *RetTy = ToVectorTy(I->getType(), VF); 7102 if (!RetTy->isVoidTy() && 7103 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7104 Cost += TTI.getScalarizationOverhead( 7105 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7106 true, false); 7107 7108 // Some targets keep addresses scalar. 7109 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7110 return Cost; 7111 7112 // Some targets support efficient element stores. 7113 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7114 return Cost; 7115 7116 // Collect operands to consider. 7117 CallInst *CI = dyn_cast<CallInst>(I); 7118 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7119 7120 // Skip operands that do not require extraction/scalarization and do not incur 7121 // any overhead. 7122 SmallVector<Type *> Tys; 7123 for (auto *V : filterExtractingOperands(Ops, VF)) 7124 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7125 return Cost + TTI.getOperandsScalarizationOverhead( 7126 filterExtractingOperands(Ops, VF), Tys); 7127 } 7128 7129 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7130 if (VF.isScalar()) 7131 return; 7132 NumPredStores = 0; 7133 for (BasicBlock *BB : TheLoop->blocks()) { 7134 // For each instruction in the old loop. 7135 for (Instruction &I : *BB) { 7136 Value *Ptr = getLoadStorePointerOperand(&I); 7137 if (!Ptr) 7138 continue; 7139 7140 // TODO: We should generate better code and update the cost model for 7141 // predicated uniform stores. Today they are treated as any other 7142 // predicated store (see added test cases in 7143 // invariant-store-vectorization.ll). 7144 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7145 NumPredStores++; 7146 7147 if (Legal->isUniformMemOp(I)) { 7148 // TODO: Avoid replicating loads and stores instead of 7149 // relying on instcombine to remove them. 7150 // Load: Scalar load + broadcast 7151 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7152 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7153 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7154 continue; 7155 } 7156 7157 // We assume that widening is the best solution when possible. 7158 if (memoryInstructionCanBeWidened(&I, VF)) { 7159 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7160 int ConsecutiveStride = 7161 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7162 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7163 "Expected consecutive stride."); 7164 InstWidening Decision = 7165 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7166 setWideningDecision(&I, VF, Decision, Cost); 7167 continue; 7168 } 7169 7170 // Choose between Interleaving, Gather/Scatter or Scalarization. 7171 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7172 unsigned NumAccesses = 1; 7173 if (isAccessInterleaved(&I)) { 7174 auto Group = getInterleavedAccessGroup(&I); 7175 assert(Group && "Fail to get an interleaved access group."); 7176 7177 // Make one decision for the whole group. 7178 if (getWideningDecision(&I, VF) != CM_Unknown) 7179 continue; 7180 7181 NumAccesses = Group->getNumMembers(); 7182 if (interleavedAccessCanBeWidened(&I, VF)) 7183 InterleaveCost = getInterleaveGroupCost(&I, VF); 7184 } 7185 7186 InstructionCost GatherScatterCost = 7187 isLegalGatherOrScatter(&I) 7188 ? getGatherScatterCost(&I, VF) * NumAccesses 7189 : InstructionCost::getInvalid(); 7190 7191 InstructionCost ScalarizationCost = 7192 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7193 : InstructionCost::getInvalid(); 7194 7195 // Choose better solution for the current VF, 7196 // write down this decision and use it during vectorization. 7197 InstructionCost Cost; 7198 InstWidening Decision; 7199 if (InterleaveCost <= GatherScatterCost && 7200 InterleaveCost < ScalarizationCost) { 7201 Decision = CM_Interleave; 7202 Cost = InterleaveCost; 7203 } else if (GatherScatterCost < ScalarizationCost) { 7204 Decision = CM_GatherScatter; 7205 Cost = GatherScatterCost; 7206 } else { 7207 assert(!VF.isScalable() && 7208 "We cannot yet scalarise for scalable vectors"); 7209 Decision = CM_Scalarize; 7210 Cost = ScalarizationCost; 7211 } 7212 // If the instructions belongs to an interleave group, the whole group 7213 // receives the same decision. The whole group receives the cost, but 7214 // the cost will actually be assigned to one instruction. 7215 if (auto Group = getInterleavedAccessGroup(&I)) 7216 setWideningDecision(Group, VF, Decision, Cost); 7217 else 7218 setWideningDecision(&I, VF, Decision, Cost); 7219 } 7220 } 7221 7222 // Make sure that any load of address and any other address computation 7223 // remains scalar unless there is gather/scatter support. This avoids 7224 // inevitable extracts into address registers, and also has the benefit of 7225 // activating LSR more, since that pass can't optimize vectorized 7226 // addresses. 7227 if (TTI.prefersVectorizedAddressing()) 7228 return; 7229 7230 // Start with all scalar pointer uses. 7231 SmallPtrSet<Instruction *, 8> AddrDefs; 7232 for (BasicBlock *BB : TheLoop->blocks()) 7233 for (Instruction &I : *BB) { 7234 Instruction *PtrDef = 7235 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7236 if (PtrDef && TheLoop->contains(PtrDef) && 7237 getWideningDecision(&I, VF) != CM_GatherScatter) 7238 AddrDefs.insert(PtrDef); 7239 } 7240 7241 // Add all instructions used to generate the addresses. 7242 SmallVector<Instruction *, 4> Worklist; 7243 append_range(Worklist, AddrDefs); 7244 while (!Worklist.empty()) { 7245 Instruction *I = Worklist.pop_back_val(); 7246 for (auto &Op : I->operands()) 7247 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7248 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7249 AddrDefs.insert(InstOp).second) 7250 Worklist.push_back(InstOp); 7251 } 7252 7253 for (auto *I : AddrDefs) { 7254 if (isa<LoadInst>(I)) { 7255 // Setting the desired widening decision should ideally be handled in 7256 // by cost functions, but since this involves the task of finding out 7257 // if the loaded register is involved in an address computation, it is 7258 // instead changed here when we know this is the case. 7259 InstWidening Decision = getWideningDecision(I, VF); 7260 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7261 // Scalarize a widened load of address. 7262 setWideningDecision( 7263 I, VF, CM_Scalarize, 7264 (VF.getKnownMinValue() * 7265 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7266 else if (auto Group = getInterleavedAccessGroup(I)) { 7267 // Scalarize an interleave group of address loads. 7268 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7269 if (Instruction *Member = Group->getMember(I)) 7270 setWideningDecision( 7271 Member, VF, CM_Scalarize, 7272 (VF.getKnownMinValue() * 7273 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7274 } 7275 } 7276 } else 7277 // Make sure I gets scalarized and a cost estimate without 7278 // scalarization overhead. 7279 ForcedScalars[VF].insert(I); 7280 } 7281 } 7282 7283 InstructionCost 7284 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7285 Type *&VectorTy) { 7286 Type *RetTy = I->getType(); 7287 if (canTruncateToMinimalBitwidth(I, VF)) 7288 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7289 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7290 auto SE = PSE.getSE(); 7291 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7292 7293 // TODO: We need to estimate the cost of intrinsic calls. 7294 switch (I->getOpcode()) { 7295 case Instruction::GetElementPtr: 7296 // We mark this instruction as zero-cost because the cost of GEPs in 7297 // vectorized code depends on whether the corresponding memory instruction 7298 // is scalarized or not. Therefore, we handle GEPs with the memory 7299 // instruction cost. 7300 return 0; 7301 case Instruction::Br: { 7302 // In cases of scalarized and predicated instructions, there will be VF 7303 // predicated blocks in the vectorized loop. Each branch around these 7304 // blocks requires also an extract of its vector compare i1 element. 7305 bool ScalarPredicatedBB = false; 7306 BranchInst *BI = cast<BranchInst>(I); 7307 if (VF.isVector() && BI->isConditional() && 7308 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7309 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7310 ScalarPredicatedBB = true; 7311 7312 if (ScalarPredicatedBB) { 7313 // Return cost for branches around scalarized and predicated blocks. 7314 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7315 auto *Vec_i1Ty = 7316 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7317 return (TTI.getScalarizationOverhead( 7318 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7319 false, true) + 7320 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7321 VF.getKnownMinValue())); 7322 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7323 // The back-edge branch will remain, as will all scalar branches. 7324 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7325 else 7326 // This branch will be eliminated by if-conversion. 7327 return 0; 7328 // Note: We currently assume zero cost for an unconditional branch inside 7329 // a predicated block since it will become a fall-through, although we 7330 // may decide in the future to call TTI for all branches. 7331 } 7332 case Instruction::PHI: { 7333 auto *Phi = cast<PHINode>(I); 7334 7335 // First-order recurrences are replaced by vector shuffles inside the loop. 7336 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7337 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7338 return TTI.getShuffleCost( 7339 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7340 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7341 7342 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7343 // converted into select instructions. We require N - 1 selects per phi 7344 // node, where N is the number of incoming values. 7345 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7346 return (Phi->getNumIncomingValues() - 1) * 7347 TTI.getCmpSelInstrCost( 7348 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7349 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7350 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7351 7352 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7353 } 7354 case Instruction::UDiv: 7355 case Instruction::SDiv: 7356 case Instruction::URem: 7357 case Instruction::SRem: 7358 // If we have a predicated instruction, it may not be executed for each 7359 // vector lane. Get the scalarization cost and scale this amount by the 7360 // probability of executing the predicated block. If the instruction is not 7361 // predicated, we fall through to the next case. 7362 if (VF.isVector() && isScalarWithPredication(I)) { 7363 InstructionCost Cost = 0; 7364 7365 // These instructions have a non-void type, so account for the phi nodes 7366 // that we will create. This cost is likely to be zero. The phi node 7367 // cost, if any, should be scaled by the block probability because it 7368 // models a copy at the end of each predicated block. 7369 Cost += VF.getKnownMinValue() * 7370 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7371 7372 // The cost of the non-predicated instruction. 7373 Cost += VF.getKnownMinValue() * 7374 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7375 7376 // The cost of insertelement and extractelement instructions needed for 7377 // scalarization. 7378 Cost += getScalarizationOverhead(I, VF); 7379 7380 // Scale the cost by the probability of executing the predicated blocks. 7381 // This assumes the predicated block for each vector lane is equally 7382 // likely. 7383 return Cost / getReciprocalPredBlockProb(); 7384 } 7385 LLVM_FALLTHROUGH; 7386 case Instruction::Add: 7387 case Instruction::FAdd: 7388 case Instruction::Sub: 7389 case Instruction::FSub: 7390 case Instruction::Mul: 7391 case Instruction::FMul: 7392 case Instruction::FDiv: 7393 case Instruction::FRem: 7394 case Instruction::Shl: 7395 case Instruction::LShr: 7396 case Instruction::AShr: 7397 case Instruction::And: 7398 case Instruction::Or: 7399 case Instruction::Xor: { 7400 // Since we will replace the stride by 1 the multiplication should go away. 7401 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7402 return 0; 7403 7404 // Detect reduction patterns 7405 InstructionCost RedCost; 7406 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7407 .isValid()) 7408 return RedCost; 7409 7410 // Certain instructions can be cheaper to vectorize if they have a constant 7411 // second vector operand. One example of this are shifts on x86. 7412 Value *Op2 = I->getOperand(1); 7413 TargetTransformInfo::OperandValueProperties Op2VP; 7414 TargetTransformInfo::OperandValueKind Op2VK = 7415 TTI.getOperandInfo(Op2, Op2VP); 7416 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7417 Op2VK = TargetTransformInfo::OK_UniformValue; 7418 7419 SmallVector<const Value *, 4> Operands(I->operand_values()); 7420 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7421 return N * TTI.getArithmeticInstrCost( 7422 I->getOpcode(), VectorTy, CostKind, 7423 TargetTransformInfo::OK_AnyValue, 7424 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7425 } 7426 case Instruction::FNeg: { 7427 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7428 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7429 return N * TTI.getArithmeticInstrCost( 7430 I->getOpcode(), VectorTy, CostKind, 7431 TargetTransformInfo::OK_AnyValue, 7432 TargetTransformInfo::OK_AnyValue, 7433 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7434 I->getOperand(0), I); 7435 } 7436 case Instruction::Select: { 7437 SelectInst *SI = cast<SelectInst>(I); 7438 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7439 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7440 Type *CondTy = SI->getCondition()->getType(); 7441 if (!ScalarCond) 7442 CondTy = VectorType::get(CondTy, VF); 7443 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7444 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7445 } 7446 case Instruction::ICmp: 7447 case Instruction::FCmp: { 7448 Type *ValTy = I->getOperand(0)->getType(); 7449 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7450 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7451 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7452 VectorTy = ToVectorTy(ValTy, VF); 7453 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7454 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7455 } 7456 case Instruction::Store: 7457 case Instruction::Load: { 7458 ElementCount Width = VF; 7459 if (Width.isVector()) { 7460 InstWidening Decision = getWideningDecision(I, Width); 7461 assert(Decision != CM_Unknown && 7462 "CM decision should be taken at this point"); 7463 if (Decision == CM_Scalarize) 7464 Width = ElementCount::getFixed(1); 7465 } 7466 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7467 return getMemoryInstructionCost(I, VF); 7468 } 7469 case Instruction::ZExt: 7470 case Instruction::SExt: 7471 case Instruction::FPToUI: 7472 case Instruction::FPToSI: 7473 case Instruction::FPExt: 7474 case Instruction::PtrToInt: 7475 case Instruction::IntToPtr: 7476 case Instruction::SIToFP: 7477 case Instruction::UIToFP: 7478 case Instruction::Trunc: 7479 case Instruction::FPTrunc: 7480 case Instruction::BitCast: { 7481 // Computes the CastContextHint from a Load/Store instruction. 7482 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7483 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7484 "Expected a load or a store!"); 7485 7486 if (VF.isScalar() || !TheLoop->contains(I)) 7487 return TTI::CastContextHint::Normal; 7488 7489 switch (getWideningDecision(I, VF)) { 7490 case LoopVectorizationCostModel::CM_GatherScatter: 7491 return TTI::CastContextHint::GatherScatter; 7492 case LoopVectorizationCostModel::CM_Interleave: 7493 return TTI::CastContextHint::Interleave; 7494 case LoopVectorizationCostModel::CM_Scalarize: 7495 case LoopVectorizationCostModel::CM_Widen: 7496 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7497 : TTI::CastContextHint::Normal; 7498 case LoopVectorizationCostModel::CM_Widen_Reverse: 7499 return TTI::CastContextHint::Reversed; 7500 case LoopVectorizationCostModel::CM_Unknown: 7501 llvm_unreachable("Instr did not go through cost modelling?"); 7502 } 7503 7504 llvm_unreachable("Unhandled case!"); 7505 }; 7506 7507 unsigned Opcode = I->getOpcode(); 7508 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7509 // For Trunc, the context is the only user, which must be a StoreInst. 7510 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7511 if (I->hasOneUse()) 7512 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7513 CCH = ComputeCCH(Store); 7514 } 7515 // For Z/Sext, the context is the operand, which must be a LoadInst. 7516 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7517 Opcode == Instruction::FPExt) { 7518 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7519 CCH = ComputeCCH(Load); 7520 } 7521 7522 // We optimize the truncation of induction variables having constant 7523 // integer steps. The cost of these truncations is the same as the scalar 7524 // operation. 7525 if (isOptimizableIVTruncate(I, VF)) { 7526 auto *Trunc = cast<TruncInst>(I); 7527 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7528 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7529 } 7530 7531 // Detect reduction patterns 7532 InstructionCost RedCost; 7533 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7534 .isValid()) 7535 return RedCost; 7536 7537 Type *SrcScalarTy = I->getOperand(0)->getType(); 7538 Type *SrcVecTy = 7539 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7540 if (canTruncateToMinimalBitwidth(I, VF)) { 7541 // This cast is going to be shrunk. This may remove the cast or it might 7542 // turn it into slightly different cast. For example, if MinBW == 16, 7543 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7544 // 7545 // Calculate the modified src and dest types. 7546 Type *MinVecTy = VectorTy; 7547 if (Opcode == Instruction::Trunc) { 7548 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7549 VectorTy = 7550 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7551 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7552 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7553 VectorTy = 7554 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7555 } 7556 } 7557 7558 unsigned N; 7559 if (isScalarAfterVectorization(I, VF)) { 7560 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7561 N = VF.getKnownMinValue(); 7562 } else 7563 N = 1; 7564 return N * 7565 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7566 } 7567 case Instruction::Call: { 7568 bool NeedToScalarize; 7569 CallInst *CI = cast<CallInst>(I); 7570 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7571 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7572 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7573 return std::min(CallCost, IntrinsicCost); 7574 } 7575 return CallCost; 7576 } 7577 case Instruction::ExtractValue: 7578 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7579 default: 7580 // The cost of executing VF copies of the scalar instruction. This opcode 7581 // is unknown. Assume that it is the same as 'mul'. 7582 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7583 Instruction::Mul, VectorTy, CostKind) + 7584 getScalarizationOverhead(I, VF); 7585 } // end of switch. 7586 } 7587 7588 char LoopVectorize::ID = 0; 7589 7590 static const char lv_name[] = "Loop Vectorization"; 7591 7592 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7593 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7594 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7595 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7596 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7597 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7598 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7599 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7600 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7601 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7602 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7603 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7604 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7605 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7606 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7607 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7608 7609 namespace llvm { 7610 7611 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7612 7613 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7614 bool VectorizeOnlyWhenForced) { 7615 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7616 } 7617 7618 } // end namespace llvm 7619 7620 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7621 // Check if the pointer operand of a load or store instruction is 7622 // consecutive. 7623 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7624 return Legal->isConsecutivePtr(Ptr); 7625 return false; 7626 } 7627 7628 void LoopVectorizationCostModel::collectValuesToIgnore() { 7629 // Ignore ephemeral values. 7630 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7631 7632 // Ignore type-promoting instructions we identified during reduction 7633 // detection. 7634 for (auto &Reduction : Legal->getReductionVars()) { 7635 RecurrenceDescriptor &RedDes = Reduction.second; 7636 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7637 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7638 } 7639 // Ignore type-casting instructions we identified during induction 7640 // detection. 7641 for (auto &Induction : Legal->getInductionVars()) { 7642 InductionDescriptor &IndDes = Induction.second; 7643 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7644 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7645 } 7646 } 7647 7648 void LoopVectorizationCostModel::collectInLoopReductions() { 7649 for (auto &Reduction : Legal->getReductionVars()) { 7650 PHINode *Phi = Reduction.first; 7651 RecurrenceDescriptor &RdxDesc = Reduction.second; 7652 7653 // We don't collect reductions that are type promoted (yet). 7654 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7655 continue; 7656 7657 // If the target would prefer this reduction to happen "in-loop", then we 7658 // want to record it as such. 7659 unsigned Opcode = RdxDesc.getOpcode(); 7660 if (!PreferInLoopReductions && 7661 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7662 TargetTransformInfo::ReductionFlags())) 7663 continue; 7664 7665 // Check that we can correctly put the reductions into the loop, by 7666 // finding the chain of operations that leads from the phi to the loop 7667 // exit value. 7668 SmallVector<Instruction *, 4> ReductionOperations = 7669 RdxDesc.getReductionOpChain(Phi, TheLoop); 7670 bool InLoop = !ReductionOperations.empty(); 7671 if (InLoop) { 7672 InLoopReductionChains[Phi] = ReductionOperations; 7673 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7674 Instruction *LastChain = Phi; 7675 for (auto *I : ReductionOperations) { 7676 InLoopReductionImmediateChains[I] = LastChain; 7677 LastChain = I; 7678 } 7679 } 7680 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7681 << " reduction for phi: " << *Phi << "\n"); 7682 } 7683 } 7684 7685 // TODO: we could return a pair of values that specify the max VF and 7686 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7687 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7688 // doesn't have a cost model that can choose which plan to execute if 7689 // more than one is generated. 7690 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7691 LoopVectorizationCostModel &CM) { 7692 unsigned WidestType; 7693 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7694 return WidestVectorRegBits / WidestType; 7695 } 7696 7697 VectorizationFactor 7698 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7699 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7700 ElementCount VF = UserVF; 7701 // Outer loop handling: They may require CFG and instruction level 7702 // transformations before even evaluating whether vectorization is profitable. 7703 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7704 // the vectorization pipeline. 7705 if (!OrigLoop->isInnermost()) { 7706 // If the user doesn't provide a vectorization factor, determine a 7707 // reasonable one. 7708 if (UserVF.isZero()) { 7709 VF = ElementCount::getFixed(determineVPlanVF( 7710 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7711 .getFixedSize(), 7712 CM)); 7713 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7714 7715 // Make sure we have a VF > 1 for stress testing. 7716 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7717 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7718 << "overriding computed VF.\n"); 7719 VF = ElementCount::getFixed(4); 7720 } 7721 } 7722 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7723 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7724 "VF needs to be a power of two"); 7725 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7726 << "VF " << VF << " to build VPlans.\n"); 7727 buildVPlans(VF, VF); 7728 7729 // For VPlan build stress testing, we bail out after VPlan construction. 7730 if (VPlanBuildStressTest) 7731 return VectorizationFactor::Disabled(); 7732 7733 return {VF, 0 /*Cost*/}; 7734 } 7735 7736 LLVM_DEBUG( 7737 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7738 "VPlan-native path.\n"); 7739 return VectorizationFactor::Disabled(); 7740 } 7741 7742 Optional<VectorizationFactor> 7743 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7744 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7745 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7746 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7747 return None; 7748 7749 // Invalidate interleave groups if all blocks of loop will be predicated. 7750 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7751 !useMaskedInterleavedAccesses(*TTI)) { 7752 LLVM_DEBUG( 7753 dbgs() 7754 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7755 "which requires masked-interleaved support.\n"); 7756 if (CM.InterleaveInfo.invalidateGroups()) 7757 // Invalidating interleave groups also requires invalidating all decisions 7758 // based on them, which includes widening decisions and uniform and scalar 7759 // values. 7760 CM.invalidateCostModelingDecisions(); 7761 } 7762 7763 ElementCount MaxVF = MaybeMaxVF.getValue(); 7764 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7765 7766 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7767 if (!UserVF.isZero() && 7768 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7769 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7770 // VFs here, this should be reverted to only use legal UserVFs once the 7771 // loop below supports scalable VFs. 7772 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7773 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7774 << " VF " << VF << ".\n"); 7775 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7776 "VF needs to be a power of two"); 7777 // Collect the instructions (and their associated costs) that will be more 7778 // profitable to scalarize. 7779 CM.selectUserVectorizationFactor(VF); 7780 CM.collectInLoopReductions(); 7781 buildVPlansWithVPRecipes(VF, VF); 7782 LLVM_DEBUG(printPlans(dbgs())); 7783 return {{VF, 0}}; 7784 } 7785 7786 assert(!MaxVF.isScalable() && 7787 "Scalable vectors not yet supported beyond this point"); 7788 7789 for (ElementCount VF = ElementCount::getFixed(1); 7790 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7791 // Collect Uniform and Scalar instructions after vectorization with VF. 7792 CM.collectUniformsAndScalars(VF); 7793 7794 // Collect the instructions (and their associated costs) that will be more 7795 // profitable to scalarize. 7796 if (VF.isVector()) 7797 CM.collectInstsToScalarize(VF); 7798 } 7799 7800 CM.collectInLoopReductions(); 7801 7802 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7803 LLVM_DEBUG(printPlans(dbgs())); 7804 if (MaxVF.isScalar()) 7805 return VectorizationFactor::Disabled(); 7806 7807 // Select the optimal vectorization factor. 7808 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 7809 7810 // Check if it is profitable to vectorize with runtime checks. 7811 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7812 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7813 bool PragmaThresholdReached = 7814 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7815 bool ThresholdReached = 7816 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7817 if ((ThresholdReached && !Hints.allowReordering()) || 7818 PragmaThresholdReached) { 7819 ORE->emit([&]() { 7820 return OptimizationRemarkAnalysisAliasing( 7821 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7822 OrigLoop->getHeader()) 7823 << "loop not vectorized: cannot prove it is safe to reorder " 7824 "memory operations"; 7825 }); 7826 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7827 Hints.emitRemarkWithHints(); 7828 return VectorizationFactor::Disabled(); 7829 } 7830 } 7831 return SelectedVF; 7832 } 7833 7834 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7835 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7836 << '\n'); 7837 BestVF = VF; 7838 BestUF = UF; 7839 7840 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7841 return !Plan->hasVF(VF); 7842 }); 7843 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7844 } 7845 7846 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7847 DominatorTree *DT) { 7848 // Perform the actual loop transformation. 7849 7850 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7851 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7852 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7853 7854 VPTransformState State{ 7855 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7856 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7857 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7858 State.CanonicalIV = ILV.Induction; 7859 7860 ILV.printDebugTracesAtStart(); 7861 7862 //===------------------------------------------------===// 7863 // 7864 // Notice: any optimization or new instruction that go 7865 // into the code below should also be implemented in 7866 // the cost-model. 7867 // 7868 //===------------------------------------------------===// 7869 7870 // 2. Copy and widen instructions from the old loop into the new loop. 7871 VPlans.front()->execute(&State); 7872 7873 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7874 // predication, updating analyses. 7875 ILV.fixVectorizedLoop(State); 7876 7877 ILV.printDebugTracesAtEnd(); 7878 } 7879 7880 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7881 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7882 for (const auto &Plan : VPlans) 7883 if (PrintVPlansInDotFormat) 7884 Plan->printDOT(O); 7885 else 7886 Plan->print(O); 7887 } 7888 #endif 7889 7890 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7891 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7892 7893 // We create new control-flow for the vectorized loop, so the original exit 7894 // conditions will be dead after vectorization if it's only used by the 7895 // terminator 7896 SmallVector<BasicBlock*> ExitingBlocks; 7897 OrigLoop->getExitingBlocks(ExitingBlocks); 7898 for (auto *BB : ExitingBlocks) { 7899 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7900 if (!Cmp || !Cmp->hasOneUse()) 7901 continue; 7902 7903 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7904 if (!DeadInstructions.insert(Cmp).second) 7905 continue; 7906 7907 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7908 // TODO: can recurse through operands in general 7909 for (Value *Op : Cmp->operands()) { 7910 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7911 DeadInstructions.insert(cast<Instruction>(Op)); 7912 } 7913 } 7914 7915 // We create new "steps" for induction variable updates to which the original 7916 // induction variables map. An original update instruction will be dead if 7917 // all its users except the induction variable are dead. 7918 auto *Latch = OrigLoop->getLoopLatch(); 7919 for (auto &Induction : Legal->getInductionVars()) { 7920 PHINode *Ind = Induction.first; 7921 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7922 7923 // If the tail is to be folded by masking, the primary induction variable, 7924 // if exists, isn't dead: it will be used for masking. Don't kill it. 7925 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7926 continue; 7927 7928 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7929 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7930 })) 7931 DeadInstructions.insert(IndUpdate); 7932 7933 // We record as "Dead" also the type-casting instructions we had identified 7934 // during induction analysis. We don't need any handling for them in the 7935 // vectorized loop because we have proven that, under a proper runtime 7936 // test guarding the vectorized loop, the value of the phi, and the casted 7937 // value of the phi, are the same. The last instruction in this casting chain 7938 // will get its scalar/vector/widened def from the scalar/vector/widened def 7939 // of the respective phi node. Any other casts in the induction def-use chain 7940 // have no other uses outside the phi update chain, and will be ignored. 7941 InductionDescriptor &IndDes = Induction.second; 7942 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7943 DeadInstructions.insert(Casts.begin(), Casts.end()); 7944 } 7945 } 7946 7947 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7948 7949 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7950 7951 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7952 Instruction::BinaryOps BinOp) { 7953 // When unrolling and the VF is 1, we only need to add a simple scalar. 7954 Type *Ty = Val->getType(); 7955 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7956 7957 if (Ty->isFloatingPointTy()) { 7958 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7959 7960 // Floating-point operations inherit FMF via the builder's flags. 7961 Value *MulOp = Builder.CreateFMul(C, Step); 7962 return Builder.CreateBinOp(BinOp, Val, MulOp); 7963 } 7964 Constant *C = ConstantInt::get(Ty, StartIdx); 7965 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7966 } 7967 7968 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7969 SmallVector<Metadata *, 4> MDs; 7970 // Reserve first location for self reference to the LoopID metadata node. 7971 MDs.push_back(nullptr); 7972 bool IsUnrollMetadata = false; 7973 MDNode *LoopID = L->getLoopID(); 7974 if (LoopID) { 7975 // First find existing loop unrolling disable metadata. 7976 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7977 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7978 if (MD) { 7979 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7980 IsUnrollMetadata = 7981 S && S->getString().startswith("llvm.loop.unroll.disable"); 7982 } 7983 MDs.push_back(LoopID->getOperand(i)); 7984 } 7985 } 7986 7987 if (!IsUnrollMetadata) { 7988 // Add runtime unroll disable metadata. 7989 LLVMContext &Context = L->getHeader()->getContext(); 7990 SmallVector<Metadata *, 1> DisableOperands; 7991 DisableOperands.push_back( 7992 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7993 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7994 MDs.push_back(DisableNode); 7995 MDNode *NewLoopID = MDNode::get(Context, MDs); 7996 // Set operand 0 to refer to the loop id itself. 7997 NewLoopID->replaceOperandWith(0, NewLoopID); 7998 L->setLoopID(NewLoopID); 7999 } 8000 } 8001 8002 //===--------------------------------------------------------------------===// 8003 // EpilogueVectorizerMainLoop 8004 //===--------------------------------------------------------------------===// 8005 8006 /// This function is partially responsible for generating the control flow 8007 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8008 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8009 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8010 Loop *Lp = createVectorLoopSkeleton(""); 8011 8012 // Generate the code to check the minimum iteration count of the vector 8013 // epilogue (see below). 8014 EPI.EpilogueIterationCountCheck = 8015 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8016 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8017 8018 // Generate the code to check any assumptions that we've made for SCEV 8019 // expressions. 8020 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8021 8022 // Generate the code that checks at runtime if arrays overlap. We put the 8023 // checks into a separate block to make the more common case of few elements 8024 // faster. 8025 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8026 8027 // Generate the iteration count check for the main loop, *after* the check 8028 // for the epilogue loop, so that the path-length is shorter for the case 8029 // that goes directly through the vector epilogue. The longer-path length for 8030 // the main loop is compensated for, by the gain from vectorizing the larger 8031 // trip count. Note: the branch will get updated later on when we vectorize 8032 // the epilogue. 8033 EPI.MainLoopIterationCountCheck = 8034 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8035 8036 // Generate the induction variable. 8037 OldInduction = Legal->getPrimaryInduction(); 8038 Type *IdxTy = Legal->getWidestInductionType(); 8039 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8040 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8041 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8042 EPI.VectorTripCount = CountRoundDown; 8043 Induction = 8044 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8045 getDebugLocFromInstOrOperands(OldInduction)); 8046 8047 // Skip induction resume value creation here because they will be created in 8048 // the second pass. If we created them here, they wouldn't be used anyway, 8049 // because the vplan in the second pass still contains the inductions from the 8050 // original loop. 8051 8052 return completeLoopSkeleton(Lp, OrigLoopID); 8053 } 8054 8055 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8056 LLVM_DEBUG({ 8057 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8058 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8059 << ", Main Loop UF:" << EPI.MainLoopUF 8060 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8061 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8062 }); 8063 } 8064 8065 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8066 DEBUG_WITH_TYPE(VerboseDebug, { 8067 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8068 }); 8069 } 8070 8071 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8072 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8073 assert(L && "Expected valid Loop."); 8074 assert(Bypass && "Expected valid bypass basic block."); 8075 unsigned VFactor = 8076 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8077 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8078 Value *Count = getOrCreateTripCount(L); 8079 // Reuse existing vector loop preheader for TC checks. 8080 // Note that new preheader block is generated for vector loop. 8081 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8082 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8083 8084 // Generate code to check if the loop's trip count is less than VF * UF of the 8085 // main vector loop. 8086 auto P = 8087 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8088 8089 Value *CheckMinIters = Builder.CreateICmp( 8090 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8091 "min.iters.check"); 8092 8093 if (!ForEpilogue) 8094 TCCheckBlock->setName("vector.main.loop.iter.check"); 8095 8096 // Create new preheader for vector loop. 8097 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8098 DT, LI, nullptr, "vector.ph"); 8099 8100 if (ForEpilogue) { 8101 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8102 DT->getNode(Bypass)->getIDom()) && 8103 "TC check is expected to dominate Bypass"); 8104 8105 // Update dominator for Bypass & LoopExit. 8106 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8107 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8108 8109 LoopBypassBlocks.push_back(TCCheckBlock); 8110 8111 // Save the trip count so we don't have to regenerate it in the 8112 // vec.epilog.iter.check. This is safe to do because the trip count 8113 // generated here dominates the vector epilog iter check. 8114 EPI.TripCount = Count; 8115 } 8116 8117 ReplaceInstWithInst( 8118 TCCheckBlock->getTerminator(), 8119 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8120 8121 return TCCheckBlock; 8122 } 8123 8124 //===--------------------------------------------------------------------===// 8125 // EpilogueVectorizerEpilogueLoop 8126 //===--------------------------------------------------------------------===// 8127 8128 /// This function is partially responsible for generating the control flow 8129 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8130 BasicBlock * 8131 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8132 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8133 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8134 8135 // Now, compare the remaining count and if there aren't enough iterations to 8136 // execute the vectorized epilogue skip to the scalar part. 8137 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8138 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8139 LoopVectorPreHeader = 8140 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8141 LI, nullptr, "vec.epilog.ph"); 8142 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8143 VecEpilogueIterationCountCheck); 8144 8145 // Adjust the control flow taking the state info from the main loop 8146 // vectorization into account. 8147 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8148 "expected this to be saved from the previous pass."); 8149 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8150 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8151 8152 DT->changeImmediateDominator(LoopVectorPreHeader, 8153 EPI.MainLoopIterationCountCheck); 8154 8155 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8156 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8157 8158 if (EPI.SCEVSafetyCheck) 8159 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8160 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8161 if (EPI.MemSafetyCheck) 8162 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8163 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8164 8165 DT->changeImmediateDominator( 8166 VecEpilogueIterationCountCheck, 8167 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8168 8169 DT->changeImmediateDominator(LoopScalarPreHeader, 8170 EPI.EpilogueIterationCountCheck); 8171 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8172 8173 // Keep track of bypass blocks, as they feed start values to the induction 8174 // phis in the scalar loop preheader. 8175 if (EPI.SCEVSafetyCheck) 8176 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8177 if (EPI.MemSafetyCheck) 8178 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8179 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8180 8181 // Generate a resume induction for the vector epilogue and put it in the 8182 // vector epilogue preheader 8183 Type *IdxTy = Legal->getWidestInductionType(); 8184 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8185 LoopVectorPreHeader->getFirstNonPHI()); 8186 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8187 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8188 EPI.MainLoopIterationCountCheck); 8189 8190 // Generate the induction variable. 8191 OldInduction = Legal->getPrimaryInduction(); 8192 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8193 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8194 Value *StartIdx = EPResumeVal; 8195 Induction = 8196 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8197 getDebugLocFromInstOrOperands(OldInduction)); 8198 8199 // Generate induction resume values. These variables save the new starting 8200 // indexes for the scalar loop. They are used to test if there are any tail 8201 // iterations left once the vector loop has completed. 8202 // Note that when the vectorized epilogue is skipped due to iteration count 8203 // check, then the resume value for the induction variable comes from 8204 // the trip count of the main vector loop, hence passing the AdditionalBypass 8205 // argument. 8206 createInductionResumeValues(Lp, CountRoundDown, 8207 {VecEpilogueIterationCountCheck, 8208 EPI.VectorTripCount} /* AdditionalBypass */); 8209 8210 AddRuntimeUnrollDisableMetaData(Lp); 8211 return completeLoopSkeleton(Lp, OrigLoopID); 8212 } 8213 8214 BasicBlock * 8215 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8216 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8217 8218 assert(EPI.TripCount && 8219 "Expected trip count to have been safed in the first pass."); 8220 assert( 8221 (!isa<Instruction>(EPI.TripCount) || 8222 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8223 "saved trip count does not dominate insertion point."); 8224 Value *TC = EPI.TripCount; 8225 IRBuilder<> Builder(Insert->getTerminator()); 8226 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8227 8228 // Generate code to check if the loop's trip count is less than VF * UF of the 8229 // vector epilogue loop. 8230 auto P = 8231 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8232 8233 Value *CheckMinIters = Builder.CreateICmp( 8234 P, Count, 8235 ConstantInt::get(Count->getType(), 8236 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8237 "min.epilog.iters.check"); 8238 8239 ReplaceInstWithInst( 8240 Insert->getTerminator(), 8241 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8242 8243 LoopBypassBlocks.push_back(Insert); 8244 return Insert; 8245 } 8246 8247 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8248 LLVM_DEBUG({ 8249 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8250 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8251 << ", Main Loop UF:" << EPI.MainLoopUF 8252 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8253 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8254 }); 8255 } 8256 8257 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8258 DEBUG_WITH_TYPE(VerboseDebug, { 8259 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8260 }); 8261 } 8262 8263 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8264 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8265 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8266 bool PredicateAtRangeStart = Predicate(Range.Start); 8267 8268 for (ElementCount TmpVF = Range.Start * 2; 8269 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8270 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8271 Range.End = TmpVF; 8272 break; 8273 } 8274 8275 return PredicateAtRangeStart; 8276 } 8277 8278 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8279 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8280 /// of VF's starting at a given VF and extending it as much as possible. Each 8281 /// vectorization decision can potentially shorten this sub-range during 8282 /// buildVPlan(). 8283 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8284 ElementCount MaxVF) { 8285 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8286 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8287 VFRange SubRange = {VF, MaxVFPlusOne}; 8288 VPlans.push_back(buildVPlan(SubRange)); 8289 VF = SubRange.End; 8290 } 8291 } 8292 8293 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8294 VPlanPtr &Plan) { 8295 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8296 8297 // Look for cached value. 8298 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8299 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8300 if (ECEntryIt != EdgeMaskCache.end()) 8301 return ECEntryIt->second; 8302 8303 VPValue *SrcMask = createBlockInMask(Src, Plan); 8304 8305 // The terminator has to be a branch inst! 8306 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8307 assert(BI && "Unexpected terminator found"); 8308 8309 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8310 return EdgeMaskCache[Edge] = SrcMask; 8311 8312 // If source is an exiting block, we know the exit edge is dynamically dead 8313 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8314 // adding uses of an otherwise potentially dead instruction. 8315 if (OrigLoop->isLoopExiting(Src)) 8316 return EdgeMaskCache[Edge] = SrcMask; 8317 8318 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8319 assert(EdgeMask && "No Edge Mask found for condition"); 8320 8321 if (BI->getSuccessor(0) != Dst) 8322 EdgeMask = Builder.createNot(EdgeMask); 8323 8324 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8325 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8326 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8327 // The select version does not introduce new UB if SrcMask is false and 8328 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8329 VPValue *False = Plan->getOrAddVPValue( 8330 ConstantInt::getFalse(BI->getCondition()->getType())); 8331 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8332 } 8333 8334 return EdgeMaskCache[Edge] = EdgeMask; 8335 } 8336 8337 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8338 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8339 8340 // Look for cached value. 8341 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8342 if (BCEntryIt != BlockMaskCache.end()) 8343 return BCEntryIt->second; 8344 8345 // All-one mask is modelled as no-mask following the convention for masked 8346 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8347 VPValue *BlockMask = nullptr; 8348 8349 if (OrigLoop->getHeader() == BB) { 8350 if (!CM.blockNeedsPredication(BB)) 8351 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8352 8353 // Create the block in mask as the first non-phi instruction in the block. 8354 VPBuilder::InsertPointGuard Guard(Builder); 8355 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8356 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8357 8358 // Introduce the early-exit compare IV <= BTC to form header block mask. 8359 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8360 // Start by constructing the desired canonical IV. 8361 VPValue *IV = nullptr; 8362 if (Legal->getPrimaryInduction()) 8363 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8364 else { 8365 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8366 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8367 IV = IVRecipe->getVPValue(); 8368 } 8369 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8370 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8371 8372 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8373 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8374 // as a second argument, we only pass the IV here and extract the 8375 // tripcount from the transform state where codegen of the VP instructions 8376 // happen. 8377 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8378 } else { 8379 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8380 } 8381 return BlockMaskCache[BB] = BlockMask; 8382 } 8383 8384 // This is the block mask. We OR all incoming edges. 8385 for (auto *Predecessor : predecessors(BB)) { 8386 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8387 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8388 return BlockMaskCache[BB] = EdgeMask; 8389 8390 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8391 BlockMask = EdgeMask; 8392 continue; 8393 } 8394 8395 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8396 } 8397 8398 return BlockMaskCache[BB] = BlockMask; 8399 } 8400 8401 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8402 ArrayRef<VPValue *> Operands, 8403 VFRange &Range, 8404 VPlanPtr &Plan) { 8405 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8406 "Must be called with either a load or store"); 8407 8408 auto willWiden = [&](ElementCount VF) -> bool { 8409 if (VF.isScalar()) 8410 return false; 8411 LoopVectorizationCostModel::InstWidening Decision = 8412 CM.getWideningDecision(I, VF); 8413 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8414 "CM decision should be taken at this point."); 8415 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8416 return true; 8417 if (CM.isScalarAfterVectorization(I, VF) || 8418 CM.isProfitableToScalarize(I, VF)) 8419 return false; 8420 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8421 }; 8422 8423 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8424 return nullptr; 8425 8426 VPValue *Mask = nullptr; 8427 if (Legal->isMaskRequired(I)) 8428 Mask = createBlockInMask(I->getParent(), Plan); 8429 8430 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8431 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8432 8433 StoreInst *Store = cast<StoreInst>(I); 8434 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8435 Mask); 8436 } 8437 8438 VPWidenIntOrFpInductionRecipe * 8439 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8440 ArrayRef<VPValue *> Operands) const { 8441 // Check if this is an integer or fp induction. If so, build the recipe that 8442 // produces its scalar and vector values. 8443 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8444 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8445 II.getKind() == InductionDescriptor::IK_FpInduction) { 8446 assert(II.getStartValue() == 8447 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8448 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8449 return new VPWidenIntOrFpInductionRecipe( 8450 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8451 } 8452 8453 return nullptr; 8454 } 8455 8456 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8457 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8458 VPlan &Plan) const { 8459 // Optimize the special case where the source is a constant integer 8460 // induction variable. Notice that we can only optimize the 'trunc' case 8461 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8462 // (c) other casts depend on pointer size. 8463 8464 // Determine whether \p K is a truncation based on an induction variable that 8465 // can be optimized. 8466 auto isOptimizableIVTruncate = 8467 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8468 return [=](ElementCount VF) -> bool { 8469 return CM.isOptimizableIVTruncate(K, VF); 8470 }; 8471 }; 8472 8473 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8474 isOptimizableIVTruncate(I), Range)) { 8475 8476 InductionDescriptor II = 8477 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8478 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8479 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8480 Start, nullptr, I); 8481 } 8482 return nullptr; 8483 } 8484 8485 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8486 ArrayRef<VPValue *> Operands, 8487 VPlanPtr &Plan) { 8488 // If all incoming values are equal, the incoming VPValue can be used directly 8489 // instead of creating a new VPBlendRecipe. 8490 VPValue *FirstIncoming = Operands[0]; 8491 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8492 return FirstIncoming == Inc; 8493 })) { 8494 return Operands[0]; 8495 } 8496 8497 // We know that all PHIs in non-header blocks are converted into selects, so 8498 // we don't have to worry about the insertion order and we can just use the 8499 // builder. At this point we generate the predication tree. There may be 8500 // duplications since this is a simple recursive scan, but future 8501 // optimizations will clean it up. 8502 SmallVector<VPValue *, 2> OperandsWithMask; 8503 unsigned NumIncoming = Phi->getNumIncomingValues(); 8504 8505 for (unsigned In = 0; In < NumIncoming; In++) { 8506 VPValue *EdgeMask = 8507 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8508 assert((EdgeMask || NumIncoming == 1) && 8509 "Multiple predecessors with one having a full mask"); 8510 OperandsWithMask.push_back(Operands[In]); 8511 if (EdgeMask) 8512 OperandsWithMask.push_back(EdgeMask); 8513 } 8514 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8515 } 8516 8517 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8518 ArrayRef<VPValue *> Operands, 8519 VFRange &Range) const { 8520 8521 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8522 [this, CI](ElementCount VF) { 8523 return CM.isScalarWithPredication(CI, VF); 8524 }, 8525 Range); 8526 8527 if (IsPredicated) 8528 return nullptr; 8529 8530 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8531 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8532 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8533 ID == Intrinsic::pseudoprobe || 8534 ID == Intrinsic::experimental_noalias_scope_decl)) 8535 return nullptr; 8536 8537 auto willWiden = [&](ElementCount VF) -> bool { 8538 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8539 // The following case may be scalarized depending on the VF. 8540 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8541 // version of the instruction. 8542 // Is it beneficial to perform intrinsic call compared to lib call? 8543 bool NeedToScalarize = false; 8544 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8545 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8546 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8547 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8548 "Either the intrinsic cost or vector call cost must be valid"); 8549 return UseVectorIntrinsic || !NeedToScalarize; 8550 }; 8551 8552 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8553 return nullptr; 8554 8555 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8556 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8557 } 8558 8559 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8560 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8561 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8562 // Instruction should be widened, unless it is scalar after vectorization, 8563 // scalarization is profitable or it is predicated. 8564 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8565 return CM.isScalarAfterVectorization(I, VF) || 8566 CM.isProfitableToScalarize(I, VF) || 8567 CM.isScalarWithPredication(I, VF); 8568 }; 8569 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8570 Range); 8571 } 8572 8573 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8574 ArrayRef<VPValue *> Operands) const { 8575 auto IsVectorizableOpcode = [](unsigned Opcode) { 8576 switch (Opcode) { 8577 case Instruction::Add: 8578 case Instruction::And: 8579 case Instruction::AShr: 8580 case Instruction::BitCast: 8581 case Instruction::FAdd: 8582 case Instruction::FCmp: 8583 case Instruction::FDiv: 8584 case Instruction::FMul: 8585 case Instruction::FNeg: 8586 case Instruction::FPExt: 8587 case Instruction::FPToSI: 8588 case Instruction::FPToUI: 8589 case Instruction::FPTrunc: 8590 case Instruction::FRem: 8591 case Instruction::FSub: 8592 case Instruction::ICmp: 8593 case Instruction::IntToPtr: 8594 case Instruction::LShr: 8595 case Instruction::Mul: 8596 case Instruction::Or: 8597 case Instruction::PtrToInt: 8598 case Instruction::SDiv: 8599 case Instruction::Select: 8600 case Instruction::SExt: 8601 case Instruction::Shl: 8602 case Instruction::SIToFP: 8603 case Instruction::SRem: 8604 case Instruction::Sub: 8605 case Instruction::Trunc: 8606 case Instruction::UDiv: 8607 case Instruction::UIToFP: 8608 case Instruction::URem: 8609 case Instruction::Xor: 8610 case Instruction::ZExt: 8611 return true; 8612 } 8613 return false; 8614 }; 8615 8616 if (!IsVectorizableOpcode(I->getOpcode())) 8617 return nullptr; 8618 8619 // Success: widen this instruction. 8620 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8621 } 8622 8623 VPBasicBlock *VPRecipeBuilder::handleReplication( 8624 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8625 VPlanPtr &Plan) { 8626 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8627 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8628 Range); 8629 8630 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8631 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8632 Range); 8633 8634 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8635 IsUniform, IsPredicated); 8636 setRecipe(I, Recipe); 8637 Plan->addVPValue(I, Recipe); 8638 8639 // Find if I uses a predicated instruction. If so, it will use its scalar 8640 // value. Avoid hoisting the insert-element which packs the scalar value into 8641 // a vector value, as that happens iff all users use the vector value. 8642 for (VPValue *Op : Recipe->operands()) { 8643 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8644 if (!PredR) 8645 continue; 8646 auto *RepR = 8647 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8648 assert(RepR->isPredicated() && 8649 "expected Replicate recipe to be predicated"); 8650 RepR->setAlsoPack(false); 8651 } 8652 8653 // Finalize the recipe for Instr, first if it is not predicated. 8654 if (!IsPredicated) { 8655 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8656 VPBB->appendRecipe(Recipe); 8657 return VPBB; 8658 } 8659 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8660 assert(VPBB->getSuccessors().empty() && 8661 "VPBB has successors when handling predicated replication."); 8662 // Record predicated instructions for above packing optimizations. 8663 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8664 VPBlockUtils::insertBlockAfter(Region, VPBB); 8665 auto *RegSucc = new VPBasicBlock(); 8666 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8667 return RegSucc; 8668 } 8669 8670 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8671 VPRecipeBase *PredRecipe, 8672 VPlanPtr &Plan) { 8673 // Instructions marked for predication are replicated and placed under an 8674 // if-then construct to prevent side-effects. 8675 8676 // Generate recipes to compute the block mask for this region. 8677 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8678 8679 // Build the triangular if-then region. 8680 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8681 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8682 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8683 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8684 auto *PHIRecipe = Instr->getType()->isVoidTy() 8685 ? nullptr 8686 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8687 if (PHIRecipe) { 8688 Plan->removeVPValueFor(Instr); 8689 Plan->addVPValue(Instr, PHIRecipe); 8690 } 8691 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8692 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8693 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8694 8695 // Note: first set Entry as region entry and then connect successors starting 8696 // from it in order, to propagate the "parent" of each VPBasicBlock. 8697 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8698 VPBlockUtils::connectBlocks(Pred, Exit); 8699 8700 return Region; 8701 } 8702 8703 VPRecipeOrVPValueTy 8704 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8705 ArrayRef<VPValue *> Operands, 8706 VFRange &Range, VPlanPtr &Plan) { 8707 // First, check for specific widening recipes that deal with calls, memory 8708 // operations, inductions and Phi nodes. 8709 if (auto *CI = dyn_cast<CallInst>(Instr)) 8710 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8711 8712 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8713 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8714 8715 VPRecipeBase *Recipe; 8716 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8717 if (Phi->getParent() != OrigLoop->getHeader()) 8718 return tryToBlend(Phi, Operands, Plan); 8719 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8720 return toVPRecipeResult(Recipe); 8721 8722 if (Legal->isReductionVariable(Phi)) { 8723 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8724 assert(RdxDesc.getRecurrenceStartValue() == 8725 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8726 VPValue *StartV = Operands[0]; 8727 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8728 } 8729 8730 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8731 } 8732 8733 if (isa<TruncInst>(Instr) && 8734 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8735 Range, *Plan))) 8736 return toVPRecipeResult(Recipe); 8737 8738 if (!shouldWiden(Instr, Range)) 8739 return nullptr; 8740 8741 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8742 return toVPRecipeResult(new VPWidenGEPRecipe( 8743 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8744 8745 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8746 bool InvariantCond = 8747 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8748 return toVPRecipeResult(new VPWidenSelectRecipe( 8749 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8750 } 8751 8752 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8753 } 8754 8755 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8756 ElementCount MaxVF) { 8757 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8758 8759 // Collect instructions from the original loop that will become trivially dead 8760 // in the vectorized loop. We don't need to vectorize these instructions. For 8761 // example, original induction update instructions can become dead because we 8762 // separately emit induction "steps" when generating code for the new loop. 8763 // Similarly, we create a new latch condition when setting up the structure 8764 // of the new loop, so the old one can become dead. 8765 SmallPtrSet<Instruction *, 4> DeadInstructions; 8766 collectTriviallyDeadInstructions(DeadInstructions); 8767 8768 // Add assume instructions we need to drop to DeadInstructions, to prevent 8769 // them from being added to the VPlan. 8770 // TODO: We only need to drop assumes in blocks that get flattend. If the 8771 // control flow is preserved, we should keep them. 8772 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8773 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8774 8775 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8776 // Dead instructions do not need sinking. Remove them from SinkAfter. 8777 for (Instruction *I : DeadInstructions) 8778 SinkAfter.erase(I); 8779 8780 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8781 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8782 VFRange SubRange = {VF, MaxVFPlusOne}; 8783 VPlans.push_back( 8784 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8785 VF = SubRange.End; 8786 } 8787 } 8788 8789 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8790 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8791 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8792 8793 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8794 8795 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8796 8797 // --------------------------------------------------------------------------- 8798 // Pre-construction: record ingredients whose recipes we'll need to further 8799 // process after constructing the initial VPlan. 8800 // --------------------------------------------------------------------------- 8801 8802 // Mark instructions we'll need to sink later and their targets as 8803 // ingredients whose recipe we'll need to record. 8804 for (auto &Entry : SinkAfter) { 8805 RecipeBuilder.recordRecipeOf(Entry.first); 8806 RecipeBuilder.recordRecipeOf(Entry.second); 8807 } 8808 for (auto &Reduction : CM.getInLoopReductionChains()) { 8809 PHINode *Phi = Reduction.first; 8810 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8811 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8812 8813 RecipeBuilder.recordRecipeOf(Phi); 8814 for (auto &R : ReductionOperations) { 8815 RecipeBuilder.recordRecipeOf(R); 8816 // For min/max reducitons, where we have a pair of icmp/select, we also 8817 // need to record the ICmp recipe, so it can be removed later. 8818 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8819 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8820 } 8821 } 8822 8823 // For each interleave group which is relevant for this (possibly trimmed) 8824 // Range, add it to the set of groups to be later applied to the VPlan and add 8825 // placeholders for its members' Recipes which we'll be replacing with a 8826 // single VPInterleaveRecipe. 8827 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8828 auto applyIG = [IG, this](ElementCount VF) -> bool { 8829 return (VF.isVector() && // Query is illegal for VF == 1 8830 CM.getWideningDecision(IG->getInsertPos(), VF) == 8831 LoopVectorizationCostModel::CM_Interleave); 8832 }; 8833 if (!getDecisionAndClampRange(applyIG, Range)) 8834 continue; 8835 InterleaveGroups.insert(IG); 8836 for (unsigned i = 0; i < IG->getFactor(); i++) 8837 if (Instruction *Member = IG->getMember(i)) 8838 RecipeBuilder.recordRecipeOf(Member); 8839 }; 8840 8841 // --------------------------------------------------------------------------- 8842 // Build initial VPlan: Scan the body of the loop in a topological order to 8843 // visit each basic block after having visited its predecessor basic blocks. 8844 // --------------------------------------------------------------------------- 8845 8846 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8847 auto Plan = std::make_unique<VPlan>(); 8848 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8849 Plan->setEntry(VPBB); 8850 8851 // Scan the body of the loop in a topological order to visit each basic block 8852 // after having visited its predecessor basic blocks. 8853 LoopBlocksDFS DFS(OrigLoop); 8854 DFS.perform(LI); 8855 8856 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8857 // Relevant instructions from basic block BB will be grouped into VPRecipe 8858 // ingredients and fill a new VPBasicBlock. 8859 unsigned VPBBsForBB = 0; 8860 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8861 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8862 VPBB = FirstVPBBForBB; 8863 Builder.setInsertPoint(VPBB); 8864 8865 // Introduce each ingredient into VPlan. 8866 // TODO: Model and preserve debug instrinsics in VPlan. 8867 for (Instruction &I : BB->instructionsWithoutDebug()) { 8868 Instruction *Instr = &I; 8869 8870 // First filter out irrelevant instructions, to ensure no recipes are 8871 // built for them. 8872 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8873 continue; 8874 8875 SmallVector<VPValue *, 4> Operands; 8876 auto *Phi = dyn_cast<PHINode>(Instr); 8877 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8878 Operands.push_back(Plan->getOrAddVPValue( 8879 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8880 } else { 8881 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8882 Operands = {OpRange.begin(), OpRange.end()}; 8883 } 8884 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8885 Instr, Operands, Range, Plan)) { 8886 // If Instr can be simplified to an existing VPValue, use it. 8887 if (RecipeOrValue.is<VPValue *>()) { 8888 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8889 continue; 8890 } 8891 // Otherwise, add the new recipe. 8892 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8893 for (auto *Def : Recipe->definedValues()) { 8894 auto *UV = Def->getUnderlyingValue(); 8895 Plan->addVPValue(UV, Def); 8896 } 8897 8898 RecipeBuilder.setRecipe(Instr, Recipe); 8899 VPBB->appendRecipe(Recipe); 8900 continue; 8901 } 8902 8903 // Otherwise, if all widening options failed, Instruction is to be 8904 // replicated. This may create a successor for VPBB. 8905 VPBasicBlock *NextVPBB = 8906 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8907 if (NextVPBB != VPBB) { 8908 VPBB = NextVPBB; 8909 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8910 : ""); 8911 } 8912 } 8913 } 8914 8915 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8916 // may also be empty, such as the last one VPBB, reflecting original 8917 // basic-blocks with no recipes. 8918 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8919 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8920 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8921 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8922 delete PreEntry; 8923 8924 // --------------------------------------------------------------------------- 8925 // Transform initial VPlan: Apply previously taken decisions, in order, to 8926 // bring the VPlan to its final state. 8927 // --------------------------------------------------------------------------- 8928 8929 // Apply Sink-After legal constraints. 8930 for (auto &Entry : SinkAfter) { 8931 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8932 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8933 // If the target is in a replication region, make sure to move Sink to the 8934 // block after it, not into the replication region itself. 8935 if (auto *Region = 8936 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8937 if (Region->isReplicator()) { 8938 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8939 VPBasicBlock *NextBlock = 8940 cast<VPBasicBlock>(Region->getSuccessors().front()); 8941 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8942 continue; 8943 } 8944 } 8945 Sink->moveAfter(Target); 8946 } 8947 8948 // Interleave memory: for each Interleave Group we marked earlier as relevant 8949 // for this VPlan, replace the Recipes widening its memory instructions with a 8950 // single VPInterleaveRecipe at its insertion point. 8951 for (auto IG : InterleaveGroups) { 8952 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8953 RecipeBuilder.getRecipe(IG->getInsertPos())); 8954 SmallVector<VPValue *, 4> StoredValues; 8955 for (unsigned i = 0; i < IG->getFactor(); ++i) 8956 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8957 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8958 8959 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8960 Recipe->getMask()); 8961 VPIG->insertBefore(Recipe); 8962 unsigned J = 0; 8963 for (unsigned i = 0; i < IG->getFactor(); ++i) 8964 if (Instruction *Member = IG->getMember(i)) { 8965 if (!Member->getType()->isVoidTy()) { 8966 VPValue *OriginalV = Plan->getVPValue(Member); 8967 Plan->removeVPValueFor(Member); 8968 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8969 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8970 J++; 8971 } 8972 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8973 } 8974 } 8975 8976 // Adjust the recipes for any inloop reductions. 8977 if (Range.Start.isVector()) 8978 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8979 8980 // Finally, if tail is folded by masking, introduce selects between the phi 8981 // and the live-out instruction of each reduction, at the end of the latch. 8982 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8983 Builder.setInsertPoint(VPBB); 8984 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8985 for (auto &Reduction : Legal->getReductionVars()) { 8986 if (CM.isInLoopReduction(Reduction.first)) 8987 continue; 8988 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8989 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8990 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8991 } 8992 } 8993 8994 std::string PlanName; 8995 raw_string_ostream RSO(PlanName); 8996 ElementCount VF = Range.Start; 8997 Plan->addVF(VF); 8998 RSO << "Initial VPlan for VF={" << VF; 8999 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9000 Plan->addVF(VF); 9001 RSO << "," << VF; 9002 } 9003 RSO << "},UF>=1"; 9004 RSO.flush(); 9005 Plan->setName(PlanName); 9006 9007 return Plan; 9008 } 9009 9010 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9011 // Outer loop handling: They may require CFG and instruction level 9012 // transformations before even evaluating whether vectorization is profitable. 9013 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9014 // the vectorization pipeline. 9015 assert(!OrigLoop->isInnermost()); 9016 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9017 9018 // Create new empty VPlan 9019 auto Plan = std::make_unique<VPlan>(); 9020 9021 // Build hierarchical CFG 9022 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9023 HCFGBuilder.buildHierarchicalCFG(); 9024 9025 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9026 VF *= 2) 9027 Plan->addVF(VF); 9028 9029 if (EnableVPlanPredication) { 9030 VPlanPredicator VPP(*Plan); 9031 VPP.predicate(); 9032 9033 // Avoid running transformation to recipes until masked code generation in 9034 // VPlan-native path is in place. 9035 return Plan; 9036 } 9037 9038 SmallPtrSet<Instruction *, 1> DeadInstructions; 9039 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9040 Legal->getInductionVars(), 9041 DeadInstructions, *PSE.getSE()); 9042 return Plan; 9043 } 9044 9045 // Adjust the recipes for any inloop reductions. The chain of instructions 9046 // leading from the loop exit instr to the phi need to be converted to 9047 // reductions, with one operand being vector and the other being the scalar 9048 // reduction chain. 9049 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9050 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9051 for (auto &Reduction : CM.getInLoopReductionChains()) { 9052 PHINode *Phi = Reduction.first; 9053 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9054 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9055 9056 // ReductionOperations are orders top-down from the phi's use to the 9057 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9058 // which of the two operands will remain scalar and which will be reduced. 9059 // For minmax the chain will be the select instructions. 9060 Instruction *Chain = Phi; 9061 for (Instruction *R : ReductionOperations) { 9062 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9063 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9064 9065 VPValue *ChainOp = Plan->getVPValue(Chain); 9066 unsigned FirstOpId; 9067 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9068 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9069 "Expected to replace a VPWidenSelectSC"); 9070 FirstOpId = 1; 9071 } else { 9072 assert(isa<VPWidenRecipe>(WidenRecipe) && 9073 "Expected to replace a VPWidenSC"); 9074 FirstOpId = 0; 9075 } 9076 unsigned VecOpId = 9077 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9078 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9079 9080 auto *CondOp = CM.foldTailByMasking() 9081 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9082 : nullptr; 9083 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9084 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9085 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9086 Plan->removeVPValueFor(R); 9087 Plan->addVPValue(R, RedRecipe); 9088 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9089 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9090 WidenRecipe->eraseFromParent(); 9091 9092 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9093 VPRecipeBase *CompareRecipe = 9094 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9095 assert(isa<VPWidenRecipe>(CompareRecipe) && 9096 "Expected to replace a VPWidenSC"); 9097 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9098 "Expected no remaining users"); 9099 CompareRecipe->eraseFromParent(); 9100 } 9101 Chain = R; 9102 } 9103 } 9104 } 9105 9106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9107 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9108 VPSlotTracker &SlotTracker) const { 9109 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9110 IG->getInsertPos()->printAsOperand(O, false); 9111 O << ", "; 9112 getAddr()->printAsOperand(O, SlotTracker); 9113 VPValue *Mask = getMask(); 9114 if (Mask) { 9115 O << ", "; 9116 Mask->printAsOperand(O, SlotTracker); 9117 } 9118 for (unsigned i = 0; i < IG->getFactor(); ++i) 9119 if (Instruction *I = IG->getMember(i)) 9120 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9121 } 9122 #endif 9123 9124 void VPWidenCallRecipe::execute(VPTransformState &State) { 9125 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9126 *this, State); 9127 } 9128 9129 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9130 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9131 this, *this, InvariantCond, State); 9132 } 9133 9134 void VPWidenRecipe::execute(VPTransformState &State) { 9135 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9136 } 9137 9138 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9139 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9140 *this, State.UF, State.VF, IsPtrLoopInvariant, 9141 IsIndexLoopInvariant, State); 9142 } 9143 9144 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9145 assert(!State.Instance && "Int or FP induction being replicated."); 9146 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9147 getTruncInst(), getVPValue(0), 9148 getCastValue(), State); 9149 } 9150 9151 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9152 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9153 getStartValue(), this, State); 9154 } 9155 9156 void VPBlendRecipe::execute(VPTransformState &State) { 9157 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9158 // We know that all PHIs in non-header blocks are converted into 9159 // selects, so we don't have to worry about the insertion order and we 9160 // can just use the builder. 9161 // At this point we generate the predication tree. There may be 9162 // duplications since this is a simple recursive scan, but future 9163 // optimizations will clean it up. 9164 9165 unsigned NumIncoming = getNumIncomingValues(); 9166 9167 // Generate a sequence of selects of the form: 9168 // SELECT(Mask3, In3, 9169 // SELECT(Mask2, In2, 9170 // SELECT(Mask1, In1, 9171 // In0))) 9172 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9173 // are essentially undef are taken from In0. 9174 InnerLoopVectorizer::VectorParts Entry(State.UF); 9175 for (unsigned In = 0; In < NumIncoming; ++In) { 9176 for (unsigned Part = 0; Part < State.UF; ++Part) { 9177 // We might have single edge PHIs (blocks) - use an identity 9178 // 'select' for the first PHI operand. 9179 Value *In0 = State.get(getIncomingValue(In), Part); 9180 if (In == 0) 9181 Entry[Part] = In0; // Initialize with the first incoming value. 9182 else { 9183 // Select between the current value and the previous incoming edge 9184 // based on the incoming mask. 9185 Value *Cond = State.get(getMask(In), Part); 9186 Entry[Part] = 9187 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9188 } 9189 } 9190 } 9191 for (unsigned Part = 0; Part < State.UF; ++Part) 9192 State.set(this, Entry[Part], Part); 9193 } 9194 9195 void VPInterleaveRecipe::execute(VPTransformState &State) { 9196 assert(!State.Instance && "Interleave group being replicated."); 9197 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9198 getStoredValues(), getMask()); 9199 } 9200 9201 void VPReductionRecipe::execute(VPTransformState &State) { 9202 assert(!State.Instance && "Reduction being replicated."); 9203 for (unsigned Part = 0; Part < State.UF; ++Part) { 9204 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9205 Value *NewVecOp = State.get(getVecOp(), Part); 9206 if (VPValue *Cond = getCondOp()) { 9207 Value *NewCond = State.get(Cond, Part); 9208 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9209 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9210 Kind, VecTy->getElementType()); 9211 Constant *IdenVec = 9212 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9213 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9214 NewVecOp = Select; 9215 } 9216 Value *NewRed = 9217 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9218 Value *PrevInChain = State.get(getChainOp(), Part); 9219 Value *NextInChain; 9220 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9221 NextInChain = 9222 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9223 NewRed, PrevInChain); 9224 } else { 9225 NextInChain = State.Builder.CreateBinOp( 9226 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9227 PrevInChain); 9228 } 9229 State.set(this, NextInChain, Part); 9230 } 9231 } 9232 9233 void VPReplicateRecipe::execute(VPTransformState &State) { 9234 if (State.Instance) { // Generate a single instance. 9235 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9236 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9237 *State.Instance, IsPredicated, State); 9238 // Insert scalar instance packing it into a vector. 9239 if (AlsoPack && State.VF.isVector()) { 9240 // If we're constructing lane 0, initialize to start from poison. 9241 if (State.Instance->Lane.isFirstLane()) { 9242 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9243 Value *Poison = PoisonValue::get( 9244 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9245 State.set(this, Poison, State.Instance->Part); 9246 } 9247 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9248 } 9249 return; 9250 } 9251 9252 // Generate scalar instances for all VF lanes of all UF parts, unless the 9253 // instruction is uniform inwhich case generate only the first lane for each 9254 // of the UF parts. 9255 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9256 assert((!State.VF.isScalable() || IsUniform) && 9257 "Can't scalarize a scalable vector"); 9258 for (unsigned Part = 0; Part < State.UF; ++Part) 9259 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9260 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9261 VPIteration(Part, Lane), IsPredicated, 9262 State); 9263 } 9264 9265 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9266 assert(State.Instance && "Branch on Mask works only on single instance."); 9267 9268 unsigned Part = State.Instance->Part; 9269 unsigned Lane = State.Instance->Lane.getKnownLane(); 9270 9271 Value *ConditionBit = nullptr; 9272 VPValue *BlockInMask = getMask(); 9273 if (BlockInMask) { 9274 ConditionBit = State.get(BlockInMask, Part); 9275 if (ConditionBit->getType()->isVectorTy()) 9276 ConditionBit = State.Builder.CreateExtractElement( 9277 ConditionBit, State.Builder.getInt32(Lane)); 9278 } else // Block in mask is all-one. 9279 ConditionBit = State.Builder.getTrue(); 9280 9281 // Replace the temporary unreachable terminator with a new conditional branch, 9282 // whose two destinations will be set later when they are created. 9283 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9284 assert(isa<UnreachableInst>(CurrentTerminator) && 9285 "Expected to replace unreachable terminator with conditional branch."); 9286 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9287 CondBr->setSuccessor(0, nullptr); 9288 ReplaceInstWithInst(CurrentTerminator, CondBr); 9289 } 9290 9291 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9292 assert(State.Instance && "Predicated instruction PHI works per instance."); 9293 Instruction *ScalarPredInst = 9294 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9295 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9296 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9297 assert(PredicatingBB && "Predicated block has no single predecessor."); 9298 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9299 "operand must be VPReplicateRecipe"); 9300 9301 // By current pack/unpack logic we need to generate only a single phi node: if 9302 // a vector value for the predicated instruction exists at this point it means 9303 // the instruction has vector users only, and a phi for the vector value is 9304 // needed. In this case the recipe of the predicated instruction is marked to 9305 // also do that packing, thereby "hoisting" the insert-element sequence. 9306 // Otherwise, a phi node for the scalar value is needed. 9307 unsigned Part = State.Instance->Part; 9308 if (State.hasVectorValue(getOperand(0), Part)) { 9309 Value *VectorValue = State.get(getOperand(0), Part); 9310 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9311 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9312 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9313 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9314 if (State.hasVectorValue(this, Part)) 9315 State.reset(this, VPhi, Part); 9316 else 9317 State.set(this, VPhi, Part); 9318 // NOTE: Currently we need to update the value of the operand, so the next 9319 // predicated iteration inserts its generated value in the correct vector. 9320 State.reset(getOperand(0), VPhi, Part); 9321 } else { 9322 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9323 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9324 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9325 PredicatingBB); 9326 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9327 if (State.hasScalarValue(this, *State.Instance)) 9328 State.reset(this, Phi, *State.Instance); 9329 else 9330 State.set(this, Phi, *State.Instance); 9331 // NOTE: Currently we need to update the value of the operand, so the next 9332 // predicated iteration inserts its generated value in the correct vector. 9333 State.reset(getOperand(0), Phi, *State.Instance); 9334 } 9335 } 9336 9337 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9338 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9339 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9340 StoredValue ? nullptr : getVPValue(), 9341 getAddr(), StoredValue, getMask()); 9342 } 9343 9344 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9345 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9346 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9347 // for predication. 9348 static ScalarEpilogueLowering getScalarEpilogueLowering( 9349 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9350 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9351 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9352 LoopVectorizationLegality &LVL) { 9353 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9354 // don't look at hints or options, and don't request a scalar epilogue. 9355 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9356 // LoopAccessInfo (due to code dependency and not being able to reliably get 9357 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9358 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9359 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9360 // back to the old way and vectorize with versioning when forced. See D81345.) 9361 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9362 PGSOQueryType::IRPass) && 9363 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9364 return CM_ScalarEpilogueNotAllowedOptSize; 9365 9366 // 2) If set, obey the directives 9367 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9368 switch (PreferPredicateOverEpilogue) { 9369 case PreferPredicateTy::ScalarEpilogue: 9370 return CM_ScalarEpilogueAllowed; 9371 case PreferPredicateTy::PredicateElseScalarEpilogue: 9372 return CM_ScalarEpilogueNotNeededUsePredicate; 9373 case PreferPredicateTy::PredicateOrDontVectorize: 9374 return CM_ScalarEpilogueNotAllowedUsePredicate; 9375 }; 9376 } 9377 9378 // 3) If set, obey the hints 9379 switch (Hints.getPredicate()) { 9380 case LoopVectorizeHints::FK_Enabled: 9381 return CM_ScalarEpilogueNotNeededUsePredicate; 9382 case LoopVectorizeHints::FK_Disabled: 9383 return CM_ScalarEpilogueAllowed; 9384 }; 9385 9386 // 4) if the TTI hook indicates this is profitable, request predication. 9387 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9388 LVL.getLAI())) 9389 return CM_ScalarEpilogueNotNeededUsePredicate; 9390 9391 return CM_ScalarEpilogueAllowed; 9392 } 9393 9394 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9395 // If Values have been set for this Def return the one relevant for \p Part. 9396 if (hasVectorValue(Def, Part)) 9397 return Data.PerPartOutput[Def][Part]; 9398 9399 if (!hasScalarValue(Def, {Part, 0})) { 9400 Value *IRV = Def->getLiveInIRValue(); 9401 Value *B = ILV->getBroadcastInstrs(IRV); 9402 set(Def, B, Part); 9403 return B; 9404 } 9405 9406 Value *ScalarValue = get(Def, {Part, 0}); 9407 // If we aren't vectorizing, we can just copy the scalar map values over 9408 // to the vector map. 9409 if (VF.isScalar()) { 9410 set(Def, ScalarValue, Part); 9411 return ScalarValue; 9412 } 9413 9414 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9415 bool IsUniform = RepR && RepR->isUniform(); 9416 9417 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9418 // Check if there is a scalar value for the selected lane. 9419 if (!hasScalarValue(Def, {Part, LastLane})) { 9420 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9421 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9422 "unexpected recipe found to be invariant"); 9423 IsUniform = true; 9424 LastLane = 0; 9425 } 9426 9427 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9428 9429 // Set the insert point after the last scalarized instruction. This 9430 // ensures the insertelement sequence will directly follow the scalar 9431 // definitions. 9432 auto OldIP = Builder.saveIP(); 9433 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9434 Builder.SetInsertPoint(&*NewIP); 9435 9436 // However, if we are vectorizing, we need to construct the vector values. 9437 // If the value is known to be uniform after vectorization, we can just 9438 // broadcast the scalar value corresponding to lane zero for each unroll 9439 // iteration. Otherwise, we construct the vector values using 9440 // insertelement instructions. Since the resulting vectors are stored in 9441 // State, we will only generate the insertelements once. 9442 Value *VectorValue = nullptr; 9443 if (IsUniform) { 9444 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9445 set(Def, VectorValue, Part); 9446 } else { 9447 // Initialize packing with insertelements to start from undef. 9448 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9449 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9450 set(Def, Undef, Part); 9451 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9452 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9453 VectorValue = get(Def, Part); 9454 } 9455 Builder.restoreIP(OldIP); 9456 return VectorValue; 9457 } 9458 9459 // Process the loop in the VPlan-native vectorization path. This path builds 9460 // VPlan upfront in the vectorization pipeline, which allows to apply 9461 // VPlan-to-VPlan transformations from the very beginning without modifying the 9462 // input LLVM IR. 9463 static bool processLoopInVPlanNativePath( 9464 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9465 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9466 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9467 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9468 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9469 LoopVectorizationRequirements &Requirements) { 9470 9471 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9472 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9473 return false; 9474 } 9475 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9476 Function *F = L->getHeader()->getParent(); 9477 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9478 9479 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9480 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9481 9482 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9483 &Hints, IAI); 9484 // Use the planner for outer loop vectorization. 9485 // TODO: CM is not used at this point inside the planner. Turn CM into an 9486 // optional argument if we don't need it in the future. 9487 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9488 Requirements, ORE); 9489 9490 // Get user vectorization factor. 9491 ElementCount UserVF = Hints.getWidth(); 9492 9493 // Plan how to best vectorize, return the best VF and its cost. 9494 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9495 9496 // If we are stress testing VPlan builds, do not attempt to generate vector 9497 // code. Masked vector code generation support will follow soon. 9498 // Also, do not attempt to vectorize if no vector code will be produced. 9499 if (VPlanBuildStressTest || EnableVPlanPredication || 9500 VectorizationFactor::Disabled() == VF) 9501 return false; 9502 9503 LVP.setBestPlan(VF.Width, 1); 9504 9505 { 9506 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9507 F->getParent()->getDataLayout()); 9508 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9509 &CM, BFI, PSI, Checks); 9510 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9511 << L->getHeader()->getParent()->getName() << "\"\n"); 9512 LVP.executePlan(LB, DT); 9513 } 9514 9515 // Mark the loop as already vectorized to avoid vectorizing again. 9516 Hints.setAlreadyVectorized(); 9517 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9518 return true; 9519 } 9520 9521 // Emit a remark if there are stores to floats that required a floating point 9522 // extension. If the vectorized loop was generated with floating point there 9523 // will be a performance penalty from the conversion overhead and the change in 9524 // the vector width. 9525 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9526 SmallVector<Instruction *, 4> Worklist; 9527 for (BasicBlock *BB : L->getBlocks()) { 9528 for (Instruction &Inst : *BB) { 9529 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9530 if (S->getValueOperand()->getType()->isFloatTy()) 9531 Worklist.push_back(S); 9532 } 9533 } 9534 } 9535 9536 // Traverse the floating point stores upwards searching, for floating point 9537 // conversions. 9538 SmallPtrSet<const Instruction *, 4> Visited; 9539 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9540 while (!Worklist.empty()) { 9541 auto *I = Worklist.pop_back_val(); 9542 if (!L->contains(I)) 9543 continue; 9544 if (!Visited.insert(I).second) 9545 continue; 9546 9547 // Emit a remark if the floating point store required a floating 9548 // point conversion. 9549 // TODO: More work could be done to identify the root cause such as a 9550 // constant or a function return type and point the user to it. 9551 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9552 ORE->emit([&]() { 9553 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9554 I->getDebugLoc(), L->getHeader()) 9555 << "floating point conversion changes vector width. " 9556 << "Mixed floating point precision requires an up/down " 9557 << "cast that will negatively impact performance."; 9558 }); 9559 9560 for (Use &Op : I->operands()) 9561 if (auto *OpI = dyn_cast<Instruction>(Op)) 9562 Worklist.push_back(OpI); 9563 } 9564 } 9565 9566 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9567 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9568 !EnableLoopInterleaving), 9569 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9570 !EnableLoopVectorization) {} 9571 9572 bool LoopVectorizePass::processLoop(Loop *L) { 9573 assert((EnableVPlanNativePath || L->isInnermost()) && 9574 "VPlan-native path is not enabled. Only process inner loops."); 9575 9576 #ifndef NDEBUG 9577 const std::string DebugLocStr = getDebugLocString(L); 9578 #endif /* NDEBUG */ 9579 9580 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9581 << L->getHeader()->getParent()->getName() << "\" from " 9582 << DebugLocStr << "\n"); 9583 9584 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9585 9586 LLVM_DEBUG( 9587 dbgs() << "LV: Loop hints:" 9588 << " force=" 9589 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9590 ? "disabled" 9591 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9592 ? "enabled" 9593 : "?")) 9594 << " width=" << Hints.getWidth() 9595 << " unroll=" << Hints.getInterleave() << "\n"); 9596 9597 // Function containing loop 9598 Function *F = L->getHeader()->getParent(); 9599 9600 // Looking at the diagnostic output is the only way to determine if a loop 9601 // was vectorized (other than looking at the IR or machine code), so it 9602 // is important to generate an optimization remark for each loop. Most of 9603 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9604 // generated as OptimizationRemark and OptimizationRemarkMissed are 9605 // less verbose reporting vectorized loops and unvectorized loops that may 9606 // benefit from vectorization, respectively. 9607 9608 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9609 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9610 return false; 9611 } 9612 9613 PredicatedScalarEvolution PSE(*SE, *L); 9614 9615 // Check if it is legal to vectorize the loop. 9616 LoopVectorizationRequirements Requirements; 9617 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9618 &Requirements, &Hints, DB, AC, BFI, PSI); 9619 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9620 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9621 Hints.emitRemarkWithHints(); 9622 return false; 9623 } 9624 9625 // Check the function attributes and profiles to find out if this function 9626 // should be optimized for size. 9627 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9628 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9629 9630 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9631 // here. They may require CFG and instruction level transformations before 9632 // even evaluating whether vectorization is profitable. Since we cannot modify 9633 // the incoming IR, we need to build VPlan upfront in the vectorization 9634 // pipeline. 9635 if (!L->isInnermost()) 9636 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9637 ORE, BFI, PSI, Hints, Requirements); 9638 9639 assert(L->isInnermost() && "Inner loop expected."); 9640 9641 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9642 // count by optimizing for size, to minimize overheads. 9643 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9644 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9645 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9646 << "This loop is worth vectorizing only if no scalar " 9647 << "iteration overheads are incurred."); 9648 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9649 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9650 else { 9651 LLVM_DEBUG(dbgs() << "\n"); 9652 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9653 } 9654 } 9655 9656 // Check the function attributes to see if implicit floats are allowed. 9657 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9658 // an integer loop and the vector instructions selected are purely integer 9659 // vector instructions? 9660 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9661 reportVectorizationFailure( 9662 "Can't vectorize when the NoImplicitFloat attribute is used", 9663 "loop not vectorized due to NoImplicitFloat attribute", 9664 "NoImplicitFloat", ORE, L); 9665 Hints.emitRemarkWithHints(); 9666 return false; 9667 } 9668 9669 // Check if the target supports potentially unsafe FP vectorization. 9670 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9671 // for the target we're vectorizing for, to make sure none of the 9672 // additional fp-math flags can help. 9673 if (Hints.isPotentiallyUnsafe() && 9674 TTI->isFPVectorizationPotentiallyUnsafe()) { 9675 reportVectorizationFailure( 9676 "Potentially unsafe FP op prevents vectorization", 9677 "loop not vectorized due to unsafe FP support.", 9678 "UnsafeFP", ORE, L); 9679 Hints.emitRemarkWithHints(); 9680 return false; 9681 } 9682 9683 if (!Requirements.canVectorizeFPMath(Hints)) { 9684 ORE->emit([&]() { 9685 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9686 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9687 ExactFPMathInst->getDebugLoc(), 9688 ExactFPMathInst->getParent()) 9689 << "loop not vectorized: cannot prove it is safe to reorder " 9690 "floating-point operations"; 9691 }); 9692 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9693 "reorder floating-point operations\n"); 9694 Hints.emitRemarkWithHints(); 9695 return false; 9696 } 9697 9698 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9699 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9700 9701 // If an override option has been passed in for interleaved accesses, use it. 9702 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9703 UseInterleaved = EnableInterleavedMemAccesses; 9704 9705 // Analyze interleaved memory accesses. 9706 if (UseInterleaved) { 9707 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9708 } 9709 9710 // Use the cost model. 9711 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9712 F, &Hints, IAI); 9713 CM.collectValuesToIgnore(); 9714 9715 // Use the planner for vectorization. 9716 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9717 Requirements, ORE); 9718 9719 // Get user vectorization factor and interleave count. 9720 ElementCount UserVF = Hints.getWidth(); 9721 unsigned UserIC = Hints.getInterleave(); 9722 9723 // Plan how to best vectorize, return the best VF and its cost. 9724 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9725 9726 VectorizationFactor VF = VectorizationFactor::Disabled(); 9727 unsigned IC = 1; 9728 9729 if (MaybeVF) { 9730 VF = *MaybeVF; 9731 // Select the interleave count. 9732 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9733 } 9734 9735 // Identify the diagnostic messages that should be produced. 9736 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9737 bool VectorizeLoop = true, InterleaveLoop = true; 9738 if (VF.Width.isScalar()) { 9739 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9740 VecDiagMsg = std::make_pair( 9741 "VectorizationNotBeneficial", 9742 "the cost-model indicates that vectorization is not beneficial"); 9743 VectorizeLoop = false; 9744 } 9745 9746 if (!MaybeVF && UserIC > 1) { 9747 // Tell the user interleaving was avoided up-front, despite being explicitly 9748 // requested. 9749 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9750 "interleaving should be avoided up front\n"); 9751 IntDiagMsg = std::make_pair( 9752 "InterleavingAvoided", 9753 "Ignoring UserIC, because interleaving was avoided up front"); 9754 InterleaveLoop = false; 9755 } else if (IC == 1 && UserIC <= 1) { 9756 // Tell the user interleaving is not beneficial. 9757 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9758 IntDiagMsg = std::make_pair( 9759 "InterleavingNotBeneficial", 9760 "the cost-model indicates that interleaving is not beneficial"); 9761 InterleaveLoop = false; 9762 if (UserIC == 1) { 9763 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9764 IntDiagMsg.second += 9765 " and is explicitly disabled or interleave count is set to 1"; 9766 } 9767 } else if (IC > 1 && UserIC == 1) { 9768 // Tell the user interleaving is beneficial, but it explicitly disabled. 9769 LLVM_DEBUG( 9770 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9771 IntDiagMsg = std::make_pair( 9772 "InterleavingBeneficialButDisabled", 9773 "the cost-model indicates that interleaving is beneficial " 9774 "but is explicitly disabled or interleave count is set to 1"); 9775 InterleaveLoop = false; 9776 } 9777 9778 // Override IC if user provided an interleave count. 9779 IC = UserIC > 0 ? UserIC : IC; 9780 9781 // Emit diagnostic messages, if any. 9782 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9783 if (!VectorizeLoop && !InterleaveLoop) { 9784 // Do not vectorize or interleaving the loop. 9785 ORE->emit([&]() { 9786 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9787 L->getStartLoc(), L->getHeader()) 9788 << VecDiagMsg.second; 9789 }); 9790 ORE->emit([&]() { 9791 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9792 L->getStartLoc(), L->getHeader()) 9793 << IntDiagMsg.second; 9794 }); 9795 return false; 9796 } else if (!VectorizeLoop && InterleaveLoop) { 9797 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9798 ORE->emit([&]() { 9799 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9800 L->getStartLoc(), L->getHeader()) 9801 << VecDiagMsg.second; 9802 }); 9803 } else if (VectorizeLoop && !InterleaveLoop) { 9804 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9805 << ") in " << DebugLocStr << '\n'); 9806 ORE->emit([&]() { 9807 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9808 L->getStartLoc(), L->getHeader()) 9809 << IntDiagMsg.second; 9810 }); 9811 } else if (VectorizeLoop && InterleaveLoop) { 9812 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9813 << ") in " << DebugLocStr << '\n'); 9814 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9815 } 9816 9817 bool DisableRuntimeUnroll = false; 9818 MDNode *OrigLoopID = L->getLoopID(); 9819 { 9820 // Optimistically generate runtime checks. Drop them if they turn out to not 9821 // be profitable. Limit the scope of Checks, so the cleanup happens 9822 // immediately after vector codegeneration is done. 9823 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9824 F->getParent()->getDataLayout()); 9825 if (!VF.Width.isScalar() || IC > 1) 9826 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9827 LVP.setBestPlan(VF.Width, IC); 9828 9829 using namespace ore; 9830 if (!VectorizeLoop) { 9831 assert(IC > 1 && "interleave count should not be 1 or 0"); 9832 // If we decided that it is not legal to vectorize the loop, then 9833 // interleave it. 9834 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9835 &CM, BFI, PSI, Checks); 9836 LVP.executePlan(Unroller, DT); 9837 9838 ORE->emit([&]() { 9839 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9840 L->getHeader()) 9841 << "interleaved loop (interleaved count: " 9842 << NV("InterleaveCount", IC) << ")"; 9843 }); 9844 } else { 9845 // If we decided that it is *legal* to vectorize the loop, then do it. 9846 9847 // Consider vectorizing the epilogue too if it's profitable. 9848 VectorizationFactor EpilogueVF = 9849 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9850 if (EpilogueVF.Width.isVector()) { 9851 9852 // The first pass vectorizes the main loop and creates a scalar epilogue 9853 // to be vectorized by executing the plan (potentially with a different 9854 // factor) again shortly afterwards. 9855 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9856 EpilogueVF.Width.getKnownMinValue(), 9857 1); 9858 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9859 EPI, &LVL, &CM, BFI, PSI, Checks); 9860 9861 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9862 LVP.executePlan(MainILV, DT); 9863 ++LoopsVectorized; 9864 9865 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9866 formLCSSARecursively(*L, *DT, LI, SE); 9867 9868 // Second pass vectorizes the epilogue and adjusts the control flow 9869 // edges from the first pass. 9870 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9871 EPI.MainLoopVF = EPI.EpilogueVF; 9872 EPI.MainLoopUF = EPI.EpilogueUF; 9873 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9874 ORE, EPI, &LVL, &CM, BFI, PSI, 9875 Checks); 9876 LVP.executePlan(EpilogILV, DT); 9877 ++LoopsEpilogueVectorized; 9878 9879 if (!MainILV.areSafetyChecksAdded()) 9880 DisableRuntimeUnroll = true; 9881 } else { 9882 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9883 &LVL, &CM, BFI, PSI, Checks); 9884 LVP.executePlan(LB, DT); 9885 ++LoopsVectorized; 9886 9887 // Add metadata to disable runtime unrolling a scalar loop when there 9888 // are no runtime checks about strides and memory. A scalar loop that is 9889 // rarely used is not worth unrolling. 9890 if (!LB.areSafetyChecksAdded()) 9891 DisableRuntimeUnroll = true; 9892 } 9893 // Report the vectorization decision. 9894 ORE->emit([&]() { 9895 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9896 L->getHeader()) 9897 << "vectorized loop (vectorization width: " 9898 << NV("VectorizationFactor", VF.Width) 9899 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9900 }); 9901 } 9902 9903 if (ORE->allowExtraAnalysis(LV_NAME)) 9904 checkMixedPrecision(L, ORE); 9905 } 9906 9907 Optional<MDNode *> RemainderLoopID = 9908 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9909 LLVMLoopVectorizeFollowupEpilogue}); 9910 if (RemainderLoopID.hasValue()) { 9911 L->setLoopID(RemainderLoopID.getValue()); 9912 } else { 9913 if (DisableRuntimeUnroll) 9914 AddRuntimeUnrollDisableMetaData(L); 9915 9916 // Mark the loop as already vectorized to avoid vectorizing again. 9917 Hints.setAlreadyVectorized(); 9918 } 9919 9920 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9921 return true; 9922 } 9923 9924 LoopVectorizeResult LoopVectorizePass::runImpl( 9925 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9926 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9927 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9928 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9929 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9930 SE = &SE_; 9931 LI = &LI_; 9932 TTI = &TTI_; 9933 DT = &DT_; 9934 BFI = &BFI_; 9935 TLI = TLI_; 9936 AA = &AA_; 9937 AC = &AC_; 9938 GetLAA = &GetLAA_; 9939 DB = &DB_; 9940 ORE = &ORE_; 9941 PSI = PSI_; 9942 9943 // Don't attempt if 9944 // 1. the target claims to have no vector registers, and 9945 // 2. interleaving won't help ILP. 9946 // 9947 // The second condition is necessary because, even if the target has no 9948 // vector registers, loop vectorization may still enable scalar 9949 // interleaving. 9950 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9951 TTI->getMaxInterleaveFactor(1) < 2) 9952 return LoopVectorizeResult(false, false); 9953 9954 bool Changed = false, CFGChanged = false; 9955 9956 // The vectorizer requires loops to be in simplified form. 9957 // Since simplification may add new inner loops, it has to run before the 9958 // legality and profitability checks. This means running the loop vectorizer 9959 // will simplify all loops, regardless of whether anything end up being 9960 // vectorized. 9961 for (auto &L : *LI) 9962 Changed |= CFGChanged |= 9963 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9964 9965 // Build up a worklist of inner-loops to vectorize. This is necessary as 9966 // the act of vectorizing or partially unrolling a loop creates new loops 9967 // and can invalidate iterators across the loops. 9968 SmallVector<Loop *, 8> Worklist; 9969 9970 for (Loop *L : *LI) 9971 collectSupportedLoops(*L, LI, ORE, Worklist); 9972 9973 LoopsAnalyzed += Worklist.size(); 9974 9975 // Now walk the identified inner loops. 9976 while (!Worklist.empty()) { 9977 Loop *L = Worklist.pop_back_val(); 9978 9979 // For the inner loops we actually process, form LCSSA to simplify the 9980 // transform. 9981 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9982 9983 Changed |= CFGChanged |= processLoop(L); 9984 } 9985 9986 // Process each loop nest in the function. 9987 return LoopVectorizeResult(Changed, CFGChanged); 9988 } 9989 9990 PreservedAnalyses LoopVectorizePass::run(Function &F, 9991 FunctionAnalysisManager &AM) { 9992 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9993 auto &LI = AM.getResult<LoopAnalysis>(F); 9994 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9995 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9996 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9997 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9998 auto &AA = AM.getResult<AAManager>(F); 9999 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10000 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10001 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10002 MemorySSA *MSSA = EnableMSSALoopDependency 10003 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10004 : nullptr; 10005 10006 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10007 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10008 [&](Loop &L) -> const LoopAccessInfo & { 10009 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10010 TLI, TTI, nullptr, MSSA}; 10011 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10012 }; 10013 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10014 ProfileSummaryInfo *PSI = 10015 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10016 LoopVectorizeResult Result = 10017 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10018 if (!Result.MadeAnyChange) 10019 return PreservedAnalyses::all(); 10020 PreservedAnalyses PA; 10021 10022 // We currently do not preserve loopinfo/dominator analyses with outer loop 10023 // vectorization. Until this is addressed, mark these analyses as preserved 10024 // only for non-VPlan-native path. 10025 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10026 if (!EnableVPlanNativePath) { 10027 PA.preserve<LoopAnalysis>(); 10028 PA.preserve<DominatorTreeAnalysis>(); 10029 } 10030 PA.preserve<BasicAA>(); 10031 PA.preserve<GlobalsAA>(); 10032 if (!Result.MadeCFGChange) 10033 PA.preserveSet<CFGAnalyses>(); 10034 return PA; 10035 } 10036