1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/PatternMatch.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/InstructionCost.h" 135 #include "llvm/Support/MathExtras.h" 136 #include "llvm/Support/raw_ostream.h" 137 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 138 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 139 #include "llvm/Transforms/Utils/LoopSimplify.h" 140 #include "llvm/Transforms/Utils/LoopUtils.h" 141 #include "llvm/Transforms/Utils/LoopVersioning.h" 142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 143 #include "llvm/Transforms/Utils/SizeOpts.h" 144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 145 #include <algorithm> 146 #include <cassert> 147 #include <cstdint> 148 #include <cstdlib> 149 #include <functional> 150 #include <iterator> 151 #include <limits> 152 #include <memory> 153 #include <string> 154 #include <tuple> 155 #include <utility> 156 157 using namespace llvm; 158 159 #define LV_NAME "loop-vectorize" 160 #define DEBUG_TYPE LV_NAME 161 162 #ifndef NDEBUG 163 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 164 #endif 165 166 /// @{ 167 /// Metadata attribute names 168 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 169 const char LLVMLoopVectorizeFollowupVectorized[] = 170 "llvm.loop.vectorize.followup_vectorized"; 171 const char LLVMLoopVectorizeFollowupEpilogue[] = 172 "llvm.loop.vectorize.followup_epilogue"; 173 /// @} 174 175 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 176 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 177 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 178 179 static cl::opt<bool> EnableEpilogueVectorization( 180 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 181 cl::desc("Enable vectorization of epilogue loops.")); 182 183 static cl::opt<unsigned> EpilogueVectorizationForceVF( 184 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 185 cl::desc("When epilogue vectorization is enabled, and a value greater than " 186 "1 is specified, forces the given VF for all applicable epilogue " 187 "loops.")); 188 189 static cl::opt<unsigned> EpilogueVectorizationMinVF( 190 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 191 cl::desc("Only loops with vectorization factor equal to or larger than " 192 "the specified value are considered for epilogue vectorization.")); 193 194 /// Loops with a known constant trip count below this number are vectorized only 195 /// if no scalar iteration overheads are incurred. 196 static cl::opt<unsigned> TinyTripCountVectorThreshold( 197 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 198 cl::desc("Loops with a constant trip count that is smaller than this " 199 "value are vectorized only if no scalar iteration overheads " 200 "are incurred.")); 201 202 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 203 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 204 cl::desc("The maximum allowed number of runtime memory checks with a " 205 "vectorize(enable) pragma.")); 206 207 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 208 // that predication is preferred, and this lists all options. I.e., the 209 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 210 // and predicate the instructions accordingly. If tail-folding fails, there are 211 // different fallback strategies depending on these values: 212 namespace PreferPredicateTy { 213 enum Option { 214 ScalarEpilogue = 0, 215 PredicateElseScalarEpilogue, 216 PredicateOrDontVectorize 217 }; 218 } // namespace PreferPredicateTy 219 220 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 221 "prefer-predicate-over-epilogue", 222 cl::init(PreferPredicateTy::ScalarEpilogue), 223 cl::Hidden, 224 cl::desc("Tail-folding and predication preferences over creating a scalar " 225 "epilogue loop."), 226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 227 "scalar-epilogue", 228 "Don't tail-predicate loops, create scalar epilogue"), 229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 230 "predicate-else-scalar-epilogue", 231 "prefer tail-folding, create scalar epilogue if tail " 232 "folding fails."), 233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 234 "predicate-dont-vectorize", 235 "prefers tail-folding, don't attempt vectorization if " 236 "tail-folding fails."))); 237 238 static cl::opt<bool> MaximizeBandwidth( 239 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 240 cl::desc("Maximize bandwidth when selecting vectorization factor which " 241 "will be determined by the smallest type in loop.")); 242 243 static cl::opt<bool> EnableInterleavedMemAccesses( 244 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 246 247 /// An interleave-group may need masking if it resides in a block that needs 248 /// predication, or in order to mask away gaps. 249 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 250 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 251 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 252 253 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 254 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 255 cl::desc("We don't interleave loops with a estimated constant trip count " 256 "below this number")); 257 258 static cl::opt<unsigned> ForceTargetNumScalarRegs( 259 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 260 cl::desc("A flag that overrides the target's number of scalar registers.")); 261 262 static cl::opt<unsigned> ForceTargetNumVectorRegs( 263 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 264 cl::desc("A flag that overrides the target's number of vector registers.")); 265 266 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 267 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 268 cl::desc("A flag that overrides the target's max interleave factor for " 269 "scalar loops.")); 270 271 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 272 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 273 cl::desc("A flag that overrides the target's max interleave factor for " 274 "vectorized loops.")); 275 276 static cl::opt<unsigned> ForceTargetInstructionCost( 277 "force-target-instruction-cost", cl::init(0), cl::Hidden, 278 cl::desc("A flag that overrides the target's expected cost for " 279 "an instruction to a single constant value. Mostly " 280 "useful for getting consistent testing.")); 281 282 static cl::opt<bool> ForceTargetSupportsScalableVectors( 283 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 284 cl::desc( 285 "Pretend that scalable vectors are supported, even if the target does " 286 "not support them. This flag should only be used for testing.")); 287 288 static cl::opt<unsigned> SmallLoopCost( 289 "small-loop-cost", cl::init(20), cl::Hidden, 290 cl::desc( 291 "The cost of a loop that is considered 'small' by the interleaver.")); 292 293 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 294 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 295 cl::desc("Enable the use of the block frequency analysis to access PGO " 296 "heuristics minimizing code growth in cold regions and being more " 297 "aggressive in hot regions.")); 298 299 // Runtime interleave loops for load/store throughput. 300 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 301 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 302 cl::desc( 303 "Enable runtime interleaving until load/store ports are saturated")); 304 305 /// Interleave small loops with scalar reductions. 306 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 307 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 308 cl::desc("Enable interleaving for loops with small iteration counts that " 309 "contain scalar reductions to expose ILP.")); 310 311 /// The number of stores in a loop that are allowed to need predication. 312 static cl::opt<unsigned> NumberOfStoresToPredicate( 313 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 314 cl::desc("Max number of stores to be predicated behind an if.")); 315 316 static cl::opt<bool> EnableIndVarRegisterHeur( 317 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 318 cl::desc("Count the induction variable only once when interleaving")); 319 320 static cl::opt<bool> EnableCondStoresVectorization( 321 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 322 cl::desc("Enable if predication of stores during vectorization.")); 323 324 static cl::opt<unsigned> MaxNestedScalarReductionIC( 325 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 326 cl::desc("The maximum interleave count to use when interleaving a scalar " 327 "reduction in a nested loop.")); 328 329 static cl::opt<bool> 330 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 331 cl::Hidden, 332 cl::desc("Prefer in-loop vector reductions, " 333 "overriding the targets preference.")); 334 335 // FIXME: When loop hints are passed which allow reordering of FP operations, 336 // we still choose to use strict reductions with this flag. We should instead 337 // use the default behaviour of vectorizing with unordered reductions if 338 // reordering is allowed. 339 cl::opt<bool> EnableStrictReductions( 340 "enable-strict-reductions", cl::init(false), cl::Hidden, 341 cl::desc("Enable the vectorisation of loops with in-order (strict) " 342 "FP reductions")); 343 344 static cl::opt<bool> PreferPredicatedReductionSelect( 345 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 346 cl::desc( 347 "Prefer predicating a reduction operation over an after loop select.")); 348 349 cl::opt<bool> EnableVPlanNativePath( 350 "enable-vplan-native-path", cl::init(false), cl::Hidden, 351 cl::desc("Enable VPlan-native vectorization path with " 352 "support for outer loop vectorization.")); 353 354 // FIXME: Remove this switch once we have divergence analysis. Currently we 355 // assume divergent non-backedge branches when this switch is true. 356 cl::opt<bool> EnableVPlanPredication( 357 "enable-vplan-predication", cl::init(false), cl::Hidden, 358 cl::desc("Enable VPlan-native vectorization path predicator with " 359 "support for outer loop vectorization.")); 360 361 // This flag enables the stress testing of the VPlan H-CFG construction in the 362 // VPlan-native vectorization path. It must be used in conjuction with 363 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 364 // verification of the H-CFGs built. 365 static cl::opt<bool> VPlanBuildStressTest( 366 "vplan-build-stress-test", cl::init(false), cl::Hidden, 367 cl::desc( 368 "Build VPlan for every supported loop nest in the function and bail " 369 "out right after the build (stress test the VPlan H-CFG construction " 370 "in the VPlan-native vectorization path).")); 371 372 cl::opt<bool> llvm::EnableLoopInterleaving( 373 "interleave-loops", cl::init(true), cl::Hidden, 374 cl::desc("Enable loop interleaving in Loop vectorization passes")); 375 cl::opt<bool> llvm::EnableLoopVectorization( 376 "vectorize-loops", cl::init(true), cl::Hidden, 377 cl::desc("Run the Loop vectorization passes")); 378 379 cl::opt<bool> PrintVPlansInDotFormat( 380 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 381 cl::desc("Use dot format instead of plain text when dumping VPlans")); 382 383 /// A helper function that returns true if the given type is irregular. The 384 /// type is irregular if its allocated size doesn't equal the store size of an 385 /// element of the corresponding vector type. 386 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 387 // Determine if an array of N elements of type Ty is "bitcast compatible" 388 // with a <N x Ty> vector. 389 // This is only true if there is no padding between the array elements. 390 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 391 } 392 393 /// A helper function that returns the reciprocal of the block probability of 394 /// predicated blocks. If we return X, we are assuming the predicated block 395 /// will execute once for every X iterations of the loop header. 396 /// 397 /// TODO: We should use actual block probability here, if available. Currently, 398 /// we always assume predicated blocks have a 50% chance of executing. 399 static unsigned getReciprocalPredBlockProb() { return 2; } 400 401 /// A helper function that returns an integer or floating-point constant with 402 /// value C. 403 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 404 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 405 : ConstantFP::get(Ty, C); 406 } 407 408 /// Returns "best known" trip count for the specified loop \p L as defined by 409 /// the following procedure: 410 /// 1) Returns exact trip count if it is known. 411 /// 2) Returns expected trip count according to profile data if any. 412 /// 3) Returns upper bound estimate if it is known. 413 /// 4) Returns None if all of the above failed. 414 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 415 // Check if exact trip count is known. 416 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 417 return ExpectedTC; 418 419 // Check if there is an expected trip count available from profile data. 420 if (LoopVectorizeWithBlockFrequency) 421 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 422 return EstimatedTC; 423 424 // Check if upper bound estimate is known. 425 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 426 return ExpectedTC; 427 428 return None; 429 } 430 431 // Forward declare GeneratedRTChecks. 432 class GeneratedRTChecks; 433 434 namespace llvm { 435 436 /// InnerLoopVectorizer vectorizes loops which contain only one basic 437 /// block to a specified vectorization factor (VF). 438 /// This class performs the widening of scalars into vectors, or multiple 439 /// scalars. This class also implements the following features: 440 /// * It inserts an epilogue loop for handling loops that don't have iteration 441 /// counts that are known to be a multiple of the vectorization factor. 442 /// * It handles the code generation for reduction variables. 443 /// * Scalarization (implementation using scalars) of un-vectorizable 444 /// instructions. 445 /// InnerLoopVectorizer does not perform any vectorization-legality 446 /// checks, and relies on the caller to check for the different legality 447 /// aspects. The InnerLoopVectorizer relies on the 448 /// LoopVectorizationLegality class to provide information about the induction 449 /// and reduction variables that were found to a given vectorization factor. 450 class InnerLoopVectorizer { 451 public: 452 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 453 LoopInfo *LI, DominatorTree *DT, 454 const TargetLibraryInfo *TLI, 455 const TargetTransformInfo *TTI, AssumptionCache *AC, 456 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 457 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 458 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 459 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 460 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 461 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 462 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 463 PSI(PSI), RTChecks(RTChecks) { 464 // Query this against the original loop and save it here because the profile 465 // of the original loop header may change as the transformation happens. 466 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 467 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 468 } 469 470 virtual ~InnerLoopVectorizer() = default; 471 472 /// Create a new empty loop that will contain vectorized instructions later 473 /// on, while the old loop will be used as the scalar remainder. Control flow 474 /// is generated around the vectorized (and scalar epilogue) loops consisting 475 /// of various checks and bypasses. Return the pre-header block of the new 476 /// loop. 477 /// In the case of epilogue vectorization, this function is overriden to 478 /// handle the more complex control flow around the loops. 479 virtual BasicBlock *createVectorizedLoopSkeleton(); 480 481 /// Widen a single instruction within the innermost loop. 482 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 483 VPTransformState &State); 484 485 /// Widen a single call instruction within the innermost loop. 486 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 487 VPTransformState &State); 488 489 /// Widen a single select instruction within the innermost loop. 490 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 491 bool InvariantCond, VPTransformState &State); 492 493 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 494 void fixVectorizedLoop(VPTransformState &State); 495 496 // Return true if any runtime check is added. 497 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 498 499 /// A type for vectorized values in the new loop. Each value from the 500 /// original loop, when vectorized, is represented by UF vector values in the 501 /// new unrolled loop, where UF is the unroll factor. 502 using VectorParts = SmallVector<Value *, 2>; 503 504 /// Vectorize a single GetElementPtrInst based on information gathered and 505 /// decisions taken during planning. 506 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 507 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 508 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 509 510 /// Vectorize a single PHINode in a block. This method handles the induction 511 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 512 /// arbitrary length vectors. 513 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 514 VPWidenPHIRecipe *PhiR, VPTransformState &State); 515 516 /// A helper function to scalarize a single Instruction in the innermost loop. 517 /// Generates a sequence of scalar instances for each lane between \p MinLane 518 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 519 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 520 /// Instr's operands. 521 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 522 const VPIteration &Instance, bool IfPredicateInstr, 523 VPTransformState &State); 524 525 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 526 /// is provided, the integer induction variable will first be truncated to 527 /// the corresponding type. 528 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 529 VPValue *Def, VPValue *CastDef, 530 VPTransformState &State); 531 532 /// Construct the vector value of a scalarized value \p V one lane at a time. 533 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 534 VPTransformState &State); 535 536 /// Try to vectorize interleaved access group \p Group with the base address 537 /// given in \p Addr, optionally masking the vector operations if \p 538 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 539 /// values in the vectorized loop. 540 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 541 ArrayRef<VPValue *> VPDefs, 542 VPTransformState &State, VPValue *Addr, 543 ArrayRef<VPValue *> StoredValues, 544 VPValue *BlockInMask = nullptr); 545 546 /// Vectorize Load and Store instructions with the base address given in \p 547 /// Addr, optionally masking the vector operations if \p BlockInMask is 548 /// non-null. Use \p State to translate given VPValues to IR values in the 549 /// vectorized loop. 550 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 551 VPValue *Def, VPValue *Addr, 552 VPValue *StoredValue, VPValue *BlockInMask); 553 554 /// Set the debug location in the builder using the debug location in 555 /// the instruction. 556 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 557 558 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 559 void fixNonInductionPHIs(VPTransformState &State); 560 561 /// Create a broadcast instruction. This method generates a broadcast 562 /// instruction (shuffle) for loop invariant values and for the induction 563 /// value. If this is the induction variable then we extend it to N, N+1, ... 564 /// this is needed because each iteration in the loop corresponds to a SIMD 565 /// element. 566 virtual Value *getBroadcastInstrs(Value *V); 567 568 protected: 569 friend class LoopVectorizationPlanner; 570 571 /// A small list of PHINodes. 572 using PhiVector = SmallVector<PHINode *, 4>; 573 574 /// A type for scalarized values in the new loop. Each value from the 575 /// original loop, when scalarized, is represented by UF x VF scalar values 576 /// in the new unrolled loop, where UF is the unroll factor and VF is the 577 /// vectorization factor. 578 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 579 580 /// Set up the values of the IVs correctly when exiting the vector loop. 581 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 582 Value *CountRoundDown, Value *EndValue, 583 BasicBlock *MiddleBlock); 584 585 /// Create a new induction variable inside L. 586 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 587 Value *Step, Instruction *DL); 588 589 /// Handle all cross-iteration phis in the header. 590 void fixCrossIterationPHIs(VPTransformState &State); 591 592 /// Fix a first-order recurrence. This is the second phase of vectorizing 593 /// this phi node. 594 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 595 596 /// Fix a reduction cross-iteration phi. This is the second phase of 597 /// vectorizing this phi node. 598 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 599 600 /// Clear NSW/NUW flags from reduction instructions if necessary. 601 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 602 VPTransformState &State); 603 604 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 605 /// means we need to add the appropriate incoming value from the middle 606 /// block as exiting edges from the scalar epilogue loop (if present) are 607 /// already in place, and we exit the vector loop exclusively to the middle 608 /// block. 609 void fixLCSSAPHIs(VPTransformState &State); 610 611 /// Iteratively sink the scalarized operands of a predicated instruction into 612 /// the block that was created for it. 613 void sinkScalarOperands(Instruction *PredInst); 614 615 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 616 /// represented as. 617 void truncateToMinimalBitwidths(VPTransformState &State); 618 619 /// This function adds 620 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 621 /// to each vector element of Val. The sequence starts at StartIndex. 622 /// \p Opcode is relevant for FP induction variable. 623 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 624 Instruction::BinaryOps Opcode = 625 Instruction::BinaryOpsEnd); 626 627 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 628 /// variable on which to base the steps, \p Step is the size of the step, and 629 /// \p EntryVal is the value from the original loop that maps to the steps. 630 /// Note that \p EntryVal doesn't have to be an induction variable - it 631 /// can also be a truncate instruction. 632 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 633 const InductionDescriptor &ID, VPValue *Def, 634 VPValue *CastDef, VPTransformState &State); 635 636 /// Create a vector induction phi node based on an existing scalar one. \p 637 /// EntryVal is the value from the original loop that maps to the vector phi 638 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 639 /// truncate instruction, instead of widening the original IV, we widen a 640 /// version of the IV truncated to \p EntryVal's type. 641 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 642 Value *Step, Value *Start, 643 Instruction *EntryVal, VPValue *Def, 644 VPValue *CastDef, 645 VPTransformState &State); 646 647 /// Returns true if an instruction \p I should be scalarized instead of 648 /// vectorized for the chosen vectorization factor. 649 bool shouldScalarizeInstruction(Instruction *I) const; 650 651 /// Returns true if we should generate a scalar version of \p IV. 652 bool needsScalarInduction(Instruction *IV) const; 653 654 /// If there is a cast involved in the induction variable \p ID, which should 655 /// be ignored in the vectorized loop body, this function records the 656 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 657 /// cast. We had already proved that the casted Phi is equal to the uncasted 658 /// Phi in the vectorized loop (under a runtime guard), and therefore 659 /// there is no need to vectorize the cast - the same value can be used in the 660 /// vector loop for both the Phi and the cast. 661 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 662 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 663 /// 664 /// \p EntryVal is the value from the original loop that maps to the vector 665 /// phi node and is used to distinguish what is the IV currently being 666 /// processed - original one (if \p EntryVal is a phi corresponding to the 667 /// original IV) or the "newly-created" one based on the proof mentioned above 668 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 669 /// latter case \p EntryVal is a TruncInst and we must not record anything for 670 /// that IV, but it's error-prone to expect callers of this routine to care 671 /// about that, hence this explicit parameter. 672 void recordVectorLoopValueForInductionCast( 673 const InductionDescriptor &ID, const Instruction *EntryVal, 674 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 675 unsigned Part, unsigned Lane = UINT_MAX); 676 677 /// Generate a shuffle sequence that will reverse the vector Vec. 678 virtual Value *reverseVector(Value *Vec); 679 680 /// Returns (and creates if needed) the original loop trip count. 681 Value *getOrCreateTripCount(Loop *NewLoop); 682 683 /// Returns (and creates if needed) the trip count of the widened loop. 684 Value *getOrCreateVectorTripCount(Loop *NewLoop); 685 686 /// Returns a bitcasted value to the requested vector type. 687 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 688 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 689 const DataLayout &DL); 690 691 /// Emit a bypass check to see if the vector trip count is zero, including if 692 /// it overflows. 693 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 694 695 /// Emit a bypass check to see if all of the SCEV assumptions we've 696 /// had to make are correct. Returns the block containing the checks or 697 /// nullptr if no checks have been added. 698 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 699 700 /// Emit bypass checks to check any memory assumptions we may have made. 701 /// Returns the block containing the checks or nullptr if no checks have been 702 /// added. 703 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 704 705 /// Compute the transformed value of Index at offset StartValue using step 706 /// StepValue. 707 /// For integer induction, returns StartValue + Index * StepValue. 708 /// For pointer induction, returns StartValue[Index * StepValue]. 709 /// FIXME: The newly created binary instructions should contain nsw/nuw 710 /// flags, which can be found from the original scalar operations. 711 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 712 const DataLayout &DL, 713 const InductionDescriptor &ID) const; 714 715 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 716 /// vector loop preheader, middle block and scalar preheader. Also 717 /// allocate a loop object for the new vector loop and return it. 718 Loop *createVectorLoopSkeleton(StringRef Prefix); 719 720 /// Create new phi nodes for the induction variables to resume iteration count 721 /// in the scalar epilogue, from where the vectorized loop left off (given by 722 /// \p VectorTripCount). 723 /// In cases where the loop skeleton is more complicated (eg. epilogue 724 /// vectorization) and the resume values can come from an additional bypass 725 /// block, the \p AdditionalBypass pair provides information about the bypass 726 /// block and the end value on the edge from bypass to this loop. 727 void createInductionResumeValues( 728 Loop *L, Value *VectorTripCount, 729 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 730 731 /// Complete the loop skeleton by adding debug MDs, creating appropriate 732 /// conditional branches in the middle block, preparing the builder and 733 /// running the verifier. Take in the vector loop \p L as argument, and return 734 /// the preheader of the completed vector loop. 735 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 736 737 /// Add additional metadata to \p To that was not present on \p Orig. 738 /// 739 /// Currently this is used to add the noalias annotations based on the 740 /// inserted memchecks. Use this for instructions that are *cloned* into the 741 /// vector loop. 742 void addNewMetadata(Instruction *To, const Instruction *Orig); 743 744 /// Add metadata from one instruction to another. 745 /// 746 /// This includes both the original MDs from \p From and additional ones (\see 747 /// addNewMetadata). Use this for *newly created* instructions in the vector 748 /// loop. 749 void addMetadata(Instruction *To, Instruction *From); 750 751 /// Similar to the previous function but it adds the metadata to a 752 /// vector of instructions. 753 void addMetadata(ArrayRef<Value *> To, Instruction *From); 754 755 /// Allow subclasses to override and print debug traces before/after vplan 756 /// execution, when trace information is requested. 757 virtual void printDebugTracesAtStart(){}; 758 virtual void printDebugTracesAtEnd(){}; 759 760 /// The original loop. 761 Loop *OrigLoop; 762 763 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 764 /// dynamic knowledge to simplify SCEV expressions and converts them to a 765 /// more usable form. 766 PredicatedScalarEvolution &PSE; 767 768 /// Loop Info. 769 LoopInfo *LI; 770 771 /// Dominator Tree. 772 DominatorTree *DT; 773 774 /// Alias Analysis. 775 AAResults *AA; 776 777 /// Target Library Info. 778 const TargetLibraryInfo *TLI; 779 780 /// Target Transform Info. 781 const TargetTransformInfo *TTI; 782 783 /// Assumption Cache. 784 AssumptionCache *AC; 785 786 /// Interface to emit optimization remarks. 787 OptimizationRemarkEmitter *ORE; 788 789 /// LoopVersioning. It's only set up (non-null) if memchecks were 790 /// used. 791 /// 792 /// This is currently only used to add no-alias metadata based on the 793 /// memchecks. The actually versioning is performed manually. 794 std::unique_ptr<LoopVersioning> LVer; 795 796 /// The vectorization SIMD factor to use. Each vector will have this many 797 /// vector elements. 798 ElementCount VF; 799 800 /// The vectorization unroll factor to use. Each scalar is vectorized to this 801 /// many different vector instructions. 802 unsigned UF; 803 804 /// The builder that we use 805 IRBuilder<> Builder; 806 807 // --- Vectorization state --- 808 809 /// The vector-loop preheader. 810 BasicBlock *LoopVectorPreHeader; 811 812 /// The scalar-loop preheader. 813 BasicBlock *LoopScalarPreHeader; 814 815 /// Middle Block between the vector and the scalar. 816 BasicBlock *LoopMiddleBlock; 817 818 /// The (unique) ExitBlock of the scalar loop. Note that 819 /// there can be multiple exiting edges reaching this block. 820 BasicBlock *LoopExitBlock; 821 822 /// The vector loop body. 823 BasicBlock *LoopVectorBody; 824 825 /// The scalar loop body. 826 BasicBlock *LoopScalarBody; 827 828 /// A list of all bypass blocks. The first block is the entry of the loop. 829 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 830 831 /// The new Induction variable which was added to the new block. 832 PHINode *Induction = nullptr; 833 834 /// The induction variable of the old basic block. 835 PHINode *OldInduction = nullptr; 836 837 /// Store instructions that were predicated. 838 SmallVector<Instruction *, 4> PredicatedInstructions; 839 840 /// Trip count of the original loop. 841 Value *TripCount = nullptr; 842 843 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 844 Value *VectorTripCount = nullptr; 845 846 /// The legality analysis. 847 LoopVectorizationLegality *Legal; 848 849 /// The profitablity analysis. 850 LoopVectorizationCostModel *Cost; 851 852 // Record whether runtime checks are added. 853 bool AddedSafetyChecks = false; 854 855 // Holds the end values for each induction variable. We save the end values 856 // so we can later fix-up the external users of the induction variables. 857 DenseMap<PHINode *, Value *> IVEndValues; 858 859 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 860 // fixed up at the end of vector code generation. 861 SmallVector<PHINode *, 8> OrigPHIsToFix; 862 863 /// BFI and PSI are used to check for profile guided size optimizations. 864 BlockFrequencyInfo *BFI; 865 ProfileSummaryInfo *PSI; 866 867 // Whether this loop should be optimized for size based on profile guided size 868 // optimizatios. 869 bool OptForSizeBasedOnProfile; 870 871 /// Structure to hold information about generated runtime checks, responsible 872 /// for cleaning the checks, if vectorization turns out unprofitable. 873 GeneratedRTChecks &RTChecks; 874 }; 875 876 class InnerLoopUnroller : public InnerLoopVectorizer { 877 public: 878 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 879 LoopInfo *LI, DominatorTree *DT, 880 const TargetLibraryInfo *TLI, 881 const TargetTransformInfo *TTI, AssumptionCache *AC, 882 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 883 LoopVectorizationLegality *LVL, 884 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 885 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 886 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 887 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 888 BFI, PSI, Check) {} 889 890 private: 891 Value *getBroadcastInstrs(Value *V) override; 892 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 893 Instruction::BinaryOps Opcode = 894 Instruction::BinaryOpsEnd) override; 895 Value *reverseVector(Value *Vec) override; 896 }; 897 898 /// Encapsulate information regarding vectorization of a loop and its epilogue. 899 /// This information is meant to be updated and used across two stages of 900 /// epilogue vectorization. 901 struct EpilogueLoopVectorizationInfo { 902 ElementCount MainLoopVF = ElementCount::getFixed(0); 903 unsigned MainLoopUF = 0; 904 ElementCount EpilogueVF = ElementCount::getFixed(0); 905 unsigned EpilogueUF = 0; 906 BasicBlock *MainLoopIterationCountCheck = nullptr; 907 BasicBlock *EpilogueIterationCountCheck = nullptr; 908 BasicBlock *SCEVSafetyCheck = nullptr; 909 BasicBlock *MemSafetyCheck = nullptr; 910 Value *TripCount = nullptr; 911 Value *VectorTripCount = nullptr; 912 913 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 914 unsigned EUF) 915 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 916 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 917 assert(EUF == 1 && 918 "A high UF for the epilogue loop is likely not beneficial."); 919 } 920 }; 921 922 /// An extension of the inner loop vectorizer that creates a skeleton for a 923 /// vectorized loop that has its epilogue (residual) also vectorized. 924 /// The idea is to run the vplan on a given loop twice, firstly to setup the 925 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 926 /// from the first step and vectorize the epilogue. This is achieved by 927 /// deriving two concrete strategy classes from this base class and invoking 928 /// them in succession from the loop vectorizer planner. 929 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 930 public: 931 InnerLoopAndEpilogueVectorizer( 932 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 933 DominatorTree *DT, const TargetLibraryInfo *TLI, 934 const TargetTransformInfo *TTI, AssumptionCache *AC, 935 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 936 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 937 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 938 GeneratedRTChecks &Checks) 939 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 940 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 941 Checks), 942 EPI(EPI) {} 943 944 // Override this function to handle the more complex control flow around the 945 // three loops. 946 BasicBlock *createVectorizedLoopSkeleton() final override { 947 return createEpilogueVectorizedLoopSkeleton(); 948 } 949 950 /// The interface for creating a vectorized skeleton using one of two 951 /// different strategies, each corresponding to one execution of the vplan 952 /// as described above. 953 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 954 955 /// Holds and updates state information required to vectorize the main loop 956 /// and its epilogue in two separate passes. This setup helps us avoid 957 /// regenerating and recomputing runtime safety checks. It also helps us to 958 /// shorten the iteration-count-check path length for the cases where the 959 /// iteration count of the loop is so small that the main vector loop is 960 /// completely skipped. 961 EpilogueLoopVectorizationInfo &EPI; 962 }; 963 964 /// A specialized derived class of inner loop vectorizer that performs 965 /// vectorization of *main* loops in the process of vectorizing loops and their 966 /// epilogues. 967 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 968 public: 969 EpilogueVectorizerMainLoop( 970 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 971 DominatorTree *DT, const TargetLibraryInfo *TLI, 972 const TargetTransformInfo *TTI, AssumptionCache *AC, 973 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 974 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 975 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 976 GeneratedRTChecks &Check) 977 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 978 EPI, LVL, CM, BFI, PSI, Check) {} 979 /// Implements the interface for creating a vectorized skeleton using the 980 /// *main loop* strategy (ie the first pass of vplan execution). 981 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 982 983 protected: 984 /// Emits an iteration count bypass check once for the main loop (when \p 985 /// ForEpilogue is false) and once for the epilogue loop (when \p 986 /// ForEpilogue is true). 987 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 988 bool ForEpilogue); 989 void printDebugTracesAtStart() override; 990 void printDebugTracesAtEnd() override; 991 }; 992 993 // A specialized derived class of inner loop vectorizer that performs 994 // vectorization of *epilogue* loops in the process of vectorizing loops and 995 // their epilogues. 996 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 997 public: 998 EpilogueVectorizerEpilogueLoop( 999 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1000 DominatorTree *DT, const TargetLibraryInfo *TLI, 1001 const TargetTransformInfo *TTI, AssumptionCache *AC, 1002 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1003 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1004 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1005 GeneratedRTChecks &Checks) 1006 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1007 EPI, LVL, CM, BFI, PSI, Checks) {} 1008 /// Implements the interface for creating a vectorized skeleton using the 1009 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1010 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1011 1012 protected: 1013 /// Emits an iteration count bypass check after the main vector loop has 1014 /// finished to see if there are any iterations left to execute by either 1015 /// the vector epilogue or the scalar epilogue. 1016 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1017 BasicBlock *Bypass, 1018 BasicBlock *Insert); 1019 void printDebugTracesAtStart() override; 1020 void printDebugTracesAtEnd() override; 1021 }; 1022 } // end namespace llvm 1023 1024 /// Look for a meaningful debug location on the instruction or it's 1025 /// operands. 1026 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1027 if (!I) 1028 return I; 1029 1030 DebugLoc Empty; 1031 if (I->getDebugLoc() != Empty) 1032 return I; 1033 1034 for (Use &Op : I->operands()) { 1035 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1036 if (OpInst->getDebugLoc() != Empty) 1037 return OpInst; 1038 } 1039 1040 return I; 1041 } 1042 1043 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1044 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1045 const DILocation *DIL = Inst->getDebugLoc(); 1046 1047 // When a FSDiscriminator is enabled, we don't need to add the multiply 1048 // factors to the discriminators. 1049 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1050 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1051 // FIXME: For scalable vectors, assume vscale=1. 1052 auto NewDIL = 1053 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1054 if (NewDIL) 1055 B.SetCurrentDebugLocation(NewDIL.getValue()); 1056 else 1057 LLVM_DEBUG(dbgs() 1058 << "Failed to create new discriminator: " 1059 << DIL->getFilename() << " Line: " << DIL->getLine()); 1060 } else 1061 B.SetCurrentDebugLocation(DIL); 1062 } else 1063 B.SetCurrentDebugLocation(DebugLoc()); 1064 } 1065 1066 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1067 /// is passed, the message relates to that particular instruction. 1068 #ifndef NDEBUG 1069 static void debugVectorizationMessage(const StringRef Prefix, 1070 const StringRef DebugMsg, 1071 Instruction *I) { 1072 dbgs() << "LV: " << Prefix << DebugMsg; 1073 if (I != nullptr) 1074 dbgs() << " " << *I; 1075 else 1076 dbgs() << '.'; 1077 dbgs() << '\n'; 1078 } 1079 #endif 1080 1081 /// Create an analysis remark that explains why vectorization failed 1082 /// 1083 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1084 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1085 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1086 /// the location of the remark. \return the remark object that can be 1087 /// streamed to. 1088 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1089 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1090 Value *CodeRegion = TheLoop->getHeader(); 1091 DebugLoc DL = TheLoop->getStartLoc(); 1092 1093 if (I) { 1094 CodeRegion = I->getParent(); 1095 // If there is no debug location attached to the instruction, revert back to 1096 // using the loop's. 1097 if (I->getDebugLoc()) 1098 DL = I->getDebugLoc(); 1099 } 1100 1101 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1102 } 1103 1104 /// Return a value for Step multiplied by VF. 1105 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1106 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1107 Constant *StepVal = ConstantInt::get( 1108 Step->getType(), 1109 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1110 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1111 } 1112 1113 namespace llvm { 1114 1115 /// Return the runtime value for VF. 1116 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1117 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1118 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1119 } 1120 1121 void reportVectorizationFailure(const StringRef DebugMsg, 1122 const StringRef OREMsg, const StringRef ORETag, 1123 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1124 Instruction *I) { 1125 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1126 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1127 ORE->emit( 1128 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1129 << "loop not vectorized: " << OREMsg); 1130 } 1131 1132 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1133 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1134 Instruction *I) { 1135 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1136 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1137 ORE->emit( 1138 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1139 << Msg); 1140 } 1141 1142 } // end namespace llvm 1143 1144 #ifndef NDEBUG 1145 /// \return string containing a file name and a line # for the given loop. 1146 static std::string getDebugLocString(const Loop *L) { 1147 std::string Result; 1148 if (L) { 1149 raw_string_ostream OS(Result); 1150 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1151 LoopDbgLoc.print(OS); 1152 else 1153 // Just print the module name. 1154 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1155 OS.flush(); 1156 } 1157 return Result; 1158 } 1159 #endif 1160 1161 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1162 const Instruction *Orig) { 1163 // If the loop was versioned with memchecks, add the corresponding no-alias 1164 // metadata. 1165 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1166 LVer->annotateInstWithNoAlias(To, Orig); 1167 } 1168 1169 void InnerLoopVectorizer::addMetadata(Instruction *To, 1170 Instruction *From) { 1171 propagateMetadata(To, From); 1172 addNewMetadata(To, From); 1173 } 1174 1175 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1176 Instruction *From) { 1177 for (Value *V : To) { 1178 if (Instruction *I = dyn_cast<Instruction>(V)) 1179 addMetadata(I, From); 1180 } 1181 } 1182 1183 namespace llvm { 1184 1185 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1186 // lowered. 1187 enum ScalarEpilogueLowering { 1188 1189 // The default: allowing scalar epilogues. 1190 CM_ScalarEpilogueAllowed, 1191 1192 // Vectorization with OptForSize: don't allow epilogues. 1193 CM_ScalarEpilogueNotAllowedOptSize, 1194 1195 // A special case of vectorisation with OptForSize: loops with a very small 1196 // trip count are considered for vectorization under OptForSize, thereby 1197 // making sure the cost of their loop body is dominant, free of runtime 1198 // guards and scalar iteration overheads. 1199 CM_ScalarEpilogueNotAllowedLowTripLoop, 1200 1201 // Loop hint predicate indicating an epilogue is undesired. 1202 CM_ScalarEpilogueNotNeededUsePredicate, 1203 1204 // Directive indicating we must either tail fold or not vectorize 1205 CM_ScalarEpilogueNotAllowedUsePredicate 1206 }; 1207 1208 /// ElementCountComparator creates a total ordering for ElementCount 1209 /// for the purposes of using it in a set structure. 1210 struct ElementCountComparator { 1211 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1212 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1213 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1214 } 1215 }; 1216 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1217 1218 /// LoopVectorizationCostModel - estimates the expected speedups due to 1219 /// vectorization. 1220 /// In many cases vectorization is not profitable. This can happen because of 1221 /// a number of reasons. In this class we mainly attempt to predict the 1222 /// expected speedup/slowdowns due to the supported instruction set. We use the 1223 /// TargetTransformInfo to query the different backends for the cost of 1224 /// different operations. 1225 class LoopVectorizationCostModel { 1226 public: 1227 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1228 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1229 LoopVectorizationLegality *Legal, 1230 const TargetTransformInfo &TTI, 1231 const TargetLibraryInfo *TLI, DemandedBits *DB, 1232 AssumptionCache *AC, 1233 OptimizationRemarkEmitter *ORE, const Function *F, 1234 const LoopVectorizeHints *Hints, 1235 InterleavedAccessInfo &IAI) 1236 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1237 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1238 Hints(Hints), InterleaveInfo(IAI) {} 1239 1240 /// \return An upper bound for the vectorization factors (both fixed and 1241 /// scalable). If the factors are 0, vectorization and interleaving should be 1242 /// avoided up front. 1243 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1244 1245 /// \return True if runtime checks are required for vectorization, and false 1246 /// otherwise. 1247 bool runtimeChecksRequired(); 1248 1249 /// \return The most profitable vectorization factor and the cost of that VF. 1250 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1251 /// then this vectorization factor will be selected if vectorization is 1252 /// possible. 1253 VectorizationFactor 1254 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1255 1256 VectorizationFactor 1257 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1258 const LoopVectorizationPlanner &LVP); 1259 1260 /// Setup cost-based decisions for user vectorization factor. 1261 void selectUserVectorizationFactor(ElementCount UserVF) { 1262 collectUniformsAndScalars(UserVF); 1263 collectInstsToScalarize(UserVF); 1264 } 1265 1266 /// \return The size (in bits) of the smallest and widest types in the code 1267 /// that needs to be vectorized. We ignore values that remain scalar such as 1268 /// 64 bit loop indices. 1269 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1270 1271 /// \return The desired interleave count. 1272 /// If interleave count has been specified by metadata it will be returned. 1273 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1274 /// are the selected vectorization factor and the cost of the selected VF. 1275 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1276 1277 /// Memory access instruction may be vectorized in more than one way. 1278 /// Form of instruction after vectorization depends on cost. 1279 /// This function takes cost-based decisions for Load/Store instructions 1280 /// and collects them in a map. This decisions map is used for building 1281 /// the lists of loop-uniform and loop-scalar instructions. 1282 /// The calculated cost is saved with widening decision in order to 1283 /// avoid redundant calculations. 1284 void setCostBasedWideningDecision(ElementCount VF); 1285 1286 /// A struct that represents some properties of the register usage 1287 /// of a loop. 1288 struct RegisterUsage { 1289 /// Holds the number of loop invariant values that are used in the loop. 1290 /// The key is ClassID of target-provided register class. 1291 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1292 /// Holds the maximum number of concurrent live intervals in the loop. 1293 /// The key is ClassID of target-provided register class. 1294 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1295 }; 1296 1297 /// \return Returns information about the register usages of the loop for the 1298 /// given vectorization factors. 1299 SmallVector<RegisterUsage, 8> 1300 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1301 1302 /// Collect values we want to ignore in the cost model. 1303 void collectValuesToIgnore(); 1304 1305 /// Split reductions into those that happen in the loop, and those that happen 1306 /// outside. In loop reductions are collected into InLoopReductionChains. 1307 void collectInLoopReductions(); 1308 1309 /// \returns The smallest bitwidth each instruction can be represented with. 1310 /// The vector equivalents of these instructions should be truncated to this 1311 /// type. 1312 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1313 return MinBWs; 1314 } 1315 1316 /// \returns True if it is more profitable to scalarize instruction \p I for 1317 /// vectorization factor \p VF. 1318 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1319 assert(VF.isVector() && 1320 "Profitable to scalarize relevant only for VF > 1."); 1321 1322 // Cost model is not run in the VPlan-native path - return conservative 1323 // result until this changes. 1324 if (EnableVPlanNativePath) 1325 return false; 1326 1327 auto Scalars = InstsToScalarize.find(VF); 1328 assert(Scalars != InstsToScalarize.end() && 1329 "VF not yet analyzed for scalarization profitability"); 1330 return Scalars->second.find(I) != Scalars->second.end(); 1331 } 1332 1333 /// Returns true if \p I is known to be uniform after vectorization. 1334 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1335 if (VF.isScalar()) 1336 return true; 1337 1338 // Cost model is not run in the VPlan-native path - return conservative 1339 // result until this changes. 1340 if (EnableVPlanNativePath) 1341 return false; 1342 1343 auto UniformsPerVF = Uniforms.find(VF); 1344 assert(UniformsPerVF != Uniforms.end() && 1345 "VF not yet analyzed for uniformity"); 1346 return UniformsPerVF->second.count(I); 1347 } 1348 1349 /// Returns true if \p I is known to be scalar after vectorization. 1350 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1351 if (VF.isScalar()) 1352 return true; 1353 1354 // Cost model is not run in the VPlan-native path - return conservative 1355 // result until this changes. 1356 if (EnableVPlanNativePath) 1357 return false; 1358 1359 auto ScalarsPerVF = Scalars.find(VF); 1360 assert(ScalarsPerVF != Scalars.end() && 1361 "Scalar values are not calculated for VF"); 1362 return ScalarsPerVF->second.count(I); 1363 } 1364 1365 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1366 /// for vectorization factor \p VF. 1367 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1368 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1369 !isProfitableToScalarize(I, VF) && 1370 !isScalarAfterVectorization(I, VF); 1371 } 1372 1373 /// Decision that was taken during cost calculation for memory instruction. 1374 enum InstWidening { 1375 CM_Unknown, 1376 CM_Widen, // For consecutive accesses with stride +1. 1377 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1378 CM_Interleave, 1379 CM_GatherScatter, 1380 CM_Scalarize 1381 }; 1382 1383 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1384 /// instruction \p I and vector width \p VF. 1385 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1386 InstructionCost Cost) { 1387 assert(VF.isVector() && "Expected VF >=2"); 1388 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1389 } 1390 1391 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1392 /// interleaving group \p Grp and vector width \p VF. 1393 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1394 ElementCount VF, InstWidening W, 1395 InstructionCost Cost) { 1396 assert(VF.isVector() && "Expected VF >=2"); 1397 /// Broadcast this decicion to all instructions inside the group. 1398 /// But the cost will be assigned to one instruction only. 1399 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1400 if (auto *I = Grp->getMember(i)) { 1401 if (Grp->getInsertPos() == I) 1402 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1403 else 1404 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1405 } 1406 } 1407 } 1408 1409 /// Return the cost model decision for the given instruction \p I and vector 1410 /// width \p VF. Return CM_Unknown if this instruction did not pass 1411 /// through the cost modeling. 1412 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1413 assert(VF.isVector() && "Expected VF to be a vector VF"); 1414 // Cost model is not run in the VPlan-native path - return conservative 1415 // result until this changes. 1416 if (EnableVPlanNativePath) 1417 return CM_GatherScatter; 1418 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 auto Itr = WideningDecisions.find(InstOnVF); 1421 if (Itr == WideningDecisions.end()) 1422 return CM_Unknown; 1423 return Itr->second.first; 1424 } 1425 1426 /// Return the vectorization cost for the given instruction \p I and vector 1427 /// width \p VF. 1428 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1429 assert(VF.isVector() && "Expected VF >=2"); 1430 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1431 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1432 "The cost is not calculated"); 1433 return WideningDecisions[InstOnVF].second; 1434 } 1435 1436 /// Return True if instruction \p I is an optimizable truncate whose operand 1437 /// is an induction variable. Such a truncate will be removed by adding a new 1438 /// induction variable with the destination type. 1439 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1440 // If the instruction is not a truncate, return false. 1441 auto *Trunc = dyn_cast<TruncInst>(I); 1442 if (!Trunc) 1443 return false; 1444 1445 // Get the source and destination types of the truncate. 1446 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1447 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1448 1449 // If the truncate is free for the given types, return false. Replacing a 1450 // free truncate with an induction variable would add an induction variable 1451 // update instruction to each iteration of the loop. We exclude from this 1452 // check the primary induction variable since it will need an update 1453 // instruction regardless. 1454 Value *Op = Trunc->getOperand(0); 1455 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1456 return false; 1457 1458 // If the truncated value is not an induction variable, return false. 1459 return Legal->isInductionPhi(Op); 1460 } 1461 1462 /// Collects the instructions to scalarize for each predicated instruction in 1463 /// the loop. 1464 void collectInstsToScalarize(ElementCount VF); 1465 1466 /// Collect Uniform and Scalar values for the given \p VF. 1467 /// The sets depend on CM decision for Load/Store instructions 1468 /// that may be vectorized as interleave, gather-scatter or scalarized. 1469 void collectUniformsAndScalars(ElementCount VF) { 1470 // Do the analysis once. 1471 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1472 return; 1473 setCostBasedWideningDecision(VF); 1474 collectLoopUniforms(VF); 1475 collectLoopScalars(VF); 1476 } 1477 1478 /// Returns true if the target machine supports masked store operation 1479 /// for the given \p DataType and kind of access to \p Ptr. 1480 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1481 return Legal->isConsecutivePtr(Ptr) && 1482 TTI.isLegalMaskedStore(DataType, Alignment); 1483 } 1484 1485 /// Returns true if the target machine supports masked load operation 1486 /// for the given \p DataType and kind of access to \p Ptr. 1487 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1488 return Legal->isConsecutivePtr(Ptr) && 1489 TTI.isLegalMaskedLoad(DataType, Alignment); 1490 } 1491 1492 /// Returns true if the target machine can represent \p V as a masked gather 1493 /// or scatter operation. 1494 bool isLegalGatherOrScatter(Value *V) { 1495 bool LI = isa<LoadInst>(V); 1496 bool SI = isa<StoreInst>(V); 1497 if (!LI && !SI) 1498 return false; 1499 auto *Ty = getLoadStoreType(V); 1500 Align Align = getLoadStoreAlignment(V); 1501 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1502 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1503 } 1504 1505 /// Returns true if the target machine supports all of the reduction 1506 /// variables found for the given VF. 1507 bool canVectorizeReductions(ElementCount VF) { 1508 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1509 RecurrenceDescriptor RdxDesc = Reduction.second; 1510 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1511 })); 1512 } 1513 1514 /// Returns true if \p I is an instruction that will be scalarized with 1515 /// predication. Such instructions include conditional stores and 1516 /// instructions that may divide by zero. 1517 /// If a non-zero VF has been calculated, we check if I will be scalarized 1518 /// predication for that VF. 1519 bool isScalarWithPredication(Instruction *I) const; 1520 1521 // Returns true if \p I is an instruction that will be predicated either 1522 // through scalar predication or masked load/store or masked gather/scatter. 1523 // Superset of instructions that return true for isScalarWithPredication. 1524 bool isPredicatedInst(Instruction *I) { 1525 if (!blockNeedsPredication(I->getParent())) 1526 return false; 1527 // Loads and stores that need some form of masked operation are predicated 1528 // instructions. 1529 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1530 return Legal->isMaskRequired(I); 1531 return isScalarWithPredication(I); 1532 } 1533 1534 /// Returns true if \p I is a memory instruction with consecutive memory 1535 /// access that can be widened. 1536 bool 1537 memoryInstructionCanBeWidened(Instruction *I, 1538 ElementCount VF = ElementCount::getFixed(1)); 1539 1540 /// Returns true if \p I is a memory instruction in an interleaved-group 1541 /// of memory accesses that can be vectorized with wide vector loads/stores 1542 /// and shuffles. 1543 bool 1544 interleavedAccessCanBeWidened(Instruction *I, 1545 ElementCount VF = ElementCount::getFixed(1)); 1546 1547 /// Check if \p Instr belongs to any interleaved access group. 1548 bool isAccessInterleaved(Instruction *Instr) { 1549 return InterleaveInfo.isInterleaved(Instr); 1550 } 1551 1552 /// Get the interleaved access group that \p Instr belongs to. 1553 const InterleaveGroup<Instruction> * 1554 getInterleavedAccessGroup(Instruction *Instr) { 1555 return InterleaveInfo.getInterleaveGroup(Instr); 1556 } 1557 1558 /// Returns true if we're required to use a scalar epilogue for at least 1559 /// the final iteration of the original loop. 1560 bool requiresScalarEpilogue() const { 1561 if (!isScalarEpilogueAllowed()) 1562 return false; 1563 // If we might exit from anywhere but the latch, must run the exiting 1564 // iteration in scalar form. 1565 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1566 return true; 1567 return InterleaveInfo.requiresScalarEpilogue(); 1568 } 1569 1570 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1571 /// loop hint annotation. 1572 bool isScalarEpilogueAllowed() const { 1573 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1574 } 1575 1576 /// Returns true if all loop blocks should be masked to fold tail loop. 1577 bool foldTailByMasking() const { return FoldTailByMasking; } 1578 1579 bool blockNeedsPredication(BasicBlock *BB) const { 1580 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1581 } 1582 1583 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1584 /// nodes to the chain of instructions representing the reductions. Uses a 1585 /// MapVector to ensure deterministic iteration order. 1586 using ReductionChainMap = 1587 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1588 1589 /// Return the chain of instructions representing an inloop reduction. 1590 const ReductionChainMap &getInLoopReductionChains() const { 1591 return InLoopReductionChains; 1592 } 1593 1594 /// Returns true if the Phi is part of an inloop reduction. 1595 bool isInLoopReduction(PHINode *Phi) const { 1596 return InLoopReductionChains.count(Phi); 1597 } 1598 1599 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1600 /// with factor VF. Return the cost of the instruction, including 1601 /// scalarization overhead if it's needed. 1602 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1603 1604 /// Estimate cost of a call instruction CI if it were vectorized with factor 1605 /// VF. Return the cost of the instruction, including scalarization overhead 1606 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1607 /// scalarized - 1608 /// i.e. either vector version isn't available, or is too expensive. 1609 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1610 bool &NeedToScalarize) const; 1611 1612 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1613 /// that of B. 1614 bool isMoreProfitable(const VectorizationFactor &A, 1615 const VectorizationFactor &B) const; 1616 1617 /// Invalidates decisions already taken by the cost model. 1618 void invalidateCostModelingDecisions() { 1619 WideningDecisions.clear(); 1620 Uniforms.clear(); 1621 Scalars.clear(); 1622 } 1623 1624 private: 1625 unsigned NumPredStores = 0; 1626 1627 /// \return An upper bound for the vectorization factors for both 1628 /// fixed and scalable vectorization, where the minimum-known number of 1629 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1630 /// disabled or unsupported, then the scalable part will be equal to 1631 /// ElementCount::getScalable(0). 1632 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1633 ElementCount UserVF); 1634 1635 /// \return the maximized element count based on the targets vector 1636 /// registers and the loop trip-count, but limited to a maximum safe VF. 1637 /// This is a helper function of computeFeasibleMaxVF. 1638 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1639 /// issue that occurred on one of the buildbots which cannot be reproduced 1640 /// without having access to the properietary compiler (see comments on 1641 /// D98509). The issue is currently under investigation and this workaround 1642 /// will be removed as soon as possible. 1643 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1644 unsigned SmallestType, 1645 unsigned WidestType, 1646 const ElementCount &MaxSafeVF); 1647 1648 /// \return the maximum legal scalable VF, based on the safe max number 1649 /// of elements. 1650 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1651 1652 /// The vectorization cost is a combination of the cost itself and a boolean 1653 /// indicating whether any of the contributing operations will actually 1654 /// operate on 1655 /// vector values after type legalization in the backend. If this latter value 1656 /// is 1657 /// false, then all operations will be scalarized (i.e. no vectorization has 1658 /// actually taken place). 1659 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1660 1661 /// Returns the expected execution cost. The unit of the cost does 1662 /// not matter because we use the 'cost' units to compare different 1663 /// vector widths. The cost that is returned is *not* normalized by 1664 /// the factor width. 1665 VectorizationCostTy expectedCost(ElementCount VF); 1666 1667 /// Returns the execution time cost of an instruction for a given vector 1668 /// width. Vector width of one means scalar. 1669 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1670 1671 /// The cost-computation logic from getInstructionCost which provides 1672 /// the vector type as an output parameter. 1673 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1674 Type *&VectorTy); 1675 1676 /// Return the cost of instructions in an inloop reduction pattern, if I is 1677 /// part of that pattern. 1678 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1679 Type *VectorTy, 1680 TTI::TargetCostKind CostKind); 1681 1682 /// Calculate vectorization cost of memory instruction \p I. 1683 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1684 1685 /// The cost computation for scalarized memory instruction. 1686 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1687 1688 /// The cost computation for interleaving group of memory instructions. 1689 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1690 1691 /// The cost computation for Gather/Scatter instruction. 1692 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1693 1694 /// The cost computation for widening instruction \p I with consecutive 1695 /// memory access. 1696 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1697 1698 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1699 /// Load: scalar load + broadcast. 1700 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1701 /// element) 1702 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1703 1704 /// Estimate the overhead of scalarizing an instruction. This is a 1705 /// convenience wrapper for the type-based getScalarizationOverhead API. 1706 InstructionCost getScalarizationOverhead(Instruction *I, 1707 ElementCount VF) const; 1708 1709 /// Returns whether the instruction is a load or store and will be a emitted 1710 /// as a vector operation. 1711 bool isConsecutiveLoadOrStore(Instruction *I); 1712 1713 /// Returns true if an artificially high cost for emulated masked memrefs 1714 /// should be used. 1715 bool useEmulatedMaskMemRefHack(Instruction *I); 1716 1717 /// Map of scalar integer values to the smallest bitwidth they can be legally 1718 /// represented as. The vector equivalents of these values should be truncated 1719 /// to this type. 1720 MapVector<Instruction *, uint64_t> MinBWs; 1721 1722 /// A type representing the costs for instructions if they were to be 1723 /// scalarized rather than vectorized. The entries are Instruction-Cost 1724 /// pairs. 1725 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1726 1727 /// A set containing all BasicBlocks that are known to present after 1728 /// vectorization as a predicated block. 1729 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1730 1731 /// Records whether it is allowed to have the original scalar loop execute at 1732 /// least once. This may be needed as a fallback loop in case runtime 1733 /// aliasing/dependence checks fail, or to handle the tail/remainder 1734 /// iterations when the trip count is unknown or doesn't divide by the VF, 1735 /// or as a peel-loop to handle gaps in interleave-groups. 1736 /// Under optsize and when the trip count is very small we don't allow any 1737 /// iterations to execute in the scalar loop. 1738 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1739 1740 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1741 bool FoldTailByMasking = false; 1742 1743 /// A map holding scalar costs for different vectorization factors. The 1744 /// presence of a cost for an instruction in the mapping indicates that the 1745 /// instruction will be scalarized when vectorizing with the associated 1746 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1747 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1748 1749 /// Holds the instructions known to be uniform after vectorization. 1750 /// The data is collected per VF. 1751 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1752 1753 /// Holds the instructions known to be scalar after vectorization. 1754 /// The data is collected per VF. 1755 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1756 1757 /// Holds the instructions (address computations) that are forced to be 1758 /// scalarized. 1759 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1760 1761 /// PHINodes of the reductions that should be expanded in-loop along with 1762 /// their associated chains of reduction operations, in program order from top 1763 /// (PHI) to bottom 1764 ReductionChainMap InLoopReductionChains; 1765 1766 /// A Map of inloop reduction operations and their immediate chain operand. 1767 /// FIXME: This can be removed once reductions can be costed correctly in 1768 /// vplan. This was added to allow quick lookup to the inloop operations, 1769 /// without having to loop through InLoopReductionChains. 1770 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1771 1772 /// Returns the expected difference in cost from scalarizing the expression 1773 /// feeding a predicated instruction \p PredInst. The instructions to 1774 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1775 /// non-negative return value implies the expression will be scalarized. 1776 /// Currently, only single-use chains are considered for scalarization. 1777 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1778 ElementCount VF); 1779 1780 /// Collect the instructions that are uniform after vectorization. An 1781 /// instruction is uniform if we represent it with a single scalar value in 1782 /// the vectorized loop corresponding to each vector iteration. Examples of 1783 /// uniform instructions include pointer operands of consecutive or 1784 /// interleaved memory accesses. Note that although uniformity implies an 1785 /// instruction will be scalar, the reverse is not true. In general, a 1786 /// scalarized instruction will be represented by VF scalar values in the 1787 /// vectorized loop, each corresponding to an iteration of the original 1788 /// scalar loop. 1789 void collectLoopUniforms(ElementCount VF); 1790 1791 /// Collect the instructions that are scalar after vectorization. An 1792 /// instruction is scalar if it is known to be uniform or will be scalarized 1793 /// during vectorization. Non-uniform scalarized instructions will be 1794 /// represented by VF values in the vectorized loop, each corresponding to an 1795 /// iteration of the original scalar loop. 1796 void collectLoopScalars(ElementCount VF); 1797 1798 /// Keeps cost model vectorization decision and cost for instructions. 1799 /// Right now it is used for memory instructions only. 1800 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1801 std::pair<InstWidening, InstructionCost>>; 1802 1803 DecisionList WideningDecisions; 1804 1805 /// Returns true if \p V is expected to be vectorized and it needs to be 1806 /// extracted. 1807 bool needsExtract(Value *V, ElementCount VF) const { 1808 Instruction *I = dyn_cast<Instruction>(V); 1809 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1810 TheLoop->isLoopInvariant(I)) 1811 return false; 1812 1813 // Assume we can vectorize V (and hence we need extraction) if the 1814 // scalars are not computed yet. This can happen, because it is called 1815 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1816 // the scalars are collected. That should be a safe assumption in most 1817 // cases, because we check if the operands have vectorizable types 1818 // beforehand in LoopVectorizationLegality. 1819 return Scalars.find(VF) == Scalars.end() || 1820 !isScalarAfterVectorization(I, VF); 1821 }; 1822 1823 /// Returns a range containing only operands needing to be extracted. 1824 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1825 ElementCount VF) const { 1826 return SmallVector<Value *, 4>(make_filter_range( 1827 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1828 } 1829 1830 /// Determines if we have the infrastructure to vectorize loop \p L and its 1831 /// epilogue, assuming the main loop is vectorized by \p VF. 1832 bool isCandidateForEpilogueVectorization(const Loop &L, 1833 const ElementCount VF) const; 1834 1835 /// Returns true if epilogue vectorization is considered profitable, and 1836 /// false otherwise. 1837 /// \p VF is the vectorization factor chosen for the original loop. 1838 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1839 1840 public: 1841 /// The loop that we evaluate. 1842 Loop *TheLoop; 1843 1844 /// Predicated scalar evolution analysis. 1845 PredicatedScalarEvolution &PSE; 1846 1847 /// Loop Info analysis. 1848 LoopInfo *LI; 1849 1850 /// Vectorization legality. 1851 LoopVectorizationLegality *Legal; 1852 1853 /// Vector target information. 1854 const TargetTransformInfo &TTI; 1855 1856 /// Target Library Info. 1857 const TargetLibraryInfo *TLI; 1858 1859 /// Demanded bits analysis. 1860 DemandedBits *DB; 1861 1862 /// Assumption cache. 1863 AssumptionCache *AC; 1864 1865 /// Interface to emit optimization remarks. 1866 OptimizationRemarkEmitter *ORE; 1867 1868 const Function *TheFunction; 1869 1870 /// Loop Vectorize Hint. 1871 const LoopVectorizeHints *Hints; 1872 1873 /// The interleave access information contains groups of interleaved accesses 1874 /// with the same stride and close to each other. 1875 InterleavedAccessInfo &InterleaveInfo; 1876 1877 /// Values to ignore in the cost model. 1878 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1879 1880 /// Values to ignore in the cost model when VF > 1. 1881 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1882 1883 /// Profitable vector factors. 1884 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1885 }; 1886 } // end namespace llvm 1887 1888 /// Helper struct to manage generating runtime checks for vectorization. 1889 /// 1890 /// The runtime checks are created up-front in temporary blocks to allow better 1891 /// estimating the cost and un-linked from the existing IR. After deciding to 1892 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1893 /// temporary blocks are completely removed. 1894 class GeneratedRTChecks { 1895 /// Basic block which contains the generated SCEV checks, if any. 1896 BasicBlock *SCEVCheckBlock = nullptr; 1897 1898 /// The value representing the result of the generated SCEV checks. If it is 1899 /// nullptr, either no SCEV checks have been generated or they have been used. 1900 Value *SCEVCheckCond = nullptr; 1901 1902 /// Basic block which contains the generated memory runtime checks, if any. 1903 BasicBlock *MemCheckBlock = nullptr; 1904 1905 /// The value representing the result of the generated memory runtime checks. 1906 /// If it is nullptr, either no memory runtime checks have been generated or 1907 /// they have been used. 1908 Instruction *MemRuntimeCheckCond = nullptr; 1909 1910 DominatorTree *DT; 1911 LoopInfo *LI; 1912 1913 SCEVExpander SCEVExp; 1914 SCEVExpander MemCheckExp; 1915 1916 public: 1917 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1918 const DataLayout &DL) 1919 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1920 MemCheckExp(SE, DL, "scev.check") {} 1921 1922 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1923 /// accurately estimate the cost of the runtime checks. The blocks are 1924 /// un-linked from the IR and is added back during vector code generation. If 1925 /// there is no vector code generation, the check blocks are removed 1926 /// completely. 1927 void Create(Loop *L, const LoopAccessInfo &LAI, 1928 const SCEVUnionPredicate &UnionPred) { 1929 1930 BasicBlock *LoopHeader = L->getHeader(); 1931 BasicBlock *Preheader = L->getLoopPreheader(); 1932 1933 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1934 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1935 // may be used by SCEVExpander. The blocks will be un-linked from their 1936 // predecessors and removed from LI & DT at the end of the function. 1937 if (!UnionPred.isAlwaysTrue()) { 1938 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1939 nullptr, "vector.scevcheck"); 1940 1941 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1942 &UnionPred, SCEVCheckBlock->getTerminator()); 1943 } 1944 1945 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1946 if (RtPtrChecking.Need) { 1947 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1948 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1949 "vector.memcheck"); 1950 1951 std::tie(std::ignore, MemRuntimeCheckCond) = 1952 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1953 RtPtrChecking.getChecks(), MemCheckExp); 1954 assert(MemRuntimeCheckCond && 1955 "no RT checks generated although RtPtrChecking " 1956 "claimed checks are required"); 1957 } 1958 1959 if (!MemCheckBlock && !SCEVCheckBlock) 1960 return; 1961 1962 // Unhook the temporary block with the checks, update various places 1963 // accordingly. 1964 if (SCEVCheckBlock) 1965 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1966 if (MemCheckBlock) 1967 MemCheckBlock->replaceAllUsesWith(Preheader); 1968 1969 if (SCEVCheckBlock) { 1970 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1971 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1972 Preheader->getTerminator()->eraseFromParent(); 1973 } 1974 if (MemCheckBlock) { 1975 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1976 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1977 Preheader->getTerminator()->eraseFromParent(); 1978 } 1979 1980 DT->changeImmediateDominator(LoopHeader, Preheader); 1981 if (MemCheckBlock) { 1982 DT->eraseNode(MemCheckBlock); 1983 LI->removeBlock(MemCheckBlock); 1984 } 1985 if (SCEVCheckBlock) { 1986 DT->eraseNode(SCEVCheckBlock); 1987 LI->removeBlock(SCEVCheckBlock); 1988 } 1989 } 1990 1991 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1992 /// unused. 1993 ~GeneratedRTChecks() { 1994 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1995 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1996 if (!SCEVCheckCond) 1997 SCEVCleaner.markResultUsed(); 1998 1999 if (!MemRuntimeCheckCond) 2000 MemCheckCleaner.markResultUsed(); 2001 2002 if (MemRuntimeCheckCond) { 2003 auto &SE = *MemCheckExp.getSE(); 2004 // Memory runtime check generation creates compares that use expanded 2005 // values. Remove them before running the SCEVExpanderCleaners. 2006 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2007 if (MemCheckExp.isInsertedInstruction(&I)) 2008 continue; 2009 SE.forgetValue(&I); 2010 SE.eraseValueFromMap(&I); 2011 I.eraseFromParent(); 2012 } 2013 } 2014 MemCheckCleaner.cleanup(); 2015 SCEVCleaner.cleanup(); 2016 2017 if (SCEVCheckCond) 2018 SCEVCheckBlock->eraseFromParent(); 2019 if (MemRuntimeCheckCond) 2020 MemCheckBlock->eraseFromParent(); 2021 } 2022 2023 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2024 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2025 /// depending on the generated condition. 2026 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2027 BasicBlock *LoopVectorPreHeader, 2028 BasicBlock *LoopExitBlock) { 2029 if (!SCEVCheckCond) 2030 return nullptr; 2031 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2032 if (C->isZero()) 2033 return nullptr; 2034 2035 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2036 2037 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2038 // Create new preheader for vector loop. 2039 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2040 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2041 2042 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2043 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2044 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2045 SCEVCheckBlock); 2046 2047 DT->addNewBlock(SCEVCheckBlock, Pred); 2048 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2049 2050 ReplaceInstWithInst( 2051 SCEVCheckBlock->getTerminator(), 2052 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2053 // Mark the check as used, to prevent it from being removed during cleanup. 2054 SCEVCheckCond = nullptr; 2055 return SCEVCheckBlock; 2056 } 2057 2058 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2059 /// the branches to branch to the vector preheader or \p Bypass, depending on 2060 /// the generated condition. 2061 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2062 BasicBlock *LoopVectorPreHeader) { 2063 // Check if we generated code that checks in runtime if arrays overlap. 2064 if (!MemRuntimeCheckCond) 2065 return nullptr; 2066 2067 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2068 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2069 MemCheckBlock); 2070 2071 DT->addNewBlock(MemCheckBlock, Pred); 2072 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2073 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2074 2075 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2076 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2077 2078 ReplaceInstWithInst( 2079 MemCheckBlock->getTerminator(), 2080 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2081 MemCheckBlock->getTerminator()->setDebugLoc( 2082 Pred->getTerminator()->getDebugLoc()); 2083 2084 // Mark the check as used, to prevent it from being removed during cleanup. 2085 MemRuntimeCheckCond = nullptr; 2086 return MemCheckBlock; 2087 } 2088 }; 2089 2090 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2091 // vectorization. The loop needs to be annotated with #pragma omp simd 2092 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2093 // vector length information is not provided, vectorization is not considered 2094 // explicit. Interleave hints are not allowed either. These limitations will be 2095 // relaxed in the future. 2096 // Please, note that we are currently forced to abuse the pragma 'clang 2097 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2098 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2099 // provides *explicit vectorization hints* (LV can bypass legal checks and 2100 // assume that vectorization is legal). However, both hints are implemented 2101 // using the same metadata (llvm.loop.vectorize, processed by 2102 // LoopVectorizeHints). This will be fixed in the future when the native IR 2103 // representation for pragma 'omp simd' is introduced. 2104 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2105 OptimizationRemarkEmitter *ORE) { 2106 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2107 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2108 2109 // Only outer loops with an explicit vectorization hint are supported. 2110 // Unannotated outer loops are ignored. 2111 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2112 return false; 2113 2114 Function *Fn = OuterLp->getHeader()->getParent(); 2115 if (!Hints.allowVectorization(Fn, OuterLp, 2116 true /*VectorizeOnlyWhenForced*/)) { 2117 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2118 return false; 2119 } 2120 2121 if (Hints.getInterleave() > 1) { 2122 // TODO: Interleave support is future work. 2123 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2124 "outer loops.\n"); 2125 Hints.emitRemarkWithHints(); 2126 return false; 2127 } 2128 2129 return true; 2130 } 2131 2132 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2133 OptimizationRemarkEmitter *ORE, 2134 SmallVectorImpl<Loop *> &V) { 2135 // Collect inner loops and outer loops without irreducible control flow. For 2136 // now, only collect outer loops that have explicit vectorization hints. If we 2137 // are stress testing the VPlan H-CFG construction, we collect the outermost 2138 // loop of every loop nest. 2139 if (L.isInnermost() || VPlanBuildStressTest || 2140 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2141 LoopBlocksRPO RPOT(&L); 2142 RPOT.perform(LI); 2143 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2144 V.push_back(&L); 2145 // TODO: Collect inner loops inside marked outer loops in case 2146 // vectorization fails for the outer loop. Do not invoke 2147 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2148 // already known to be reducible. We can use an inherited attribute for 2149 // that. 2150 return; 2151 } 2152 } 2153 for (Loop *InnerL : L) 2154 collectSupportedLoops(*InnerL, LI, ORE, V); 2155 } 2156 2157 namespace { 2158 2159 /// The LoopVectorize Pass. 2160 struct LoopVectorize : public FunctionPass { 2161 /// Pass identification, replacement for typeid 2162 static char ID; 2163 2164 LoopVectorizePass Impl; 2165 2166 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2167 bool VectorizeOnlyWhenForced = false) 2168 : FunctionPass(ID), 2169 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2170 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2171 } 2172 2173 bool runOnFunction(Function &F) override { 2174 if (skipFunction(F)) 2175 return false; 2176 2177 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2178 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2179 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2180 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2181 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2182 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2183 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2184 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2185 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2186 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2187 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2188 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2189 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2190 2191 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2192 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2193 2194 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2195 GetLAA, *ORE, PSI).MadeAnyChange; 2196 } 2197 2198 void getAnalysisUsage(AnalysisUsage &AU) const override { 2199 AU.addRequired<AssumptionCacheTracker>(); 2200 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2201 AU.addRequired<DominatorTreeWrapperPass>(); 2202 AU.addRequired<LoopInfoWrapperPass>(); 2203 AU.addRequired<ScalarEvolutionWrapperPass>(); 2204 AU.addRequired<TargetTransformInfoWrapperPass>(); 2205 AU.addRequired<AAResultsWrapperPass>(); 2206 AU.addRequired<LoopAccessLegacyAnalysis>(); 2207 AU.addRequired<DemandedBitsWrapperPass>(); 2208 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2209 AU.addRequired<InjectTLIMappingsLegacy>(); 2210 2211 // We currently do not preserve loopinfo/dominator analyses with outer loop 2212 // vectorization. Until this is addressed, mark these analyses as preserved 2213 // only for non-VPlan-native path. 2214 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2215 if (!EnableVPlanNativePath) { 2216 AU.addPreserved<LoopInfoWrapperPass>(); 2217 AU.addPreserved<DominatorTreeWrapperPass>(); 2218 } 2219 2220 AU.addPreserved<BasicAAWrapperPass>(); 2221 AU.addPreserved<GlobalsAAWrapperPass>(); 2222 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2223 } 2224 }; 2225 2226 } // end anonymous namespace 2227 2228 //===----------------------------------------------------------------------===// 2229 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2230 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2231 //===----------------------------------------------------------------------===// 2232 2233 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2234 // We need to place the broadcast of invariant variables outside the loop, 2235 // but only if it's proven safe to do so. Else, broadcast will be inside 2236 // vector loop body. 2237 Instruction *Instr = dyn_cast<Instruction>(V); 2238 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2239 (!Instr || 2240 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2241 // Place the code for broadcasting invariant variables in the new preheader. 2242 IRBuilder<>::InsertPointGuard Guard(Builder); 2243 if (SafeToHoist) 2244 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2245 2246 // Broadcast the scalar into all locations in the vector. 2247 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2248 2249 return Shuf; 2250 } 2251 2252 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2253 const InductionDescriptor &II, Value *Step, Value *Start, 2254 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2255 VPTransformState &State) { 2256 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2257 "Expected either an induction phi-node or a truncate of it!"); 2258 2259 // Construct the initial value of the vector IV in the vector loop preheader 2260 auto CurrIP = Builder.saveIP(); 2261 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2262 if (isa<TruncInst>(EntryVal)) { 2263 assert(Start->getType()->isIntegerTy() && 2264 "Truncation requires an integer type"); 2265 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2266 Step = Builder.CreateTrunc(Step, TruncType); 2267 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2268 } 2269 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2270 Value *SteppedStart = 2271 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2272 2273 // We create vector phi nodes for both integer and floating-point induction 2274 // variables. Here, we determine the kind of arithmetic we will perform. 2275 Instruction::BinaryOps AddOp; 2276 Instruction::BinaryOps MulOp; 2277 if (Step->getType()->isIntegerTy()) { 2278 AddOp = Instruction::Add; 2279 MulOp = Instruction::Mul; 2280 } else { 2281 AddOp = II.getInductionOpcode(); 2282 MulOp = Instruction::FMul; 2283 } 2284 2285 // Multiply the vectorization factor by the step using integer or 2286 // floating-point arithmetic as appropriate. 2287 Type *StepType = Step->getType(); 2288 if (Step->getType()->isFloatingPointTy()) 2289 StepType = IntegerType::get(StepType->getContext(), 2290 StepType->getScalarSizeInBits()); 2291 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2292 if (Step->getType()->isFloatingPointTy()) 2293 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2294 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2295 2296 // Create a vector splat to use in the induction update. 2297 // 2298 // FIXME: If the step is non-constant, we create the vector splat with 2299 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2300 // handle a constant vector splat. 2301 Value *SplatVF = isa<Constant>(Mul) 2302 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2303 : Builder.CreateVectorSplat(VF, Mul); 2304 Builder.restoreIP(CurrIP); 2305 2306 // We may need to add the step a number of times, depending on the unroll 2307 // factor. The last of those goes into the PHI. 2308 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2309 &*LoopVectorBody->getFirstInsertionPt()); 2310 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2311 Instruction *LastInduction = VecInd; 2312 for (unsigned Part = 0; Part < UF; ++Part) { 2313 State.set(Def, LastInduction, Part); 2314 2315 if (isa<TruncInst>(EntryVal)) 2316 addMetadata(LastInduction, EntryVal); 2317 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2318 State, Part); 2319 2320 LastInduction = cast<Instruction>( 2321 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2322 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2323 } 2324 2325 // Move the last step to the end of the latch block. This ensures consistent 2326 // placement of all induction updates. 2327 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2328 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2329 auto *ICmp = cast<Instruction>(Br->getCondition()); 2330 LastInduction->moveBefore(ICmp); 2331 LastInduction->setName("vec.ind.next"); 2332 2333 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2334 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2335 } 2336 2337 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2338 return Cost->isScalarAfterVectorization(I, VF) || 2339 Cost->isProfitableToScalarize(I, VF); 2340 } 2341 2342 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2343 if (shouldScalarizeInstruction(IV)) 2344 return true; 2345 auto isScalarInst = [&](User *U) -> bool { 2346 auto *I = cast<Instruction>(U); 2347 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2348 }; 2349 return llvm::any_of(IV->users(), isScalarInst); 2350 } 2351 2352 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2353 const InductionDescriptor &ID, const Instruction *EntryVal, 2354 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2355 unsigned Part, unsigned Lane) { 2356 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2357 "Expected either an induction phi-node or a truncate of it!"); 2358 2359 // This induction variable is not the phi from the original loop but the 2360 // newly-created IV based on the proof that casted Phi is equal to the 2361 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2362 // re-uses the same InductionDescriptor that original IV uses but we don't 2363 // have to do any recording in this case - that is done when original IV is 2364 // processed. 2365 if (isa<TruncInst>(EntryVal)) 2366 return; 2367 2368 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2369 if (Casts.empty()) 2370 return; 2371 // Only the first Cast instruction in the Casts vector is of interest. 2372 // The rest of the Casts (if exist) have no uses outside the 2373 // induction update chain itself. 2374 if (Lane < UINT_MAX) 2375 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2376 else 2377 State.set(CastDef, VectorLoopVal, Part); 2378 } 2379 2380 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2381 TruncInst *Trunc, VPValue *Def, 2382 VPValue *CastDef, 2383 VPTransformState &State) { 2384 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2385 "Primary induction variable must have an integer type"); 2386 2387 auto II = Legal->getInductionVars().find(IV); 2388 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2389 2390 auto ID = II->second; 2391 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2392 2393 // The value from the original loop to which we are mapping the new induction 2394 // variable. 2395 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2396 2397 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2398 2399 // Generate code for the induction step. Note that induction steps are 2400 // required to be loop-invariant 2401 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2402 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2403 "Induction step should be loop invariant"); 2404 if (PSE.getSE()->isSCEVable(IV->getType())) { 2405 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2406 return Exp.expandCodeFor(Step, Step->getType(), 2407 LoopVectorPreHeader->getTerminator()); 2408 } 2409 return cast<SCEVUnknown>(Step)->getValue(); 2410 }; 2411 2412 // The scalar value to broadcast. This is derived from the canonical 2413 // induction variable. If a truncation type is given, truncate the canonical 2414 // induction variable and step. Otherwise, derive these values from the 2415 // induction descriptor. 2416 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2417 Value *ScalarIV = Induction; 2418 if (IV != OldInduction) { 2419 ScalarIV = IV->getType()->isIntegerTy() 2420 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2421 : Builder.CreateCast(Instruction::SIToFP, Induction, 2422 IV->getType()); 2423 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2424 ScalarIV->setName("offset.idx"); 2425 } 2426 if (Trunc) { 2427 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2428 assert(Step->getType()->isIntegerTy() && 2429 "Truncation requires an integer step"); 2430 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2431 Step = Builder.CreateTrunc(Step, TruncType); 2432 } 2433 return ScalarIV; 2434 }; 2435 2436 // Create the vector values from the scalar IV, in the absence of creating a 2437 // vector IV. 2438 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2439 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2440 for (unsigned Part = 0; Part < UF; ++Part) { 2441 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2442 Value *EntryPart = 2443 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2444 ID.getInductionOpcode()); 2445 State.set(Def, EntryPart, Part); 2446 if (Trunc) 2447 addMetadata(EntryPart, Trunc); 2448 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2449 State, Part); 2450 } 2451 }; 2452 2453 // Fast-math-flags propagate from the original induction instruction. 2454 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2455 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2456 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2457 2458 // Now do the actual transformations, and start with creating the step value. 2459 Value *Step = CreateStepValue(ID.getStep()); 2460 if (VF.isZero() || VF.isScalar()) { 2461 Value *ScalarIV = CreateScalarIV(Step); 2462 CreateSplatIV(ScalarIV, Step); 2463 return; 2464 } 2465 2466 // Determine if we want a scalar version of the induction variable. This is 2467 // true if the induction variable itself is not widened, or if it has at 2468 // least one user in the loop that is not widened. 2469 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2470 if (!NeedsScalarIV) { 2471 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2472 State); 2473 return; 2474 } 2475 2476 // Try to create a new independent vector induction variable. If we can't 2477 // create the phi node, we will splat the scalar induction variable in each 2478 // loop iteration. 2479 if (!shouldScalarizeInstruction(EntryVal)) { 2480 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2481 State); 2482 Value *ScalarIV = CreateScalarIV(Step); 2483 // Create scalar steps that can be used by instructions we will later 2484 // scalarize. Note that the addition of the scalar steps will not increase 2485 // the number of instructions in the loop in the common case prior to 2486 // InstCombine. We will be trading one vector extract for each scalar step. 2487 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2488 return; 2489 } 2490 2491 // All IV users are scalar instructions, so only emit a scalar IV, not a 2492 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2493 // predicate used by the masked loads/stores. 2494 Value *ScalarIV = CreateScalarIV(Step); 2495 if (!Cost->isScalarEpilogueAllowed()) 2496 CreateSplatIV(ScalarIV, Step); 2497 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2498 } 2499 2500 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2501 Instruction::BinaryOps BinOp) { 2502 // Create and check the types. 2503 auto *ValVTy = cast<VectorType>(Val->getType()); 2504 ElementCount VLen = ValVTy->getElementCount(); 2505 2506 Type *STy = Val->getType()->getScalarType(); 2507 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2508 "Induction Step must be an integer or FP"); 2509 assert(Step->getType() == STy && "Step has wrong type"); 2510 2511 SmallVector<Constant *, 8> Indices; 2512 2513 // Create a vector of consecutive numbers from zero to VF. 2514 VectorType *InitVecValVTy = ValVTy; 2515 Type *InitVecValSTy = STy; 2516 if (STy->isFloatingPointTy()) { 2517 InitVecValSTy = 2518 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2519 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2520 } 2521 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2522 2523 // Add on StartIdx 2524 Value *StartIdxSplat = Builder.CreateVectorSplat( 2525 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2526 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2527 2528 if (STy->isIntegerTy()) { 2529 Step = Builder.CreateVectorSplat(VLen, Step); 2530 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2531 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2532 // which can be found from the original scalar operations. 2533 Step = Builder.CreateMul(InitVec, Step); 2534 return Builder.CreateAdd(Val, Step, "induction"); 2535 } 2536 2537 // Floating point induction. 2538 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2539 "Binary Opcode should be specified for FP induction"); 2540 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2541 Step = Builder.CreateVectorSplat(VLen, Step); 2542 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2543 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2544 } 2545 2546 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2547 Instruction *EntryVal, 2548 const InductionDescriptor &ID, 2549 VPValue *Def, VPValue *CastDef, 2550 VPTransformState &State) { 2551 // We shouldn't have to build scalar steps if we aren't vectorizing. 2552 assert(VF.isVector() && "VF should be greater than one"); 2553 // Get the value type and ensure it and the step have the same integer type. 2554 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2555 assert(ScalarIVTy == Step->getType() && 2556 "Val and Step should have the same type"); 2557 2558 // We build scalar steps for both integer and floating-point induction 2559 // variables. Here, we determine the kind of arithmetic we will perform. 2560 Instruction::BinaryOps AddOp; 2561 Instruction::BinaryOps MulOp; 2562 if (ScalarIVTy->isIntegerTy()) { 2563 AddOp = Instruction::Add; 2564 MulOp = Instruction::Mul; 2565 } else { 2566 AddOp = ID.getInductionOpcode(); 2567 MulOp = Instruction::FMul; 2568 } 2569 2570 // Determine the number of scalars we need to generate for each unroll 2571 // iteration. If EntryVal is uniform, we only need to generate the first 2572 // lane. Otherwise, we generate all VF values. 2573 bool IsUniform = 2574 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2575 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2576 // Compute the scalar steps and save the results in State. 2577 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2578 ScalarIVTy->getScalarSizeInBits()); 2579 Type *VecIVTy = nullptr; 2580 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2581 if (!IsUniform && VF.isScalable()) { 2582 VecIVTy = VectorType::get(ScalarIVTy, VF); 2583 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2584 SplatStep = Builder.CreateVectorSplat(VF, Step); 2585 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2586 } 2587 2588 for (unsigned Part = 0; Part < UF; ++Part) { 2589 Value *StartIdx0 = 2590 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2591 2592 if (!IsUniform && VF.isScalable()) { 2593 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2594 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2595 if (ScalarIVTy->isFloatingPointTy()) 2596 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2597 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2598 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2599 State.set(Def, Add, Part); 2600 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2601 Part); 2602 // It's useful to record the lane values too for the known minimum number 2603 // of elements so we do those below. This improves the code quality when 2604 // trying to extract the first element, for example. 2605 } 2606 2607 if (ScalarIVTy->isFloatingPointTy()) 2608 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2609 2610 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2611 Value *StartIdx = Builder.CreateBinOp( 2612 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2613 // The step returned by `createStepForVF` is a runtime-evaluated value 2614 // when VF is scalable. Otherwise, it should be folded into a Constant. 2615 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2616 "Expected StartIdx to be folded to a constant when VF is not " 2617 "scalable"); 2618 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2619 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2620 State.set(Def, Add, VPIteration(Part, Lane)); 2621 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2622 Part, Lane); 2623 } 2624 } 2625 } 2626 2627 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2628 const VPIteration &Instance, 2629 VPTransformState &State) { 2630 Value *ScalarInst = State.get(Def, Instance); 2631 Value *VectorValue = State.get(Def, Instance.Part); 2632 VectorValue = Builder.CreateInsertElement( 2633 VectorValue, ScalarInst, 2634 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2635 State.set(Def, VectorValue, Instance.Part); 2636 } 2637 2638 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2639 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2640 return Builder.CreateVectorReverse(Vec, "reverse"); 2641 } 2642 2643 // Return whether we allow using masked interleave-groups (for dealing with 2644 // strided loads/stores that reside in predicated blocks, or for dealing 2645 // with gaps). 2646 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2647 // If an override option has been passed in for interleaved accesses, use it. 2648 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2649 return EnableMaskedInterleavedMemAccesses; 2650 2651 return TTI.enableMaskedInterleavedAccessVectorization(); 2652 } 2653 2654 // Try to vectorize the interleave group that \p Instr belongs to. 2655 // 2656 // E.g. Translate following interleaved load group (factor = 3): 2657 // for (i = 0; i < N; i+=3) { 2658 // R = Pic[i]; // Member of index 0 2659 // G = Pic[i+1]; // Member of index 1 2660 // B = Pic[i+2]; // Member of index 2 2661 // ... // do something to R, G, B 2662 // } 2663 // To: 2664 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2665 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2666 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2667 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2668 // 2669 // Or translate following interleaved store group (factor = 3): 2670 // for (i = 0; i < N; i+=3) { 2671 // ... do something to R, G, B 2672 // Pic[i] = R; // Member of index 0 2673 // Pic[i+1] = G; // Member of index 1 2674 // Pic[i+2] = B; // Member of index 2 2675 // } 2676 // To: 2677 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2678 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2679 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2680 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2681 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2682 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2683 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2684 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2685 VPValue *BlockInMask) { 2686 Instruction *Instr = Group->getInsertPos(); 2687 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2688 2689 // Prepare for the vector type of the interleaved load/store. 2690 Type *ScalarTy = getLoadStoreType(Instr); 2691 unsigned InterleaveFactor = Group->getFactor(); 2692 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2693 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2694 2695 // Prepare for the new pointers. 2696 SmallVector<Value *, 2> AddrParts; 2697 unsigned Index = Group->getIndex(Instr); 2698 2699 // TODO: extend the masked interleaved-group support to reversed access. 2700 assert((!BlockInMask || !Group->isReverse()) && 2701 "Reversed masked interleave-group not supported."); 2702 2703 // If the group is reverse, adjust the index to refer to the last vector lane 2704 // instead of the first. We adjust the index from the first vector lane, 2705 // rather than directly getting the pointer for lane VF - 1, because the 2706 // pointer operand of the interleaved access is supposed to be uniform. For 2707 // uniform instructions, we're only required to generate a value for the 2708 // first vector lane in each unroll iteration. 2709 if (Group->isReverse()) 2710 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2711 2712 for (unsigned Part = 0; Part < UF; Part++) { 2713 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2714 setDebugLocFromInst(Builder, AddrPart); 2715 2716 // Notice current instruction could be any index. Need to adjust the address 2717 // to the member of index 0. 2718 // 2719 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2720 // b = A[i]; // Member of index 0 2721 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2722 // 2723 // E.g. A[i+1] = a; // Member of index 1 2724 // A[i] = b; // Member of index 0 2725 // A[i+2] = c; // Member of index 2 (Current instruction) 2726 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2727 2728 bool InBounds = false; 2729 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2730 InBounds = gep->isInBounds(); 2731 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2732 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2733 2734 // Cast to the vector pointer type. 2735 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2736 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2737 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2738 } 2739 2740 setDebugLocFromInst(Builder, Instr); 2741 Value *PoisonVec = PoisonValue::get(VecTy); 2742 2743 Value *MaskForGaps = nullptr; 2744 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2745 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2746 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2747 } 2748 2749 // Vectorize the interleaved load group. 2750 if (isa<LoadInst>(Instr)) { 2751 // For each unroll part, create a wide load for the group. 2752 SmallVector<Value *, 2> NewLoads; 2753 for (unsigned Part = 0; Part < UF; Part++) { 2754 Instruction *NewLoad; 2755 if (BlockInMask || MaskForGaps) { 2756 assert(useMaskedInterleavedAccesses(*TTI) && 2757 "masked interleaved groups are not allowed."); 2758 Value *GroupMask = MaskForGaps; 2759 if (BlockInMask) { 2760 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2761 Value *ShuffledMask = Builder.CreateShuffleVector( 2762 BlockInMaskPart, 2763 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2764 "interleaved.mask"); 2765 GroupMask = MaskForGaps 2766 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2767 MaskForGaps) 2768 : ShuffledMask; 2769 } 2770 NewLoad = 2771 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2772 GroupMask, PoisonVec, "wide.masked.vec"); 2773 } 2774 else 2775 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2776 Group->getAlign(), "wide.vec"); 2777 Group->addMetadata(NewLoad); 2778 NewLoads.push_back(NewLoad); 2779 } 2780 2781 // For each member in the group, shuffle out the appropriate data from the 2782 // wide loads. 2783 unsigned J = 0; 2784 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2785 Instruction *Member = Group->getMember(I); 2786 2787 // Skip the gaps in the group. 2788 if (!Member) 2789 continue; 2790 2791 auto StrideMask = 2792 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2793 for (unsigned Part = 0; Part < UF; Part++) { 2794 Value *StridedVec = Builder.CreateShuffleVector( 2795 NewLoads[Part], StrideMask, "strided.vec"); 2796 2797 // If this member has different type, cast the result type. 2798 if (Member->getType() != ScalarTy) { 2799 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2800 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2801 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2802 } 2803 2804 if (Group->isReverse()) 2805 StridedVec = reverseVector(StridedVec); 2806 2807 State.set(VPDefs[J], StridedVec, Part); 2808 } 2809 ++J; 2810 } 2811 return; 2812 } 2813 2814 // The sub vector type for current instruction. 2815 auto *SubVT = VectorType::get(ScalarTy, VF); 2816 2817 // Vectorize the interleaved store group. 2818 for (unsigned Part = 0; Part < UF; Part++) { 2819 // Collect the stored vector from each member. 2820 SmallVector<Value *, 4> StoredVecs; 2821 for (unsigned i = 0; i < InterleaveFactor; i++) { 2822 // Interleaved store group doesn't allow a gap, so each index has a member 2823 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2824 2825 Value *StoredVec = State.get(StoredValues[i], Part); 2826 2827 if (Group->isReverse()) 2828 StoredVec = reverseVector(StoredVec); 2829 2830 // If this member has different type, cast it to a unified type. 2831 2832 if (StoredVec->getType() != SubVT) 2833 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2834 2835 StoredVecs.push_back(StoredVec); 2836 } 2837 2838 // Concatenate all vectors into a wide vector. 2839 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2840 2841 // Interleave the elements in the wide vector. 2842 Value *IVec = Builder.CreateShuffleVector( 2843 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2844 "interleaved.vec"); 2845 2846 Instruction *NewStoreInstr; 2847 if (BlockInMask) { 2848 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2849 Value *ShuffledMask = Builder.CreateShuffleVector( 2850 BlockInMaskPart, 2851 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2852 "interleaved.mask"); 2853 NewStoreInstr = Builder.CreateMaskedStore( 2854 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2855 } 2856 else 2857 NewStoreInstr = 2858 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2859 2860 Group->addMetadata(NewStoreInstr); 2861 } 2862 } 2863 2864 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2865 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2866 VPValue *StoredValue, VPValue *BlockInMask) { 2867 // Attempt to issue a wide load. 2868 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2869 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2870 2871 assert((LI || SI) && "Invalid Load/Store instruction"); 2872 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2873 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2874 2875 LoopVectorizationCostModel::InstWidening Decision = 2876 Cost->getWideningDecision(Instr, VF); 2877 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2878 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2879 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2880 "CM decision is not to widen the memory instruction"); 2881 2882 Type *ScalarDataTy = getLoadStoreType(Instr); 2883 2884 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2885 const Align Alignment = getLoadStoreAlignment(Instr); 2886 2887 // Determine if the pointer operand of the access is either consecutive or 2888 // reverse consecutive. 2889 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2890 bool ConsecutiveStride = 2891 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2892 bool CreateGatherScatter = 2893 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2894 2895 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2896 // gather/scatter. Otherwise Decision should have been to Scalarize. 2897 assert((ConsecutiveStride || CreateGatherScatter) && 2898 "The instruction should be scalarized"); 2899 (void)ConsecutiveStride; 2900 2901 VectorParts BlockInMaskParts(UF); 2902 bool isMaskRequired = BlockInMask; 2903 if (isMaskRequired) 2904 for (unsigned Part = 0; Part < UF; ++Part) 2905 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2906 2907 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2908 // Calculate the pointer for the specific unroll-part. 2909 GetElementPtrInst *PartPtr = nullptr; 2910 2911 bool InBounds = false; 2912 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2913 InBounds = gep->isInBounds(); 2914 if (Reverse) { 2915 // If the address is consecutive but reversed, then the 2916 // wide store needs to start at the last vector element. 2917 // RunTimeVF = VScale * VF.getKnownMinValue() 2918 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2919 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2920 // NumElt = -Part * RunTimeVF 2921 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2922 // LastLane = 1 - RunTimeVF 2923 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2924 PartPtr = 2925 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2926 PartPtr->setIsInBounds(InBounds); 2927 PartPtr = cast<GetElementPtrInst>( 2928 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2929 PartPtr->setIsInBounds(InBounds); 2930 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2931 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2932 } else { 2933 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2934 PartPtr = cast<GetElementPtrInst>( 2935 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2936 PartPtr->setIsInBounds(InBounds); 2937 } 2938 2939 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2940 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2941 }; 2942 2943 // Handle Stores: 2944 if (SI) { 2945 setDebugLocFromInst(Builder, SI); 2946 2947 for (unsigned Part = 0; Part < UF; ++Part) { 2948 Instruction *NewSI = nullptr; 2949 Value *StoredVal = State.get(StoredValue, Part); 2950 if (CreateGatherScatter) { 2951 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2952 Value *VectorGep = State.get(Addr, Part); 2953 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2954 MaskPart); 2955 } else { 2956 if (Reverse) { 2957 // If we store to reverse consecutive memory locations, then we need 2958 // to reverse the order of elements in the stored value. 2959 StoredVal = reverseVector(StoredVal); 2960 // We don't want to update the value in the map as it might be used in 2961 // another expression. So don't call resetVectorValue(StoredVal). 2962 } 2963 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2964 if (isMaskRequired) 2965 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2966 BlockInMaskParts[Part]); 2967 else 2968 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2969 } 2970 addMetadata(NewSI, SI); 2971 } 2972 return; 2973 } 2974 2975 // Handle loads. 2976 assert(LI && "Must have a load instruction"); 2977 setDebugLocFromInst(Builder, LI); 2978 for (unsigned Part = 0; Part < UF; ++Part) { 2979 Value *NewLI; 2980 if (CreateGatherScatter) { 2981 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2982 Value *VectorGep = State.get(Addr, Part); 2983 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2984 nullptr, "wide.masked.gather"); 2985 addMetadata(NewLI, LI); 2986 } else { 2987 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2988 if (isMaskRequired) 2989 NewLI = Builder.CreateMaskedLoad( 2990 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2991 "wide.masked.load"); 2992 else 2993 NewLI = 2994 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2995 2996 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2997 addMetadata(NewLI, LI); 2998 if (Reverse) 2999 NewLI = reverseVector(NewLI); 3000 } 3001 3002 State.set(Def, NewLI, Part); 3003 } 3004 } 3005 3006 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3007 VPUser &User, 3008 const VPIteration &Instance, 3009 bool IfPredicateInstr, 3010 VPTransformState &State) { 3011 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3012 3013 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3014 // the first lane and part. 3015 if (isa<NoAliasScopeDeclInst>(Instr)) 3016 if (!Instance.isFirstIteration()) 3017 return; 3018 3019 setDebugLocFromInst(Builder, Instr); 3020 3021 // Does this instruction return a value ? 3022 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3023 3024 Instruction *Cloned = Instr->clone(); 3025 if (!IsVoidRetTy) 3026 Cloned->setName(Instr->getName() + ".cloned"); 3027 3028 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3029 Builder.GetInsertPoint()); 3030 // Replace the operands of the cloned instructions with their scalar 3031 // equivalents in the new loop. 3032 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3033 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3034 auto InputInstance = Instance; 3035 if (!Operand || !OrigLoop->contains(Operand) || 3036 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3037 InputInstance.Lane = VPLane::getFirstLane(); 3038 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3039 Cloned->setOperand(op, NewOp); 3040 } 3041 addNewMetadata(Cloned, Instr); 3042 3043 // Place the cloned scalar in the new loop. 3044 Builder.Insert(Cloned); 3045 3046 State.set(Def, Cloned, Instance); 3047 3048 // If we just cloned a new assumption, add it the assumption cache. 3049 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3050 AC->registerAssumption(II); 3051 3052 // End if-block. 3053 if (IfPredicateInstr) 3054 PredicatedInstructions.push_back(Cloned); 3055 } 3056 3057 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3058 Value *End, Value *Step, 3059 Instruction *DL) { 3060 BasicBlock *Header = L->getHeader(); 3061 BasicBlock *Latch = L->getLoopLatch(); 3062 // As we're just creating this loop, it's possible no latch exists 3063 // yet. If so, use the header as this will be a single block loop. 3064 if (!Latch) 3065 Latch = Header; 3066 3067 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3068 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3069 setDebugLocFromInst(Builder, OldInst); 3070 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3071 3072 Builder.SetInsertPoint(Latch->getTerminator()); 3073 setDebugLocFromInst(Builder, OldInst); 3074 3075 // Create i+1 and fill the PHINode. 3076 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3077 Induction->addIncoming(Start, L->getLoopPreheader()); 3078 Induction->addIncoming(Next, Latch); 3079 // Create the compare. 3080 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3081 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3082 3083 // Now we have two terminators. Remove the old one from the block. 3084 Latch->getTerminator()->eraseFromParent(); 3085 3086 return Induction; 3087 } 3088 3089 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3090 if (TripCount) 3091 return TripCount; 3092 3093 assert(L && "Create Trip Count for null loop."); 3094 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3095 // Find the loop boundaries. 3096 ScalarEvolution *SE = PSE.getSE(); 3097 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3098 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3099 "Invalid loop count"); 3100 3101 Type *IdxTy = Legal->getWidestInductionType(); 3102 assert(IdxTy && "No type for induction"); 3103 3104 // The exit count might have the type of i64 while the phi is i32. This can 3105 // happen if we have an induction variable that is sign extended before the 3106 // compare. The only way that we get a backedge taken count is that the 3107 // induction variable was signed and as such will not overflow. In such a case 3108 // truncation is legal. 3109 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3110 IdxTy->getPrimitiveSizeInBits()) 3111 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3112 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3113 3114 // Get the total trip count from the count by adding 1. 3115 const SCEV *ExitCount = SE->getAddExpr( 3116 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3117 3118 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3119 3120 // Expand the trip count and place the new instructions in the preheader. 3121 // Notice that the pre-header does not change, only the loop body. 3122 SCEVExpander Exp(*SE, DL, "induction"); 3123 3124 // Count holds the overall loop count (N). 3125 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3126 L->getLoopPreheader()->getTerminator()); 3127 3128 if (TripCount->getType()->isPointerTy()) 3129 TripCount = 3130 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3131 L->getLoopPreheader()->getTerminator()); 3132 3133 return TripCount; 3134 } 3135 3136 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3137 if (VectorTripCount) 3138 return VectorTripCount; 3139 3140 Value *TC = getOrCreateTripCount(L); 3141 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3142 3143 Type *Ty = TC->getType(); 3144 // This is where we can make the step a runtime constant. 3145 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3146 3147 // If the tail is to be folded by masking, round the number of iterations N 3148 // up to a multiple of Step instead of rounding down. This is done by first 3149 // adding Step-1 and then rounding down. Note that it's ok if this addition 3150 // overflows: the vector induction variable will eventually wrap to zero given 3151 // that it starts at zero and its Step is a power of two; the loop will then 3152 // exit, with the last early-exit vector comparison also producing all-true. 3153 if (Cost->foldTailByMasking()) { 3154 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3155 "VF*UF must be a power of 2 when folding tail by masking"); 3156 assert(!VF.isScalable() && 3157 "Tail folding not yet supported for scalable vectors"); 3158 TC = Builder.CreateAdd( 3159 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3160 } 3161 3162 // Now we need to generate the expression for the part of the loop that the 3163 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3164 // iterations are not required for correctness, or N - Step, otherwise. Step 3165 // is equal to the vectorization factor (number of SIMD elements) times the 3166 // unroll factor (number of SIMD instructions). 3167 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3168 3169 // There are two cases where we need to ensure (at least) the last iteration 3170 // runs in the scalar remainder loop. Thus, if the step evenly divides 3171 // the trip count, we set the remainder to be equal to the step. If the step 3172 // does not evenly divide the trip count, no adjustment is necessary since 3173 // there will already be scalar iterations. Note that the minimum iterations 3174 // check ensures that N >= Step. The cases are: 3175 // 1) If there is a non-reversed interleaved group that may speculatively 3176 // access memory out-of-bounds. 3177 // 2) If any instruction may follow a conditionally taken exit. That is, if 3178 // the loop contains multiple exiting blocks, or a single exiting block 3179 // which is not the latch. 3180 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3181 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3182 R = Builder.CreateSelect(IsZero, Step, R); 3183 } 3184 3185 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3186 3187 return VectorTripCount; 3188 } 3189 3190 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3191 const DataLayout &DL) { 3192 // Verify that V is a vector type with same number of elements as DstVTy. 3193 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3194 unsigned VF = DstFVTy->getNumElements(); 3195 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3196 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3197 Type *SrcElemTy = SrcVecTy->getElementType(); 3198 Type *DstElemTy = DstFVTy->getElementType(); 3199 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3200 "Vector elements must have same size"); 3201 3202 // Do a direct cast if element types are castable. 3203 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3204 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3205 } 3206 // V cannot be directly casted to desired vector type. 3207 // May happen when V is a floating point vector but DstVTy is a vector of 3208 // pointers or vice-versa. Handle this using a two-step bitcast using an 3209 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3210 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3211 "Only one type should be a pointer type"); 3212 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3213 "Only one type should be a floating point type"); 3214 Type *IntTy = 3215 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3216 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3217 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3218 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3219 } 3220 3221 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3222 BasicBlock *Bypass) { 3223 Value *Count = getOrCreateTripCount(L); 3224 // Reuse existing vector loop preheader for TC checks. 3225 // Note that new preheader block is generated for vector loop. 3226 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3227 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3228 3229 // Generate code to check if the loop's trip count is less than VF * UF, or 3230 // equal to it in case a scalar epilogue is required; this implies that the 3231 // vector trip count is zero. This check also covers the case where adding one 3232 // to the backedge-taken count overflowed leading to an incorrect trip count 3233 // of zero. In this case we will also jump to the scalar loop. 3234 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3235 : ICmpInst::ICMP_ULT; 3236 3237 // If tail is to be folded, vector loop takes care of all iterations. 3238 Value *CheckMinIters = Builder.getFalse(); 3239 if (!Cost->foldTailByMasking()) { 3240 Value *Step = 3241 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3242 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3243 } 3244 // Create new preheader for vector loop. 3245 LoopVectorPreHeader = 3246 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3247 "vector.ph"); 3248 3249 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3250 DT->getNode(Bypass)->getIDom()) && 3251 "TC check is expected to dominate Bypass"); 3252 3253 // Update dominator for Bypass & LoopExit. 3254 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3255 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3256 3257 ReplaceInstWithInst( 3258 TCCheckBlock->getTerminator(), 3259 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3260 LoopBypassBlocks.push_back(TCCheckBlock); 3261 } 3262 3263 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3264 3265 BasicBlock *const SCEVCheckBlock = 3266 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3267 if (!SCEVCheckBlock) 3268 return nullptr; 3269 3270 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3271 (OptForSizeBasedOnProfile && 3272 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3273 "Cannot SCEV check stride or overflow when optimizing for size"); 3274 3275 3276 // Update dominator only if this is first RT check. 3277 if (LoopBypassBlocks.empty()) { 3278 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3279 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3280 } 3281 3282 LoopBypassBlocks.push_back(SCEVCheckBlock); 3283 AddedSafetyChecks = true; 3284 return SCEVCheckBlock; 3285 } 3286 3287 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3288 BasicBlock *Bypass) { 3289 // VPlan-native path does not do any analysis for runtime checks currently. 3290 if (EnableVPlanNativePath) 3291 return nullptr; 3292 3293 BasicBlock *const MemCheckBlock = 3294 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3295 3296 // Check if we generated code that checks in runtime if arrays overlap. We put 3297 // the checks into a separate block to make the more common case of few 3298 // elements faster. 3299 if (!MemCheckBlock) 3300 return nullptr; 3301 3302 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3303 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3304 "Cannot emit memory checks when optimizing for size, unless forced " 3305 "to vectorize."); 3306 ORE->emit([&]() { 3307 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3308 L->getStartLoc(), L->getHeader()) 3309 << "Code-size may be reduced by not forcing " 3310 "vectorization, or by source-code modifications " 3311 "eliminating the need for runtime checks " 3312 "(e.g., adding 'restrict')."; 3313 }); 3314 } 3315 3316 LoopBypassBlocks.push_back(MemCheckBlock); 3317 3318 AddedSafetyChecks = true; 3319 3320 // We currently don't use LoopVersioning for the actual loop cloning but we 3321 // still use it to add the noalias metadata. 3322 LVer = std::make_unique<LoopVersioning>( 3323 *Legal->getLAI(), 3324 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3325 DT, PSE.getSE()); 3326 LVer->prepareNoAliasMetadata(); 3327 return MemCheckBlock; 3328 } 3329 3330 Value *InnerLoopVectorizer::emitTransformedIndex( 3331 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3332 const InductionDescriptor &ID) const { 3333 3334 SCEVExpander Exp(*SE, DL, "induction"); 3335 auto Step = ID.getStep(); 3336 auto StartValue = ID.getStartValue(); 3337 assert(Index->getType()->getScalarType() == Step->getType() && 3338 "Index scalar type does not match StepValue type"); 3339 3340 // Note: the IR at this point is broken. We cannot use SE to create any new 3341 // SCEV and then expand it, hoping that SCEV's simplification will give us 3342 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3343 // lead to various SCEV crashes. So all we can do is to use builder and rely 3344 // on InstCombine for future simplifications. Here we handle some trivial 3345 // cases only. 3346 auto CreateAdd = [&B](Value *X, Value *Y) { 3347 assert(X->getType() == Y->getType() && "Types don't match!"); 3348 if (auto *CX = dyn_cast<ConstantInt>(X)) 3349 if (CX->isZero()) 3350 return Y; 3351 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3352 if (CY->isZero()) 3353 return X; 3354 return B.CreateAdd(X, Y); 3355 }; 3356 3357 // We allow X to be a vector type, in which case Y will potentially be 3358 // splatted into a vector with the same element count. 3359 auto CreateMul = [&B](Value *X, Value *Y) { 3360 assert(X->getType()->getScalarType() == Y->getType() && 3361 "Types don't match!"); 3362 if (auto *CX = dyn_cast<ConstantInt>(X)) 3363 if (CX->isOne()) 3364 return Y; 3365 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3366 if (CY->isOne()) 3367 return X; 3368 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3369 if (XVTy && !isa<VectorType>(Y->getType())) 3370 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3371 return B.CreateMul(X, Y); 3372 }; 3373 3374 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3375 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3376 // the DomTree is not kept up-to-date for additional blocks generated in the 3377 // vector loop. By using the header as insertion point, we guarantee that the 3378 // expanded instructions dominate all their uses. 3379 auto GetInsertPoint = [this, &B]() { 3380 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3381 if (InsertBB != LoopVectorBody && 3382 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3383 return LoopVectorBody->getTerminator(); 3384 return &*B.GetInsertPoint(); 3385 }; 3386 3387 switch (ID.getKind()) { 3388 case InductionDescriptor::IK_IntInduction: { 3389 assert(!isa<VectorType>(Index->getType()) && 3390 "Vector indices not supported for integer inductions yet"); 3391 assert(Index->getType() == StartValue->getType() && 3392 "Index type does not match StartValue type"); 3393 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3394 return B.CreateSub(StartValue, Index); 3395 auto *Offset = CreateMul( 3396 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3397 return CreateAdd(StartValue, Offset); 3398 } 3399 case InductionDescriptor::IK_PtrInduction: { 3400 assert(isa<SCEVConstant>(Step) && 3401 "Expected constant step for pointer induction"); 3402 return B.CreateGEP( 3403 StartValue->getType()->getPointerElementType(), StartValue, 3404 CreateMul(Index, 3405 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3406 GetInsertPoint()))); 3407 } 3408 case InductionDescriptor::IK_FpInduction: { 3409 assert(!isa<VectorType>(Index->getType()) && 3410 "Vector indices not supported for FP inductions yet"); 3411 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3412 auto InductionBinOp = ID.getInductionBinOp(); 3413 assert(InductionBinOp && 3414 (InductionBinOp->getOpcode() == Instruction::FAdd || 3415 InductionBinOp->getOpcode() == Instruction::FSub) && 3416 "Original bin op should be defined for FP induction"); 3417 3418 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3419 Value *MulExp = B.CreateFMul(StepValue, Index); 3420 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3421 "induction"); 3422 } 3423 case InductionDescriptor::IK_NoInduction: 3424 return nullptr; 3425 } 3426 llvm_unreachable("invalid enum"); 3427 } 3428 3429 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3430 LoopScalarBody = OrigLoop->getHeader(); 3431 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3432 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3433 assert(LoopExitBlock && "Must have an exit block"); 3434 assert(LoopVectorPreHeader && "Invalid loop structure"); 3435 3436 LoopMiddleBlock = 3437 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3438 LI, nullptr, Twine(Prefix) + "middle.block"); 3439 LoopScalarPreHeader = 3440 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3441 nullptr, Twine(Prefix) + "scalar.ph"); 3442 3443 // Set up branch from middle block to the exit and scalar preheader blocks. 3444 // completeLoopSkeleton will update the condition to use an iteration check, 3445 // if required to decide whether to execute the remainder. 3446 BranchInst *BrInst = 3447 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3448 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3449 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3450 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3451 3452 // We intentionally don't let SplitBlock to update LoopInfo since 3453 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3454 // LoopVectorBody is explicitly added to the correct place few lines later. 3455 LoopVectorBody = 3456 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3457 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3458 3459 // Update dominator for loop exit. 3460 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3461 3462 // Create and register the new vector loop. 3463 Loop *Lp = LI->AllocateLoop(); 3464 Loop *ParentLoop = OrigLoop->getParentLoop(); 3465 3466 // Insert the new loop into the loop nest and register the new basic blocks 3467 // before calling any utilities such as SCEV that require valid LoopInfo. 3468 if (ParentLoop) { 3469 ParentLoop->addChildLoop(Lp); 3470 } else { 3471 LI->addTopLevelLoop(Lp); 3472 } 3473 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3474 return Lp; 3475 } 3476 3477 void InnerLoopVectorizer::createInductionResumeValues( 3478 Loop *L, Value *VectorTripCount, 3479 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3480 assert(VectorTripCount && L && "Expected valid arguments"); 3481 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3482 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3483 "Inconsistent information about additional bypass."); 3484 // We are going to resume the execution of the scalar loop. 3485 // Go over all of the induction variables that we found and fix the 3486 // PHIs that are left in the scalar version of the loop. 3487 // The starting values of PHI nodes depend on the counter of the last 3488 // iteration in the vectorized loop. 3489 // If we come from a bypass edge then we need to start from the original 3490 // start value. 3491 for (auto &InductionEntry : Legal->getInductionVars()) { 3492 PHINode *OrigPhi = InductionEntry.first; 3493 InductionDescriptor II = InductionEntry.second; 3494 3495 // Create phi nodes to merge from the backedge-taken check block. 3496 PHINode *BCResumeVal = 3497 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3498 LoopScalarPreHeader->getTerminator()); 3499 // Copy original phi DL over to the new one. 3500 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3501 Value *&EndValue = IVEndValues[OrigPhi]; 3502 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3503 if (OrigPhi == OldInduction) { 3504 // We know what the end value is. 3505 EndValue = VectorTripCount; 3506 } else { 3507 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3508 3509 // Fast-math-flags propagate from the original induction instruction. 3510 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3511 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3512 3513 Type *StepType = II.getStep()->getType(); 3514 Instruction::CastOps CastOp = 3515 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3516 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3517 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3518 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3519 EndValue->setName("ind.end"); 3520 3521 // Compute the end value for the additional bypass (if applicable). 3522 if (AdditionalBypass.first) { 3523 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3524 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3525 StepType, true); 3526 CRD = 3527 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3528 EndValueFromAdditionalBypass = 3529 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3530 EndValueFromAdditionalBypass->setName("ind.end"); 3531 } 3532 } 3533 // The new PHI merges the original incoming value, in case of a bypass, 3534 // or the value at the end of the vectorized loop. 3535 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3536 3537 // Fix the scalar body counter (PHI node). 3538 // The old induction's phi node in the scalar body needs the truncated 3539 // value. 3540 for (BasicBlock *BB : LoopBypassBlocks) 3541 BCResumeVal->addIncoming(II.getStartValue(), BB); 3542 3543 if (AdditionalBypass.first) 3544 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3545 EndValueFromAdditionalBypass); 3546 3547 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3548 } 3549 } 3550 3551 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3552 MDNode *OrigLoopID) { 3553 assert(L && "Expected valid loop."); 3554 3555 // The trip counts should be cached by now. 3556 Value *Count = getOrCreateTripCount(L); 3557 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3558 3559 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3560 3561 // Add a check in the middle block to see if we have completed 3562 // all of the iterations in the first vector loop. 3563 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3564 // If tail is to be folded, we know we don't need to run the remainder. 3565 if (!Cost->foldTailByMasking()) { 3566 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3567 Count, VectorTripCount, "cmp.n", 3568 LoopMiddleBlock->getTerminator()); 3569 3570 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3571 // of the corresponding compare because they may have ended up with 3572 // different line numbers and we want to avoid awkward line stepping while 3573 // debugging. Eg. if the compare has got a line number inside the loop. 3574 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3575 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3576 } 3577 3578 // Get ready to start creating new instructions into the vectorized body. 3579 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3580 "Inconsistent vector loop preheader"); 3581 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3582 3583 Optional<MDNode *> VectorizedLoopID = 3584 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3585 LLVMLoopVectorizeFollowupVectorized}); 3586 if (VectorizedLoopID.hasValue()) { 3587 L->setLoopID(VectorizedLoopID.getValue()); 3588 3589 // Do not setAlreadyVectorized if loop attributes have been defined 3590 // explicitly. 3591 return LoopVectorPreHeader; 3592 } 3593 3594 // Keep all loop hints from the original loop on the vector loop (we'll 3595 // replace the vectorizer-specific hints below). 3596 if (MDNode *LID = OrigLoop->getLoopID()) 3597 L->setLoopID(LID); 3598 3599 LoopVectorizeHints Hints(L, true, *ORE); 3600 Hints.setAlreadyVectorized(); 3601 3602 #ifdef EXPENSIVE_CHECKS 3603 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3604 LI->verify(*DT); 3605 #endif 3606 3607 return LoopVectorPreHeader; 3608 } 3609 3610 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3611 /* 3612 In this function we generate a new loop. The new loop will contain 3613 the vectorized instructions while the old loop will continue to run the 3614 scalar remainder. 3615 3616 [ ] <-- loop iteration number check. 3617 / | 3618 / v 3619 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3620 | / | 3621 | / v 3622 || [ ] <-- vector pre header. 3623 |/ | 3624 | v 3625 | [ ] \ 3626 | [ ]_| <-- vector loop. 3627 | | 3628 | v 3629 | -[ ] <--- middle-block. 3630 | / | 3631 | / v 3632 -|- >[ ] <--- new preheader. 3633 | | 3634 | v 3635 | [ ] \ 3636 | [ ]_| <-- old scalar loop to handle remainder. 3637 \ | 3638 \ v 3639 >[ ] <-- exit block. 3640 ... 3641 */ 3642 3643 // Get the metadata of the original loop before it gets modified. 3644 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3645 3646 // Workaround! Compute the trip count of the original loop and cache it 3647 // before we start modifying the CFG. This code has a systemic problem 3648 // wherein it tries to run analysis over partially constructed IR; this is 3649 // wrong, and not simply for SCEV. The trip count of the original loop 3650 // simply happens to be prone to hitting this in practice. In theory, we 3651 // can hit the same issue for any SCEV, or ValueTracking query done during 3652 // mutation. See PR49900. 3653 getOrCreateTripCount(OrigLoop); 3654 3655 // Create an empty vector loop, and prepare basic blocks for the runtime 3656 // checks. 3657 Loop *Lp = createVectorLoopSkeleton(""); 3658 3659 // Now, compare the new count to zero. If it is zero skip the vector loop and 3660 // jump to the scalar loop. This check also covers the case where the 3661 // backedge-taken count is uint##_max: adding one to it will overflow leading 3662 // to an incorrect trip count of zero. In this (rare) case we will also jump 3663 // to the scalar loop. 3664 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3665 3666 // Generate the code to check any assumptions that we've made for SCEV 3667 // expressions. 3668 emitSCEVChecks(Lp, LoopScalarPreHeader); 3669 3670 // Generate the code that checks in runtime if arrays overlap. We put the 3671 // checks into a separate block to make the more common case of few elements 3672 // faster. 3673 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3674 3675 // Some loops have a single integer induction variable, while other loops 3676 // don't. One example is c++ iterators that often have multiple pointer 3677 // induction variables. In the code below we also support a case where we 3678 // don't have a single induction variable. 3679 // 3680 // We try to obtain an induction variable from the original loop as hard 3681 // as possible. However if we don't find one that: 3682 // - is an integer 3683 // - counts from zero, stepping by one 3684 // - is the size of the widest induction variable type 3685 // then we create a new one. 3686 OldInduction = Legal->getPrimaryInduction(); 3687 Type *IdxTy = Legal->getWidestInductionType(); 3688 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3689 // The loop step is equal to the vectorization factor (num of SIMD elements) 3690 // times the unroll factor (num of SIMD instructions). 3691 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3692 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3693 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3694 Induction = 3695 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3696 getDebugLocFromInstOrOperands(OldInduction)); 3697 3698 // Emit phis for the new starting index of the scalar loop. 3699 createInductionResumeValues(Lp, CountRoundDown); 3700 3701 return completeLoopSkeleton(Lp, OrigLoopID); 3702 } 3703 3704 // Fix up external users of the induction variable. At this point, we are 3705 // in LCSSA form, with all external PHIs that use the IV having one input value, 3706 // coming from the remainder loop. We need those PHIs to also have a correct 3707 // value for the IV when arriving directly from the middle block. 3708 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3709 const InductionDescriptor &II, 3710 Value *CountRoundDown, Value *EndValue, 3711 BasicBlock *MiddleBlock) { 3712 // There are two kinds of external IV usages - those that use the value 3713 // computed in the last iteration (the PHI) and those that use the penultimate 3714 // value (the value that feeds into the phi from the loop latch). 3715 // We allow both, but they, obviously, have different values. 3716 3717 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3718 3719 DenseMap<Value *, Value *> MissingVals; 3720 3721 // An external user of the last iteration's value should see the value that 3722 // the remainder loop uses to initialize its own IV. 3723 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3724 for (User *U : PostInc->users()) { 3725 Instruction *UI = cast<Instruction>(U); 3726 if (!OrigLoop->contains(UI)) { 3727 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3728 MissingVals[UI] = EndValue; 3729 } 3730 } 3731 3732 // An external user of the penultimate value need to see EndValue - Step. 3733 // The simplest way to get this is to recompute it from the constituent SCEVs, 3734 // that is Start + (Step * (CRD - 1)). 3735 for (User *U : OrigPhi->users()) { 3736 auto *UI = cast<Instruction>(U); 3737 if (!OrigLoop->contains(UI)) { 3738 const DataLayout &DL = 3739 OrigLoop->getHeader()->getModule()->getDataLayout(); 3740 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3741 3742 IRBuilder<> B(MiddleBlock->getTerminator()); 3743 3744 // Fast-math-flags propagate from the original induction instruction. 3745 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3746 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3747 3748 Value *CountMinusOne = B.CreateSub( 3749 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3750 Value *CMO = 3751 !II.getStep()->getType()->isIntegerTy() 3752 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3753 II.getStep()->getType()) 3754 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3755 CMO->setName("cast.cmo"); 3756 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3757 Escape->setName("ind.escape"); 3758 MissingVals[UI] = Escape; 3759 } 3760 } 3761 3762 for (auto &I : MissingVals) { 3763 PHINode *PHI = cast<PHINode>(I.first); 3764 // One corner case we have to handle is two IVs "chasing" each-other, 3765 // that is %IV2 = phi [...], [ %IV1, %latch ] 3766 // In this case, if IV1 has an external use, we need to avoid adding both 3767 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3768 // don't already have an incoming value for the middle block. 3769 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3770 PHI->addIncoming(I.second, MiddleBlock); 3771 } 3772 } 3773 3774 namespace { 3775 3776 struct CSEDenseMapInfo { 3777 static bool canHandle(const Instruction *I) { 3778 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3779 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3780 } 3781 3782 static inline Instruction *getEmptyKey() { 3783 return DenseMapInfo<Instruction *>::getEmptyKey(); 3784 } 3785 3786 static inline Instruction *getTombstoneKey() { 3787 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3788 } 3789 3790 static unsigned getHashValue(const Instruction *I) { 3791 assert(canHandle(I) && "Unknown instruction!"); 3792 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3793 I->value_op_end())); 3794 } 3795 3796 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3797 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3798 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3799 return LHS == RHS; 3800 return LHS->isIdenticalTo(RHS); 3801 } 3802 }; 3803 3804 } // end anonymous namespace 3805 3806 ///Perform cse of induction variable instructions. 3807 static void cse(BasicBlock *BB) { 3808 // Perform simple cse. 3809 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3810 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3811 Instruction *In = &*I++; 3812 3813 if (!CSEDenseMapInfo::canHandle(In)) 3814 continue; 3815 3816 // Check if we can replace this instruction with any of the 3817 // visited instructions. 3818 if (Instruction *V = CSEMap.lookup(In)) { 3819 In->replaceAllUsesWith(V); 3820 In->eraseFromParent(); 3821 continue; 3822 } 3823 3824 CSEMap[In] = In; 3825 } 3826 } 3827 3828 InstructionCost 3829 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3830 bool &NeedToScalarize) const { 3831 Function *F = CI->getCalledFunction(); 3832 Type *ScalarRetTy = CI->getType(); 3833 SmallVector<Type *, 4> Tys, ScalarTys; 3834 for (auto &ArgOp : CI->arg_operands()) 3835 ScalarTys.push_back(ArgOp->getType()); 3836 3837 // Estimate cost of scalarized vector call. The source operands are assumed 3838 // to be vectors, so we need to extract individual elements from there, 3839 // execute VF scalar calls, and then gather the result into the vector return 3840 // value. 3841 InstructionCost ScalarCallCost = 3842 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3843 if (VF.isScalar()) 3844 return ScalarCallCost; 3845 3846 // Compute corresponding vector type for return value and arguments. 3847 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3848 for (Type *ScalarTy : ScalarTys) 3849 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3850 3851 // Compute costs of unpacking argument values for the scalar calls and 3852 // packing the return values to a vector. 3853 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3854 3855 InstructionCost Cost = 3856 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3857 3858 // If we can't emit a vector call for this function, then the currently found 3859 // cost is the cost we need to return. 3860 NeedToScalarize = true; 3861 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3862 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3863 3864 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3865 return Cost; 3866 3867 // If the corresponding vector cost is cheaper, return its cost. 3868 InstructionCost VectorCallCost = 3869 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3870 if (VectorCallCost < Cost) { 3871 NeedToScalarize = false; 3872 Cost = VectorCallCost; 3873 } 3874 return Cost; 3875 } 3876 3877 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3878 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3879 return Elt; 3880 return VectorType::get(Elt, VF); 3881 } 3882 3883 InstructionCost 3884 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3885 ElementCount VF) const { 3886 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3887 assert(ID && "Expected intrinsic call!"); 3888 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3889 FastMathFlags FMF; 3890 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3891 FMF = FPMO->getFastMathFlags(); 3892 3893 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3894 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3895 SmallVector<Type *> ParamTys; 3896 std::transform(FTy->param_begin(), FTy->param_end(), 3897 std::back_inserter(ParamTys), 3898 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3899 3900 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3901 dyn_cast<IntrinsicInst>(CI)); 3902 return TTI.getIntrinsicInstrCost(CostAttrs, 3903 TargetTransformInfo::TCK_RecipThroughput); 3904 } 3905 3906 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3907 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3908 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3909 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3910 } 3911 3912 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3913 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3914 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3915 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3916 } 3917 3918 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3919 // For every instruction `I` in MinBWs, truncate the operands, create a 3920 // truncated version of `I` and reextend its result. InstCombine runs 3921 // later and will remove any ext/trunc pairs. 3922 SmallPtrSet<Value *, 4> Erased; 3923 for (const auto &KV : Cost->getMinimalBitwidths()) { 3924 // If the value wasn't vectorized, we must maintain the original scalar 3925 // type. The absence of the value from State indicates that it 3926 // wasn't vectorized. 3927 VPValue *Def = State.Plan->getVPValue(KV.first); 3928 if (!State.hasAnyVectorValue(Def)) 3929 continue; 3930 for (unsigned Part = 0; Part < UF; ++Part) { 3931 Value *I = State.get(Def, Part); 3932 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3933 continue; 3934 Type *OriginalTy = I->getType(); 3935 Type *ScalarTruncatedTy = 3936 IntegerType::get(OriginalTy->getContext(), KV.second); 3937 auto *TruncatedTy = FixedVectorType::get( 3938 ScalarTruncatedTy, 3939 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3940 if (TruncatedTy == OriginalTy) 3941 continue; 3942 3943 IRBuilder<> B(cast<Instruction>(I)); 3944 auto ShrinkOperand = [&](Value *V) -> Value * { 3945 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3946 if (ZI->getSrcTy() == TruncatedTy) 3947 return ZI->getOperand(0); 3948 return B.CreateZExtOrTrunc(V, TruncatedTy); 3949 }; 3950 3951 // The actual instruction modification depends on the instruction type, 3952 // unfortunately. 3953 Value *NewI = nullptr; 3954 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3955 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3956 ShrinkOperand(BO->getOperand(1))); 3957 3958 // Any wrapping introduced by shrinking this operation shouldn't be 3959 // considered undefined behavior. So, we can't unconditionally copy 3960 // arithmetic wrapping flags to NewI. 3961 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3962 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3963 NewI = 3964 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3965 ShrinkOperand(CI->getOperand(1))); 3966 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3967 NewI = B.CreateSelect(SI->getCondition(), 3968 ShrinkOperand(SI->getTrueValue()), 3969 ShrinkOperand(SI->getFalseValue())); 3970 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3971 switch (CI->getOpcode()) { 3972 default: 3973 llvm_unreachable("Unhandled cast!"); 3974 case Instruction::Trunc: 3975 NewI = ShrinkOperand(CI->getOperand(0)); 3976 break; 3977 case Instruction::SExt: 3978 NewI = B.CreateSExtOrTrunc( 3979 CI->getOperand(0), 3980 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3981 break; 3982 case Instruction::ZExt: 3983 NewI = B.CreateZExtOrTrunc( 3984 CI->getOperand(0), 3985 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3986 break; 3987 } 3988 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3989 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3990 ->getNumElements(); 3991 auto *O0 = B.CreateZExtOrTrunc( 3992 SI->getOperand(0), 3993 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3994 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3995 ->getNumElements(); 3996 auto *O1 = B.CreateZExtOrTrunc( 3997 SI->getOperand(1), 3998 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3999 4000 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4001 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4002 // Don't do anything with the operands, just extend the result. 4003 continue; 4004 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4005 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 4006 ->getNumElements(); 4007 auto *O0 = B.CreateZExtOrTrunc( 4008 IE->getOperand(0), 4009 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4010 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4011 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4012 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4013 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4014 ->getNumElements(); 4015 auto *O0 = B.CreateZExtOrTrunc( 4016 EE->getOperand(0), 4017 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4018 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4019 } else { 4020 // If we don't know what to do, be conservative and don't do anything. 4021 continue; 4022 } 4023 4024 // Lastly, extend the result. 4025 NewI->takeName(cast<Instruction>(I)); 4026 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4027 I->replaceAllUsesWith(Res); 4028 cast<Instruction>(I)->eraseFromParent(); 4029 Erased.insert(I); 4030 State.reset(Def, Res, Part); 4031 } 4032 } 4033 4034 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4035 for (const auto &KV : Cost->getMinimalBitwidths()) { 4036 // If the value wasn't vectorized, we must maintain the original scalar 4037 // type. The absence of the value from State indicates that it 4038 // wasn't vectorized. 4039 VPValue *Def = State.Plan->getVPValue(KV.first); 4040 if (!State.hasAnyVectorValue(Def)) 4041 continue; 4042 for (unsigned Part = 0; Part < UF; ++Part) { 4043 Value *I = State.get(Def, Part); 4044 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4045 if (Inst && Inst->use_empty()) { 4046 Value *NewI = Inst->getOperand(0); 4047 Inst->eraseFromParent(); 4048 State.reset(Def, NewI, Part); 4049 } 4050 } 4051 } 4052 } 4053 4054 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4055 // Insert truncates and extends for any truncated instructions as hints to 4056 // InstCombine. 4057 if (VF.isVector()) 4058 truncateToMinimalBitwidths(State); 4059 4060 // Fix widened non-induction PHIs by setting up the PHI operands. 4061 if (OrigPHIsToFix.size()) { 4062 assert(EnableVPlanNativePath && 4063 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4064 fixNonInductionPHIs(State); 4065 } 4066 4067 // At this point every instruction in the original loop is widened to a 4068 // vector form. Now we need to fix the recurrences in the loop. These PHI 4069 // nodes are currently empty because we did not want to introduce cycles. 4070 // This is the second stage of vectorizing recurrences. 4071 fixCrossIterationPHIs(State); 4072 4073 // Forget the original basic block. 4074 PSE.getSE()->forgetLoop(OrigLoop); 4075 4076 // Fix-up external users of the induction variables. 4077 for (auto &Entry : Legal->getInductionVars()) 4078 fixupIVUsers(Entry.first, Entry.second, 4079 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4080 IVEndValues[Entry.first], LoopMiddleBlock); 4081 4082 fixLCSSAPHIs(State); 4083 for (Instruction *PI : PredicatedInstructions) 4084 sinkScalarOperands(&*PI); 4085 4086 // Remove redundant induction instructions. 4087 cse(LoopVectorBody); 4088 4089 // Set/update profile weights for the vector and remainder loops as original 4090 // loop iterations are now distributed among them. Note that original loop 4091 // represented by LoopScalarBody becomes remainder loop after vectorization. 4092 // 4093 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4094 // end up getting slightly roughened result but that should be OK since 4095 // profile is not inherently precise anyway. Note also possible bypass of 4096 // vector code caused by legality checks is ignored, assigning all the weight 4097 // to the vector loop, optimistically. 4098 // 4099 // For scalable vectorization we can't know at compile time how many iterations 4100 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4101 // vscale of '1'. 4102 setProfileInfoAfterUnrolling( 4103 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4104 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4105 } 4106 4107 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4108 // In order to support recurrences we need to be able to vectorize Phi nodes. 4109 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4110 // stage #2: We now need to fix the recurrences by adding incoming edges to 4111 // the currently empty PHI nodes. At this point every instruction in the 4112 // original loop is widened to a vector form so we can use them to construct 4113 // the incoming edges. 4114 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4115 for (VPRecipeBase &R : Header->phis()) { 4116 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4117 if (!PhiR) 4118 continue; 4119 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4120 if (PhiR->getRecurrenceDescriptor()) { 4121 fixReduction(PhiR, State); 4122 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4123 fixFirstOrderRecurrence(OrigPhi, State); 4124 } 4125 } 4126 4127 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4128 VPTransformState &State) { 4129 // This is the second phase of vectorizing first-order recurrences. An 4130 // overview of the transformation is described below. Suppose we have the 4131 // following loop. 4132 // 4133 // for (int i = 0; i < n; ++i) 4134 // b[i] = a[i] - a[i - 1]; 4135 // 4136 // There is a first-order recurrence on "a". For this loop, the shorthand 4137 // scalar IR looks like: 4138 // 4139 // scalar.ph: 4140 // s_init = a[-1] 4141 // br scalar.body 4142 // 4143 // scalar.body: 4144 // i = phi [0, scalar.ph], [i+1, scalar.body] 4145 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4146 // s2 = a[i] 4147 // b[i] = s2 - s1 4148 // br cond, scalar.body, ... 4149 // 4150 // In this example, s1 is a recurrence because it's value depends on the 4151 // previous iteration. In the first phase of vectorization, we created a 4152 // temporary value for s1. We now complete the vectorization and produce the 4153 // shorthand vector IR shown below (for VF = 4, UF = 1). 4154 // 4155 // vector.ph: 4156 // v_init = vector(..., ..., ..., a[-1]) 4157 // br vector.body 4158 // 4159 // vector.body 4160 // i = phi [0, vector.ph], [i+4, vector.body] 4161 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4162 // v2 = a[i, i+1, i+2, i+3]; 4163 // v3 = vector(v1(3), v2(0, 1, 2)) 4164 // b[i, i+1, i+2, i+3] = v2 - v3 4165 // br cond, vector.body, middle.block 4166 // 4167 // middle.block: 4168 // x = v2(3) 4169 // br scalar.ph 4170 // 4171 // scalar.ph: 4172 // s_init = phi [x, middle.block], [a[-1], otherwise] 4173 // br scalar.body 4174 // 4175 // After execution completes the vector loop, we extract the next value of 4176 // the recurrence (x) to use as the initial value in the scalar loop. 4177 4178 // Get the original loop preheader and single loop latch. 4179 auto *Preheader = OrigLoop->getLoopPreheader(); 4180 auto *Latch = OrigLoop->getLoopLatch(); 4181 4182 // Get the initial and previous values of the scalar recurrence. 4183 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4184 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4185 4186 auto *IdxTy = Builder.getInt32Ty(); 4187 auto *One = ConstantInt::get(IdxTy, 1); 4188 4189 // Create a vector from the initial value. 4190 auto *VectorInit = ScalarInit; 4191 if (VF.isVector()) { 4192 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4193 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4194 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4195 VectorInit = Builder.CreateInsertElement( 4196 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4197 VectorInit, LastIdx, "vector.recur.init"); 4198 } 4199 4200 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4201 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4202 // We constructed a temporary phi node in the first phase of vectorization. 4203 // This phi node will eventually be deleted. 4204 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4205 4206 // Create a phi node for the new recurrence. The current value will either be 4207 // the initial value inserted into a vector or loop-varying vector value. 4208 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4209 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4210 4211 // Get the vectorized previous value of the last part UF - 1. It appears last 4212 // among all unrolled iterations, due to the order of their construction. 4213 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4214 4215 // Find and set the insertion point after the previous value if it is an 4216 // instruction. 4217 BasicBlock::iterator InsertPt; 4218 // Note that the previous value may have been constant-folded so it is not 4219 // guaranteed to be an instruction in the vector loop. 4220 // FIXME: Loop invariant values do not form recurrences. We should deal with 4221 // them earlier. 4222 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4223 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4224 else { 4225 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4226 if (isa<PHINode>(PreviousLastPart)) 4227 // If the previous value is a phi node, we should insert after all the phi 4228 // nodes in the block containing the PHI to avoid breaking basic block 4229 // verification. Note that the basic block may be different to 4230 // LoopVectorBody, in case we predicate the loop. 4231 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4232 else 4233 InsertPt = ++PreviousInst->getIterator(); 4234 } 4235 Builder.SetInsertPoint(&*InsertPt); 4236 4237 // The vector from which to take the initial value for the current iteration 4238 // (actual or unrolled). Initially, this is the vector phi node. 4239 Value *Incoming = VecPhi; 4240 4241 // Shuffle the current and previous vector and update the vector parts. 4242 for (unsigned Part = 0; Part < UF; ++Part) { 4243 Value *PreviousPart = State.get(PreviousDef, Part); 4244 Value *PhiPart = State.get(PhiDef, Part); 4245 auto *Shuffle = VF.isVector() 4246 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4247 : Incoming; 4248 PhiPart->replaceAllUsesWith(Shuffle); 4249 cast<Instruction>(PhiPart)->eraseFromParent(); 4250 State.reset(PhiDef, Shuffle, Part); 4251 Incoming = PreviousPart; 4252 } 4253 4254 // Fix the latch value of the new recurrence in the vector loop. 4255 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4256 4257 // Extract the last vector element in the middle block. This will be the 4258 // initial value for the recurrence when jumping to the scalar loop. 4259 auto *ExtractForScalar = Incoming; 4260 if (VF.isVector()) { 4261 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4262 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4263 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4264 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4265 "vector.recur.extract"); 4266 } 4267 // Extract the second last element in the middle block if the 4268 // Phi is used outside the loop. We need to extract the phi itself 4269 // and not the last element (the phi update in the current iteration). This 4270 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4271 // when the scalar loop is not run at all. 4272 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4273 if (VF.isVector()) { 4274 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4275 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4276 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4277 Incoming, Idx, "vector.recur.extract.for.phi"); 4278 } else if (UF > 1) 4279 // When loop is unrolled without vectorizing, initialize 4280 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4281 // of `Incoming`. This is analogous to the vectorized case above: extracting 4282 // the second last element when VF > 1. 4283 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4284 4285 // Fix the initial value of the original recurrence in the scalar loop. 4286 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4287 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4288 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4289 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4290 Start->addIncoming(Incoming, BB); 4291 } 4292 4293 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4294 Phi->setName("scalar.recur"); 4295 4296 // Finally, fix users of the recurrence outside the loop. The users will need 4297 // either the last value of the scalar recurrence or the last value of the 4298 // vector recurrence we extracted in the middle block. Since the loop is in 4299 // LCSSA form, we just need to find all the phi nodes for the original scalar 4300 // recurrence in the exit block, and then add an edge for the middle block. 4301 // Note that LCSSA does not imply single entry when the original scalar loop 4302 // had multiple exiting edges (as we always run the last iteration in the 4303 // scalar epilogue); in that case, the exiting path through middle will be 4304 // dynamically dead and the value picked for the phi doesn't matter. 4305 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4306 if (any_of(LCSSAPhi.incoming_values(), 4307 [Phi](Value *V) { return V == Phi; })) 4308 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4309 } 4310 4311 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4312 return EnableStrictReductions && RdxDesc.isOrdered(); 4313 } 4314 4315 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4316 VPTransformState &State) { 4317 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4318 // Get it's reduction variable descriptor. 4319 assert(Legal->isReductionVariable(OrigPhi) && 4320 "Unable to find the reduction variable"); 4321 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4322 4323 RecurKind RK = RdxDesc.getRecurrenceKind(); 4324 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4325 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4326 setDebugLocFromInst(Builder, ReductionStartValue); 4327 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4328 4329 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4330 // This is the vector-clone of the value that leaves the loop. 4331 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4332 4333 // Wrap flags are in general invalid after vectorization, clear them. 4334 clearReductionWrapFlags(RdxDesc, State); 4335 4336 // Fix the vector-loop phi. 4337 4338 // Reductions do not have to start at zero. They can start with 4339 // any loop invariant values. 4340 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4341 4342 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4343 useOrderedReductions(RdxDesc); 4344 4345 for (unsigned Part = 0; Part < UF; ++Part) { 4346 if (IsOrdered && Part > 0) 4347 break; 4348 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4349 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4350 if (IsOrdered) 4351 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4352 4353 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4354 } 4355 4356 // Before each round, move the insertion point right between 4357 // the PHIs and the values we are going to write. 4358 // This allows us to write both PHINodes and the extractelement 4359 // instructions. 4360 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4361 4362 setDebugLocFromInst(Builder, LoopExitInst); 4363 4364 Type *PhiTy = OrigPhi->getType(); 4365 // If tail is folded by masking, the vector value to leave the loop should be 4366 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4367 // instead of the former. For an inloop reduction the reduction will already 4368 // be predicated, and does not need to be handled here. 4369 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4370 for (unsigned Part = 0; Part < UF; ++Part) { 4371 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4372 Value *Sel = nullptr; 4373 for (User *U : VecLoopExitInst->users()) { 4374 if (isa<SelectInst>(U)) { 4375 assert(!Sel && "Reduction exit feeding two selects"); 4376 Sel = U; 4377 } else 4378 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4379 } 4380 assert(Sel && "Reduction exit feeds no select"); 4381 State.reset(LoopExitInstDef, Sel, Part); 4382 4383 // If the target can create a predicated operator for the reduction at no 4384 // extra cost in the loop (for example a predicated vadd), it can be 4385 // cheaper for the select to remain in the loop than be sunk out of it, 4386 // and so use the select value for the phi instead of the old 4387 // LoopExitValue. 4388 if (PreferPredicatedReductionSelect || 4389 TTI->preferPredicatedReductionSelect( 4390 RdxDesc.getOpcode(), PhiTy, 4391 TargetTransformInfo::ReductionFlags())) { 4392 auto *VecRdxPhi = 4393 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4394 VecRdxPhi->setIncomingValueForBlock( 4395 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4396 } 4397 } 4398 } 4399 4400 // If the vector reduction can be performed in a smaller type, we truncate 4401 // then extend the loop exit value to enable InstCombine to evaluate the 4402 // entire expression in the smaller type. 4403 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4404 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4405 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4406 Builder.SetInsertPoint( 4407 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4408 VectorParts RdxParts(UF); 4409 for (unsigned Part = 0; Part < UF; ++Part) { 4410 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4411 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4412 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4413 : Builder.CreateZExt(Trunc, VecTy); 4414 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4415 UI != RdxParts[Part]->user_end();) 4416 if (*UI != Trunc) { 4417 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4418 RdxParts[Part] = Extnd; 4419 } else { 4420 ++UI; 4421 } 4422 } 4423 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4424 for (unsigned Part = 0; Part < UF; ++Part) { 4425 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4426 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4427 } 4428 } 4429 4430 // Reduce all of the unrolled parts into a single vector. 4431 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4432 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4433 4434 // The middle block terminator has already been assigned a DebugLoc here (the 4435 // OrigLoop's single latch terminator). We want the whole middle block to 4436 // appear to execute on this line because: (a) it is all compiler generated, 4437 // (b) these instructions are always executed after evaluating the latch 4438 // conditional branch, and (c) other passes may add new predecessors which 4439 // terminate on this line. This is the easiest way to ensure we don't 4440 // accidentally cause an extra step back into the loop while debugging. 4441 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4442 if (IsOrdered) 4443 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4444 else { 4445 // Floating-point operations should have some FMF to enable the reduction. 4446 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4447 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4448 for (unsigned Part = 1; Part < UF; ++Part) { 4449 Value *RdxPart = State.get(LoopExitInstDef, Part); 4450 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4451 ReducedPartRdx = Builder.CreateBinOp( 4452 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4453 } else { 4454 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4455 } 4456 } 4457 } 4458 4459 // Create the reduction after the loop. Note that inloop reductions create the 4460 // target reduction in the loop using a Reduction recipe. 4461 if (VF.isVector() && !IsInLoopReductionPhi) { 4462 ReducedPartRdx = 4463 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4464 // If the reduction can be performed in a smaller type, we need to extend 4465 // the reduction to the wider type before we branch to the original loop. 4466 if (PhiTy != RdxDesc.getRecurrenceType()) 4467 ReducedPartRdx = RdxDesc.isSigned() 4468 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4469 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4470 } 4471 4472 // Create a phi node that merges control-flow from the backedge-taken check 4473 // block and the middle block. 4474 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4475 LoopScalarPreHeader->getTerminator()); 4476 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4477 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4478 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4479 4480 // Now, we need to fix the users of the reduction variable 4481 // inside and outside of the scalar remainder loop. 4482 4483 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4484 // in the exit blocks. See comment on analogous loop in 4485 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4486 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4487 if (any_of(LCSSAPhi.incoming_values(), 4488 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4489 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4490 4491 // Fix the scalar loop reduction variable with the incoming reduction sum 4492 // from the vector body and from the backedge value. 4493 int IncomingEdgeBlockIdx = 4494 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4495 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4496 // Pick the other block. 4497 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4498 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4499 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4500 } 4501 4502 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4503 VPTransformState &State) { 4504 RecurKind RK = RdxDesc.getRecurrenceKind(); 4505 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4506 return; 4507 4508 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4509 assert(LoopExitInstr && "null loop exit instruction"); 4510 SmallVector<Instruction *, 8> Worklist; 4511 SmallPtrSet<Instruction *, 8> Visited; 4512 Worklist.push_back(LoopExitInstr); 4513 Visited.insert(LoopExitInstr); 4514 4515 while (!Worklist.empty()) { 4516 Instruction *Cur = Worklist.pop_back_val(); 4517 if (isa<OverflowingBinaryOperator>(Cur)) 4518 for (unsigned Part = 0; Part < UF; ++Part) { 4519 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4520 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4521 } 4522 4523 for (User *U : Cur->users()) { 4524 Instruction *UI = cast<Instruction>(U); 4525 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4526 Visited.insert(UI).second) 4527 Worklist.push_back(UI); 4528 } 4529 } 4530 } 4531 4532 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4533 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4534 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4535 // Some phis were already hand updated by the reduction and recurrence 4536 // code above, leave them alone. 4537 continue; 4538 4539 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4540 // Non-instruction incoming values will have only one value. 4541 4542 VPLane Lane = VPLane::getFirstLane(); 4543 if (isa<Instruction>(IncomingValue) && 4544 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4545 VF)) 4546 Lane = VPLane::getLastLaneForVF(VF); 4547 4548 // Can be a loop invariant incoming value or the last scalar value to be 4549 // extracted from the vectorized loop. 4550 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4551 Value *lastIncomingValue = 4552 OrigLoop->isLoopInvariant(IncomingValue) 4553 ? IncomingValue 4554 : State.get(State.Plan->getVPValue(IncomingValue), 4555 VPIteration(UF - 1, Lane)); 4556 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4557 } 4558 } 4559 4560 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4561 // The basic block and loop containing the predicated instruction. 4562 auto *PredBB = PredInst->getParent(); 4563 auto *VectorLoop = LI->getLoopFor(PredBB); 4564 4565 // Initialize a worklist with the operands of the predicated instruction. 4566 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4567 4568 // Holds instructions that we need to analyze again. An instruction may be 4569 // reanalyzed if we don't yet know if we can sink it or not. 4570 SmallVector<Instruction *, 8> InstsToReanalyze; 4571 4572 // Returns true if a given use occurs in the predicated block. Phi nodes use 4573 // their operands in their corresponding predecessor blocks. 4574 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4575 auto *I = cast<Instruction>(U.getUser()); 4576 BasicBlock *BB = I->getParent(); 4577 if (auto *Phi = dyn_cast<PHINode>(I)) 4578 BB = Phi->getIncomingBlock( 4579 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4580 return BB == PredBB; 4581 }; 4582 4583 // Iteratively sink the scalarized operands of the predicated instruction 4584 // into the block we created for it. When an instruction is sunk, it's 4585 // operands are then added to the worklist. The algorithm ends after one pass 4586 // through the worklist doesn't sink a single instruction. 4587 bool Changed; 4588 do { 4589 // Add the instructions that need to be reanalyzed to the worklist, and 4590 // reset the changed indicator. 4591 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4592 InstsToReanalyze.clear(); 4593 Changed = false; 4594 4595 while (!Worklist.empty()) { 4596 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4597 4598 // We can't sink an instruction if it is a phi node, is not in the loop, 4599 // or may have side effects. 4600 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4601 I->mayHaveSideEffects()) 4602 continue; 4603 4604 // If the instruction is already in PredBB, check if we can sink its 4605 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4606 // sinking the scalar instruction I, hence it appears in PredBB; but it 4607 // may have failed to sink I's operands (recursively), which we try 4608 // (again) here. 4609 if (I->getParent() == PredBB) { 4610 Worklist.insert(I->op_begin(), I->op_end()); 4611 continue; 4612 } 4613 4614 // It's legal to sink the instruction if all its uses occur in the 4615 // predicated block. Otherwise, there's nothing to do yet, and we may 4616 // need to reanalyze the instruction. 4617 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4618 InstsToReanalyze.push_back(I); 4619 continue; 4620 } 4621 4622 // Move the instruction to the beginning of the predicated block, and add 4623 // it's operands to the worklist. 4624 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4625 Worklist.insert(I->op_begin(), I->op_end()); 4626 4627 // The sinking may have enabled other instructions to be sunk, so we will 4628 // need to iterate. 4629 Changed = true; 4630 } 4631 } while (Changed); 4632 } 4633 4634 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4635 for (PHINode *OrigPhi : OrigPHIsToFix) { 4636 VPWidenPHIRecipe *VPPhi = 4637 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4638 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4639 // Make sure the builder has a valid insert point. 4640 Builder.SetInsertPoint(NewPhi); 4641 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4642 VPValue *Inc = VPPhi->getIncomingValue(i); 4643 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4644 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4645 } 4646 } 4647 } 4648 4649 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4650 VPUser &Operands, unsigned UF, 4651 ElementCount VF, bool IsPtrLoopInvariant, 4652 SmallBitVector &IsIndexLoopInvariant, 4653 VPTransformState &State) { 4654 // Construct a vector GEP by widening the operands of the scalar GEP as 4655 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4656 // results in a vector of pointers when at least one operand of the GEP 4657 // is vector-typed. Thus, to keep the representation compact, we only use 4658 // vector-typed operands for loop-varying values. 4659 4660 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4661 // If we are vectorizing, but the GEP has only loop-invariant operands, 4662 // the GEP we build (by only using vector-typed operands for 4663 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4664 // produce a vector of pointers, we need to either arbitrarily pick an 4665 // operand to broadcast, or broadcast a clone of the original GEP. 4666 // Here, we broadcast a clone of the original. 4667 // 4668 // TODO: If at some point we decide to scalarize instructions having 4669 // loop-invariant operands, this special case will no longer be 4670 // required. We would add the scalarization decision to 4671 // collectLoopScalars() and teach getVectorValue() to broadcast 4672 // the lane-zero scalar value. 4673 auto *Clone = Builder.Insert(GEP->clone()); 4674 for (unsigned Part = 0; Part < UF; ++Part) { 4675 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4676 State.set(VPDef, EntryPart, Part); 4677 addMetadata(EntryPart, GEP); 4678 } 4679 } else { 4680 // If the GEP has at least one loop-varying operand, we are sure to 4681 // produce a vector of pointers. But if we are only unrolling, we want 4682 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4683 // produce with the code below will be scalar (if VF == 1) or vector 4684 // (otherwise). Note that for the unroll-only case, we still maintain 4685 // values in the vector mapping with initVector, as we do for other 4686 // instructions. 4687 for (unsigned Part = 0; Part < UF; ++Part) { 4688 // The pointer operand of the new GEP. If it's loop-invariant, we 4689 // won't broadcast it. 4690 auto *Ptr = IsPtrLoopInvariant 4691 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4692 : State.get(Operands.getOperand(0), Part); 4693 4694 // Collect all the indices for the new GEP. If any index is 4695 // loop-invariant, we won't broadcast it. 4696 SmallVector<Value *, 4> Indices; 4697 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4698 VPValue *Operand = Operands.getOperand(I); 4699 if (IsIndexLoopInvariant[I - 1]) 4700 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4701 else 4702 Indices.push_back(State.get(Operand, Part)); 4703 } 4704 4705 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4706 // but it should be a vector, otherwise. 4707 auto *NewGEP = 4708 GEP->isInBounds() 4709 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4710 Indices) 4711 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4712 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4713 "NewGEP is not a pointer vector"); 4714 State.set(VPDef, NewGEP, Part); 4715 addMetadata(NewGEP, GEP); 4716 } 4717 } 4718 } 4719 4720 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4721 RecurrenceDescriptor *RdxDesc, 4722 VPWidenPHIRecipe *PhiR, 4723 VPTransformState &State) { 4724 PHINode *P = cast<PHINode>(PN); 4725 if (EnableVPlanNativePath) { 4726 // Currently we enter here in the VPlan-native path for non-induction 4727 // PHIs where all control flow is uniform. We simply widen these PHIs. 4728 // Create a vector phi with no operands - the vector phi operands will be 4729 // set at the end of vector code generation. 4730 Type *VecTy = (State.VF.isScalar()) 4731 ? PN->getType() 4732 : VectorType::get(PN->getType(), State.VF); 4733 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4734 State.set(PhiR, VecPhi, 0); 4735 OrigPHIsToFix.push_back(P); 4736 4737 return; 4738 } 4739 4740 assert(PN->getParent() == OrigLoop->getHeader() && 4741 "Non-header phis should have been handled elsewhere"); 4742 4743 VPValue *StartVPV = PhiR->getStartValue(); 4744 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4745 // In order to support recurrences we need to be able to vectorize Phi nodes. 4746 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4747 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4748 // this value when we vectorize all of the instructions that use the PHI. 4749 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4750 Value *Iden = nullptr; 4751 bool ScalarPHI = 4752 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4753 Type *VecTy = 4754 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4755 4756 if (RdxDesc) { 4757 assert(Legal->isReductionVariable(P) && StartV && 4758 "RdxDesc should only be set for reduction variables; in that case " 4759 "a StartV is also required"); 4760 RecurKind RK = RdxDesc->getRecurrenceKind(); 4761 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4762 // MinMax reduction have the start value as their identify. 4763 if (ScalarPHI) { 4764 Iden = StartV; 4765 } else { 4766 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4767 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4768 StartV = Iden = 4769 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4770 } 4771 } else { 4772 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4773 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4774 Iden = IdenC; 4775 4776 if (!ScalarPHI) { 4777 Iden = ConstantVector::getSplat(State.VF, IdenC); 4778 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4779 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4780 Constant *Zero = Builder.getInt32(0); 4781 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4782 } 4783 } 4784 } 4785 4786 bool IsOrdered = State.VF.isVector() && 4787 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4788 useOrderedReductions(*RdxDesc); 4789 4790 for (unsigned Part = 0; Part < State.UF; ++Part) { 4791 // This is phase one of vectorizing PHIs. 4792 if (Part > 0 && IsOrdered) 4793 return; 4794 Value *EntryPart = PHINode::Create( 4795 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4796 State.set(PhiR, EntryPart, Part); 4797 if (StartV) { 4798 // Make sure to add the reduction start value only to the 4799 // first unroll part. 4800 Value *StartVal = (Part == 0) ? StartV : Iden; 4801 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4802 } 4803 } 4804 return; 4805 } 4806 4807 assert(!Legal->isReductionVariable(P) && 4808 "reductions should be handled above"); 4809 4810 setDebugLocFromInst(Builder, P); 4811 4812 // This PHINode must be an induction variable. 4813 // Make sure that we know about it. 4814 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4815 4816 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4817 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4818 4819 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4820 // which can be found from the original scalar operations. 4821 switch (II.getKind()) { 4822 case InductionDescriptor::IK_NoInduction: 4823 llvm_unreachable("Unknown induction"); 4824 case InductionDescriptor::IK_IntInduction: 4825 case InductionDescriptor::IK_FpInduction: 4826 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4827 case InductionDescriptor::IK_PtrInduction: { 4828 // Handle the pointer induction variable case. 4829 assert(P->getType()->isPointerTy() && "Unexpected type."); 4830 4831 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4832 // This is the normalized GEP that starts counting at zero. 4833 Value *PtrInd = 4834 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4835 // Determine the number of scalars we need to generate for each unroll 4836 // iteration. If the instruction is uniform, we only need to generate the 4837 // first lane. Otherwise, we generate all VF values. 4838 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4839 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4840 4841 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4842 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4843 if (NeedsVectorIndex) { 4844 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4845 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4846 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4847 } 4848 4849 for (unsigned Part = 0; Part < UF; ++Part) { 4850 Value *PartStart = createStepForVF( 4851 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4852 4853 if (NeedsVectorIndex) { 4854 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4855 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4856 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4857 Value *SclrGep = 4858 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4859 SclrGep->setName("next.gep"); 4860 State.set(PhiR, SclrGep, Part); 4861 // We've cached the whole vector, which means we can support the 4862 // extraction of any lane. 4863 continue; 4864 } 4865 4866 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4867 Value *Idx = Builder.CreateAdd( 4868 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4869 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4870 Value *SclrGep = 4871 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4872 SclrGep->setName("next.gep"); 4873 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4874 } 4875 } 4876 return; 4877 } 4878 assert(isa<SCEVConstant>(II.getStep()) && 4879 "Induction step not a SCEV constant!"); 4880 Type *PhiType = II.getStep()->getType(); 4881 4882 // Build a pointer phi 4883 Value *ScalarStartValue = II.getStartValue(); 4884 Type *ScStValueType = ScalarStartValue->getType(); 4885 PHINode *NewPointerPhi = 4886 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4887 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4888 4889 // A pointer induction, performed by using a gep 4890 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4891 Instruction *InductionLoc = LoopLatch->getTerminator(); 4892 const SCEV *ScalarStep = II.getStep(); 4893 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4894 Value *ScalarStepValue = 4895 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4896 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4897 Value *NumUnrolledElems = 4898 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4899 Value *InductionGEP = GetElementPtrInst::Create( 4900 ScStValueType->getPointerElementType(), NewPointerPhi, 4901 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4902 InductionLoc); 4903 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4904 4905 // Create UF many actual address geps that use the pointer 4906 // phi as base and a vectorized version of the step value 4907 // (<step*0, ..., step*N>) as offset. 4908 for (unsigned Part = 0; Part < State.UF; ++Part) { 4909 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4910 Value *StartOffsetScalar = 4911 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4912 Value *StartOffset = 4913 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4914 // Create a vector of consecutive numbers from zero to VF. 4915 StartOffset = 4916 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4917 4918 Value *GEP = Builder.CreateGEP( 4919 ScStValueType->getPointerElementType(), NewPointerPhi, 4920 Builder.CreateMul( 4921 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4922 "vector.gep")); 4923 State.set(PhiR, GEP, Part); 4924 } 4925 } 4926 } 4927 } 4928 4929 /// A helper function for checking whether an integer division-related 4930 /// instruction may divide by zero (in which case it must be predicated if 4931 /// executed conditionally in the scalar code). 4932 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4933 /// Non-zero divisors that are non compile-time constants will not be 4934 /// converted into multiplication, so we will still end up scalarizing 4935 /// the division, but can do so w/o predication. 4936 static bool mayDivideByZero(Instruction &I) { 4937 assert((I.getOpcode() == Instruction::UDiv || 4938 I.getOpcode() == Instruction::SDiv || 4939 I.getOpcode() == Instruction::URem || 4940 I.getOpcode() == Instruction::SRem) && 4941 "Unexpected instruction"); 4942 Value *Divisor = I.getOperand(1); 4943 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4944 return !CInt || CInt->isZero(); 4945 } 4946 4947 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4948 VPUser &User, 4949 VPTransformState &State) { 4950 switch (I.getOpcode()) { 4951 case Instruction::Call: 4952 case Instruction::Br: 4953 case Instruction::PHI: 4954 case Instruction::GetElementPtr: 4955 case Instruction::Select: 4956 llvm_unreachable("This instruction is handled by a different recipe."); 4957 case Instruction::UDiv: 4958 case Instruction::SDiv: 4959 case Instruction::SRem: 4960 case Instruction::URem: 4961 case Instruction::Add: 4962 case Instruction::FAdd: 4963 case Instruction::Sub: 4964 case Instruction::FSub: 4965 case Instruction::FNeg: 4966 case Instruction::Mul: 4967 case Instruction::FMul: 4968 case Instruction::FDiv: 4969 case Instruction::FRem: 4970 case Instruction::Shl: 4971 case Instruction::LShr: 4972 case Instruction::AShr: 4973 case Instruction::And: 4974 case Instruction::Or: 4975 case Instruction::Xor: { 4976 // Just widen unops and binops. 4977 setDebugLocFromInst(Builder, &I); 4978 4979 for (unsigned Part = 0; Part < UF; ++Part) { 4980 SmallVector<Value *, 2> Ops; 4981 for (VPValue *VPOp : User.operands()) 4982 Ops.push_back(State.get(VPOp, Part)); 4983 4984 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4985 4986 if (auto *VecOp = dyn_cast<Instruction>(V)) 4987 VecOp->copyIRFlags(&I); 4988 4989 // Use this vector value for all users of the original instruction. 4990 State.set(Def, V, Part); 4991 addMetadata(V, &I); 4992 } 4993 4994 break; 4995 } 4996 case Instruction::ICmp: 4997 case Instruction::FCmp: { 4998 // Widen compares. Generate vector compares. 4999 bool FCmp = (I.getOpcode() == Instruction::FCmp); 5000 auto *Cmp = cast<CmpInst>(&I); 5001 setDebugLocFromInst(Builder, Cmp); 5002 for (unsigned Part = 0; Part < UF; ++Part) { 5003 Value *A = State.get(User.getOperand(0), Part); 5004 Value *B = State.get(User.getOperand(1), Part); 5005 Value *C = nullptr; 5006 if (FCmp) { 5007 // Propagate fast math flags. 5008 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 5009 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 5010 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 5011 } else { 5012 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 5013 } 5014 State.set(Def, C, Part); 5015 addMetadata(C, &I); 5016 } 5017 5018 break; 5019 } 5020 5021 case Instruction::ZExt: 5022 case Instruction::SExt: 5023 case Instruction::FPToUI: 5024 case Instruction::FPToSI: 5025 case Instruction::FPExt: 5026 case Instruction::PtrToInt: 5027 case Instruction::IntToPtr: 5028 case Instruction::SIToFP: 5029 case Instruction::UIToFP: 5030 case Instruction::Trunc: 5031 case Instruction::FPTrunc: 5032 case Instruction::BitCast: { 5033 auto *CI = cast<CastInst>(&I); 5034 setDebugLocFromInst(Builder, CI); 5035 5036 /// Vectorize casts. 5037 Type *DestTy = 5038 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 5039 5040 for (unsigned Part = 0; Part < UF; ++Part) { 5041 Value *A = State.get(User.getOperand(0), Part); 5042 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5043 State.set(Def, Cast, Part); 5044 addMetadata(Cast, &I); 5045 } 5046 break; 5047 } 5048 default: 5049 // This instruction is not vectorized by simple widening. 5050 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5051 llvm_unreachable("Unhandled instruction!"); 5052 } // end of switch. 5053 } 5054 5055 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5056 VPUser &ArgOperands, 5057 VPTransformState &State) { 5058 assert(!isa<DbgInfoIntrinsic>(I) && 5059 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5060 setDebugLocFromInst(Builder, &I); 5061 5062 Module *M = I.getParent()->getParent()->getParent(); 5063 auto *CI = cast<CallInst>(&I); 5064 5065 SmallVector<Type *, 4> Tys; 5066 for (Value *ArgOperand : CI->arg_operands()) 5067 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5068 5069 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5070 5071 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5072 // version of the instruction. 5073 // Is it beneficial to perform intrinsic call compared to lib call? 5074 bool NeedToScalarize = false; 5075 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5076 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5077 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5078 assert((UseVectorIntrinsic || !NeedToScalarize) && 5079 "Instruction should be scalarized elsewhere."); 5080 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5081 "Either the intrinsic cost or vector call cost must be valid"); 5082 5083 for (unsigned Part = 0; Part < UF; ++Part) { 5084 SmallVector<Value *, 4> Args; 5085 for (auto &I : enumerate(ArgOperands.operands())) { 5086 // Some intrinsics have a scalar argument - don't replace it with a 5087 // vector. 5088 Value *Arg; 5089 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5090 Arg = State.get(I.value(), Part); 5091 else 5092 Arg = State.get(I.value(), VPIteration(0, 0)); 5093 Args.push_back(Arg); 5094 } 5095 5096 Function *VectorF; 5097 if (UseVectorIntrinsic) { 5098 // Use vector version of the intrinsic. 5099 Type *TysForDecl[] = {CI->getType()}; 5100 if (VF.isVector()) 5101 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5102 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5103 assert(VectorF && "Can't retrieve vector intrinsic."); 5104 } else { 5105 // Use vector version of the function call. 5106 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5107 #ifndef NDEBUG 5108 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5109 "Can't create vector function."); 5110 #endif 5111 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5112 } 5113 SmallVector<OperandBundleDef, 1> OpBundles; 5114 CI->getOperandBundlesAsDefs(OpBundles); 5115 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5116 5117 if (isa<FPMathOperator>(V)) 5118 V->copyFastMathFlags(CI); 5119 5120 State.set(Def, V, Part); 5121 addMetadata(V, &I); 5122 } 5123 } 5124 5125 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5126 VPUser &Operands, 5127 bool InvariantCond, 5128 VPTransformState &State) { 5129 setDebugLocFromInst(Builder, &I); 5130 5131 // The condition can be loop invariant but still defined inside the 5132 // loop. This means that we can't just use the original 'cond' value. 5133 // We have to take the 'vectorized' value and pick the first lane. 5134 // Instcombine will make this a no-op. 5135 auto *InvarCond = InvariantCond 5136 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5137 : nullptr; 5138 5139 for (unsigned Part = 0; Part < UF; ++Part) { 5140 Value *Cond = 5141 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5142 Value *Op0 = State.get(Operands.getOperand(1), Part); 5143 Value *Op1 = State.get(Operands.getOperand(2), Part); 5144 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5145 State.set(VPDef, Sel, Part); 5146 addMetadata(Sel, &I); 5147 } 5148 } 5149 5150 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5151 // We should not collect Scalars more than once per VF. Right now, this 5152 // function is called from collectUniformsAndScalars(), which already does 5153 // this check. Collecting Scalars for VF=1 does not make any sense. 5154 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5155 "This function should not be visited twice for the same VF"); 5156 5157 SmallSetVector<Instruction *, 8> Worklist; 5158 5159 // These sets are used to seed the analysis with pointers used by memory 5160 // accesses that will remain scalar. 5161 SmallSetVector<Instruction *, 8> ScalarPtrs; 5162 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5163 auto *Latch = TheLoop->getLoopLatch(); 5164 5165 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5166 // The pointer operands of loads and stores will be scalar as long as the 5167 // memory access is not a gather or scatter operation. The value operand of a 5168 // store will remain scalar if the store is scalarized. 5169 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5170 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5171 assert(WideningDecision != CM_Unknown && 5172 "Widening decision should be ready at this moment"); 5173 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5174 if (Ptr == Store->getValueOperand()) 5175 return WideningDecision == CM_Scalarize; 5176 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5177 "Ptr is neither a value or pointer operand"); 5178 return WideningDecision != CM_GatherScatter; 5179 }; 5180 5181 // A helper that returns true if the given value is a bitcast or 5182 // getelementptr instruction contained in the loop. 5183 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5184 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5185 isa<GetElementPtrInst>(V)) && 5186 !TheLoop->isLoopInvariant(V); 5187 }; 5188 5189 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5190 if (!isa<PHINode>(Ptr) || 5191 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5192 return false; 5193 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5194 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5195 return false; 5196 return isScalarUse(MemAccess, Ptr); 5197 }; 5198 5199 // A helper that evaluates a memory access's use of a pointer. If the 5200 // pointer is actually the pointer induction of a loop, it is being 5201 // inserted into Worklist. If the use will be a scalar use, and the 5202 // pointer is only used by memory accesses, we place the pointer in 5203 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5204 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5205 if (isScalarPtrInduction(MemAccess, Ptr)) { 5206 Worklist.insert(cast<Instruction>(Ptr)); 5207 Instruction *Update = cast<Instruction>( 5208 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5209 Worklist.insert(Update); 5210 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5211 << "\n"); 5212 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5213 << "\n"); 5214 return; 5215 } 5216 // We only care about bitcast and getelementptr instructions contained in 5217 // the loop. 5218 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5219 return; 5220 5221 // If the pointer has already been identified as scalar (e.g., if it was 5222 // also identified as uniform), there's nothing to do. 5223 auto *I = cast<Instruction>(Ptr); 5224 if (Worklist.count(I)) 5225 return; 5226 5227 // If the use of the pointer will be a scalar use, and all users of the 5228 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5229 // place the pointer in PossibleNonScalarPtrs. 5230 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5231 return isa<LoadInst>(U) || isa<StoreInst>(U); 5232 })) 5233 ScalarPtrs.insert(I); 5234 else 5235 PossibleNonScalarPtrs.insert(I); 5236 }; 5237 5238 // We seed the scalars analysis with three classes of instructions: (1) 5239 // instructions marked uniform-after-vectorization and (2) bitcast, 5240 // getelementptr and (pointer) phi instructions used by memory accesses 5241 // requiring a scalar use. 5242 // 5243 // (1) Add to the worklist all instructions that have been identified as 5244 // uniform-after-vectorization. 5245 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5246 5247 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5248 // memory accesses requiring a scalar use. The pointer operands of loads and 5249 // stores will be scalar as long as the memory accesses is not a gather or 5250 // scatter operation. The value operand of a store will remain scalar if the 5251 // store is scalarized. 5252 for (auto *BB : TheLoop->blocks()) 5253 for (auto &I : *BB) { 5254 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5255 evaluatePtrUse(Load, Load->getPointerOperand()); 5256 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5257 evaluatePtrUse(Store, Store->getPointerOperand()); 5258 evaluatePtrUse(Store, Store->getValueOperand()); 5259 } 5260 } 5261 for (auto *I : ScalarPtrs) 5262 if (!PossibleNonScalarPtrs.count(I)) { 5263 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5264 Worklist.insert(I); 5265 } 5266 5267 // Insert the forced scalars. 5268 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5269 // induction variable when the PHI user is scalarized. 5270 auto ForcedScalar = ForcedScalars.find(VF); 5271 if (ForcedScalar != ForcedScalars.end()) 5272 for (auto *I : ForcedScalar->second) 5273 Worklist.insert(I); 5274 5275 // Expand the worklist by looking through any bitcasts and getelementptr 5276 // instructions we've already identified as scalar. This is similar to the 5277 // expansion step in collectLoopUniforms(); however, here we're only 5278 // expanding to include additional bitcasts and getelementptr instructions. 5279 unsigned Idx = 0; 5280 while (Idx != Worklist.size()) { 5281 Instruction *Dst = Worklist[Idx++]; 5282 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5283 continue; 5284 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5285 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5286 auto *J = cast<Instruction>(U); 5287 return !TheLoop->contains(J) || Worklist.count(J) || 5288 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5289 isScalarUse(J, Src)); 5290 })) { 5291 Worklist.insert(Src); 5292 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5293 } 5294 } 5295 5296 // An induction variable will remain scalar if all users of the induction 5297 // variable and induction variable update remain scalar. 5298 for (auto &Induction : Legal->getInductionVars()) { 5299 auto *Ind = Induction.first; 5300 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5301 5302 // If tail-folding is applied, the primary induction variable will be used 5303 // to feed a vector compare. 5304 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5305 continue; 5306 5307 // Determine if all users of the induction variable are scalar after 5308 // vectorization. 5309 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5310 auto *I = cast<Instruction>(U); 5311 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5312 }); 5313 if (!ScalarInd) 5314 continue; 5315 5316 // Determine if all users of the induction variable update instruction are 5317 // scalar after vectorization. 5318 auto ScalarIndUpdate = 5319 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5320 auto *I = cast<Instruction>(U); 5321 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5322 }); 5323 if (!ScalarIndUpdate) 5324 continue; 5325 5326 // The induction variable and its update instruction will remain scalar. 5327 Worklist.insert(Ind); 5328 Worklist.insert(IndUpdate); 5329 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5330 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5331 << "\n"); 5332 } 5333 5334 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5335 } 5336 5337 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5338 if (!blockNeedsPredication(I->getParent())) 5339 return false; 5340 switch(I->getOpcode()) { 5341 default: 5342 break; 5343 case Instruction::Load: 5344 case Instruction::Store: { 5345 if (!Legal->isMaskRequired(I)) 5346 return false; 5347 auto *Ptr = getLoadStorePointerOperand(I); 5348 auto *Ty = getLoadStoreType(I); 5349 const Align Alignment = getLoadStoreAlignment(I); 5350 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5351 TTI.isLegalMaskedGather(Ty, Alignment)) 5352 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5353 TTI.isLegalMaskedScatter(Ty, Alignment)); 5354 } 5355 case Instruction::UDiv: 5356 case Instruction::SDiv: 5357 case Instruction::SRem: 5358 case Instruction::URem: 5359 return mayDivideByZero(*I); 5360 } 5361 return false; 5362 } 5363 5364 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5365 Instruction *I, ElementCount VF) { 5366 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5367 assert(getWideningDecision(I, VF) == CM_Unknown && 5368 "Decision should not be set yet."); 5369 auto *Group = getInterleavedAccessGroup(I); 5370 assert(Group && "Must have a group."); 5371 5372 // If the instruction's allocated size doesn't equal it's type size, it 5373 // requires padding and will be scalarized. 5374 auto &DL = I->getModule()->getDataLayout(); 5375 auto *ScalarTy = getLoadStoreType(I); 5376 if (hasIrregularType(ScalarTy, DL)) 5377 return false; 5378 5379 // Check if masking is required. 5380 // A Group may need masking for one of two reasons: it resides in a block that 5381 // needs predication, or it was decided to use masking to deal with gaps. 5382 bool PredicatedAccessRequiresMasking = 5383 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5384 bool AccessWithGapsRequiresMasking = 5385 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5386 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5387 return true; 5388 5389 // If masked interleaving is required, we expect that the user/target had 5390 // enabled it, because otherwise it either wouldn't have been created or 5391 // it should have been invalidated by the CostModel. 5392 assert(useMaskedInterleavedAccesses(TTI) && 5393 "Masked interleave-groups for predicated accesses are not enabled."); 5394 5395 auto *Ty = getLoadStoreType(I); 5396 const Align Alignment = getLoadStoreAlignment(I); 5397 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5398 : TTI.isLegalMaskedStore(Ty, Alignment); 5399 } 5400 5401 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5402 Instruction *I, ElementCount VF) { 5403 // Get and ensure we have a valid memory instruction. 5404 LoadInst *LI = dyn_cast<LoadInst>(I); 5405 StoreInst *SI = dyn_cast<StoreInst>(I); 5406 assert((LI || SI) && "Invalid memory instruction"); 5407 5408 auto *Ptr = getLoadStorePointerOperand(I); 5409 5410 // In order to be widened, the pointer should be consecutive, first of all. 5411 if (!Legal->isConsecutivePtr(Ptr)) 5412 return false; 5413 5414 // If the instruction is a store located in a predicated block, it will be 5415 // scalarized. 5416 if (isScalarWithPredication(I)) 5417 return false; 5418 5419 // If the instruction's allocated size doesn't equal it's type size, it 5420 // requires padding and will be scalarized. 5421 auto &DL = I->getModule()->getDataLayout(); 5422 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5423 if (hasIrregularType(ScalarTy, DL)) 5424 return false; 5425 5426 return true; 5427 } 5428 5429 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5430 // We should not collect Uniforms more than once per VF. Right now, 5431 // this function is called from collectUniformsAndScalars(), which 5432 // already does this check. Collecting Uniforms for VF=1 does not make any 5433 // sense. 5434 5435 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5436 "This function should not be visited twice for the same VF"); 5437 5438 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5439 // not analyze again. Uniforms.count(VF) will return 1. 5440 Uniforms[VF].clear(); 5441 5442 // We now know that the loop is vectorizable! 5443 // Collect instructions inside the loop that will remain uniform after 5444 // vectorization. 5445 5446 // Global values, params and instructions outside of current loop are out of 5447 // scope. 5448 auto isOutOfScope = [&](Value *V) -> bool { 5449 Instruction *I = dyn_cast<Instruction>(V); 5450 return (!I || !TheLoop->contains(I)); 5451 }; 5452 5453 SetVector<Instruction *> Worklist; 5454 BasicBlock *Latch = TheLoop->getLoopLatch(); 5455 5456 // Instructions that are scalar with predication must not be considered 5457 // uniform after vectorization, because that would create an erroneous 5458 // replicating region where only a single instance out of VF should be formed. 5459 // TODO: optimize such seldom cases if found important, see PR40816. 5460 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5461 if (isOutOfScope(I)) { 5462 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5463 << *I << "\n"); 5464 return; 5465 } 5466 if (isScalarWithPredication(I)) { 5467 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5468 << *I << "\n"); 5469 return; 5470 } 5471 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5472 Worklist.insert(I); 5473 }; 5474 5475 // Start with the conditional branch. If the branch condition is an 5476 // instruction contained in the loop that is only used by the branch, it is 5477 // uniform. 5478 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5479 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5480 addToWorklistIfAllowed(Cmp); 5481 5482 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5483 InstWidening WideningDecision = getWideningDecision(I, VF); 5484 assert(WideningDecision != CM_Unknown && 5485 "Widening decision should be ready at this moment"); 5486 5487 // A uniform memory op is itself uniform. We exclude uniform stores 5488 // here as they demand the last lane, not the first one. 5489 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5490 assert(WideningDecision == CM_Scalarize); 5491 return true; 5492 } 5493 5494 return (WideningDecision == CM_Widen || 5495 WideningDecision == CM_Widen_Reverse || 5496 WideningDecision == CM_Interleave); 5497 }; 5498 5499 5500 // Returns true if Ptr is the pointer operand of a memory access instruction 5501 // I, and I is known to not require scalarization. 5502 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5503 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5504 }; 5505 5506 // Holds a list of values which are known to have at least one uniform use. 5507 // Note that there may be other uses which aren't uniform. A "uniform use" 5508 // here is something which only demands lane 0 of the unrolled iterations; 5509 // it does not imply that all lanes produce the same value (e.g. this is not 5510 // the usual meaning of uniform) 5511 SetVector<Value *> HasUniformUse; 5512 5513 // Scan the loop for instructions which are either a) known to have only 5514 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5515 for (auto *BB : TheLoop->blocks()) 5516 for (auto &I : *BB) { 5517 // If there's no pointer operand, there's nothing to do. 5518 auto *Ptr = getLoadStorePointerOperand(&I); 5519 if (!Ptr) 5520 continue; 5521 5522 // A uniform memory op is itself uniform. We exclude uniform stores 5523 // here as they demand the last lane, not the first one. 5524 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5525 addToWorklistIfAllowed(&I); 5526 5527 if (isUniformDecision(&I, VF)) { 5528 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5529 HasUniformUse.insert(Ptr); 5530 } 5531 } 5532 5533 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5534 // demanding) users. Since loops are assumed to be in LCSSA form, this 5535 // disallows uses outside the loop as well. 5536 for (auto *V : HasUniformUse) { 5537 if (isOutOfScope(V)) 5538 continue; 5539 auto *I = cast<Instruction>(V); 5540 auto UsersAreMemAccesses = 5541 llvm::all_of(I->users(), [&](User *U) -> bool { 5542 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5543 }); 5544 if (UsersAreMemAccesses) 5545 addToWorklistIfAllowed(I); 5546 } 5547 5548 // Expand Worklist in topological order: whenever a new instruction 5549 // is added , its users should be already inside Worklist. It ensures 5550 // a uniform instruction will only be used by uniform instructions. 5551 unsigned idx = 0; 5552 while (idx != Worklist.size()) { 5553 Instruction *I = Worklist[idx++]; 5554 5555 for (auto OV : I->operand_values()) { 5556 // isOutOfScope operands cannot be uniform instructions. 5557 if (isOutOfScope(OV)) 5558 continue; 5559 // First order recurrence Phi's should typically be considered 5560 // non-uniform. 5561 auto *OP = dyn_cast<PHINode>(OV); 5562 if (OP && Legal->isFirstOrderRecurrence(OP)) 5563 continue; 5564 // If all the users of the operand are uniform, then add the 5565 // operand into the uniform worklist. 5566 auto *OI = cast<Instruction>(OV); 5567 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5568 auto *J = cast<Instruction>(U); 5569 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5570 })) 5571 addToWorklistIfAllowed(OI); 5572 } 5573 } 5574 5575 // For an instruction to be added into Worklist above, all its users inside 5576 // the loop should also be in Worklist. However, this condition cannot be 5577 // true for phi nodes that form a cyclic dependence. We must process phi 5578 // nodes separately. An induction variable will remain uniform if all users 5579 // of the induction variable and induction variable update remain uniform. 5580 // The code below handles both pointer and non-pointer induction variables. 5581 for (auto &Induction : Legal->getInductionVars()) { 5582 auto *Ind = Induction.first; 5583 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5584 5585 // Determine if all users of the induction variable are uniform after 5586 // vectorization. 5587 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5588 auto *I = cast<Instruction>(U); 5589 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5590 isVectorizedMemAccessUse(I, Ind); 5591 }); 5592 if (!UniformInd) 5593 continue; 5594 5595 // Determine if all users of the induction variable update instruction are 5596 // uniform after vectorization. 5597 auto UniformIndUpdate = 5598 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5599 auto *I = cast<Instruction>(U); 5600 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5601 isVectorizedMemAccessUse(I, IndUpdate); 5602 }); 5603 if (!UniformIndUpdate) 5604 continue; 5605 5606 // The induction variable and its update instruction will remain uniform. 5607 addToWorklistIfAllowed(Ind); 5608 addToWorklistIfAllowed(IndUpdate); 5609 } 5610 5611 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5612 } 5613 5614 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5615 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5616 5617 if (Legal->getRuntimePointerChecking()->Need) { 5618 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5619 "runtime pointer checks needed. Enable vectorization of this " 5620 "loop with '#pragma clang loop vectorize(enable)' when " 5621 "compiling with -Os/-Oz", 5622 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5623 return true; 5624 } 5625 5626 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5627 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5628 "runtime SCEV checks needed. Enable vectorization of this " 5629 "loop with '#pragma clang loop vectorize(enable)' when " 5630 "compiling with -Os/-Oz", 5631 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5632 return true; 5633 } 5634 5635 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5636 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5637 reportVectorizationFailure("Runtime stride check for small trip count", 5638 "runtime stride == 1 checks needed. Enable vectorization of " 5639 "this loop without such check by compiling with -Os/-Oz", 5640 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5641 return true; 5642 } 5643 5644 return false; 5645 } 5646 5647 ElementCount 5648 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5649 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5650 reportVectorizationInfo( 5651 "Disabling scalable vectorization, because target does not " 5652 "support scalable vectors.", 5653 "ScalableVectorsUnsupported", ORE, TheLoop); 5654 return ElementCount::getScalable(0); 5655 } 5656 5657 if (Hints->isScalableVectorizationDisabled()) { 5658 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5659 "ScalableVectorizationDisabled", ORE, TheLoop); 5660 return ElementCount::getScalable(0); 5661 } 5662 5663 auto MaxScalableVF = ElementCount::getScalable( 5664 std::numeric_limits<ElementCount::ScalarTy>::max()); 5665 5666 // Disable scalable vectorization if the loop contains unsupported reductions. 5667 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5668 // FIXME: While for scalable vectors this is currently sufficient, this should 5669 // be replaced by a more detailed mechanism that filters out specific VFs, 5670 // instead of invalidating vectorization for a whole set of VFs based on the 5671 // MaxVF. 5672 if (!canVectorizeReductions(MaxScalableVF)) { 5673 reportVectorizationInfo( 5674 "Scalable vectorization not supported for the reduction " 5675 "operations found in this loop.", 5676 "ScalableVFUnfeasible", ORE, TheLoop); 5677 return ElementCount::getScalable(0); 5678 } 5679 5680 if (Legal->isSafeForAnyVectorWidth()) 5681 return MaxScalableVF; 5682 5683 // Limit MaxScalableVF by the maximum safe dependence distance. 5684 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5685 MaxScalableVF = ElementCount::getScalable( 5686 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5687 if (!MaxScalableVF) 5688 reportVectorizationInfo( 5689 "Max legal vector width too small, scalable vectorization " 5690 "unfeasible.", 5691 "ScalableVFUnfeasible", ORE, TheLoop); 5692 5693 return MaxScalableVF; 5694 } 5695 5696 FixedScalableVFPair 5697 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5698 ElementCount UserVF) { 5699 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5700 unsigned SmallestType, WidestType; 5701 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5702 5703 // Get the maximum safe dependence distance in bits computed by LAA. 5704 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5705 // the memory accesses that is most restrictive (involved in the smallest 5706 // dependence distance). 5707 unsigned MaxSafeElements = 5708 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5709 5710 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5711 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5712 5713 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5714 << ".\n"); 5715 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5716 << ".\n"); 5717 5718 // First analyze the UserVF, fall back if the UserVF should be ignored. 5719 if (UserVF) { 5720 auto MaxSafeUserVF = 5721 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5722 5723 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5724 return UserVF; 5725 5726 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5727 5728 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5729 // is better to ignore the hint and let the compiler choose a suitable VF. 5730 if (!UserVF.isScalable()) { 5731 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5732 << " is unsafe, clamping to max safe VF=" 5733 << MaxSafeFixedVF << ".\n"); 5734 ORE->emit([&]() { 5735 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5736 TheLoop->getStartLoc(), 5737 TheLoop->getHeader()) 5738 << "User-specified vectorization factor " 5739 << ore::NV("UserVectorizationFactor", UserVF) 5740 << " is unsafe, clamping to maximum safe vectorization factor " 5741 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5742 }); 5743 return MaxSafeFixedVF; 5744 } 5745 5746 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5747 << " is unsafe. Ignoring scalable UserVF.\n"); 5748 ORE->emit([&]() { 5749 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5750 TheLoop->getStartLoc(), 5751 TheLoop->getHeader()) 5752 << "User-specified vectorization factor " 5753 << ore::NV("UserVectorizationFactor", UserVF) 5754 << " is unsafe. Ignoring the hint to let the compiler pick a " 5755 "suitable VF."; 5756 }); 5757 } 5758 5759 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5760 << " / " << WidestType << " bits.\n"); 5761 5762 FixedScalableVFPair Result(ElementCount::getFixed(1), 5763 ElementCount::getScalable(0)); 5764 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5765 WidestType, MaxSafeFixedVF)) 5766 Result.FixedVF = MaxVF; 5767 5768 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5769 WidestType, MaxSafeScalableVF)) 5770 if (MaxVF.isScalable()) { 5771 Result.ScalableVF = MaxVF; 5772 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5773 << "\n"); 5774 } 5775 5776 return Result; 5777 } 5778 5779 FixedScalableVFPair 5780 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5781 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5782 // TODO: It may by useful to do since it's still likely to be dynamically 5783 // uniform if the target can skip. 5784 reportVectorizationFailure( 5785 "Not inserting runtime ptr check for divergent target", 5786 "runtime pointer checks needed. Not enabled for divergent target", 5787 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5788 return FixedScalableVFPair::getNone(); 5789 } 5790 5791 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5792 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5793 if (TC == 1) { 5794 reportVectorizationFailure("Single iteration (non) loop", 5795 "loop trip count is one, irrelevant for vectorization", 5796 "SingleIterationLoop", ORE, TheLoop); 5797 return FixedScalableVFPair::getNone(); 5798 } 5799 5800 switch (ScalarEpilogueStatus) { 5801 case CM_ScalarEpilogueAllowed: 5802 return computeFeasibleMaxVF(TC, UserVF); 5803 case CM_ScalarEpilogueNotAllowedUsePredicate: 5804 LLVM_FALLTHROUGH; 5805 case CM_ScalarEpilogueNotNeededUsePredicate: 5806 LLVM_DEBUG( 5807 dbgs() << "LV: vector predicate hint/switch found.\n" 5808 << "LV: Not allowing scalar epilogue, creating predicated " 5809 << "vector loop.\n"); 5810 break; 5811 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5812 // fallthrough as a special case of OptForSize 5813 case CM_ScalarEpilogueNotAllowedOptSize: 5814 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5815 LLVM_DEBUG( 5816 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5817 else 5818 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5819 << "count.\n"); 5820 5821 // Bail if runtime checks are required, which are not good when optimising 5822 // for size. 5823 if (runtimeChecksRequired()) 5824 return FixedScalableVFPair::getNone(); 5825 5826 break; 5827 } 5828 5829 // The only loops we can vectorize without a scalar epilogue, are loops with 5830 // a bottom-test and a single exiting block. We'd have to handle the fact 5831 // that not every instruction executes on the last iteration. This will 5832 // require a lane mask which varies through the vector loop body. (TODO) 5833 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5834 // If there was a tail-folding hint/switch, but we can't fold the tail by 5835 // masking, fallback to a vectorization with a scalar epilogue. 5836 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5837 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5838 "scalar epilogue instead.\n"); 5839 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5840 return computeFeasibleMaxVF(TC, UserVF); 5841 } 5842 return FixedScalableVFPair::getNone(); 5843 } 5844 5845 // Now try the tail folding 5846 5847 // Invalidate interleave groups that require an epilogue if we can't mask 5848 // the interleave-group. 5849 if (!useMaskedInterleavedAccesses(TTI)) { 5850 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5851 "No decisions should have been taken at this point"); 5852 // Note: There is no need to invalidate any cost modeling decisions here, as 5853 // non where taken so far. 5854 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5855 } 5856 5857 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5858 // Avoid tail folding if the trip count is known to be a multiple of any VF 5859 // we chose. 5860 // FIXME: The condition below pessimises the case for fixed-width vectors, 5861 // when scalable VFs are also candidates for vectorization. 5862 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5863 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5864 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5865 "MaxFixedVF must be a power of 2"); 5866 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5867 : MaxFixedVF.getFixedValue(); 5868 ScalarEvolution *SE = PSE.getSE(); 5869 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5870 const SCEV *ExitCount = SE->getAddExpr( 5871 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5872 const SCEV *Rem = SE->getURemExpr( 5873 SE->applyLoopGuards(ExitCount, TheLoop), 5874 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5875 if (Rem->isZero()) { 5876 // Accept MaxFixedVF if we do not have a tail. 5877 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5878 return MaxFactors; 5879 } 5880 } 5881 5882 // If we don't know the precise trip count, or if the trip count that we 5883 // found modulo the vectorization factor is not zero, try to fold the tail 5884 // by masking. 5885 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5886 if (Legal->prepareToFoldTailByMasking()) { 5887 FoldTailByMasking = true; 5888 return MaxFactors; 5889 } 5890 5891 // If there was a tail-folding hint/switch, but we can't fold the tail by 5892 // masking, fallback to a vectorization with a scalar epilogue. 5893 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5894 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5895 "scalar epilogue instead.\n"); 5896 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5897 return MaxFactors; 5898 } 5899 5900 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5901 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5902 return FixedScalableVFPair::getNone(); 5903 } 5904 5905 if (TC == 0) { 5906 reportVectorizationFailure( 5907 "Unable to calculate the loop count due to complex control flow", 5908 "unable to calculate the loop count due to complex control flow", 5909 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5910 return FixedScalableVFPair::getNone(); 5911 } 5912 5913 reportVectorizationFailure( 5914 "Cannot optimize for size and vectorize at the same time.", 5915 "cannot optimize for size and vectorize at the same time. " 5916 "Enable vectorization of this loop with '#pragma clang loop " 5917 "vectorize(enable)' when compiling with -Os/-Oz", 5918 "NoTailLoopWithOptForSize", ORE, TheLoop); 5919 return FixedScalableVFPair::getNone(); 5920 } 5921 5922 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5923 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5924 const ElementCount &MaxSafeVF) { 5925 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5926 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5927 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5928 : TargetTransformInfo::RGK_FixedWidthVector); 5929 5930 // Convenience function to return the minimum of two ElementCounts. 5931 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5932 assert((LHS.isScalable() == RHS.isScalable()) && 5933 "Scalable flags must match"); 5934 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5935 }; 5936 5937 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5938 // Note that both WidestRegister and WidestType may not be a powers of 2. 5939 auto MaxVectorElementCount = ElementCount::get( 5940 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5941 ComputeScalableMaxVF); 5942 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5943 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5944 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5945 5946 if (!MaxVectorElementCount) { 5947 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5948 return ElementCount::getFixed(1); 5949 } 5950 5951 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5952 if (ConstTripCount && 5953 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5954 isPowerOf2_32(ConstTripCount)) { 5955 // We need to clamp the VF to be the ConstTripCount. There is no point in 5956 // choosing a higher viable VF as done in the loop below. If 5957 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5958 // the TC is less than or equal to the known number of lanes. 5959 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5960 << ConstTripCount << "\n"); 5961 return TripCountEC; 5962 } 5963 5964 ElementCount MaxVF = MaxVectorElementCount; 5965 if (TTI.shouldMaximizeVectorBandwidth() || 5966 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5967 auto MaxVectorElementCountMaxBW = ElementCount::get( 5968 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5969 ComputeScalableMaxVF); 5970 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5971 5972 // Collect all viable vectorization factors larger than the default MaxVF 5973 // (i.e. MaxVectorElementCount). 5974 SmallVector<ElementCount, 8> VFs; 5975 for (ElementCount VS = MaxVectorElementCount * 2; 5976 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5977 VFs.push_back(VS); 5978 5979 // For each VF calculate its register usage. 5980 auto RUs = calculateRegisterUsage(VFs); 5981 5982 // Select the largest VF which doesn't require more registers than existing 5983 // ones. 5984 for (int i = RUs.size() - 1; i >= 0; --i) { 5985 bool Selected = true; 5986 for (auto &pair : RUs[i].MaxLocalUsers) { 5987 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5988 if (pair.second > TargetNumRegisters) 5989 Selected = false; 5990 } 5991 if (Selected) { 5992 MaxVF = VFs[i]; 5993 break; 5994 } 5995 } 5996 if (ElementCount MinVF = 5997 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5998 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5999 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 6000 << ") with target's minimum: " << MinVF << '\n'); 6001 MaxVF = MinVF; 6002 } 6003 } 6004 } 6005 return MaxVF; 6006 } 6007 6008 bool LoopVectorizationCostModel::isMoreProfitable( 6009 const VectorizationFactor &A, const VectorizationFactor &B) const { 6010 InstructionCost::CostType CostA = *A.Cost.getValue(); 6011 InstructionCost::CostType CostB = *B.Cost.getValue(); 6012 6013 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6014 6015 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6016 MaxTripCount) { 6017 // If we are folding the tail and the trip count is a known (possibly small) 6018 // constant, the trip count will be rounded up to an integer number of 6019 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6020 // which we compare directly. When not folding the tail, the total cost will 6021 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6022 // approximated with the per-lane cost below instead of using the tripcount 6023 // as here. 6024 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6025 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6026 return RTCostA < RTCostB; 6027 } 6028 6029 // When set to preferred, for now assume vscale may be larger than 1, so 6030 // that scalable vectorization is slightly favorable over fixed-width 6031 // vectorization. 6032 if (Hints->isScalableVectorizationPreferred()) 6033 if (A.Width.isScalable() && !B.Width.isScalable()) 6034 return (CostA * B.Width.getKnownMinValue()) <= 6035 (CostB * A.Width.getKnownMinValue()); 6036 6037 // To avoid the need for FP division: 6038 // (CostA / A.Width) < (CostB / B.Width) 6039 // <=> (CostA * B.Width) < (CostB * A.Width) 6040 return (CostA * B.Width.getKnownMinValue()) < 6041 (CostB * A.Width.getKnownMinValue()); 6042 } 6043 6044 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 6045 const ElementCountSet &VFCandidates) { 6046 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6047 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6048 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6049 assert(VFCandidates.count(ElementCount::getFixed(1)) && 6050 "Expected Scalar VF to be a candidate"); 6051 6052 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6053 VectorizationFactor ChosenFactor = ScalarCost; 6054 6055 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6056 if (ForceVectorization && VFCandidates.size() > 1) { 6057 // Ignore scalar width, because the user explicitly wants vectorization. 6058 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6059 // evaluation. 6060 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6061 } 6062 6063 for (const auto &i : VFCandidates) { 6064 // The cost for scalar VF=1 is already calculated, so ignore it. 6065 if (i.isScalar()) 6066 continue; 6067 6068 // Notice that the vector loop needs to be executed less times, so 6069 // we need to divide the cost of the vector loops by the width of 6070 // the vector elements. 6071 VectorizationCostTy C = expectedCost(i); 6072 6073 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6074 VectorizationFactor Candidate(i, C.first); 6075 LLVM_DEBUG( 6076 dbgs() << "LV: Vector loop of width " << i << " costs: " 6077 << (*Candidate.Cost.getValue() / 6078 Candidate.Width.getKnownMinValue()) 6079 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") 6080 << ".\n"); 6081 6082 if (!C.second && !ForceVectorization) { 6083 LLVM_DEBUG( 6084 dbgs() << "LV: Not considering vector loop of width " << i 6085 << " because it will not generate any vector instructions.\n"); 6086 continue; 6087 } 6088 6089 // If profitable add it to ProfitableVF list. 6090 if (isMoreProfitable(Candidate, ScalarCost)) 6091 ProfitableVFs.push_back(Candidate); 6092 6093 if (isMoreProfitable(Candidate, ChosenFactor)) 6094 ChosenFactor = Candidate; 6095 } 6096 6097 if (!EnableCondStoresVectorization && NumPredStores) { 6098 reportVectorizationFailure("There are conditional stores.", 6099 "store that is conditionally executed prevents vectorization", 6100 "ConditionalStore", ORE, TheLoop); 6101 ChosenFactor = ScalarCost; 6102 } 6103 6104 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6105 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6106 dbgs() 6107 << "LV: Vectorization seems to be not beneficial, " 6108 << "but was forced by a user.\n"); 6109 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6110 return ChosenFactor; 6111 } 6112 6113 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6114 const Loop &L, ElementCount VF) const { 6115 // Cross iteration phis such as reductions need special handling and are 6116 // currently unsupported. 6117 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6118 return Legal->isFirstOrderRecurrence(&Phi) || 6119 Legal->isReductionVariable(&Phi); 6120 })) 6121 return false; 6122 6123 // Phis with uses outside of the loop require special handling and are 6124 // currently unsupported. 6125 for (auto &Entry : Legal->getInductionVars()) { 6126 // Look for uses of the value of the induction at the last iteration. 6127 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6128 for (User *U : PostInc->users()) 6129 if (!L.contains(cast<Instruction>(U))) 6130 return false; 6131 // Look for uses of penultimate value of the induction. 6132 for (User *U : Entry.first->users()) 6133 if (!L.contains(cast<Instruction>(U))) 6134 return false; 6135 } 6136 6137 // Induction variables that are widened require special handling that is 6138 // currently not supported. 6139 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6140 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6141 this->isProfitableToScalarize(Entry.first, VF)); 6142 })) 6143 return false; 6144 6145 return true; 6146 } 6147 6148 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6149 const ElementCount VF) const { 6150 // FIXME: We need a much better cost-model to take different parameters such 6151 // as register pressure, code size increase and cost of extra branches into 6152 // account. For now we apply a very crude heuristic and only consider loops 6153 // with vectorization factors larger than a certain value. 6154 // We also consider epilogue vectorization unprofitable for targets that don't 6155 // consider interleaving beneficial (eg. MVE). 6156 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6157 return false; 6158 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6159 return true; 6160 return false; 6161 } 6162 6163 VectorizationFactor 6164 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6165 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6166 VectorizationFactor Result = VectorizationFactor::Disabled(); 6167 if (!EnableEpilogueVectorization) { 6168 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6169 return Result; 6170 } 6171 6172 if (!isScalarEpilogueAllowed()) { 6173 LLVM_DEBUG( 6174 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6175 "allowed.\n";); 6176 return Result; 6177 } 6178 6179 // FIXME: This can be fixed for scalable vectors later, because at this stage 6180 // the LoopVectorizer will only consider vectorizing a loop with scalable 6181 // vectors when the loop has a hint to enable vectorization for a given VF. 6182 if (MainLoopVF.isScalable()) { 6183 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6184 "yet supported.\n"); 6185 return Result; 6186 } 6187 6188 // Not really a cost consideration, but check for unsupported cases here to 6189 // simplify the logic. 6190 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6191 LLVM_DEBUG( 6192 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6193 "not a supported candidate.\n";); 6194 return Result; 6195 } 6196 6197 if (EpilogueVectorizationForceVF > 1) { 6198 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6199 if (LVP.hasPlanWithVFs( 6200 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6201 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6202 else { 6203 LLVM_DEBUG( 6204 dbgs() 6205 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6206 return Result; 6207 } 6208 } 6209 6210 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6211 TheLoop->getHeader()->getParent()->hasMinSize()) { 6212 LLVM_DEBUG( 6213 dbgs() 6214 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6215 return Result; 6216 } 6217 6218 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6219 return Result; 6220 6221 for (auto &NextVF : ProfitableVFs) 6222 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6223 (Result.Width.getFixedValue() == 1 || 6224 isMoreProfitable(NextVF, Result)) && 6225 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6226 Result = NextVF; 6227 6228 if (Result != VectorizationFactor::Disabled()) 6229 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6230 << Result.Width.getFixedValue() << "\n";); 6231 return Result; 6232 } 6233 6234 std::pair<unsigned, unsigned> 6235 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6236 unsigned MinWidth = -1U; 6237 unsigned MaxWidth = 8; 6238 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6239 6240 // For each block. 6241 for (BasicBlock *BB : TheLoop->blocks()) { 6242 // For each instruction in the loop. 6243 for (Instruction &I : BB->instructionsWithoutDebug()) { 6244 Type *T = I.getType(); 6245 6246 // Skip ignored values. 6247 if (ValuesToIgnore.count(&I)) 6248 continue; 6249 6250 // Only examine Loads, Stores and PHINodes. 6251 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6252 continue; 6253 6254 // Examine PHI nodes that are reduction variables. Update the type to 6255 // account for the recurrence type. 6256 if (auto *PN = dyn_cast<PHINode>(&I)) { 6257 if (!Legal->isReductionVariable(PN)) 6258 continue; 6259 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6260 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6261 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6262 RdxDesc.getRecurrenceType(), 6263 TargetTransformInfo::ReductionFlags())) 6264 continue; 6265 T = RdxDesc.getRecurrenceType(); 6266 } 6267 6268 // Examine the stored values. 6269 if (auto *ST = dyn_cast<StoreInst>(&I)) 6270 T = ST->getValueOperand()->getType(); 6271 6272 // Ignore loaded pointer types and stored pointer types that are not 6273 // vectorizable. 6274 // 6275 // FIXME: The check here attempts to predict whether a load or store will 6276 // be vectorized. We only know this for certain after a VF has 6277 // been selected. Here, we assume that if an access can be 6278 // vectorized, it will be. We should also look at extending this 6279 // optimization to non-pointer types. 6280 // 6281 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6282 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6283 continue; 6284 6285 MinWidth = std::min(MinWidth, 6286 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6287 MaxWidth = std::max(MaxWidth, 6288 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6289 } 6290 } 6291 6292 return {MinWidth, MaxWidth}; 6293 } 6294 6295 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6296 unsigned LoopCost) { 6297 // -- The interleave heuristics -- 6298 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6299 // There are many micro-architectural considerations that we can't predict 6300 // at this level. For example, frontend pressure (on decode or fetch) due to 6301 // code size, or the number and capabilities of the execution ports. 6302 // 6303 // We use the following heuristics to select the interleave count: 6304 // 1. If the code has reductions, then we interleave to break the cross 6305 // iteration dependency. 6306 // 2. If the loop is really small, then we interleave to reduce the loop 6307 // overhead. 6308 // 3. We don't interleave if we think that we will spill registers to memory 6309 // due to the increased register pressure. 6310 6311 if (!isScalarEpilogueAllowed()) 6312 return 1; 6313 6314 // We used the distance for the interleave count. 6315 if (Legal->getMaxSafeDepDistBytes() != -1U) 6316 return 1; 6317 6318 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6319 const bool HasReductions = !Legal->getReductionVars().empty(); 6320 // Do not interleave loops with a relatively small known or estimated trip 6321 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6322 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6323 // because with the above conditions interleaving can expose ILP and break 6324 // cross iteration dependences for reductions. 6325 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6326 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6327 return 1; 6328 6329 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6330 // We divide by these constants so assume that we have at least one 6331 // instruction that uses at least one register. 6332 for (auto& pair : R.MaxLocalUsers) { 6333 pair.second = std::max(pair.second, 1U); 6334 } 6335 6336 // We calculate the interleave count using the following formula. 6337 // Subtract the number of loop invariants from the number of available 6338 // registers. These registers are used by all of the interleaved instances. 6339 // Next, divide the remaining registers by the number of registers that is 6340 // required by the loop, in order to estimate how many parallel instances 6341 // fit without causing spills. All of this is rounded down if necessary to be 6342 // a power of two. We want power of two interleave count to simplify any 6343 // addressing operations or alignment considerations. 6344 // We also want power of two interleave counts to ensure that the induction 6345 // variable of the vector loop wraps to zero, when tail is folded by masking; 6346 // this currently happens when OptForSize, in which case IC is set to 1 above. 6347 unsigned IC = UINT_MAX; 6348 6349 for (auto& pair : R.MaxLocalUsers) { 6350 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6351 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6352 << " registers of " 6353 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6354 if (VF.isScalar()) { 6355 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6356 TargetNumRegisters = ForceTargetNumScalarRegs; 6357 } else { 6358 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6359 TargetNumRegisters = ForceTargetNumVectorRegs; 6360 } 6361 unsigned MaxLocalUsers = pair.second; 6362 unsigned LoopInvariantRegs = 0; 6363 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6364 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6365 6366 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6367 // Don't count the induction variable as interleaved. 6368 if (EnableIndVarRegisterHeur) { 6369 TmpIC = 6370 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6371 std::max(1U, (MaxLocalUsers - 1))); 6372 } 6373 6374 IC = std::min(IC, TmpIC); 6375 } 6376 6377 // Clamp the interleave ranges to reasonable counts. 6378 unsigned MaxInterleaveCount = 6379 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6380 6381 // Check if the user has overridden the max. 6382 if (VF.isScalar()) { 6383 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6384 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6385 } else { 6386 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6387 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6388 } 6389 6390 // If trip count is known or estimated compile time constant, limit the 6391 // interleave count to be less than the trip count divided by VF, provided it 6392 // is at least 1. 6393 // 6394 // For scalable vectors we can't know if interleaving is beneficial. It may 6395 // not be beneficial for small loops if none of the lanes in the second vector 6396 // iterations is enabled. However, for larger loops, there is likely to be a 6397 // similar benefit as for fixed-width vectors. For now, we choose to leave 6398 // the InterleaveCount as if vscale is '1', although if some information about 6399 // the vector is known (e.g. min vector size), we can make a better decision. 6400 if (BestKnownTC) { 6401 MaxInterleaveCount = 6402 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6403 // Make sure MaxInterleaveCount is greater than 0. 6404 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6405 } 6406 6407 assert(MaxInterleaveCount > 0 && 6408 "Maximum interleave count must be greater than 0"); 6409 6410 // Clamp the calculated IC to be between the 1 and the max interleave count 6411 // that the target and trip count allows. 6412 if (IC > MaxInterleaveCount) 6413 IC = MaxInterleaveCount; 6414 else 6415 // Make sure IC is greater than 0. 6416 IC = std::max(1u, IC); 6417 6418 assert(IC > 0 && "Interleave count must be greater than 0."); 6419 6420 // If we did not calculate the cost for VF (because the user selected the VF) 6421 // then we calculate the cost of VF here. 6422 if (LoopCost == 0) { 6423 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6424 LoopCost = *expectedCost(VF).first.getValue(); 6425 } 6426 6427 assert(LoopCost && "Non-zero loop cost expected"); 6428 6429 // Interleave if we vectorized this loop and there is a reduction that could 6430 // benefit from interleaving. 6431 if (VF.isVector() && HasReductions) { 6432 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6433 return IC; 6434 } 6435 6436 // Note that if we've already vectorized the loop we will have done the 6437 // runtime check and so interleaving won't require further checks. 6438 bool InterleavingRequiresRuntimePointerCheck = 6439 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6440 6441 // We want to interleave small loops in order to reduce the loop overhead and 6442 // potentially expose ILP opportunities. 6443 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6444 << "LV: IC is " << IC << '\n' 6445 << "LV: VF is " << VF << '\n'); 6446 const bool AggressivelyInterleaveReductions = 6447 TTI.enableAggressiveInterleaving(HasReductions); 6448 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6449 // We assume that the cost overhead is 1 and we use the cost model 6450 // to estimate the cost of the loop and interleave until the cost of the 6451 // loop overhead is about 5% of the cost of the loop. 6452 unsigned SmallIC = 6453 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6454 6455 // Interleave until store/load ports (estimated by max interleave count) are 6456 // saturated. 6457 unsigned NumStores = Legal->getNumStores(); 6458 unsigned NumLoads = Legal->getNumLoads(); 6459 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6460 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6461 6462 // If we have a scalar reduction (vector reductions are already dealt with 6463 // by this point), we can increase the critical path length if the loop 6464 // we're interleaving is inside another loop. Limit, by default to 2, so the 6465 // critical path only gets increased by one reduction operation. 6466 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6467 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6468 SmallIC = std::min(SmallIC, F); 6469 StoresIC = std::min(StoresIC, F); 6470 LoadsIC = std::min(LoadsIC, F); 6471 } 6472 6473 if (EnableLoadStoreRuntimeInterleave && 6474 std::max(StoresIC, LoadsIC) > SmallIC) { 6475 LLVM_DEBUG( 6476 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6477 return std::max(StoresIC, LoadsIC); 6478 } 6479 6480 // If there are scalar reductions and TTI has enabled aggressive 6481 // interleaving for reductions, we will interleave to expose ILP. 6482 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6483 AggressivelyInterleaveReductions) { 6484 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6485 // Interleave no less than SmallIC but not as aggressive as the normal IC 6486 // to satisfy the rare situation when resources are too limited. 6487 return std::max(IC / 2, SmallIC); 6488 } else { 6489 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6490 return SmallIC; 6491 } 6492 } 6493 6494 // Interleave if this is a large loop (small loops are already dealt with by 6495 // this point) that could benefit from interleaving. 6496 if (AggressivelyInterleaveReductions) { 6497 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6498 return IC; 6499 } 6500 6501 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6502 return 1; 6503 } 6504 6505 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6506 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6507 // This function calculates the register usage by measuring the highest number 6508 // of values that are alive at a single location. Obviously, this is a very 6509 // rough estimation. We scan the loop in a topological order in order and 6510 // assign a number to each instruction. We use RPO to ensure that defs are 6511 // met before their users. We assume that each instruction that has in-loop 6512 // users starts an interval. We record every time that an in-loop value is 6513 // used, so we have a list of the first and last occurrences of each 6514 // instruction. Next, we transpose this data structure into a multi map that 6515 // holds the list of intervals that *end* at a specific location. This multi 6516 // map allows us to perform a linear search. We scan the instructions linearly 6517 // and record each time that a new interval starts, by placing it in a set. 6518 // If we find this value in the multi-map then we remove it from the set. 6519 // The max register usage is the maximum size of the set. 6520 // We also search for instructions that are defined outside the loop, but are 6521 // used inside the loop. We need this number separately from the max-interval 6522 // usage number because when we unroll, loop-invariant values do not take 6523 // more register. 6524 LoopBlocksDFS DFS(TheLoop); 6525 DFS.perform(LI); 6526 6527 RegisterUsage RU; 6528 6529 // Each 'key' in the map opens a new interval. The values 6530 // of the map are the index of the 'last seen' usage of the 6531 // instruction that is the key. 6532 using IntervalMap = DenseMap<Instruction *, unsigned>; 6533 6534 // Maps instruction to its index. 6535 SmallVector<Instruction *, 64> IdxToInstr; 6536 // Marks the end of each interval. 6537 IntervalMap EndPoint; 6538 // Saves the list of instruction indices that are used in the loop. 6539 SmallPtrSet<Instruction *, 8> Ends; 6540 // Saves the list of values that are used in the loop but are 6541 // defined outside the loop, such as arguments and constants. 6542 SmallPtrSet<Value *, 8> LoopInvariants; 6543 6544 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6545 for (Instruction &I : BB->instructionsWithoutDebug()) { 6546 IdxToInstr.push_back(&I); 6547 6548 // Save the end location of each USE. 6549 for (Value *U : I.operands()) { 6550 auto *Instr = dyn_cast<Instruction>(U); 6551 6552 // Ignore non-instruction values such as arguments, constants, etc. 6553 if (!Instr) 6554 continue; 6555 6556 // If this instruction is outside the loop then record it and continue. 6557 if (!TheLoop->contains(Instr)) { 6558 LoopInvariants.insert(Instr); 6559 continue; 6560 } 6561 6562 // Overwrite previous end points. 6563 EndPoint[Instr] = IdxToInstr.size(); 6564 Ends.insert(Instr); 6565 } 6566 } 6567 } 6568 6569 // Saves the list of intervals that end with the index in 'key'. 6570 using InstrList = SmallVector<Instruction *, 2>; 6571 DenseMap<unsigned, InstrList> TransposeEnds; 6572 6573 // Transpose the EndPoints to a list of values that end at each index. 6574 for (auto &Interval : EndPoint) 6575 TransposeEnds[Interval.second].push_back(Interval.first); 6576 6577 SmallPtrSet<Instruction *, 8> OpenIntervals; 6578 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6579 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6580 6581 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6582 6583 // A lambda that gets the register usage for the given type and VF. 6584 const auto &TTICapture = TTI; 6585 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6586 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6587 return 0; 6588 return *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6589 }; 6590 6591 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6592 Instruction *I = IdxToInstr[i]; 6593 6594 // Remove all of the instructions that end at this location. 6595 InstrList &List = TransposeEnds[i]; 6596 for (Instruction *ToRemove : List) 6597 OpenIntervals.erase(ToRemove); 6598 6599 // Ignore instructions that are never used within the loop. 6600 if (!Ends.count(I)) 6601 continue; 6602 6603 // Skip ignored values. 6604 if (ValuesToIgnore.count(I)) 6605 continue; 6606 6607 // For each VF find the maximum usage of registers. 6608 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6609 // Count the number of live intervals. 6610 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6611 6612 if (VFs[j].isScalar()) { 6613 for (auto Inst : OpenIntervals) { 6614 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6615 if (RegUsage.find(ClassID) == RegUsage.end()) 6616 RegUsage[ClassID] = 1; 6617 else 6618 RegUsage[ClassID] += 1; 6619 } 6620 } else { 6621 collectUniformsAndScalars(VFs[j]); 6622 for (auto Inst : OpenIntervals) { 6623 // Skip ignored values for VF > 1. 6624 if (VecValuesToIgnore.count(Inst)) 6625 continue; 6626 if (isScalarAfterVectorization(Inst, VFs[j])) { 6627 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6628 if (RegUsage.find(ClassID) == RegUsage.end()) 6629 RegUsage[ClassID] = 1; 6630 else 6631 RegUsage[ClassID] += 1; 6632 } else { 6633 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6634 if (RegUsage.find(ClassID) == RegUsage.end()) 6635 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6636 else 6637 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6638 } 6639 } 6640 } 6641 6642 for (auto& pair : RegUsage) { 6643 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6644 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6645 else 6646 MaxUsages[j][pair.first] = pair.second; 6647 } 6648 } 6649 6650 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6651 << OpenIntervals.size() << '\n'); 6652 6653 // Add the current instruction to the list of open intervals. 6654 OpenIntervals.insert(I); 6655 } 6656 6657 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6658 SmallMapVector<unsigned, unsigned, 4> Invariant; 6659 6660 for (auto Inst : LoopInvariants) { 6661 unsigned Usage = 6662 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6663 unsigned ClassID = 6664 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6665 if (Invariant.find(ClassID) == Invariant.end()) 6666 Invariant[ClassID] = Usage; 6667 else 6668 Invariant[ClassID] += Usage; 6669 } 6670 6671 LLVM_DEBUG({ 6672 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6673 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6674 << " item\n"; 6675 for (const auto &pair : MaxUsages[i]) { 6676 dbgs() << "LV(REG): RegisterClass: " 6677 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6678 << " registers\n"; 6679 } 6680 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6681 << " item\n"; 6682 for (const auto &pair : Invariant) { 6683 dbgs() << "LV(REG): RegisterClass: " 6684 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6685 << " registers\n"; 6686 } 6687 }); 6688 6689 RU.LoopInvariantRegs = Invariant; 6690 RU.MaxLocalUsers = MaxUsages[i]; 6691 RUs[i] = RU; 6692 } 6693 6694 return RUs; 6695 } 6696 6697 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6698 // TODO: Cost model for emulated masked load/store is completely 6699 // broken. This hack guides the cost model to use an artificially 6700 // high enough value to practically disable vectorization with such 6701 // operations, except where previously deployed legality hack allowed 6702 // using very low cost values. This is to avoid regressions coming simply 6703 // from moving "masked load/store" check from legality to cost model. 6704 // Masked Load/Gather emulation was previously never allowed. 6705 // Limited number of Masked Store/Scatter emulation was allowed. 6706 assert(isPredicatedInst(I) && 6707 "Expecting a scalar emulated instruction"); 6708 return isa<LoadInst>(I) || 6709 (isa<StoreInst>(I) && 6710 NumPredStores > NumberOfStoresToPredicate); 6711 } 6712 6713 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6714 // If we aren't vectorizing the loop, or if we've already collected the 6715 // instructions to scalarize, there's nothing to do. Collection may already 6716 // have occurred if we have a user-selected VF and are now computing the 6717 // expected cost for interleaving. 6718 if (VF.isScalar() || VF.isZero() || 6719 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6720 return; 6721 6722 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6723 // not profitable to scalarize any instructions, the presence of VF in the 6724 // map will indicate that we've analyzed it already. 6725 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6726 6727 // Find all the instructions that are scalar with predication in the loop and 6728 // determine if it would be better to not if-convert the blocks they are in. 6729 // If so, we also record the instructions to scalarize. 6730 for (BasicBlock *BB : TheLoop->blocks()) { 6731 if (!blockNeedsPredication(BB)) 6732 continue; 6733 for (Instruction &I : *BB) 6734 if (isScalarWithPredication(&I)) { 6735 ScalarCostsTy ScalarCosts; 6736 // Do not apply discount logic if hacked cost is needed 6737 // for emulated masked memrefs. 6738 if (!useEmulatedMaskMemRefHack(&I) && 6739 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6740 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6741 // Remember that BB will remain after vectorization. 6742 PredicatedBBsAfterVectorization.insert(BB); 6743 } 6744 } 6745 } 6746 6747 int LoopVectorizationCostModel::computePredInstDiscount( 6748 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6749 assert(!isUniformAfterVectorization(PredInst, VF) && 6750 "Instruction marked uniform-after-vectorization will be predicated"); 6751 6752 // Initialize the discount to zero, meaning that the scalar version and the 6753 // vector version cost the same. 6754 InstructionCost Discount = 0; 6755 6756 // Holds instructions to analyze. The instructions we visit are mapped in 6757 // ScalarCosts. Those instructions are the ones that would be scalarized if 6758 // we find that the scalar version costs less. 6759 SmallVector<Instruction *, 8> Worklist; 6760 6761 // Returns true if the given instruction can be scalarized. 6762 auto canBeScalarized = [&](Instruction *I) -> bool { 6763 // We only attempt to scalarize instructions forming a single-use chain 6764 // from the original predicated block that would otherwise be vectorized. 6765 // Although not strictly necessary, we give up on instructions we know will 6766 // already be scalar to avoid traversing chains that are unlikely to be 6767 // beneficial. 6768 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6769 isScalarAfterVectorization(I, VF)) 6770 return false; 6771 6772 // If the instruction is scalar with predication, it will be analyzed 6773 // separately. We ignore it within the context of PredInst. 6774 if (isScalarWithPredication(I)) 6775 return false; 6776 6777 // If any of the instruction's operands are uniform after vectorization, 6778 // the instruction cannot be scalarized. This prevents, for example, a 6779 // masked load from being scalarized. 6780 // 6781 // We assume we will only emit a value for lane zero of an instruction 6782 // marked uniform after vectorization, rather than VF identical values. 6783 // Thus, if we scalarize an instruction that uses a uniform, we would 6784 // create uses of values corresponding to the lanes we aren't emitting code 6785 // for. This behavior can be changed by allowing getScalarValue to clone 6786 // the lane zero values for uniforms rather than asserting. 6787 for (Use &U : I->operands()) 6788 if (auto *J = dyn_cast<Instruction>(U.get())) 6789 if (isUniformAfterVectorization(J, VF)) 6790 return false; 6791 6792 // Otherwise, we can scalarize the instruction. 6793 return true; 6794 }; 6795 6796 // Compute the expected cost discount from scalarizing the entire expression 6797 // feeding the predicated instruction. We currently only consider expressions 6798 // that are single-use instruction chains. 6799 Worklist.push_back(PredInst); 6800 while (!Worklist.empty()) { 6801 Instruction *I = Worklist.pop_back_val(); 6802 6803 // If we've already analyzed the instruction, there's nothing to do. 6804 if (ScalarCosts.find(I) != ScalarCosts.end()) 6805 continue; 6806 6807 // Compute the cost of the vector instruction. Note that this cost already 6808 // includes the scalarization overhead of the predicated instruction. 6809 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6810 6811 // Compute the cost of the scalarized instruction. This cost is the cost of 6812 // the instruction as if it wasn't if-converted and instead remained in the 6813 // predicated block. We will scale this cost by block probability after 6814 // computing the scalarization overhead. 6815 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6816 InstructionCost ScalarCost = 6817 VF.getKnownMinValue() * 6818 getInstructionCost(I, ElementCount::getFixed(1)).first; 6819 6820 // Compute the scalarization overhead of needed insertelement instructions 6821 // and phi nodes. 6822 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6823 ScalarCost += TTI.getScalarizationOverhead( 6824 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6825 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6826 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6827 ScalarCost += 6828 VF.getKnownMinValue() * 6829 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6830 } 6831 6832 // Compute the scalarization overhead of needed extractelement 6833 // instructions. For each of the instruction's operands, if the operand can 6834 // be scalarized, add it to the worklist; otherwise, account for the 6835 // overhead. 6836 for (Use &U : I->operands()) 6837 if (auto *J = dyn_cast<Instruction>(U.get())) { 6838 assert(VectorType::isValidElementType(J->getType()) && 6839 "Instruction has non-scalar type"); 6840 if (canBeScalarized(J)) 6841 Worklist.push_back(J); 6842 else if (needsExtract(J, VF)) { 6843 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6844 ScalarCost += TTI.getScalarizationOverhead( 6845 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6846 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6847 } 6848 } 6849 6850 // Scale the total scalar cost by block probability. 6851 ScalarCost /= getReciprocalPredBlockProb(); 6852 6853 // Compute the discount. A non-negative discount means the vector version 6854 // of the instruction costs more, and scalarizing would be beneficial. 6855 Discount += VectorCost - ScalarCost; 6856 ScalarCosts[I] = ScalarCost; 6857 } 6858 6859 return *Discount.getValue(); 6860 } 6861 6862 LoopVectorizationCostModel::VectorizationCostTy 6863 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6864 VectorizationCostTy Cost; 6865 6866 // For each block. 6867 for (BasicBlock *BB : TheLoop->blocks()) { 6868 VectorizationCostTy BlockCost; 6869 6870 // For each instruction in the old loop. 6871 for (Instruction &I : BB->instructionsWithoutDebug()) { 6872 // Skip ignored values. 6873 if (ValuesToIgnore.count(&I) || 6874 (VF.isVector() && VecValuesToIgnore.count(&I))) 6875 continue; 6876 6877 VectorizationCostTy C = getInstructionCost(&I, VF); 6878 6879 // Check if we should override the cost. 6880 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6881 C.first = InstructionCost(ForceTargetInstructionCost); 6882 6883 BlockCost.first += C.first; 6884 BlockCost.second |= C.second; 6885 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6886 << " for VF " << VF << " For instruction: " << I 6887 << '\n'); 6888 } 6889 6890 // If we are vectorizing a predicated block, it will have been 6891 // if-converted. This means that the block's instructions (aside from 6892 // stores and instructions that may divide by zero) will now be 6893 // unconditionally executed. For the scalar case, we may not always execute 6894 // the predicated block, if it is an if-else block. Thus, scale the block's 6895 // cost by the probability of executing it. blockNeedsPredication from 6896 // Legal is used so as to not include all blocks in tail folded loops. 6897 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6898 BlockCost.first /= getReciprocalPredBlockProb(); 6899 6900 Cost.first += BlockCost.first; 6901 Cost.second |= BlockCost.second; 6902 } 6903 6904 return Cost; 6905 } 6906 6907 /// Gets Address Access SCEV after verifying that the access pattern 6908 /// is loop invariant except the induction variable dependence. 6909 /// 6910 /// This SCEV can be sent to the Target in order to estimate the address 6911 /// calculation cost. 6912 static const SCEV *getAddressAccessSCEV( 6913 Value *Ptr, 6914 LoopVectorizationLegality *Legal, 6915 PredicatedScalarEvolution &PSE, 6916 const Loop *TheLoop) { 6917 6918 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6919 if (!Gep) 6920 return nullptr; 6921 6922 // We are looking for a gep with all loop invariant indices except for one 6923 // which should be an induction variable. 6924 auto SE = PSE.getSE(); 6925 unsigned NumOperands = Gep->getNumOperands(); 6926 for (unsigned i = 1; i < NumOperands; ++i) { 6927 Value *Opd = Gep->getOperand(i); 6928 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6929 !Legal->isInductionVariable(Opd)) 6930 return nullptr; 6931 } 6932 6933 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6934 return PSE.getSCEV(Ptr); 6935 } 6936 6937 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6938 return Legal->hasStride(I->getOperand(0)) || 6939 Legal->hasStride(I->getOperand(1)); 6940 } 6941 6942 InstructionCost 6943 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6944 ElementCount VF) { 6945 assert(VF.isVector() && 6946 "Scalarization cost of instruction implies vectorization."); 6947 if (VF.isScalable()) 6948 return InstructionCost::getInvalid(); 6949 6950 Type *ValTy = getLoadStoreType(I); 6951 auto SE = PSE.getSE(); 6952 6953 unsigned AS = getLoadStoreAddressSpace(I); 6954 Value *Ptr = getLoadStorePointerOperand(I); 6955 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6956 6957 // Figure out whether the access is strided and get the stride value 6958 // if it's known in compile time 6959 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6960 6961 // Get the cost of the scalar memory instruction and address computation. 6962 InstructionCost Cost = 6963 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6964 6965 // Don't pass *I here, since it is scalar but will actually be part of a 6966 // vectorized loop where the user of it is a vectorized instruction. 6967 const Align Alignment = getLoadStoreAlignment(I); 6968 Cost += VF.getKnownMinValue() * 6969 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6970 AS, TTI::TCK_RecipThroughput); 6971 6972 // Get the overhead of the extractelement and insertelement instructions 6973 // we might create due to scalarization. 6974 Cost += getScalarizationOverhead(I, VF); 6975 6976 // If we have a predicated load/store, it will need extra i1 extracts and 6977 // conditional branches, but may not be executed for each vector lane. Scale 6978 // the cost by the probability of executing the predicated block. 6979 if (isPredicatedInst(I)) { 6980 Cost /= getReciprocalPredBlockProb(); 6981 6982 // Add the cost of an i1 extract and a branch 6983 auto *Vec_i1Ty = 6984 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6985 Cost += TTI.getScalarizationOverhead( 6986 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6987 /*Insert=*/false, /*Extract=*/true); 6988 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6989 6990 if (useEmulatedMaskMemRefHack(I)) 6991 // Artificially setting to a high enough value to practically disable 6992 // vectorization with such operations. 6993 Cost = 3000000; 6994 } 6995 6996 return Cost; 6997 } 6998 6999 InstructionCost 7000 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7001 ElementCount VF) { 7002 Type *ValTy = getLoadStoreType(I); 7003 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7004 Value *Ptr = getLoadStorePointerOperand(I); 7005 unsigned AS = getLoadStoreAddressSpace(I); 7006 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7007 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7008 7009 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7010 "Stride should be 1 or -1 for consecutive memory access"); 7011 const Align Alignment = getLoadStoreAlignment(I); 7012 InstructionCost Cost = 0; 7013 if (Legal->isMaskRequired(I)) 7014 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7015 CostKind); 7016 else 7017 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7018 CostKind, I); 7019 7020 bool Reverse = ConsecutiveStride < 0; 7021 if (Reverse) 7022 Cost += 7023 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7024 return Cost; 7025 } 7026 7027 InstructionCost 7028 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7029 ElementCount VF) { 7030 assert(Legal->isUniformMemOp(*I)); 7031 7032 Type *ValTy = getLoadStoreType(I); 7033 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7034 const Align Alignment = getLoadStoreAlignment(I); 7035 unsigned AS = getLoadStoreAddressSpace(I); 7036 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7037 if (isa<LoadInst>(I)) { 7038 return TTI.getAddressComputationCost(ValTy) + 7039 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7040 CostKind) + 7041 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7042 } 7043 StoreInst *SI = cast<StoreInst>(I); 7044 7045 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7046 return TTI.getAddressComputationCost(ValTy) + 7047 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7048 CostKind) + 7049 (isLoopInvariantStoreValue 7050 ? 0 7051 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7052 VF.getKnownMinValue() - 1)); 7053 } 7054 7055 InstructionCost 7056 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7057 ElementCount VF) { 7058 Type *ValTy = getLoadStoreType(I); 7059 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7060 const Align Alignment = getLoadStoreAlignment(I); 7061 const Value *Ptr = getLoadStorePointerOperand(I); 7062 7063 return TTI.getAddressComputationCost(VectorTy) + 7064 TTI.getGatherScatterOpCost( 7065 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7066 TargetTransformInfo::TCK_RecipThroughput, I); 7067 } 7068 7069 InstructionCost 7070 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7071 ElementCount VF) { 7072 // TODO: Once we have support for interleaving with scalable vectors 7073 // we can calculate the cost properly here. 7074 if (VF.isScalable()) 7075 return InstructionCost::getInvalid(); 7076 7077 Type *ValTy = getLoadStoreType(I); 7078 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7079 unsigned AS = getLoadStoreAddressSpace(I); 7080 7081 auto Group = getInterleavedAccessGroup(I); 7082 assert(Group && "Fail to get an interleaved access group."); 7083 7084 unsigned InterleaveFactor = Group->getFactor(); 7085 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7086 7087 // Holds the indices of existing members in an interleaved load group. 7088 // An interleaved store group doesn't need this as it doesn't allow gaps. 7089 SmallVector<unsigned, 4> Indices; 7090 if (isa<LoadInst>(I)) { 7091 for (unsigned i = 0; i < InterleaveFactor; i++) 7092 if (Group->getMember(i)) 7093 Indices.push_back(i); 7094 } 7095 7096 // Calculate the cost of the whole interleaved group. 7097 bool UseMaskForGaps = 7098 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7099 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7100 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7101 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7102 7103 if (Group->isReverse()) { 7104 // TODO: Add support for reversed masked interleaved access. 7105 assert(!Legal->isMaskRequired(I) && 7106 "Reverse masked interleaved access not supported."); 7107 Cost += 7108 Group->getNumMembers() * 7109 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7110 } 7111 return Cost; 7112 } 7113 7114 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7115 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7116 // Early exit for no inloop reductions 7117 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7118 return InstructionCost::getInvalid(); 7119 auto *VectorTy = cast<VectorType>(Ty); 7120 7121 // We are looking for a pattern of, and finding the minimal acceptable cost: 7122 // reduce(mul(ext(A), ext(B))) or 7123 // reduce(mul(A, B)) or 7124 // reduce(ext(A)) or 7125 // reduce(A). 7126 // The basic idea is that we walk down the tree to do that, finding the root 7127 // reduction instruction in InLoopReductionImmediateChains. From there we find 7128 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7129 // of the components. If the reduction cost is lower then we return it for the 7130 // reduction instruction and 0 for the other instructions in the pattern. If 7131 // it is not we return an invalid cost specifying the orignal cost method 7132 // should be used. 7133 Instruction *RetI = I; 7134 if ((RetI->getOpcode() == Instruction::SExt || 7135 RetI->getOpcode() == Instruction::ZExt)) { 7136 if (!RetI->hasOneUser()) 7137 return InstructionCost::getInvalid(); 7138 RetI = RetI->user_back(); 7139 } 7140 if (RetI->getOpcode() == Instruction::Mul && 7141 RetI->user_back()->getOpcode() == Instruction::Add) { 7142 if (!RetI->hasOneUser()) 7143 return InstructionCost::getInvalid(); 7144 RetI = RetI->user_back(); 7145 } 7146 7147 // Test if the found instruction is a reduction, and if not return an invalid 7148 // cost specifying the parent to use the original cost modelling. 7149 if (!InLoopReductionImmediateChains.count(RetI)) 7150 return InstructionCost::getInvalid(); 7151 7152 // Find the reduction this chain is a part of and calculate the basic cost of 7153 // the reduction on its own. 7154 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7155 Instruction *ReductionPhi = LastChain; 7156 while (!isa<PHINode>(ReductionPhi)) 7157 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7158 7159 RecurrenceDescriptor RdxDesc = 7160 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7161 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7162 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7163 7164 // Get the operand that was not the reduction chain and match it to one of the 7165 // patterns, returning the better cost if it is found. 7166 Instruction *RedOp = RetI->getOperand(1) == LastChain 7167 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7168 : dyn_cast<Instruction>(RetI->getOperand(1)); 7169 7170 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7171 7172 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7173 !TheLoop->isLoopInvariant(RedOp)) { 7174 bool IsUnsigned = isa<ZExtInst>(RedOp); 7175 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7176 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7177 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7178 CostKind); 7179 7180 InstructionCost ExtCost = 7181 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7182 TTI::CastContextHint::None, CostKind, RedOp); 7183 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7184 return I == RetI ? *RedCost.getValue() : 0; 7185 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7186 Instruction *Mul = RedOp; 7187 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7188 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7189 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7190 Op0->getOpcode() == Op1->getOpcode() && 7191 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7192 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7193 bool IsUnsigned = isa<ZExtInst>(Op0); 7194 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7195 // reduce(mul(ext, ext)) 7196 InstructionCost ExtCost = 7197 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7198 TTI::CastContextHint::None, CostKind, Op0); 7199 InstructionCost MulCost = 7200 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7201 7202 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7203 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7204 CostKind); 7205 7206 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7207 return I == RetI ? *RedCost.getValue() : 0; 7208 } else { 7209 InstructionCost MulCost = 7210 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7211 7212 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7213 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7214 CostKind); 7215 7216 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7217 return I == RetI ? *RedCost.getValue() : 0; 7218 } 7219 } 7220 7221 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7222 } 7223 7224 InstructionCost 7225 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7226 ElementCount VF) { 7227 // Calculate scalar cost only. Vectorization cost should be ready at this 7228 // moment. 7229 if (VF.isScalar()) { 7230 Type *ValTy = getLoadStoreType(I); 7231 const Align Alignment = getLoadStoreAlignment(I); 7232 unsigned AS = getLoadStoreAddressSpace(I); 7233 7234 return TTI.getAddressComputationCost(ValTy) + 7235 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7236 TTI::TCK_RecipThroughput, I); 7237 } 7238 return getWideningCost(I, VF); 7239 } 7240 7241 LoopVectorizationCostModel::VectorizationCostTy 7242 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7243 ElementCount VF) { 7244 // If we know that this instruction will remain uniform, check the cost of 7245 // the scalar version. 7246 if (isUniformAfterVectorization(I, VF)) 7247 VF = ElementCount::getFixed(1); 7248 7249 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7250 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7251 7252 // Forced scalars do not have any scalarization overhead. 7253 auto ForcedScalar = ForcedScalars.find(VF); 7254 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7255 auto InstSet = ForcedScalar->second; 7256 if (InstSet.count(I)) 7257 return VectorizationCostTy( 7258 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7259 VF.getKnownMinValue()), 7260 false); 7261 } 7262 7263 Type *VectorTy; 7264 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7265 7266 bool TypeNotScalarized = 7267 VF.isVector() && VectorTy->isVectorTy() && 7268 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7269 return VectorizationCostTy(C, TypeNotScalarized); 7270 } 7271 7272 InstructionCost 7273 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7274 ElementCount VF) const { 7275 7276 if (VF.isScalable()) 7277 return InstructionCost::getInvalid(); 7278 7279 if (VF.isScalar()) 7280 return 0; 7281 7282 InstructionCost Cost = 0; 7283 Type *RetTy = ToVectorTy(I->getType(), VF); 7284 if (!RetTy->isVoidTy() && 7285 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7286 Cost += TTI.getScalarizationOverhead( 7287 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7288 true, false); 7289 7290 // Some targets keep addresses scalar. 7291 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7292 return Cost; 7293 7294 // Some targets support efficient element stores. 7295 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7296 return Cost; 7297 7298 // Collect operands to consider. 7299 CallInst *CI = dyn_cast<CallInst>(I); 7300 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7301 7302 // Skip operands that do not require extraction/scalarization and do not incur 7303 // any overhead. 7304 SmallVector<Type *> Tys; 7305 for (auto *V : filterExtractingOperands(Ops, VF)) 7306 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7307 return Cost + TTI.getOperandsScalarizationOverhead( 7308 filterExtractingOperands(Ops, VF), Tys); 7309 } 7310 7311 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7312 if (VF.isScalar()) 7313 return; 7314 NumPredStores = 0; 7315 for (BasicBlock *BB : TheLoop->blocks()) { 7316 // For each instruction in the old loop. 7317 for (Instruction &I : *BB) { 7318 Value *Ptr = getLoadStorePointerOperand(&I); 7319 if (!Ptr) 7320 continue; 7321 7322 // TODO: We should generate better code and update the cost model for 7323 // predicated uniform stores. Today they are treated as any other 7324 // predicated store (see added test cases in 7325 // invariant-store-vectorization.ll). 7326 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7327 NumPredStores++; 7328 7329 if (Legal->isUniformMemOp(I)) { 7330 // TODO: Avoid replicating loads and stores instead of 7331 // relying on instcombine to remove them. 7332 // Load: Scalar load + broadcast 7333 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7334 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7335 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7336 continue; 7337 } 7338 7339 // We assume that widening is the best solution when possible. 7340 if (memoryInstructionCanBeWidened(&I, VF)) { 7341 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7342 int ConsecutiveStride = 7343 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7344 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7345 "Expected consecutive stride."); 7346 InstWidening Decision = 7347 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7348 setWideningDecision(&I, VF, Decision, Cost); 7349 continue; 7350 } 7351 7352 // Choose between Interleaving, Gather/Scatter or Scalarization. 7353 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7354 unsigned NumAccesses = 1; 7355 if (isAccessInterleaved(&I)) { 7356 auto Group = getInterleavedAccessGroup(&I); 7357 assert(Group && "Fail to get an interleaved access group."); 7358 7359 // Make one decision for the whole group. 7360 if (getWideningDecision(&I, VF) != CM_Unknown) 7361 continue; 7362 7363 NumAccesses = Group->getNumMembers(); 7364 if (interleavedAccessCanBeWidened(&I, VF)) 7365 InterleaveCost = getInterleaveGroupCost(&I, VF); 7366 } 7367 7368 InstructionCost GatherScatterCost = 7369 isLegalGatherOrScatter(&I) 7370 ? getGatherScatterCost(&I, VF) * NumAccesses 7371 : InstructionCost::getInvalid(); 7372 7373 InstructionCost ScalarizationCost = 7374 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7375 7376 // Choose better solution for the current VF, 7377 // write down this decision and use it during vectorization. 7378 InstructionCost Cost; 7379 InstWidening Decision; 7380 if (InterleaveCost <= GatherScatterCost && 7381 InterleaveCost < ScalarizationCost) { 7382 Decision = CM_Interleave; 7383 Cost = InterleaveCost; 7384 } else if (GatherScatterCost < ScalarizationCost) { 7385 Decision = CM_GatherScatter; 7386 Cost = GatherScatterCost; 7387 } else { 7388 assert(!VF.isScalable() && 7389 "We cannot yet scalarise for scalable vectors"); 7390 Decision = CM_Scalarize; 7391 Cost = ScalarizationCost; 7392 } 7393 // If the instructions belongs to an interleave group, the whole group 7394 // receives the same decision. The whole group receives the cost, but 7395 // the cost will actually be assigned to one instruction. 7396 if (auto Group = getInterleavedAccessGroup(&I)) 7397 setWideningDecision(Group, VF, Decision, Cost); 7398 else 7399 setWideningDecision(&I, VF, Decision, Cost); 7400 } 7401 } 7402 7403 // Make sure that any load of address and any other address computation 7404 // remains scalar unless there is gather/scatter support. This avoids 7405 // inevitable extracts into address registers, and also has the benefit of 7406 // activating LSR more, since that pass can't optimize vectorized 7407 // addresses. 7408 if (TTI.prefersVectorizedAddressing()) 7409 return; 7410 7411 // Start with all scalar pointer uses. 7412 SmallPtrSet<Instruction *, 8> AddrDefs; 7413 for (BasicBlock *BB : TheLoop->blocks()) 7414 for (Instruction &I : *BB) { 7415 Instruction *PtrDef = 7416 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7417 if (PtrDef && TheLoop->contains(PtrDef) && 7418 getWideningDecision(&I, VF) != CM_GatherScatter) 7419 AddrDefs.insert(PtrDef); 7420 } 7421 7422 // Add all instructions used to generate the addresses. 7423 SmallVector<Instruction *, 4> Worklist; 7424 append_range(Worklist, AddrDefs); 7425 while (!Worklist.empty()) { 7426 Instruction *I = Worklist.pop_back_val(); 7427 for (auto &Op : I->operands()) 7428 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7429 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7430 AddrDefs.insert(InstOp).second) 7431 Worklist.push_back(InstOp); 7432 } 7433 7434 for (auto *I : AddrDefs) { 7435 if (isa<LoadInst>(I)) { 7436 // Setting the desired widening decision should ideally be handled in 7437 // by cost functions, but since this involves the task of finding out 7438 // if the loaded register is involved in an address computation, it is 7439 // instead changed here when we know this is the case. 7440 InstWidening Decision = getWideningDecision(I, VF); 7441 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7442 // Scalarize a widened load of address. 7443 setWideningDecision( 7444 I, VF, CM_Scalarize, 7445 (VF.getKnownMinValue() * 7446 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7447 else if (auto Group = getInterleavedAccessGroup(I)) { 7448 // Scalarize an interleave group of address loads. 7449 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7450 if (Instruction *Member = Group->getMember(I)) 7451 setWideningDecision( 7452 Member, VF, CM_Scalarize, 7453 (VF.getKnownMinValue() * 7454 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7455 } 7456 } 7457 } else 7458 // Make sure I gets scalarized and a cost estimate without 7459 // scalarization overhead. 7460 ForcedScalars[VF].insert(I); 7461 } 7462 } 7463 7464 InstructionCost 7465 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7466 Type *&VectorTy) { 7467 Type *RetTy = I->getType(); 7468 if (canTruncateToMinimalBitwidth(I, VF)) 7469 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7470 auto SE = PSE.getSE(); 7471 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7472 7473 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7474 ElementCount VF) -> bool { 7475 if (VF.isScalar()) 7476 return true; 7477 7478 auto Scalarized = InstsToScalarize.find(VF); 7479 assert(Scalarized != InstsToScalarize.end() && 7480 "VF not yet analyzed for scalarization profitability"); 7481 return !Scalarized->second.count(I) && 7482 llvm::all_of(I->users(), [&](User *U) { 7483 auto *UI = cast<Instruction>(U); 7484 return !Scalarized->second.count(UI); 7485 }); 7486 }; 7487 (void) hasSingleCopyAfterVectorization; 7488 7489 if (isScalarAfterVectorization(I, VF)) { 7490 // With the exception of GEPs and PHIs, after scalarization there should 7491 // only be one copy of the instruction generated in the loop. This is 7492 // because the VF is either 1, or any instructions that need scalarizing 7493 // have already been dealt with by the the time we get here. As a result, 7494 // it means we don't have to multiply the instruction cost by VF. 7495 assert(I->getOpcode() == Instruction::GetElementPtr || 7496 I->getOpcode() == Instruction::PHI || 7497 (I->getOpcode() == Instruction::BitCast && 7498 I->getType()->isPointerTy()) || 7499 hasSingleCopyAfterVectorization(I, VF)); 7500 VectorTy = RetTy; 7501 } else 7502 VectorTy = ToVectorTy(RetTy, VF); 7503 7504 // TODO: We need to estimate the cost of intrinsic calls. 7505 switch (I->getOpcode()) { 7506 case Instruction::GetElementPtr: 7507 // We mark this instruction as zero-cost because the cost of GEPs in 7508 // vectorized code depends on whether the corresponding memory instruction 7509 // is scalarized or not. Therefore, we handle GEPs with the memory 7510 // instruction cost. 7511 return 0; 7512 case Instruction::Br: { 7513 // In cases of scalarized and predicated instructions, there will be VF 7514 // predicated blocks in the vectorized loop. Each branch around these 7515 // blocks requires also an extract of its vector compare i1 element. 7516 bool ScalarPredicatedBB = false; 7517 BranchInst *BI = cast<BranchInst>(I); 7518 if (VF.isVector() && BI->isConditional() && 7519 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7520 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7521 ScalarPredicatedBB = true; 7522 7523 if (ScalarPredicatedBB) { 7524 // Return cost for branches around scalarized and predicated blocks. 7525 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7526 auto *Vec_i1Ty = 7527 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7528 return (TTI.getScalarizationOverhead( 7529 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7530 false, true) + 7531 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7532 VF.getKnownMinValue())); 7533 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7534 // The back-edge branch will remain, as will all scalar branches. 7535 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7536 else 7537 // This branch will be eliminated by if-conversion. 7538 return 0; 7539 // Note: We currently assume zero cost for an unconditional branch inside 7540 // a predicated block since it will become a fall-through, although we 7541 // may decide in the future to call TTI for all branches. 7542 } 7543 case Instruction::PHI: { 7544 auto *Phi = cast<PHINode>(I); 7545 7546 // First-order recurrences are replaced by vector shuffles inside the loop. 7547 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7548 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7549 return TTI.getShuffleCost( 7550 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7551 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7552 7553 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7554 // converted into select instructions. We require N - 1 selects per phi 7555 // node, where N is the number of incoming values. 7556 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7557 return (Phi->getNumIncomingValues() - 1) * 7558 TTI.getCmpSelInstrCost( 7559 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7560 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7561 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7562 7563 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7564 } 7565 case Instruction::UDiv: 7566 case Instruction::SDiv: 7567 case Instruction::URem: 7568 case Instruction::SRem: 7569 // If we have a predicated instruction, it may not be executed for each 7570 // vector lane. Get the scalarization cost and scale this amount by the 7571 // probability of executing the predicated block. If the instruction is not 7572 // predicated, we fall through to the next case. 7573 if (VF.isVector() && isScalarWithPredication(I)) { 7574 InstructionCost Cost = 0; 7575 7576 // These instructions have a non-void type, so account for the phi nodes 7577 // that we will create. This cost is likely to be zero. The phi node 7578 // cost, if any, should be scaled by the block probability because it 7579 // models a copy at the end of each predicated block. 7580 Cost += VF.getKnownMinValue() * 7581 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7582 7583 // The cost of the non-predicated instruction. 7584 Cost += VF.getKnownMinValue() * 7585 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7586 7587 // The cost of insertelement and extractelement instructions needed for 7588 // scalarization. 7589 Cost += getScalarizationOverhead(I, VF); 7590 7591 // Scale the cost by the probability of executing the predicated blocks. 7592 // This assumes the predicated block for each vector lane is equally 7593 // likely. 7594 return Cost / getReciprocalPredBlockProb(); 7595 } 7596 LLVM_FALLTHROUGH; 7597 case Instruction::Add: 7598 case Instruction::FAdd: 7599 case Instruction::Sub: 7600 case Instruction::FSub: 7601 case Instruction::Mul: 7602 case Instruction::FMul: 7603 case Instruction::FDiv: 7604 case Instruction::FRem: 7605 case Instruction::Shl: 7606 case Instruction::LShr: 7607 case Instruction::AShr: 7608 case Instruction::And: 7609 case Instruction::Or: 7610 case Instruction::Xor: { 7611 // Since we will replace the stride by 1 the multiplication should go away. 7612 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7613 return 0; 7614 7615 // Detect reduction patterns 7616 InstructionCost RedCost; 7617 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7618 .isValid()) 7619 return RedCost; 7620 7621 // Certain instructions can be cheaper to vectorize if they have a constant 7622 // second vector operand. One example of this are shifts on x86. 7623 Value *Op2 = I->getOperand(1); 7624 TargetTransformInfo::OperandValueProperties Op2VP; 7625 TargetTransformInfo::OperandValueKind Op2VK = 7626 TTI.getOperandInfo(Op2, Op2VP); 7627 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7628 Op2VK = TargetTransformInfo::OK_UniformValue; 7629 7630 SmallVector<const Value *, 4> Operands(I->operand_values()); 7631 return TTI.getArithmeticInstrCost( 7632 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7633 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7634 } 7635 case Instruction::FNeg: { 7636 return TTI.getArithmeticInstrCost( 7637 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7638 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7639 TargetTransformInfo::OP_None, I->getOperand(0), I); 7640 } 7641 case Instruction::Select: { 7642 SelectInst *SI = cast<SelectInst>(I); 7643 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7644 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7645 7646 const Value *Op0, *Op1; 7647 using namespace llvm::PatternMatch; 7648 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7649 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7650 // select x, y, false --> x & y 7651 // select x, true, y --> x | y 7652 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7653 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7654 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7655 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7656 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7657 Op1->getType()->getScalarSizeInBits() == 1); 7658 7659 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7660 return TTI.getArithmeticInstrCost( 7661 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7662 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7663 } 7664 7665 Type *CondTy = SI->getCondition()->getType(); 7666 if (!ScalarCond) 7667 CondTy = VectorType::get(CondTy, VF); 7668 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7669 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7670 } 7671 case Instruction::ICmp: 7672 case Instruction::FCmp: { 7673 Type *ValTy = I->getOperand(0)->getType(); 7674 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7675 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7676 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7677 VectorTy = ToVectorTy(ValTy, VF); 7678 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7679 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7680 } 7681 case Instruction::Store: 7682 case Instruction::Load: { 7683 ElementCount Width = VF; 7684 if (Width.isVector()) { 7685 InstWidening Decision = getWideningDecision(I, Width); 7686 assert(Decision != CM_Unknown && 7687 "CM decision should be taken at this point"); 7688 if (Decision == CM_Scalarize) 7689 Width = ElementCount::getFixed(1); 7690 } 7691 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7692 return getMemoryInstructionCost(I, VF); 7693 } 7694 case Instruction::BitCast: 7695 if (I->getType()->isPointerTy()) 7696 return 0; 7697 LLVM_FALLTHROUGH; 7698 case Instruction::ZExt: 7699 case Instruction::SExt: 7700 case Instruction::FPToUI: 7701 case Instruction::FPToSI: 7702 case Instruction::FPExt: 7703 case Instruction::PtrToInt: 7704 case Instruction::IntToPtr: 7705 case Instruction::SIToFP: 7706 case Instruction::UIToFP: 7707 case Instruction::Trunc: 7708 case Instruction::FPTrunc: { 7709 // Computes the CastContextHint from a Load/Store instruction. 7710 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7711 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7712 "Expected a load or a store!"); 7713 7714 if (VF.isScalar() || !TheLoop->contains(I)) 7715 return TTI::CastContextHint::Normal; 7716 7717 switch (getWideningDecision(I, VF)) { 7718 case LoopVectorizationCostModel::CM_GatherScatter: 7719 return TTI::CastContextHint::GatherScatter; 7720 case LoopVectorizationCostModel::CM_Interleave: 7721 return TTI::CastContextHint::Interleave; 7722 case LoopVectorizationCostModel::CM_Scalarize: 7723 case LoopVectorizationCostModel::CM_Widen: 7724 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7725 : TTI::CastContextHint::Normal; 7726 case LoopVectorizationCostModel::CM_Widen_Reverse: 7727 return TTI::CastContextHint::Reversed; 7728 case LoopVectorizationCostModel::CM_Unknown: 7729 llvm_unreachable("Instr did not go through cost modelling?"); 7730 } 7731 7732 llvm_unreachable("Unhandled case!"); 7733 }; 7734 7735 unsigned Opcode = I->getOpcode(); 7736 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7737 // For Trunc, the context is the only user, which must be a StoreInst. 7738 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7739 if (I->hasOneUse()) 7740 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7741 CCH = ComputeCCH(Store); 7742 } 7743 // For Z/Sext, the context is the operand, which must be a LoadInst. 7744 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7745 Opcode == Instruction::FPExt) { 7746 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7747 CCH = ComputeCCH(Load); 7748 } 7749 7750 // We optimize the truncation of induction variables having constant 7751 // integer steps. The cost of these truncations is the same as the scalar 7752 // operation. 7753 if (isOptimizableIVTruncate(I, VF)) { 7754 auto *Trunc = cast<TruncInst>(I); 7755 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7756 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7757 } 7758 7759 // Detect reduction patterns 7760 InstructionCost RedCost; 7761 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7762 .isValid()) 7763 return RedCost; 7764 7765 Type *SrcScalarTy = I->getOperand(0)->getType(); 7766 Type *SrcVecTy = 7767 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7768 if (canTruncateToMinimalBitwidth(I, VF)) { 7769 // This cast is going to be shrunk. This may remove the cast or it might 7770 // turn it into slightly different cast. For example, if MinBW == 16, 7771 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7772 // 7773 // Calculate the modified src and dest types. 7774 Type *MinVecTy = VectorTy; 7775 if (Opcode == Instruction::Trunc) { 7776 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7777 VectorTy = 7778 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7779 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7780 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7781 VectorTy = 7782 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7783 } 7784 } 7785 7786 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7787 } 7788 case Instruction::Call: { 7789 bool NeedToScalarize; 7790 CallInst *CI = cast<CallInst>(I); 7791 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7792 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7793 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7794 return std::min(CallCost, IntrinsicCost); 7795 } 7796 return CallCost; 7797 } 7798 case Instruction::ExtractValue: 7799 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7800 default: 7801 // This opcode is unknown. Assume that it is the same as 'mul'. 7802 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7803 } // end of switch. 7804 } 7805 7806 char LoopVectorize::ID = 0; 7807 7808 static const char lv_name[] = "Loop Vectorization"; 7809 7810 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7811 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7812 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7813 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7814 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7815 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7816 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7817 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7818 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7819 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7820 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7821 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7822 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7823 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7824 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7825 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7826 7827 namespace llvm { 7828 7829 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7830 7831 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7832 bool VectorizeOnlyWhenForced) { 7833 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7834 } 7835 7836 } // end namespace llvm 7837 7838 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7839 // Check if the pointer operand of a load or store instruction is 7840 // consecutive. 7841 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7842 return Legal->isConsecutivePtr(Ptr); 7843 return false; 7844 } 7845 7846 void LoopVectorizationCostModel::collectValuesToIgnore() { 7847 // Ignore ephemeral values. 7848 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7849 7850 // Ignore type-promoting instructions we identified during reduction 7851 // detection. 7852 for (auto &Reduction : Legal->getReductionVars()) { 7853 RecurrenceDescriptor &RedDes = Reduction.second; 7854 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7855 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7856 } 7857 // Ignore type-casting instructions we identified during induction 7858 // detection. 7859 for (auto &Induction : Legal->getInductionVars()) { 7860 InductionDescriptor &IndDes = Induction.second; 7861 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7862 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7863 } 7864 } 7865 7866 void LoopVectorizationCostModel::collectInLoopReductions() { 7867 for (auto &Reduction : Legal->getReductionVars()) { 7868 PHINode *Phi = Reduction.first; 7869 RecurrenceDescriptor &RdxDesc = Reduction.second; 7870 7871 // We don't collect reductions that are type promoted (yet). 7872 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7873 continue; 7874 7875 // If the target would prefer this reduction to happen "in-loop", then we 7876 // want to record it as such. 7877 unsigned Opcode = RdxDesc.getOpcode(); 7878 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7879 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7880 TargetTransformInfo::ReductionFlags())) 7881 continue; 7882 7883 // Check that we can correctly put the reductions into the loop, by 7884 // finding the chain of operations that leads from the phi to the loop 7885 // exit value. 7886 SmallVector<Instruction *, 4> ReductionOperations = 7887 RdxDesc.getReductionOpChain(Phi, TheLoop); 7888 bool InLoop = !ReductionOperations.empty(); 7889 if (InLoop) { 7890 InLoopReductionChains[Phi] = ReductionOperations; 7891 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7892 Instruction *LastChain = Phi; 7893 for (auto *I : ReductionOperations) { 7894 InLoopReductionImmediateChains[I] = LastChain; 7895 LastChain = I; 7896 } 7897 } 7898 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7899 << " reduction for phi: " << *Phi << "\n"); 7900 } 7901 } 7902 7903 // TODO: we could return a pair of values that specify the max VF and 7904 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7905 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7906 // doesn't have a cost model that can choose which plan to execute if 7907 // more than one is generated. 7908 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7909 LoopVectorizationCostModel &CM) { 7910 unsigned WidestType; 7911 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7912 return WidestVectorRegBits / WidestType; 7913 } 7914 7915 VectorizationFactor 7916 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7917 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7918 ElementCount VF = UserVF; 7919 // Outer loop handling: They may require CFG and instruction level 7920 // transformations before even evaluating whether vectorization is profitable. 7921 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7922 // the vectorization pipeline. 7923 if (!OrigLoop->isInnermost()) { 7924 // If the user doesn't provide a vectorization factor, determine a 7925 // reasonable one. 7926 if (UserVF.isZero()) { 7927 VF = ElementCount::getFixed(determineVPlanVF( 7928 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7929 .getFixedSize(), 7930 CM)); 7931 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7932 7933 // Make sure we have a VF > 1 for stress testing. 7934 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7935 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7936 << "overriding computed VF.\n"); 7937 VF = ElementCount::getFixed(4); 7938 } 7939 } 7940 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7941 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7942 "VF needs to be a power of two"); 7943 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7944 << "VF " << VF << " to build VPlans.\n"); 7945 buildVPlans(VF, VF); 7946 7947 // For VPlan build stress testing, we bail out after VPlan construction. 7948 if (VPlanBuildStressTest) 7949 return VectorizationFactor::Disabled(); 7950 7951 return {VF, 0 /*Cost*/}; 7952 } 7953 7954 LLVM_DEBUG( 7955 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7956 "VPlan-native path.\n"); 7957 return VectorizationFactor::Disabled(); 7958 } 7959 7960 Optional<VectorizationFactor> 7961 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7962 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7963 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7964 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7965 return None; 7966 7967 // Invalidate interleave groups if all blocks of loop will be predicated. 7968 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7969 !useMaskedInterleavedAccesses(*TTI)) { 7970 LLVM_DEBUG( 7971 dbgs() 7972 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7973 "which requires masked-interleaved support.\n"); 7974 if (CM.InterleaveInfo.invalidateGroups()) 7975 // Invalidating interleave groups also requires invalidating all decisions 7976 // based on them, which includes widening decisions and uniform and scalar 7977 // values. 7978 CM.invalidateCostModelingDecisions(); 7979 } 7980 7981 ElementCount MaxUserVF = 7982 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7983 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7984 if (!UserVF.isZero() && UserVFIsLegal) { 7985 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7986 << " VF " << UserVF << ".\n"); 7987 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7988 "VF needs to be a power of two"); 7989 // Collect the instructions (and their associated costs) that will be more 7990 // profitable to scalarize. 7991 CM.selectUserVectorizationFactor(UserVF); 7992 CM.collectInLoopReductions(); 7993 buildVPlansWithVPRecipes(UserVF, UserVF); 7994 LLVM_DEBUG(printPlans(dbgs())); 7995 return {{UserVF, 0}}; 7996 } 7997 7998 // Populate the set of Vectorization Factor Candidates. 7999 ElementCountSet VFCandidates; 8000 for (auto VF = ElementCount::getFixed(1); 8001 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 8002 VFCandidates.insert(VF); 8003 for (auto VF = ElementCount::getScalable(1); 8004 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 8005 VFCandidates.insert(VF); 8006 8007 for (const auto VF : VFCandidates) { 8008 // Collect Uniform and Scalar instructions after vectorization with VF. 8009 CM.collectUniformsAndScalars(VF); 8010 8011 // Collect the instructions (and their associated costs) that will be more 8012 // profitable to scalarize. 8013 if (VF.isVector()) 8014 CM.collectInstsToScalarize(VF); 8015 } 8016 8017 CM.collectInLoopReductions(); 8018 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 8019 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 8020 8021 LLVM_DEBUG(printPlans(dbgs())); 8022 if (!MaxFactors.hasVector()) 8023 return VectorizationFactor::Disabled(); 8024 8025 // Select the optimal vectorization factor. 8026 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); 8027 8028 // Check if it is profitable to vectorize with runtime checks. 8029 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8030 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8031 bool PragmaThresholdReached = 8032 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8033 bool ThresholdReached = 8034 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8035 if ((ThresholdReached && !Hints.allowReordering()) || 8036 PragmaThresholdReached) { 8037 ORE->emit([&]() { 8038 return OptimizationRemarkAnalysisAliasing( 8039 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8040 OrigLoop->getHeader()) 8041 << "loop not vectorized: cannot prove it is safe to reorder " 8042 "memory operations"; 8043 }); 8044 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8045 Hints.emitRemarkWithHints(); 8046 return VectorizationFactor::Disabled(); 8047 } 8048 } 8049 return SelectedVF; 8050 } 8051 8052 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8053 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8054 << '\n'); 8055 BestVF = VF; 8056 BestUF = UF; 8057 8058 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8059 return !Plan->hasVF(VF); 8060 }); 8061 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8062 } 8063 8064 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8065 DominatorTree *DT) { 8066 // Perform the actual loop transformation. 8067 8068 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8069 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8070 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8071 8072 VPTransformState State{ 8073 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8074 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8075 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8076 State.CanonicalIV = ILV.Induction; 8077 8078 ILV.printDebugTracesAtStart(); 8079 8080 //===------------------------------------------------===// 8081 // 8082 // Notice: any optimization or new instruction that go 8083 // into the code below should also be implemented in 8084 // the cost-model. 8085 // 8086 //===------------------------------------------------===// 8087 8088 // 2. Copy and widen instructions from the old loop into the new loop. 8089 VPlans.front()->execute(&State); 8090 8091 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8092 // predication, updating analyses. 8093 ILV.fixVectorizedLoop(State); 8094 8095 ILV.printDebugTracesAtEnd(); 8096 } 8097 8098 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8099 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8100 for (const auto &Plan : VPlans) 8101 if (PrintVPlansInDotFormat) 8102 Plan->printDOT(O); 8103 else 8104 Plan->print(O); 8105 } 8106 #endif 8107 8108 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8109 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8110 8111 // We create new control-flow for the vectorized loop, so the original exit 8112 // conditions will be dead after vectorization if it's only used by the 8113 // terminator 8114 SmallVector<BasicBlock*> ExitingBlocks; 8115 OrigLoop->getExitingBlocks(ExitingBlocks); 8116 for (auto *BB : ExitingBlocks) { 8117 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8118 if (!Cmp || !Cmp->hasOneUse()) 8119 continue; 8120 8121 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8122 if (!DeadInstructions.insert(Cmp).second) 8123 continue; 8124 8125 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8126 // TODO: can recurse through operands in general 8127 for (Value *Op : Cmp->operands()) { 8128 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8129 DeadInstructions.insert(cast<Instruction>(Op)); 8130 } 8131 } 8132 8133 // We create new "steps" for induction variable updates to which the original 8134 // induction variables map. An original update instruction will be dead if 8135 // all its users except the induction variable are dead. 8136 auto *Latch = OrigLoop->getLoopLatch(); 8137 for (auto &Induction : Legal->getInductionVars()) { 8138 PHINode *Ind = Induction.first; 8139 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8140 8141 // If the tail is to be folded by masking, the primary induction variable, 8142 // if exists, isn't dead: it will be used for masking. Don't kill it. 8143 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8144 continue; 8145 8146 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8147 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8148 })) 8149 DeadInstructions.insert(IndUpdate); 8150 8151 // We record as "Dead" also the type-casting instructions we had identified 8152 // during induction analysis. We don't need any handling for them in the 8153 // vectorized loop because we have proven that, under a proper runtime 8154 // test guarding the vectorized loop, the value of the phi, and the casted 8155 // value of the phi, are the same. The last instruction in this casting chain 8156 // will get its scalar/vector/widened def from the scalar/vector/widened def 8157 // of the respective phi node. Any other casts in the induction def-use chain 8158 // have no other uses outside the phi update chain, and will be ignored. 8159 InductionDescriptor &IndDes = Induction.second; 8160 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8161 DeadInstructions.insert(Casts.begin(), Casts.end()); 8162 } 8163 } 8164 8165 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8166 8167 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8168 8169 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8170 Instruction::BinaryOps BinOp) { 8171 // When unrolling and the VF is 1, we only need to add a simple scalar. 8172 Type *Ty = Val->getType(); 8173 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8174 8175 if (Ty->isFloatingPointTy()) { 8176 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8177 8178 // Floating-point operations inherit FMF via the builder's flags. 8179 Value *MulOp = Builder.CreateFMul(C, Step); 8180 return Builder.CreateBinOp(BinOp, Val, MulOp); 8181 } 8182 Constant *C = ConstantInt::get(Ty, StartIdx); 8183 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8184 } 8185 8186 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8187 SmallVector<Metadata *, 4> MDs; 8188 // Reserve first location for self reference to the LoopID metadata node. 8189 MDs.push_back(nullptr); 8190 bool IsUnrollMetadata = false; 8191 MDNode *LoopID = L->getLoopID(); 8192 if (LoopID) { 8193 // First find existing loop unrolling disable metadata. 8194 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8195 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8196 if (MD) { 8197 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8198 IsUnrollMetadata = 8199 S && S->getString().startswith("llvm.loop.unroll.disable"); 8200 } 8201 MDs.push_back(LoopID->getOperand(i)); 8202 } 8203 } 8204 8205 if (!IsUnrollMetadata) { 8206 // Add runtime unroll disable metadata. 8207 LLVMContext &Context = L->getHeader()->getContext(); 8208 SmallVector<Metadata *, 1> DisableOperands; 8209 DisableOperands.push_back( 8210 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8211 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8212 MDs.push_back(DisableNode); 8213 MDNode *NewLoopID = MDNode::get(Context, MDs); 8214 // Set operand 0 to refer to the loop id itself. 8215 NewLoopID->replaceOperandWith(0, NewLoopID); 8216 L->setLoopID(NewLoopID); 8217 } 8218 } 8219 8220 //===--------------------------------------------------------------------===// 8221 // EpilogueVectorizerMainLoop 8222 //===--------------------------------------------------------------------===// 8223 8224 /// This function is partially responsible for generating the control flow 8225 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8226 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8227 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8228 Loop *Lp = createVectorLoopSkeleton(""); 8229 8230 // Generate the code to check the minimum iteration count of the vector 8231 // epilogue (see below). 8232 EPI.EpilogueIterationCountCheck = 8233 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8234 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8235 8236 // Generate the code to check any assumptions that we've made for SCEV 8237 // expressions. 8238 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8239 8240 // Generate the code that checks at runtime if arrays overlap. We put the 8241 // checks into a separate block to make the more common case of few elements 8242 // faster. 8243 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8244 8245 // Generate the iteration count check for the main loop, *after* the check 8246 // for the epilogue loop, so that the path-length is shorter for the case 8247 // that goes directly through the vector epilogue. The longer-path length for 8248 // the main loop is compensated for, by the gain from vectorizing the larger 8249 // trip count. Note: the branch will get updated later on when we vectorize 8250 // the epilogue. 8251 EPI.MainLoopIterationCountCheck = 8252 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8253 8254 // Generate the induction variable. 8255 OldInduction = Legal->getPrimaryInduction(); 8256 Type *IdxTy = Legal->getWidestInductionType(); 8257 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8258 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8259 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8260 EPI.VectorTripCount = CountRoundDown; 8261 Induction = 8262 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8263 getDebugLocFromInstOrOperands(OldInduction)); 8264 8265 // Skip induction resume value creation here because they will be created in 8266 // the second pass. If we created them here, they wouldn't be used anyway, 8267 // because the vplan in the second pass still contains the inductions from the 8268 // original loop. 8269 8270 return completeLoopSkeleton(Lp, OrigLoopID); 8271 } 8272 8273 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8274 LLVM_DEBUG({ 8275 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8276 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8277 << ", Main Loop UF:" << EPI.MainLoopUF 8278 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8279 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8280 }); 8281 } 8282 8283 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8284 DEBUG_WITH_TYPE(VerboseDebug, { 8285 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8286 }); 8287 } 8288 8289 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8290 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8291 assert(L && "Expected valid Loop."); 8292 assert(Bypass && "Expected valid bypass basic block."); 8293 unsigned VFactor = 8294 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8295 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8296 Value *Count = getOrCreateTripCount(L); 8297 // Reuse existing vector loop preheader for TC checks. 8298 // Note that new preheader block is generated for vector loop. 8299 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8300 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8301 8302 // Generate code to check if the loop's trip count is less than VF * UF of the 8303 // main vector loop. 8304 auto P = 8305 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8306 8307 Value *CheckMinIters = Builder.CreateICmp( 8308 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8309 "min.iters.check"); 8310 8311 if (!ForEpilogue) 8312 TCCheckBlock->setName("vector.main.loop.iter.check"); 8313 8314 // Create new preheader for vector loop. 8315 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8316 DT, LI, nullptr, "vector.ph"); 8317 8318 if (ForEpilogue) { 8319 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8320 DT->getNode(Bypass)->getIDom()) && 8321 "TC check is expected to dominate Bypass"); 8322 8323 // Update dominator for Bypass & LoopExit. 8324 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8325 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8326 8327 LoopBypassBlocks.push_back(TCCheckBlock); 8328 8329 // Save the trip count so we don't have to regenerate it in the 8330 // vec.epilog.iter.check. This is safe to do because the trip count 8331 // generated here dominates the vector epilog iter check. 8332 EPI.TripCount = Count; 8333 } 8334 8335 ReplaceInstWithInst( 8336 TCCheckBlock->getTerminator(), 8337 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8338 8339 return TCCheckBlock; 8340 } 8341 8342 //===--------------------------------------------------------------------===// 8343 // EpilogueVectorizerEpilogueLoop 8344 //===--------------------------------------------------------------------===// 8345 8346 /// This function is partially responsible for generating the control flow 8347 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8348 BasicBlock * 8349 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8350 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8351 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8352 8353 // Now, compare the remaining count and if there aren't enough iterations to 8354 // execute the vectorized epilogue skip to the scalar part. 8355 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8356 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8357 LoopVectorPreHeader = 8358 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8359 LI, nullptr, "vec.epilog.ph"); 8360 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8361 VecEpilogueIterationCountCheck); 8362 8363 // Adjust the control flow taking the state info from the main loop 8364 // vectorization into account. 8365 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8366 "expected this to be saved from the previous pass."); 8367 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8368 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8369 8370 DT->changeImmediateDominator(LoopVectorPreHeader, 8371 EPI.MainLoopIterationCountCheck); 8372 8373 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8374 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8375 8376 if (EPI.SCEVSafetyCheck) 8377 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8378 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8379 if (EPI.MemSafetyCheck) 8380 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8381 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8382 8383 DT->changeImmediateDominator( 8384 VecEpilogueIterationCountCheck, 8385 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8386 8387 DT->changeImmediateDominator(LoopScalarPreHeader, 8388 EPI.EpilogueIterationCountCheck); 8389 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8390 8391 // Keep track of bypass blocks, as they feed start values to the induction 8392 // phis in the scalar loop preheader. 8393 if (EPI.SCEVSafetyCheck) 8394 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8395 if (EPI.MemSafetyCheck) 8396 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8397 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8398 8399 // Generate a resume induction for the vector epilogue and put it in the 8400 // vector epilogue preheader 8401 Type *IdxTy = Legal->getWidestInductionType(); 8402 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8403 LoopVectorPreHeader->getFirstNonPHI()); 8404 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8405 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8406 EPI.MainLoopIterationCountCheck); 8407 8408 // Generate the induction variable. 8409 OldInduction = Legal->getPrimaryInduction(); 8410 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8411 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8412 Value *StartIdx = EPResumeVal; 8413 Induction = 8414 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8415 getDebugLocFromInstOrOperands(OldInduction)); 8416 8417 // Generate induction resume values. These variables save the new starting 8418 // indexes for the scalar loop. They are used to test if there are any tail 8419 // iterations left once the vector loop has completed. 8420 // Note that when the vectorized epilogue is skipped due to iteration count 8421 // check, then the resume value for the induction variable comes from 8422 // the trip count of the main vector loop, hence passing the AdditionalBypass 8423 // argument. 8424 createInductionResumeValues(Lp, CountRoundDown, 8425 {VecEpilogueIterationCountCheck, 8426 EPI.VectorTripCount} /* AdditionalBypass */); 8427 8428 AddRuntimeUnrollDisableMetaData(Lp); 8429 return completeLoopSkeleton(Lp, OrigLoopID); 8430 } 8431 8432 BasicBlock * 8433 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8434 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8435 8436 assert(EPI.TripCount && 8437 "Expected trip count to have been safed in the first pass."); 8438 assert( 8439 (!isa<Instruction>(EPI.TripCount) || 8440 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8441 "saved trip count does not dominate insertion point."); 8442 Value *TC = EPI.TripCount; 8443 IRBuilder<> Builder(Insert->getTerminator()); 8444 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8445 8446 // Generate code to check if the loop's trip count is less than VF * UF of the 8447 // vector epilogue loop. 8448 auto P = 8449 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8450 8451 Value *CheckMinIters = Builder.CreateICmp( 8452 P, Count, 8453 ConstantInt::get(Count->getType(), 8454 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8455 "min.epilog.iters.check"); 8456 8457 ReplaceInstWithInst( 8458 Insert->getTerminator(), 8459 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8460 8461 LoopBypassBlocks.push_back(Insert); 8462 return Insert; 8463 } 8464 8465 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8466 LLVM_DEBUG({ 8467 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8468 << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8469 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8470 }); 8471 } 8472 8473 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8474 DEBUG_WITH_TYPE(VerboseDebug, { 8475 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8476 }); 8477 } 8478 8479 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8480 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8481 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8482 bool PredicateAtRangeStart = Predicate(Range.Start); 8483 8484 for (ElementCount TmpVF = Range.Start * 2; 8485 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8486 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8487 Range.End = TmpVF; 8488 break; 8489 } 8490 8491 return PredicateAtRangeStart; 8492 } 8493 8494 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8495 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8496 /// of VF's starting at a given VF and extending it as much as possible. Each 8497 /// vectorization decision can potentially shorten this sub-range during 8498 /// buildVPlan(). 8499 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8500 ElementCount MaxVF) { 8501 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8502 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8503 VFRange SubRange = {VF, MaxVFPlusOne}; 8504 VPlans.push_back(buildVPlan(SubRange)); 8505 VF = SubRange.End; 8506 } 8507 } 8508 8509 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8510 VPlanPtr &Plan) { 8511 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8512 8513 // Look for cached value. 8514 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8515 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8516 if (ECEntryIt != EdgeMaskCache.end()) 8517 return ECEntryIt->second; 8518 8519 VPValue *SrcMask = createBlockInMask(Src, Plan); 8520 8521 // The terminator has to be a branch inst! 8522 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8523 assert(BI && "Unexpected terminator found"); 8524 8525 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8526 return EdgeMaskCache[Edge] = SrcMask; 8527 8528 // If source is an exiting block, we know the exit edge is dynamically dead 8529 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8530 // adding uses of an otherwise potentially dead instruction. 8531 if (OrigLoop->isLoopExiting(Src)) 8532 return EdgeMaskCache[Edge] = SrcMask; 8533 8534 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8535 assert(EdgeMask && "No Edge Mask found for condition"); 8536 8537 if (BI->getSuccessor(0) != Dst) 8538 EdgeMask = Builder.createNot(EdgeMask); 8539 8540 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8541 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8542 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8543 // The select version does not introduce new UB if SrcMask is false and 8544 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8545 VPValue *False = Plan->getOrAddVPValue( 8546 ConstantInt::getFalse(BI->getCondition()->getType())); 8547 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8548 } 8549 8550 return EdgeMaskCache[Edge] = EdgeMask; 8551 } 8552 8553 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8554 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8555 8556 // Look for cached value. 8557 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8558 if (BCEntryIt != BlockMaskCache.end()) 8559 return BCEntryIt->second; 8560 8561 // All-one mask is modelled as no-mask following the convention for masked 8562 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8563 VPValue *BlockMask = nullptr; 8564 8565 if (OrigLoop->getHeader() == BB) { 8566 if (!CM.blockNeedsPredication(BB)) 8567 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8568 8569 // Create the block in mask as the first non-phi instruction in the block. 8570 VPBuilder::InsertPointGuard Guard(Builder); 8571 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8572 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8573 8574 // Introduce the early-exit compare IV <= BTC to form header block mask. 8575 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8576 // Start by constructing the desired canonical IV. 8577 VPValue *IV = nullptr; 8578 if (Legal->getPrimaryInduction()) 8579 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8580 else { 8581 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8582 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8583 IV = IVRecipe->getVPSingleValue(); 8584 } 8585 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8586 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8587 8588 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8589 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8590 // as a second argument, we only pass the IV here and extract the 8591 // tripcount from the transform state where codegen of the VP instructions 8592 // happen. 8593 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8594 } else { 8595 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8596 } 8597 return BlockMaskCache[BB] = BlockMask; 8598 } 8599 8600 // This is the block mask. We OR all incoming edges. 8601 for (auto *Predecessor : predecessors(BB)) { 8602 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8603 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8604 return BlockMaskCache[BB] = EdgeMask; 8605 8606 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8607 BlockMask = EdgeMask; 8608 continue; 8609 } 8610 8611 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8612 } 8613 8614 return BlockMaskCache[BB] = BlockMask; 8615 } 8616 8617 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8618 ArrayRef<VPValue *> Operands, 8619 VFRange &Range, 8620 VPlanPtr &Plan) { 8621 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8622 "Must be called with either a load or store"); 8623 8624 auto willWiden = [&](ElementCount VF) -> bool { 8625 if (VF.isScalar()) 8626 return false; 8627 LoopVectorizationCostModel::InstWidening Decision = 8628 CM.getWideningDecision(I, VF); 8629 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8630 "CM decision should be taken at this point."); 8631 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8632 return true; 8633 if (CM.isScalarAfterVectorization(I, VF) || 8634 CM.isProfitableToScalarize(I, VF)) 8635 return false; 8636 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8637 }; 8638 8639 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8640 return nullptr; 8641 8642 VPValue *Mask = nullptr; 8643 if (Legal->isMaskRequired(I)) 8644 Mask = createBlockInMask(I->getParent(), Plan); 8645 8646 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8647 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8648 8649 StoreInst *Store = cast<StoreInst>(I); 8650 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8651 Mask); 8652 } 8653 8654 VPWidenIntOrFpInductionRecipe * 8655 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8656 ArrayRef<VPValue *> Operands) const { 8657 // Check if this is an integer or fp induction. If so, build the recipe that 8658 // produces its scalar and vector values. 8659 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8660 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8661 II.getKind() == InductionDescriptor::IK_FpInduction) { 8662 assert(II.getStartValue() == 8663 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8664 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8665 return new VPWidenIntOrFpInductionRecipe( 8666 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8667 } 8668 8669 return nullptr; 8670 } 8671 8672 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8673 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8674 VPlan &Plan) const { 8675 // Optimize the special case where the source is a constant integer 8676 // induction variable. Notice that we can only optimize the 'trunc' case 8677 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8678 // (c) other casts depend on pointer size. 8679 8680 // Determine whether \p K is a truncation based on an induction variable that 8681 // can be optimized. 8682 auto isOptimizableIVTruncate = 8683 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8684 return [=](ElementCount VF) -> bool { 8685 return CM.isOptimizableIVTruncate(K, VF); 8686 }; 8687 }; 8688 8689 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8690 isOptimizableIVTruncate(I), Range)) { 8691 8692 InductionDescriptor II = 8693 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8694 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8695 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8696 Start, nullptr, I); 8697 } 8698 return nullptr; 8699 } 8700 8701 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8702 ArrayRef<VPValue *> Operands, 8703 VPlanPtr &Plan) { 8704 // If all incoming values are equal, the incoming VPValue can be used directly 8705 // instead of creating a new VPBlendRecipe. 8706 VPValue *FirstIncoming = Operands[0]; 8707 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8708 return FirstIncoming == Inc; 8709 })) { 8710 return Operands[0]; 8711 } 8712 8713 // We know that all PHIs in non-header blocks are converted into selects, so 8714 // we don't have to worry about the insertion order and we can just use the 8715 // builder. At this point we generate the predication tree. There may be 8716 // duplications since this is a simple recursive scan, but future 8717 // optimizations will clean it up. 8718 SmallVector<VPValue *, 2> OperandsWithMask; 8719 unsigned NumIncoming = Phi->getNumIncomingValues(); 8720 8721 for (unsigned In = 0; In < NumIncoming; In++) { 8722 VPValue *EdgeMask = 8723 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8724 assert((EdgeMask || NumIncoming == 1) && 8725 "Multiple predecessors with one having a full mask"); 8726 OperandsWithMask.push_back(Operands[In]); 8727 if (EdgeMask) 8728 OperandsWithMask.push_back(EdgeMask); 8729 } 8730 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8731 } 8732 8733 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8734 ArrayRef<VPValue *> Operands, 8735 VFRange &Range) const { 8736 8737 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8738 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8739 Range); 8740 8741 if (IsPredicated) 8742 return nullptr; 8743 8744 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8745 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8746 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8747 ID == Intrinsic::pseudoprobe || 8748 ID == Intrinsic::experimental_noalias_scope_decl)) 8749 return nullptr; 8750 8751 auto willWiden = [&](ElementCount VF) -> bool { 8752 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8753 // The following case may be scalarized depending on the VF. 8754 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8755 // version of the instruction. 8756 // Is it beneficial to perform intrinsic call compared to lib call? 8757 bool NeedToScalarize = false; 8758 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8759 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8760 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8761 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8762 "Either the intrinsic cost or vector call cost must be valid"); 8763 return UseVectorIntrinsic || !NeedToScalarize; 8764 }; 8765 8766 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8767 return nullptr; 8768 8769 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8770 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8771 } 8772 8773 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8774 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8775 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8776 // Instruction should be widened, unless it is scalar after vectorization, 8777 // scalarization is profitable or it is predicated. 8778 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8779 return CM.isScalarAfterVectorization(I, VF) || 8780 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8781 }; 8782 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8783 Range); 8784 } 8785 8786 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8787 ArrayRef<VPValue *> Operands) const { 8788 auto IsVectorizableOpcode = [](unsigned Opcode) { 8789 switch (Opcode) { 8790 case Instruction::Add: 8791 case Instruction::And: 8792 case Instruction::AShr: 8793 case Instruction::BitCast: 8794 case Instruction::FAdd: 8795 case Instruction::FCmp: 8796 case Instruction::FDiv: 8797 case Instruction::FMul: 8798 case Instruction::FNeg: 8799 case Instruction::FPExt: 8800 case Instruction::FPToSI: 8801 case Instruction::FPToUI: 8802 case Instruction::FPTrunc: 8803 case Instruction::FRem: 8804 case Instruction::FSub: 8805 case Instruction::ICmp: 8806 case Instruction::IntToPtr: 8807 case Instruction::LShr: 8808 case Instruction::Mul: 8809 case Instruction::Or: 8810 case Instruction::PtrToInt: 8811 case Instruction::SDiv: 8812 case Instruction::Select: 8813 case Instruction::SExt: 8814 case Instruction::Shl: 8815 case Instruction::SIToFP: 8816 case Instruction::SRem: 8817 case Instruction::Sub: 8818 case Instruction::Trunc: 8819 case Instruction::UDiv: 8820 case Instruction::UIToFP: 8821 case Instruction::URem: 8822 case Instruction::Xor: 8823 case Instruction::ZExt: 8824 return true; 8825 } 8826 return false; 8827 }; 8828 8829 if (!IsVectorizableOpcode(I->getOpcode())) 8830 return nullptr; 8831 8832 // Success: widen this instruction. 8833 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8834 } 8835 8836 void VPRecipeBuilder::fixHeaderPhis() { 8837 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8838 for (VPWidenPHIRecipe *R : PhisToFix) { 8839 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8840 VPRecipeBase *IncR = 8841 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8842 R->addOperand(IncR->getVPSingleValue()); 8843 } 8844 } 8845 8846 VPBasicBlock *VPRecipeBuilder::handleReplication( 8847 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8848 VPlanPtr &Plan) { 8849 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8850 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8851 Range); 8852 8853 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8854 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8855 8856 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8857 IsUniform, IsPredicated); 8858 setRecipe(I, Recipe); 8859 Plan->addVPValue(I, Recipe); 8860 8861 // Find if I uses a predicated instruction. If so, it will use its scalar 8862 // value. Avoid hoisting the insert-element which packs the scalar value into 8863 // a vector value, as that happens iff all users use the vector value. 8864 for (VPValue *Op : Recipe->operands()) { 8865 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8866 if (!PredR) 8867 continue; 8868 auto *RepR = 8869 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8870 assert(RepR->isPredicated() && 8871 "expected Replicate recipe to be predicated"); 8872 RepR->setAlsoPack(false); 8873 } 8874 8875 // Finalize the recipe for Instr, first if it is not predicated. 8876 if (!IsPredicated) { 8877 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8878 VPBB->appendRecipe(Recipe); 8879 return VPBB; 8880 } 8881 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8882 assert(VPBB->getSuccessors().empty() && 8883 "VPBB has successors when handling predicated replication."); 8884 // Record predicated instructions for above packing optimizations. 8885 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8886 VPBlockUtils::insertBlockAfter(Region, VPBB); 8887 auto *RegSucc = new VPBasicBlock(); 8888 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8889 return RegSucc; 8890 } 8891 8892 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8893 VPRecipeBase *PredRecipe, 8894 VPlanPtr &Plan) { 8895 // Instructions marked for predication are replicated and placed under an 8896 // if-then construct to prevent side-effects. 8897 8898 // Generate recipes to compute the block mask for this region. 8899 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8900 8901 // Build the triangular if-then region. 8902 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8903 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8904 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8905 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8906 auto *PHIRecipe = Instr->getType()->isVoidTy() 8907 ? nullptr 8908 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8909 if (PHIRecipe) { 8910 Plan->removeVPValueFor(Instr); 8911 Plan->addVPValue(Instr, PHIRecipe); 8912 } 8913 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8914 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8915 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8916 8917 // Note: first set Entry as region entry and then connect successors starting 8918 // from it in order, to propagate the "parent" of each VPBasicBlock. 8919 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8920 VPBlockUtils::connectBlocks(Pred, Exit); 8921 8922 return Region; 8923 } 8924 8925 VPRecipeOrVPValueTy 8926 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8927 ArrayRef<VPValue *> Operands, 8928 VFRange &Range, VPlanPtr &Plan) { 8929 // First, check for specific widening recipes that deal with calls, memory 8930 // operations, inductions and Phi nodes. 8931 if (auto *CI = dyn_cast<CallInst>(Instr)) 8932 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8933 8934 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8935 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8936 8937 VPRecipeBase *Recipe; 8938 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8939 if (Phi->getParent() != OrigLoop->getHeader()) 8940 return tryToBlend(Phi, Operands, Plan); 8941 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8942 return toVPRecipeResult(Recipe); 8943 8944 if (Legal->isReductionVariable(Phi)) { 8945 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8946 assert(RdxDesc.getRecurrenceStartValue() == 8947 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8948 VPValue *StartV = Operands[0]; 8949 8950 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8951 PhisToFix.push_back(PhiRecipe); 8952 // Record the incoming value from the backedge, so we can add the incoming 8953 // value from the backedge after all recipes have been created. 8954 recordRecipeOf(cast<Instruction>( 8955 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8956 return toVPRecipeResult(PhiRecipe); 8957 } 8958 8959 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8960 } 8961 8962 if (isa<TruncInst>(Instr) && 8963 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8964 Range, *Plan))) 8965 return toVPRecipeResult(Recipe); 8966 8967 if (!shouldWiden(Instr, Range)) 8968 return nullptr; 8969 8970 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8971 return toVPRecipeResult(new VPWidenGEPRecipe( 8972 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8973 8974 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8975 bool InvariantCond = 8976 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8977 return toVPRecipeResult(new VPWidenSelectRecipe( 8978 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8979 } 8980 8981 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8982 } 8983 8984 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8985 ElementCount MaxVF) { 8986 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8987 8988 // Collect instructions from the original loop that will become trivially dead 8989 // in the vectorized loop. We don't need to vectorize these instructions. For 8990 // example, original induction update instructions can become dead because we 8991 // separately emit induction "steps" when generating code for the new loop. 8992 // Similarly, we create a new latch condition when setting up the structure 8993 // of the new loop, so the old one can become dead. 8994 SmallPtrSet<Instruction *, 4> DeadInstructions; 8995 collectTriviallyDeadInstructions(DeadInstructions); 8996 8997 // Add assume instructions we need to drop to DeadInstructions, to prevent 8998 // them from being added to the VPlan. 8999 // TODO: We only need to drop assumes in blocks that get flattend. If the 9000 // control flow is preserved, we should keep them. 9001 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9002 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9003 9004 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9005 // Dead instructions do not need sinking. Remove them from SinkAfter. 9006 for (Instruction *I : DeadInstructions) 9007 SinkAfter.erase(I); 9008 9009 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9010 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9011 VFRange SubRange = {VF, MaxVFPlusOne}; 9012 VPlans.push_back( 9013 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9014 VF = SubRange.End; 9015 } 9016 } 9017 9018 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9019 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9020 const MapVector<Instruction *, Instruction *> &SinkAfter) { 9021 9022 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9023 9024 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9025 9026 // --------------------------------------------------------------------------- 9027 // Pre-construction: record ingredients whose recipes we'll need to further 9028 // process after constructing the initial VPlan. 9029 // --------------------------------------------------------------------------- 9030 9031 // Mark instructions we'll need to sink later and their targets as 9032 // ingredients whose recipe we'll need to record. 9033 for (auto &Entry : SinkAfter) { 9034 RecipeBuilder.recordRecipeOf(Entry.first); 9035 RecipeBuilder.recordRecipeOf(Entry.second); 9036 } 9037 for (auto &Reduction : CM.getInLoopReductionChains()) { 9038 PHINode *Phi = Reduction.first; 9039 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9040 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9041 9042 RecipeBuilder.recordRecipeOf(Phi); 9043 for (auto &R : ReductionOperations) { 9044 RecipeBuilder.recordRecipeOf(R); 9045 // For min/max reducitons, where we have a pair of icmp/select, we also 9046 // need to record the ICmp recipe, so it can be removed later. 9047 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9048 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9049 } 9050 } 9051 9052 // For each interleave group which is relevant for this (possibly trimmed) 9053 // Range, add it to the set of groups to be later applied to the VPlan and add 9054 // placeholders for its members' Recipes which we'll be replacing with a 9055 // single VPInterleaveRecipe. 9056 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9057 auto applyIG = [IG, this](ElementCount VF) -> bool { 9058 return (VF.isVector() && // Query is illegal for VF == 1 9059 CM.getWideningDecision(IG->getInsertPos(), VF) == 9060 LoopVectorizationCostModel::CM_Interleave); 9061 }; 9062 if (!getDecisionAndClampRange(applyIG, Range)) 9063 continue; 9064 InterleaveGroups.insert(IG); 9065 for (unsigned i = 0; i < IG->getFactor(); i++) 9066 if (Instruction *Member = IG->getMember(i)) 9067 RecipeBuilder.recordRecipeOf(Member); 9068 }; 9069 9070 // --------------------------------------------------------------------------- 9071 // Build initial VPlan: Scan the body of the loop in a topological order to 9072 // visit each basic block after having visited its predecessor basic blocks. 9073 // --------------------------------------------------------------------------- 9074 9075 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9076 auto Plan = std::make_unique<VPlan>(); 9077 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9078 Plan->setEntry(VPBB); 9079 9080 // Scan the body of the loop in a topological order to visit each basic block 9081 // after having visited its predecessor basic blocks. 9082 LoopBlocksDFS DFS(OrigLoop); 9083 DFS.perform(LI); 9084 9085 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9086 // Relevant instructions from basic block BB will be grouped into VPRecipe 9087 // ingredients and fill a new VPBasicBlock. 9088 unsigned VPBBsForBB = 0; 9089 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9090 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9091 VPBB = FirstVPBBForBB; 9092 Builder.setInsertPoint(VPBB); 9093 9094 // Introduce each ingredient into VPlan. 9095 // TODO: Model and preserve debug instrinsics in VPlan. 9096 for (Instruction &I : BB->instructionsWithoutDebug()) { 9097 Instruction *Instr = &I; 9098 9099 // First filter out irrelevant instructions, to ensure no recipes are 9100 // built for them. 9101 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9102 continue; 9103 9104 SmallVector<VPValue *, 4> Operands; 9105 auto *Phi = dyn_cast<PHINode>(Instr); 9106 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9107 Operands.push_back(Plan->getOrAddVPValue( 9108 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9109 } else { 9110 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9111 Operands = {OpRange.begin(), OpRange.end()}; 9112 } 9113 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9114 Instr, Operands, Range, Plan)) { 9115 // If Instr can be simplified to an existing VPValue, use it. 9116 if (RecipeOrValue.is<VPValue *>()) { 9117 auto *VPV = RecipeOrValue.get<VPValue *>(); 9118 Plan->addVPValue(Instr, VPV); 9119 // If the re-used value is a recipe, register the recipe for the 9120 // instruction, in case the recipe for Instr needs to be recorded. 9121 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9122 RecipeBuilder.setRecipe(Instr, R); 9123 continue; 9124 } 9125 // Otherwise, add the new recipe. 9126 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9127 for (auto *Def : Recipe->definedValues()) { 9128 auto *UV = Def->getUnderlyingValue(); 9129 Plan->addVPValue(UV, Def); 9130 } 9131 9132 RecipeBuilder.setRecipe(Instr, Recipe); 9133 VPBB->appendRecipe(Recipe); 9134 continue; 9135 } 9136 9137 // Otherwise, if all widening options failed, Instruction is to be 9138 // replicated. This may create a successor for VPBB. 9139 VPBasicBlock *NextVPBB = 9140 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9141 if (NextVPBB != VPBB) { 9142 VPBB = NextVPBB; 9143 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9144 : ""); 9145 } 9146 } 9147 } 9148 9149 RecipeBuilder.fixHeaderPhis(); 9150 9151 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9152 // may also be empty, such as the last one VPBB, reflecting original 9153 // basic-blocks with no recipes. 9154 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9155 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9156 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9157 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9158 delete PreEntry; 9159 9160 // --------------------------------------------------------------------------- 9161 // Transform initial VPlan: Apply previously taken decisions, in order, to 9162 // bring the VPlan to its final state. 9163 // --------------------------------------------------------------------------- 9164 9165 // Apply Sink-After legal constraints. 9166 for (auto &Entry : SinkAfter) { 9167 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9168 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9169 9170 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9171 auto *Region = 9172 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9173 if (Region && Region->isReplicator()) 9174 return Region; 9175 return nullptr; 9176 }; 9177 9178 // If the target is in a replication region, make sure to move Sink to the 9179 // block after it, not into the replication region itself. 9180 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9181 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9182 assert(!GetReplicateRegion(Sink) && 9183 "cannot sink a region into another region yet"); 9184 VPBasicBlock *NextBlock = 9185 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9186 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9187 continue; 9188 } 9189 9190 auto *SinkRegion = GetReplicateRegion(Sink); 9191 // Unless the sink source is in a replicate region, sink the recipe 9192 // directly. 9193 if (!SinkRegion) { 9194 Sink->moveAfter(Target); 9195 continue; 9196 } 9197 9198 // If the sink source is in a replicate region, we need to move the whole 9199 // replicate region, which should only contain a single recipe in the main 9200 // block. 9201 assert(Sink->getParent()->size() == 1 && 9202 "parent must be a replicator with a single recipe"); 9203 auto *SplitBlock = 9204 Target->getParent()->splitAt(std::next(Target->getIterator())); 9205 9206 auto *Pred = SinkRegion->getSinglePredecessor(); 9207 auto *Succ = SinkRegion->getSingleSuccessor(); 9208 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9209 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9210 VPBlockUtils::connectBlocks(Pred, Succ); 9211 9212 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9213 9214 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9215 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9216 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9217 if (VPBB == SplitPred) 9218 VPBB = SplitBlock; 9219 } 9220 9221 // Interleave memory: for each Interleave Group we marked earlier as relevant 9222 // for this VPlan, replace the Recipes widening its memory instructions with a 9223 // single VPInterleaveRecipe at its insertion point. 9224 for (auto IG : InterleaveGroups) { 9225 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9226 RecipeBuilder.getRecipe(IG->getInsertPos())); 9227 SmallVector<VPValue *, 4> StoredValues; 9228 for (unsigned i = 0; i < IG->getFactor(); ++i) 9229 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9230 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9231 9232 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9233 Recipe->getMask()); 9234 VPIG->insertBefore(Recipe); 9235 unsigned J = 0; 9236 for (unsigned i = 0; i < IG->getFactor(); ++i) 9237 if (Instruction *Member = IG->getMember(i)) { 9238 if (!Member->getType()->isVoidTy()) { 9239 VPValue *OriginalV = Plan->getVPValue(Member); 9240 Plan->removeVPValueFor(Member); 9241 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9242 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9243 J++; 9244 } 9245 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9246 } 9247 } 9248 9249 // Adjust the recipes for any inloop reductions. 9250 if (Range.Start.isVector()) 9251 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9252 9253 // Finally, if tail is folded by masking, introduce selects between the phi 9254 // and the live-out instruction of each reduction, at the end of the latch. 9255 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9256 Builder.setInsertPoint(VPBB); 9257 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9258 for (auto &Reduction : Legal->getReductionVars()) { 9259 if (CM.isInLoopReduction(Reduction.first)) 9260 continue; 9261 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9262 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9263 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9264 } 9265 } 9266 9267 VPlanTransforms::sinkScalarOperands(*Plan); 9268 9269 std::string PlanName; 9270 raw_string_ostream RSO(PlanName); 9271 ElementCount VF = Range.Start; 9272 Plan->addVF(VF); 9273 RSO << "Initial VPlan for VF={" << VF; 9274 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9275 Plan->addVF(VF); 9276 RSO << "," << VF; 9277 } 9278 RSO << "},UF>=1"; 9279 RSO.flush(); 9280 Plan->setName(PlanName); 9281 9282 return Plan; 9283 } 9284 9285 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9286 // Outer loop handling: They may require CFG and instruction level 9287 // transformations before even evaluating whether vectorization is profitable. 9288 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9289 // the vectorization pipeline. 9290 assert(!OrigLoop->isInnermost()); 9291 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9292 9293 // Create new empty VPlan 9294 auto Plan = std::make_unique<VPlan>(); 9295 9296 // Build hierarchical CFG 9297 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9298 HCFGBuilder.buildHierarchicalCFG(); 9299 9300 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9301 VF *= 2) 9302 Plan->addVF(VF); 9303 9304 if (EnableVPlanPredication) { 9305 VPlanPredicator VPP(*Plan); 9306 VPP.predicate(); 9307 9308 // Avoid running transformation to recipes until masked code generation in 9309 // VPlan-native path is in place. 9310 return Plan; 9311 } 9312 9313 SmallPtrSet<Instruction *, 1> DeadInstructions; 9314 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9315 Legal->getInductionVars(), 9316 DeadInstructions, *PSE.getSE()); 9317 return Plan; 9318 } 9319 9320 // Adjust the recipes for any inloop reductions. The chain of instructions 9321 // leading from the loop exit instr to the phi need to be converted to 9322 // reductions, with one operand being vector and the other being the scalar 9323 // reduction chain. 9324 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9325 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9326 for (auto &Reduction : CM.getInLoopReductionChains()) { 9327 PHINode *Phi = Reduction.first; 9328 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9329 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9330 9331 // ReductionOperations are orders top-down from the phi's use to the 9332 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9333 // which of the two operands will remain scalar and which will be reduced. 9334 // For minmax the chain will be the select instructions. 9335 Instruction *Chain = Phi; 9336 for (Instruction *R : ReductionOperations) { 9337 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9338 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9339 9340 VPValue *ChainOp = Plan->getVPValue(Chain); 9341 unsigned FirstOpId; 9342 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9343 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9344 "Expected to replace a VPWidenSelectSC"); 9345 FirstOpId = 1; 9346 } else { 9347 assert(isa<VPWidenRecipe>(WidenRecipe) && 9348 "Expected to replace a VPWidenSC"); 9349 FirstOpId = 0; 9350 } 9351 unsigned VecOpId = 9352 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9353 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9354 9355 auto *CondOp = CM.foldTailByMasking() 9356 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9357 : nullptr; 9358 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9359 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9360 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9361 Plan->removeVPValueFor(R); 9362 Plan->addVPValue(R, RedRecipe); 9363 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9364 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9365 WidenRecipe->eraseFromParent(); 9366 9367 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9368 VPRecipeBase *CompareRecipe = 9369 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9370 assert(isa<VPWidenRecipe>(CompareRecipe) && 9371 "Expected to replace a VPWidenSC"); 9372 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9373 "Expected no remaining users"); 9374 CompareRecipe->eraseFromParent(); 9375 } 9376 Chain = R; 9377 } 9378 } 9379 } 9380 9381 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9382 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9383 VPSlotTracker &SlotTracker) const { 9384 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9385 IG->getInsertPos()->printAsOperand(O, false); 9386 O << ", "; 9387 getAddr()->printAsOperand(O, SlotTracker); 9388 VPValue *Mask = getMask(); 9389 if (Mask) { 9390 O << ", "; 9391 Mask->printAsOperand(O, SlotTracker); 9392 } 9393 for (unsigned i = 0; i < IG->getFactor(); ++i) 9394 if (Instruction *I = IG->getMember(i)) 9395 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9396 } 9397 #endif 9398 9399 void VPWidenCallRecipe::execute(VPTransformState &State) { 9400 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9401 *this, State); 9402 } 9403 9404 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9405 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9406 this, *this, InvariantCond, State); 9407 } 9408 9409 void VPWidenRecipe::execute(VPTransformState &State) { 9410 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9411 } 9412 9413 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9414 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9415 *this, State.UF, State.VF, IsPtrLoopInvariant, 9416 IsIndexLoopInvariant, State); 9417 } 9418 9419 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9420 assert(!State.Instance && "Int or FP induction being replicated."); 9421 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9422 getTruncInst(), getVPValue(0), 9423 getCastValue(), State); 9424 } 9425 9426 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9427 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9428 this, State); 9429 } 9430 9431 void VPBlendRecipe::execute(VPTransformState &State) { 9432 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9433 // We know that all PHIs in non-header blocks are converted into 9434 // selects, so we don't have to worry about the insertion order and we 9435 // can just use the builder. 9436 // At this point we generate the predication tree. There may be 9437 // duplications since this is a simple recursive scan, but future 9438 // optimizations will clean it up. 9439 9440 unsigned NumIncoming = getNumIncomingValues(); 9441 9442 // Generate a sequence of selects of the form: 9443 // SELECT(Mask3, In3, 9444 // SELECT(Mask2, In2, 9445 // SELECT(Mask1, In1, 9446 // In0))) 9447 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9448 // are essentially undef are taken from In0. 9449 InnerLoopVectorizer::VectorParts Entry(State.UF); 9450 for (unsigned In = 0; In < NumIncoming; ++In) { 9451 for (unsigned Part = 0; Part < State.UF; ++Part) { 9452 // We might have single edge PHIs (blocks) - use an identity 9453 // 'select' for the first PHI operand. 9454 Value *In0 = State.get(getIncomingValue(In), Part); 9455 if (In == 0) 9456 Entry[Part] = In0; // Initialize with the first incoming value. 9457 else { 9458 // Select between the current value and the previous incoming edge 9459 // based on the incoming mask. 9460 Value *Cond = State.get(getMask(In), Part); 9461 Entry[Part] = 9462 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9463 } 9464 } 9465 } 9466 for (unsigned Part = 0; Part < State.UF; ++Part) 9467 State.set(this, Entry[Part], Part); 9468 } 9469 9470 void VPInterleaveRecipe::execute(VPTransformState &State) { 9471 assert(!State.Instance && "Interleave group being replicated."); 9472 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9473 getStoredValues(), getMask()); 9474 } 9475 9476 void VPReductionRecipe::execute(VPTransformState &State) { 9477 assert(!State.Instance && "Reduction being replicated."); 9478 Value *PrevInChain = State.get(getChainOp(), 0); 9479 for (unsigned Part = 0; Part < State.UF; ++Part) { 9480 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9481 bool IsOrdered = useOrderedReductions(*RdxDesc); 9482 Value *NewVecOp = State.get(getVecOp(), Part); 9483 if (VPValue *Cond = getCondOp()) { 9484 Value *NewCond = State.get(Cond, Part); 9485 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9486 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9487 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9488 Constant *IdenVec = 9489 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9490 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9491 NewVecOp = Select; 9492 } 9493 Value *NewRed; 9494 Value *NextInChain; 9495 if (IsOrdered) { 9496 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9497 PrevInChain); 9498 PrevInChain = NewRed; 9499 } else { 9500 PrevInChain = State.get(getChainOp(), Part); 9501 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9502 } 9503 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9504 NextInChain = 9505 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9506 NewRed, PrevInChain); 9507 } else if (IsOrdered) 9508 NextInChain = NewRed; 9509 else { 9510 NextInChain = State.Builder.CreateBinOp( 9511 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9512 PrevInChain); 9513 } 9514 State.set(this, NextInChain, Part); 9515 } 9516 } 9517 9518 void VPReplicateRecipe::execute(VPTransformState &State) { 9519 if (State.Instance) { // Generate a single instance. 9520 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9521 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9522 *State.Instance, IsPredicated, State); 9523 // Insert scalar instance packing it into a vector. 9524 if (AlsoPack && State.VF.isVector()) { 9525 // If we're constructing lane 0, initialize to start from poison. 9526 if (State.Instance->Lane.isFirstLane()) { 9527 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9528 Value *Poison = PoisonValue::get( 9529 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9530 State.set(this, Poison, State.Instance->Part); 9531 } 9532 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9533 } 9534 return; 9535 } 9536 9537 // Generate scalar instances for all VF lanes of all UF parts, unless the 9538 // instruction is uniform inwhich case generate only the first lane for each 9539 // of the UF parts. 9540 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9541 assert((!State.VF.isScalable() || IsUniform) && 9542 "Can't scalarize a scalable vector"); 9543 for (unsigned Part = 0; Part < State.UF; ++Part) 9544 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9545 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9546 VPIteration(Part, Lane), IsPredicated, 9547 State); 9548 } 9549 9550 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9551 assert(State.Instance && "Branch on Mask works only on single instance."); 9552 9553 unsigned Part = State.Instance->Part; 9554 unsigned Lane = State.Instance->Lane.getKnownLane(); 9555 9556 Value *ConditionBit = nullptr; 9557 VPValue *BlockInMask = getMask(); 9558 if (BlockInMask) { 9559 ConditionBit = State.get(BlockInMask, Part); 9560 if (ConditionBit->getType()->isVectorTy()) 9561 ConditionBit = State.Builder.CreateExtractElement( 9562 ConditionBit, State.Builder.getInt32(Lane)); 9563 } else // Block in mask is all-one. 9564 ConditionBit = State.Builder.getTrue(); 9565 9566 // Replace the temporary unreachable terminator with a new conditional branch, 9567 // whose two destinations will be set later when they are created. 9568 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9569 assert(isa<UnreachableInst>(CurrentTerminator) && 9570 "Expected to replace unreachable terminator with conditional branch."); 9571 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9572 CondBr->setSuccessor(0, nullptr); 9573 ReplaceInstWithInst(CurrentTerminator, CondBr); 9574 } 9575 9576 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9577 assert(State.Instance && "Predicated instruction PHI works per instance."); 9578 Instruction *ScalarPredInst = 9579 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9580 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9581 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9582 assert(PredicatingBB && "Predicated block has no single predecessor."); 9583 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9584 "operand must be VPReplicateRecipe"); 9585 9586 // By current pack/unpack logic we need to generate only a single phi node: if 9587 // a vector value for the predicated instruction exists at this point it means 9588 // the instruction has vector users only, and a phi for the vector value is 9589 // needed. In this case the recipe of the predicated instruction is marked to 9590 // also do that packing, thereby "hoisting" the insert-element sequence. 9591 // Otherwise, a phi node for the scalar value is needed. 9592 unsigned Part = State.Instance->Part; 9593 if (State.hasVectorValue(getOperand(0), Part)) { 9594 Value *VectorValue = State.get(getOperand(0), Part); 9595 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9596 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9597 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9598 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9599 if (State.hasVectorValue(this, Part)) 9600 State.reset(this, VPhi, Part); 9601 else 9602 State.set(this, VPhi, Part); 9603 // NOTE: Currently we need to update the value of the operand, so the next 9604 // predicated iteration inserts its generated value in the correct vector. 9605 State.reset(getOperand(0), VPhi, Part); 9606 } else { 9607 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9608 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9609 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9610 PredicatingBB); 9611 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9612 if (State.hasScalarValue(this, *State.Instance)) 9613 State.reset(this, Phi, *State.Instance); 9614 else 9615 State.set(this, Phi, *State.Instance); 9616 // NOTE: Currently we need to update the value of the operand, so the next 9617 // predicated iteration inserts its generated value in the correct vector. 9618 State.reset(getOperand(0), Phi, *State.Instance); 9619 } 9620 } 9621 9622 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9623 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9624 State.ILV->vectorizeMemoryInstruction( 9625 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9626 StoredValue, getMask()); 9627 } 9628 9629 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9630 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9631 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9632 // for predication. 9633 static ScalarEpilogueLowering getScalarEpilogueLowering( 9634 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9635 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9636 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9637 LoopVectorizationLegality &LVL) { 9638 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9639 // don't look at hints or options, and don't request a scalar epilogue. 9640 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9641 // LoopAccessInfo (due to code dependency and not being able to reliably get 9642 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9643 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9644 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9645 // back to the old way and vectorize with versioning when forced. See D81345.) 9646 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9647 PGSOQueryType::IRPass) && 9648 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9649 return CM_ScalarEpilogueNotAllowedOptSize; 9650 9651 // 2) If set, obey the directives 9652 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9653 switch (PreferPredicateOverEpilogue) { 9654 case PreferPredicateTy::ScalarEpilogue: 9655 return CM_ScalarEpilogueAllowed; 9656 case PreferPredicateTy::PredicateElseScalarEpilogue: 9657 return CM_ScalarEpilogueNotNeededUsePredicate; 9658 case PreferPredicateTy::PredicateOrDontVectorize: 9659 return CM_ScalarEpilogueNotAllowedUsePredicate; 9660 }; 9661 } 9662 9663 // 3) If set, obey the hints 9664 switch (Hints.getPredicate()) { 9665 case LoopVectorizeHints::FK_Enabled: 9666 return CM_ScalarEpilogueNotNeededUsePredicate; 9667 case LoopVectorizeHints::FK_Disabled: 9668 return CM_ScalarEpilogueAllowed; 9669 }; 9670 9671 // 4) if the TTI hook indicates this is profitable, request predication. 9672 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9673 LVL.getLAI())) 9674 return CM_ScalarEpilogueNotNeededUsePredicate; 9675 9676 return CM_ScalarEpilogueAllowed; 9677 } 9678 9679 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9680 // If Values have been set for this Def return the one relevant for \p Part. 9681 if (hasVectorValue(Def, Part)) 9682 return Data.PerPartOutput[Def][Part]; 9683 9684 if (!hasScalarValue(Def, {Part, 0})) { 9685 Value *IRV = Def->getLiveInIRValue(); 9686 Value *B = ILV->getBroadcastInstrs(IRV); 9687 set(Def, B, Part); 9688 return B; 9689 } 9690 9691 Value *ScalarValue = get(Def, {Part, 0}); 9692 // If we aren't vectorizing, we can just copy the scalar map values over 9693 // to the vector map. 9694 if (VF.isScalar()) { 9695 set(Def, ScalarValue, Part); 9696 return ScalarValue; 9697 } 9698 9699 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9700 bool IsUniform = RepR && RepR->isUniform(); 9701 9702 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9703 // Check if there is a scalar value for the selected lane. 9704 if (!hasScalarValue(Def, {Part, LastLane})) { 9705 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9706 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9707 "unexpected recipe found to be invariant"); 9708 IsUniform = true; 9709 LastLane = 0; 9710 } 9711 9712 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9713 9714 // Set the insert point after the last scalarized instruction. This 9715 // ensures the insertelement sequence will directly follow the scalar 9716 // definitions. 9717 auto OldIP = Builder.saveIP(); 9718 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9719 Builder.SetInsertPoint(&*NewIP); 9720 9721 // However, if we are vectorizing, we need to construct the vector values. 9722 // If the value is known to be uniform after vectorization, we can just 9723 // broadcast the scalar value corresponding to lane zero for each unroll 9724 // iteration. Otherwise, we construct the vector values using 9725 // insertelement instructions. Since the resulting vectors are stored in 9726 // State, we will only generate the insertelements once. 9727 Value *VectorValue = nullptr; 9728 if (IsUniform) { 9729 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9730 set(Def, VectorValue, Part); 9731 } else { 9732 // Initialize packing with insertelements to start from undef. 9733 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9734 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9735 set(Def, Undef, Part); 9736 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9737 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9738 VectorValue = get(Def, Part); 9739 } 9740 Builder.restoreIP(OldIP); 9741 return VectorValue; 9742 } 9743 9744 // Process the loop in the VPlan-native vectorization path. This path builds 9745 // VPlan upfront in the vectorization pipeline, which allows to apply 9746 // VPlan-to-VPlan transformations from the very beginning without modifying the 9747 // input LLVM IR. 9748 static bool processLoopInVPlanNativePath( 9749 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9750 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9751 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9752 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9753 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9754 LoopVectorizationRequirements &Requirements) { 9755 9756 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9757 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9758 return false; 9759 } 9760 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9761 Function *F = L->getHeader()->getParent(); 9762 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9763 9764 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9765 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9766 9767 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9768 &Hints, IAI); 9769 // Use the planner for outer loop vectorization. 9770 // TODO: CM is not used at this point inside the planner. Turn CM into an 9771 // optional argument if we don't need it in the future. 9772 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9773 Requirements, ORE); 9774 9775 // Get user vectorization factor. 9776 ElementCount UserVF = Hints.getWidth(); 9777 9778 // Plan how to best vectorize, return the best VF and its cost. 9779 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9780 9781 // If we are stress testing VPlan builds, do not attempt to generate vector 9782 // code. Masked vector code generation support will follow soon. 9783 // Also, do not attempt to vectorize if no vector code will be produced. 9784 if (VPlanBuildStressTest || EnableVPlanPredication || 9785 VectorizationFactor::Disabled() == VF) 9786 return false; 9787 9788 LVP.setBestPlan(VF.Width, 1); 9789 9790 { 9791 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9792 F->getParent()->getDataLayout()); 9793 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9794 &CM, BFI, PSI, Checks); 9795 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9796 << L->getHeader()->getParent()->getName() << "\"\n"); 9797 LVP.executePlan(LB, DT); 9798 } 9799 9800 // Mark the loop as already vectorized to avoid vectorizing again. 9801 Hints.setAlreadyVectorized(); 9802 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9803 return true; 9804 } 9805 9806 // Emit a remark if there are stores to floats that required a floating point 9807 // extension. If the vectorized loop was generated with floating point there 9808 // will be a performance penalty from the conversion overhead and the change in 9809 // the vector width. 9810 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9811 SmallVector<Instruction *, 4> Worklist; 9812 for (BasicBlock *BB : L->getBlocks()) { 9813 for (Instruction &Inst : *BB) { 9814 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9815 if (S->getValueOperand()->getType()->isFloatTy()) 9816 Worklist.push_back(S); 9817 } 9818 } 9819 } 9820 9821 // Traverse the floating point stores upwards searching, for floating point 9822 // conversions. 9823 SmallPtrSet<const Instruction *, 4> Visited; 9824 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9825 while (!Worklist.empty()) { 9826 auto *I = Worklist.pop_back_val(); 9827 if (!L->contains(I)) 9828 continue; 9829 if (!Visited.insert(I).second) 9830 continue; 9831 9832 // Emit a remark if the floating point store required a floating 9833 // point conversion. 9834 // TODO: More work could be done to identify the root cause such as a 9835 // constant or a function return type and point the user to it. 9836 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9837 ORE->emit([&]() { 9838 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9839 I->getDebugLoc(), L->getHeader()) 9840 << "floating point conversion changes vector width. " 9841 << "Mixed floating point precision requires an up/down " 9842 << "cast that will negatively impact performance."; 9843 }); 9844 9845 for (Use &Op : I->operands()) 9846 if (auto *OpI = dyn_cast<Instruction>(Op)) 9847 Worklist.push_back(OpI); 9848 } 9849 } 9850 9851 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9852 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9853 !EnableLoopInterleaving), 9854 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9855 !EnableLoopVectorization) {} 9856 9857 bool LoopVectorizePass::processLoop(Loop *L) { 9858 assert((EnableVPlanNativePath || L->isInnermost()) && 9859 "VPlan-native path is not enabled. Only process inner loops."); 9860 9861 #ifndef NDEBUG 9862 const std::string DebugLocStr = getDebugLocString(L); 9863 #endif /* NDEBUG */ 9864 9865 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9866 << L->getHeader()->getParent()->getName() << "\" from " 9867 << DebugLocStr << "\n"); 9868 9869 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9870 9871 LLVM_DEBUG( 9872 dbgs() << "LV: Loop hints:" 9873 << " force=" 9874 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9875 ? "disabled" 9876 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9877 ? "enabled" 9878 : "?")) 9879 << " width=" << Hints.getWidth() 9880 << " interleave=" << Hints.getInterleave() << "\n"); 9881 9882 // Function containing loop 9883 Function *F = L->getHeader()->getParent(); 9884 9885 // Looking at the diagnostic output is the only way to determine if a loop 9886 // was vectorized (other than looking at the IR or machine code), so it 9887 // is important to generate an optimization remark for each loop. Most of 9888 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9889 // generated as OptimizationRemark and OptimizationRemarkMissed are 9890 // less verbose reporting vectorized loops and unvectorized loops that may 9891 // benefit from vectorization, respectively. 9892 9893 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9894 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9895 return false; 9896 } 9897 9898 PredicatedScalarEvolution PSE(*SE, *L); 9899 9900 // Check if it is legal to vectorize the loop. 9901 LoopVectorizationRequirements Requirements; 9902 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9903 &Requirements, &Hints, DB, AC, BFI, PSI); 9904 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9905 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9906 Hints.emitRemarkWithHints(); 9907 return false; 9908 } 9909 9910 // Check the function attributes and profiles to find out if this function 9911 // should be optimized for size. 9912 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9913 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9914 9915 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9916 // here. They may require CFG and instruction level transformations before 9917 // even evaluating whether vectorization is profitable. Since we cannot modify 9918 // the incoming IR, we need to build VPlan upfront in the vectorization 9919 // pipeline. 9920 if (!L->isInnermost()) 9921 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9922 ORE, BFI, PSI, Hints, Requirements); 9923 9924 assert(L->isInnermost() && "Inner loop expected."); 9925 9926 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9927 // count by optimizing for size, to minimize overheads. 9928 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9929 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9930 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9931 << "This loop is worth vectorizing only if no scalar " 9932 << "iteration overheads are incurred."); 9933 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9934 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9935 else { 9936 LLVM_DEBUG(dbgs() << "\n"); 9937 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9938 } 9939 } 9940 9941 // Check the function attributes to see if implicit floats are allowed. 9942 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9943 // an integer loop and the vector instructions selected are purely integer 9944 // vector instructions? 9945 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9946 reportVectorizationFailure( 9947 "Can't vectorize when the NoImplicitFloat attribute is used", 9948 "loop not vectorized due to NoImplicitFloat attribute", 9949 "NoImplicitFloat", ORE, L); 9950 Hints.emitRemarkWithHints(); 9951 return false; 9952 } 9953 9954 // Check if the target supports potentially unsafe FP vectorization. 9955 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9956 // for the target we're vectorizing for, to make sure none of the 9957 // additional fp-math flags can help. 9958 if (Hints.isPotentiallyUnsafe() && 9959 TTI->isFPVectorizationPotentiallyUnsafe()) { 9960 reportVectorizationFailure( 9961 "Potentially unsafe FP op prevents vectorization", 9962 "loop not vectorized due to unsafe FP support.", 9963 "UnsafeFP", ORE, L); 9964 Hints.emitRemarkWithHints(); 9965 return false; 9966 } 9967 9968 if (!LVL.canVectorizeFPMath(EnableStrictReductions)) { 9969 ORE->emit([&]() { 9970 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9971 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9972 ExactFPMathInst->getDebugLoc(), 9973 ExactFPMathInst->getParent()) 9974 << "loop not vectorized: cannot prove it is safe to reorder " 9975 "floating-point operations"; 9976 }); 9977 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9978 "reorder floating-point operations\n"); 9979 Hints.emitRemarkWithHints(); 9980 return false; 9981 } 9982 9983 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9984 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9985 9986 // If an override option has been passed in for interleaved accesses, use it. 9987 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9988 UseInterleaved = EnableInterleavedMemAccesses; 9989 9990 // Analyze interleaved memory accesses. 9991 if (UseInterleaved) { 9992 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9993 } 9994 9995 // Use the cost model. 9996 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9997 F, &Hints, IAI); 9998 CM.collectValuesToIgnore(); 9999 10000 // Use the planner for vectorization. 10001 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10002 Requirements, ORE); 10003 10004 // Get user vectorization factor and interleave count. 10005 ElementCount UserVF = Hints.getWidth(); 10006 unsigned UserIC = Hints.getInterleave(); 10007 10008 // Plan how to best vectorize, return the best VF and its cost. 10009 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10010 10011 VectorizationFactor VF = VectorizationFactor::Disabled(); 10012 unsigned IC = 1; 10013 10014 if (MaybeVF) { 10015 VF = *MaybeVF; 10016 // Select the interleave count. 10017 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10018 } 10019 10020 // Identify the diagnostic messages that should be produced. 10021 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10022 bool VectorizeLoop = true, InterleaveLoop = true; 10023 if (VF.Width.isScalar()) { 10024 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10025 VecDiagMsg = std::make_pair( 10026 "VectorizationNotBeneficial", 10027 "the cost-model indicates that vectorization is not beneficial"); 10028 VectorizeLoop = false; 10029 } 10030 10031 if (!MaybeVF && UserIC > 1) { 10032 // Tell the user interleaving was avoided up-front, despite being explicitly 10033 // requested. 10034 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10035 "interleaving should be avoided up front\n"); 10036 IntDiagMsg = std::make_pair( 10037 "InterleavingAvoided", 10038 "Ignoring UserIC, because interleaving was avoided up front"); 10039 InterleaveLoop = false; 10040 } else if (IC == 1 && UserIC <= 1) { 10041 // Tell the user interleaving is not beneficial. 10042 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10043 IntDiagMsg = std::make_pair( 10044 "InterleavingNotBeneficial", 10045 "the cost-model indicates that interleaving is not beneficial"); 10046 InterleaveLoop = false; 10047 if (UserIC == 1) { 10048 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10049 IntDiagMsg.second += 10050 " and is explicitly disabled or interleave count is set to 1"; 10051 } 10052 } else if (IC > 1 && UserIC == 1) { 10053 // Tell the user interleaving is beneficial, but it explicitly disabled. 10054 LLVM_DEBUG( 10055 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10056 IntDiagMsg = std::make_pair( 10057 "InterleavingBeneficialButDisabled", 10058 "the cost-model indicates that interleaving is beneficial " 10059 "but is explicitly disabled or interleave count is set to 1"); 10060 InterleaveLoop = false; 10061 } 10062 10063 // Override IC if user provided an interleave count. 10064 IC = UserIC > 0 ? UserIC : IC; 10065 10066 // Emit diagnostic messages, if any. 10067 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10068 if (!VectorizeLoop && !InterleaveLoop) { 10069 // Do not vectorize or interleaving the loop. 10070 ORE->emit([&]() { 10071 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10072 L->getStartLoc(), L->getHeader()) 10073 << VecDiagMsg.second; 10074 }); 10075 ORE->emit([&]() { 10076 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10077 L->getStartLoc(), L->getHeader()) 10078 << IntDiagMsg.second; 10079 }); 10080 return false; 10081 } else if (!VectorizeLoop && InterleaveLoop) { 10082 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10083 ORE->emit([&]() { 10084 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10085 L->getStartLoc(), L->getHeader()) 10086 << VecDiagMsg.second; 10087 }); 10088 } else if (VectorizeLoop && !InterleaveLoop) { 10089 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10090 << ") in " << DebugLocStr << '\n'); 10091 ORE->emit([&]() { 10092 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10093 L->getStartLoc(), L->getHeader()) 10094 << IntDiagMsg.second; 10095 }); 10096 } else if (VectorizeLoop && InterleaveLoop) { 10097 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10098 << ") in " << DebugLocStr << '\n'); 10099 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10100 } 10101 10102 bool DisableRuntimeUnroll = false; 10103 MDNode *OrigLoopID = L->getLoopID(); 10104 { 10105 // Optimistically generate runtime checks. Drop them if they turn out to not 10106 // be profitable. Limit the scope of Checks, so the cleanup happens 10107 // immediately after vector codegeneration is done. 10108 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10109 F->getParent()->getDataLayout()); 10110 if (!VF.Width.isScalar() || IC > 1) 10111 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10112 LVP.setBestPlan(VF.Width, IC); 10113 10114 using namespace ore; 10115 if (!VectorizeLoop) { 10116 assert(IC > 1 && "interleave count should not be 1 or 0"); 10117 // If we decided that it is not legal to vectorize the loop, then 10118 // interleave it. 10119 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10120 &CM, BFI, PSI, Checks); 10121 LVP.executePlan(Unroller, DT); 10122 10123 ORE->emit([&]() { 10124 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10125 L->getHeader()) 10126 << "interleaved loop (interleaved count: " 10127 << NV("InterleaveCount", IC) << ")"; 10128 }); 10129 } else { 10130 // If we decided that it is *legal* to vectorize the loop, then do it. 10131 10132 // Consider vectorizing the epilogue too if it's profitable. 10133 VectorizationFactor EpilogueVF = 10134 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10135 if (EpilogueVF.Width.isVector()) { 10136 10137 // The first pass vectorizes the main loop and creates a scalar epilogue 10138 // to be vectorized by executing the plan (potentially with a different 10139 // factor) again shortly afterwards. 10140 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10141 EpilogueVF.Width.getKnownMinValue(), 10142 1); 10143 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10144 EPI, &LVL, &CM, BFI, PSI, Checks); 10145 10146 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10147 LVP.executePlan(MainILV, DT); 10148 ++LoopsVectorized; 10149 10150 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10151 formLCSSARecursively(*L, *DT, LI, SE); 10152 10153 // Second pass vectorizes the epilogue and adjusts the control flow 10154 // edges from the first pass. 10155 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10156 EPI.MainLoopVF = EPI.EpilogueVF; 10157 EPI.MainLoopUF = EPI.EpilogueUF; 10158 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10159 ORE, EPI, &LVL, &CM, BFI, PSI, 10160 Checks); 10161 LVP.executePlan(EpilogILV, DT); 10162 ++LoopsEpilogueVectorized; 10163 10164 if (!MainILV.areSafetyChecksAdded()) 10165 DisableRuntimeUnroll = true; 10166 } else { 10167 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10168 &LVL, &CM, BFI, PSI, Checks); 10169 LVP.executePlan(LB, DT); 10170 ++LoopsVectorized; 10171 10172 // Add metadata to disable runtime unrolling a scalar loop when there 10173 // are no runtime checks about strides and memory. A scalar loop that is 10174 // rarely used is not worth unrolling. 10175 if (!LB.areSafetyChecksAdded()) 10176 DisableRuntimeUnroll = true; 10177 } 10178 // Report the vectorization decision. 10179 ORE->emit([&]() { 10180 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10181 L->getHeader()) 10182 << "vectorized loop (vectorization width: " 10183 << NV("VectorizationFactor", VF.Width) 10184 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10185 }); 10186 } 10187 10188 if (ORE->allowExtraAnalysis(LV_NAME)) 10189 checkMixedPrecision(L, ORE); 10190 } 10191 10192 Optional<MDNode *> RemainderLoopID = 10193 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10194 LLVMLoopVectorizeFollowupEpilogue}); 10195 if (RemainderLoopID.hasValue()) { 10196 L->setLoopID(RemainderLoopID.getValue()); 10197 } else { 10198 if (DisableRuntimeUnroll) 10199 AddRuntimeUnrollDisableMetaData(L); 10200 10201 // Mark the loop as already vectorized to avoid vectorizing again. 10202 Hints.setAlreadyVectorized(); 10203 } 10204 10205 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10206 return true; 10207 } 10208 10209 LoopVectorizeResult LoopVectorizePass::runImpl( 10210 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10211 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10212 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10213 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10214 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10215 SE = &SE_; 10216 LI = &LI_; 10217 TTI = &TTI_; 10218 DT = &DT_; 10219 BFI = &BFI_; 10220 TLI = TLI_; 10221 AA = &AA_; 10222 AC = &AC_; 10223 GetLAA = &GetLAA_; 10224 DB = &DB_; 10225 ORE = &ORE_; 10226 PSI = PSI_; 10227 10228 // Don't attempt if 10229 // 1. the target claims to have no vector registers, and 10230 // 2. interleaving won't help ILP. 10231 // 10232 // The second condition is necessary because, even if the target has no 10233 // vector registers, loop vectorization may still enable scalar 10234 // interleaving. 10235 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10236 TTI->getMaxInterleaveFactor(1) < 2) 10237 return LoopVectorizeResult(false, false); 10238 10239 bool Changed = false, CFGChanged = false; 10240 10241 // The vectorizer requires loops to be in simplified form. 10242 // Since simplification may add new inner loops, it has to run before the 10243 // legality and profitability checks. This means running the loop vectorizer 10244 // will simplify all loops, regardless of whether anything end up being 10245 // vectorized. 10246 for (auto &L : *LI) 10247 Changed |= CFGChanged |= 10248 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10249 10250 // Build up a worklist of inner-loops to vectorize. This is necessary as 10251 // the act of vectorizing or partially unrolling a loop creates new loops 10252 // and can invalidate iterators across the loops. 10253 SmallVector<Loop *, 8> Worklist; 10254 10255 for (Loop *L : *LI) 10256 collectSupportedLoops(*L, LI, ORE, Worklist); 10257 10258 LoopsAnalyzed += Worklist.size(); 10259 10260 // Now walk the identified inner loops. 10261 while (!Worklist.empty()) { 10262 Loop *L = Worklist.pop_back_val(); 10263 10264 // For the inner loops we actually process, form LCSSA to simplify the 10265 // transform. 10266 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10267 10268 Changed |= CFGChanged |= processLoop(L); 10269 } 10270 10271 // Process each loop nest in the function. 10272 return LoopVectorizeResult(Changed, CFGChanged); 10273 } 10274 10275 PreservedAnalyses LoopVectorizePass::run(Function &F, 10276 FunctionAnalysisManager &AM) { 10277 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10278 auto &LI = AM.getResult<LoopAnalysis>(F); 10279 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10280 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10281 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10282 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10283 auto &AA = AM.getResult<AAManager>(F); 10284 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10285 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10286 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10287 MemorySSA *MSSA = EnableMSSALoopDependency 10288 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10289 : nullptr; 10290 10291 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10292 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10293 [&](Loop &L) -> const LoopAccessInfo & { 10294 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10295 TLI, TTI, nullptr, MSSA}; 10296 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10297 }; 10298 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10299 ProfileSummaryInfo *PSI = 10300 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10301 LoopVectorizeResult Result = 10302 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10303 if (!Result.MadeAnyChange) 10304 return PreservedAnalyses::all(); 10305 PreservedAnalyses PA; 10306 10307 // We currently do not preserve loopinfo/dominator analyses with outer loop 10308 // vectorization. Until this is addressed, mark these analyses as preserved 10309 // only for non-VPlan-native path. 10310 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10311 if (!EnableVPlanNativePath) { 10312 PA.preserve<LoopAnalysis>(); 10313 PA.preserve<DominatorTreeAnalysis>(); 10314 } 10315 if (!Result.MadeCFGChange) 10316 PA.preserveSet<CFGAnalyses>(); 10317 return PA; 10318 } 10319