1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/Metadata.h" 116 #include "llvm/IR/Module.h" 117 #include "llvm/IR/Operator.h" 118 #include "llvm/IR/PatternMatch.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <functional> 147 #include <iterator> 148 #include <limits> 149 #include <map> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 201 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 202 cl::desc("The maximum allowed number of runtime memory checks with a " 203 "vectorize(enable) pragma.")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<bool> MaximizeBandwidth( 237 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 238 cl::desc("Maximize bandwidth when selecting vectorization factor which " 239 "will be determined by the smallest type in loop.")); 240 241 static cl::opt<bool> EnableInterleavedMemAccesses( 242 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 243 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 244 245 /// An interleave-group may need masking if it resides in a block that needs 246 /// predication, or in order to mask away gaps. 247 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 248 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 249 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 250 251 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 252 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 253 cl::desc("We don't interleave loops with a estimated constant trip count " 254 "below this number")); 255 256 static cl::opt<unsigned> ForceTargetNumScalarRegs( 257 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of scalar registers.")); 259 260 static cl::opt<unsigned> ForceTargetNumVectorRegs( 261 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's number of vector registers.")); 263 264 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 265 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "scalar loops.")); 268 269 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 270 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's max interleave factor for " 272 "vectorized loops.")); 273 274 static cl::opt<unsigned> ForceTargetInstructionCost( 275 "force-target-instruction-cost", cl::init(0), cl::Hidden, 276 cl::desc("A flag that overrides the target's expected cost for " 277 "an instruction to a single constant value. Mostly " 278 "useful for getting consistent testing.")); 279 280 static cl::opt<bool> ForceTargetSupportsScalableVectors( 281 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 282 cl::desc( 283 "Pretend that scalable vectors are supported, even if the target does " 284 "not support them. This flag should only be used for testing.")); 285 286 static cl::opt<unsigned> SmallLoopCost( 287 "small-loop-cost", cl::init(20), cl::Hidden, 288 cl::desc( 289 "The cost of a loop that is considered 'small' by the interleaver.")); 290 291 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 292 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 293 cl::desc("Enable the use of the block frequency analysis to access PGO " 294 "heuristics minimizing code growth in cold regions and being more " 295 "aggressive in hot regions.")); 296 297 // Runtime interleave loops for load/store throughput. 298 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 299 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 300 cl::desc( 301 "Enable runtime interleaving until load/store ports are saturated")); 302 303 /// Interleave small loops with scalar reductions. 304 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 305 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 306 cl::desc("Enable interleaving for loops with small iteration counts that " 307 "contain scalar reductions to expose ILP.")); 308 309 /// The number of stores in a loop that are allowed to need predication. 310 static cl::opt<unsigned> NumberOfStoresToPredicate( 311 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 312 cl::desc("Max number of stores to be predicated behind an if.")); 313 314 static cl::opt<bool> EnableIndVarRegisterHeur( 315 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 316 cl::desc("Count the induction variable only once when interleaving")); 317 318 static cl::opt<bool> EnableCondStoresVectorization( 319 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 320 cl::desc("Enable if predication of stores during vectorization.")); 321 322 static cl::opt<unsigned> MaxNestedScalarReductionIC( 323 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 324 cl::desc("The maximum interleave count to use when interleaving a scalar " 325 "reduction in a nested loop.")); 326 327 static cl::opt<bool> 328 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 329 cl::Hidden, 330 cl::desc("Prefer in-loop vector reductions, " 331 "overriding the targets preference.")); 332 333 static cl::opt<bool> ForceOrderedReductions( 334 "force-ordered-reductions", cl::init(false), cl::Hidden, 335 cl::desc("Enable the vectorisation of loops with in-order (strict) " 336 "FP reductions")); 337 338 static cl::opt<bool> PreferPredicatedReductionSelect( 339 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 340 cl::desc( 341 "Prefer predicating a reduction operation over an after loop select.")); 342 343 cl::opt<bool> EnableVPlanNativePath( 344 "enable-vplan-native-path", cl::init(false), cl::Hidden, 345 cl::desc("Enable VPlan-native vectorization path with " 346 "support for outer loop vectorization.")); 347 348 // FIXME: Remove this switch once we have divergence analysis. Currently we 349 // assume divergent non-backedge branches when this switch is true. 350 cl::opt<bool> EnableVPlanPredication( 351 "enable-vplan-predication", cl::init(false), cl::Hidden, 352 cl::desc("Enable VPlan-native vectorization path predicator with " 353 "support for outer loop vectorization.")); 354 355 // This flag enables the stress testing of the VPlan H-CFG construction in the 356 // VPlan-native vectorization path. It must be used in conjuction with 357 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 358 // verification of the H-CFGs built. 359 static cl::opt<bool> VPlanBuildStressTest( 360 "vplan-build-stress-test", cl::init(false), cl::Hidden, 361 cl::desc( 362 "Build VPlan for every supported loop nest in the function and bail " 363 "out right after the build (stress test the VPlan H-CFG construction " 364 "in the VPlan-native vectorization path).")); 365 366 cl::opt<bool> llvm::EnableLoopInterleaving( 367 "interleave-loops", cl::init(true), cl::Hidden, 368 cl::desc("Enable loop interleaving in Loop vectorization passes")); 369 cl::opt<bool> llvm::EnableLoopVectorization( 370 "vectorize-loops", cl::init(true), cl::Hidden, 371 cl::desc("Run the Loop vectorization passes")); 372 373 cl::opt<bool> PrintVPlansInDotFormat( 374 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 375 cl::desc("Use dot format instead of plain text when dumping VPlans")); 376 377 /// A helper function that returns true if the given type is irregular. The 378 /// type is irregular if its allocated size doesn't equal the store size of an 379 /// element of the corresponding vector type. 380 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 381 // Determine if an array of N elements of type Ty is "bitcast compatible" 382 // with a <N x Ty> vector. 383 // This is only true if there is no padding between the array elements. 384 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 385 } 386 387 /// A helper function that returns the reciprocal of the block probability of 388 /// predicated blocks. If we return X, we are assuming the predicated block 389 /// will execute once for every X iterations of the loop header. 390 /// 391 /// TODO: We should use actual block probability here, if available. Currently, 392 /// we always assume predicated blocks have a 50% chance of executing. 393 static unsigned getReciprocalPredBlockProb() { return 2; } 394 395 /// A helper function that returns an integer or floating-point constant with 396 /// value C. 397 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 398 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 399 : ConstantFP::get(Ty, C); 400 } 401 402 /// Returns "best known" trip count for the specified loop \p L as defined by 403 /// the following procedure: 404 /// 1) Returns exact trip count if it is known. 405 /// 2) Returns expected trip count according to profile data if any. 406 /// 3) Returns upper bound estimate if it is known. 407 /// 4) Returns None if all of the above failed. 408 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 409 // Check if exact trip count is known. 410 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 411 return ExpectedTC; 412 413 // Check if there is an expected trip count available from profile data. 414 if (LoopVectorizeWithBlockFrequency) 415 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 416 return EstimatedTC; 417 418 // Check if upper bound estimate is known. 419 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 420 return ExpectedTC; 421 422 return None; 423 } 424 425 // Forward declare GeneratedRTChecks. 426 class GeneratedRTChecks; 427 428 namespace llvm { 429 430 AnalysisKey ShouldRunExtraVectorPasses::Key; 431 432 /// InnerLoopVectorizer vectorizes loops which contain only one basic 433 /// block to a specified vectorization factor (VF). 434 /// This class performs the widening of scalars into vectors, or multiple 435 /// scalars. This class also implements the following features: 436 /// * It inserts an epilogue loop for handling loops that don't have iteration 437 /// counts that are known to be a multiple of the vectorization factor. 438 /// * It handles the code generation for reduction variables. 439 /// * Scalarization (implementation using scalars) of un-vectorizable 440 /// instructions. 441 /// InnerLoopVectorizer does not perform any vectorization-legality 442 /// checks, and relies on the caller to check for the different legality 443 /// aspects. The InnerLoopVectorizer relies on the 444 /// LoopVectorizationLegality class to provide information about the induction 445 /// and reduction variables that were found to a given vectorization factor. 446 class InnerLoopVectorizer { 447 public: 448 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 449 LoopInfo *LI, DominatorTree *DT, 450 const TargetLibraryInfo *TLI, 451 const TargetTransformInfo *TTI, AssumptionCache *AC, 452 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 453 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 454 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 455 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 456 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 457 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 458 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 459 PSI(PSI), RTChecks(RTChecks) { 460 // Query this against the original loop and save it here because the profile 461 // of the original loop header may change as the transformation happens. 462 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 463 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 464 } 465 466 virtual ~InnerLoopVectorizer() = default; 467 468 /// Create a new empty loop that will contain vectorized instructions later 469 /// on, while the old loop will be used as the scalar remainder. Control flow 470 /// is generated around the vectorized (and scalar epilogue) loops consisting 471 /// of various checks and bypasses. Return the pre-header block of the new 472 /// loop and the start value for the canonical induction, if it is != 0. The 473 /// latter is the case when vectorizing the epilogue loop. In the case of 474 /// epilogue vectorization, this function is overriden to handle the more 475 /// complex control flow around the loops. 476 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 483 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 484 485 // Return true if any runtime check is added. 486 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 487 488 /// A type for vectorized values in the new loop. Each value from the 489 /// original loop, when vectorized, is represented by UF vector values in the 490 /// new unrolled loop, where UF is the unroll factor. 491 using VectorParts = SmallVector<Value *, 2>; 492 493 /// Vectorize a single vector PHINode in a block in the VPlan-native path 494 /// only. 495 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR, 496 VPTransformState &State); 497 498 /// A helper function to scalarize a single Instruction in the innermost loop. 499 /// Generates a sequence of scalar instances for each lane between \p MinLane 500 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 501 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 502 /// Instr's operands. 503 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 504 const VPIteration &Instance, bool IfPredicateInstr, 505 VPTransformState &State); 506 507 /// Construct the vector value of a scalarized value \p V one lane at a time. 508 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 509 VPTransformState &State); 510 511 /// Try to vectorize interleaved access group \p Group with the base address 512 /// given in \p Addr, optionally masking the vector operations if \p 513 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 514 /// values in the vectorized loop. 515 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 516 ArrayRef<VPValue *> VPDefs, 517 VPTransformState &State, VPValue *Addr, 518 ArrayRef<VPValue *> StoredValues, 519 VPValue *BlockInMask = nullptr); 520 521 /// Set the debug location in the builder \p Ptr using the debug location in 522 /// \p V. If \p Ptr is None then it uses the class member's Builder. 523 void setDebugLocFromInst(const Value *V, 524 Optional<IRBuilderBase *> CustomBuilder = None); 525 526 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 527 void fixNonInductionPHIs(VPTransformState &State); 528 529 /// Returns true if the reordering of FP operations is not allowed, but we are 530 /// able to vectorize with strict in-order reductions for the given RdxDesc. 531 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 532 533 /// Create a broadcast instruction. This method generates a broadcast 534 /// instruction (shuffle) for loop invariant values and for the induction 535 /// value. If this is the induction variable then we extend it to N, N+1, ... 536 /// this is needed because each iteration in the loop corresponds to a SIMD 537 /// element. 538 virtual Value *getBroadcastInstrs(Value *V); 539 540 /// Add metadata from one instruction to another. 541 /// 542 /// This includes both the original MDs from \p From and additional ones (\see 543 /// addNewMetadata). Use this for *newly created* instructions in the vector 544 /// loop. 545 void addMetadata(Instruction *To, Instruction *From); 546 547 /// Similar to the previous function but it adds the metadata to a 548 /// vector of instructions. 549 void addMetadata(ArrayRef<Value *> To, Instruction *From); 550 551 // Returns the resume value (bc.merge.rdx) for a reduction as 552 // generated by fixReduction. 553 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 554 555 protected: 556 friend class LoopVectorizationPlanner; 557 558 /// A small list of PHINodes. 559 using PhiVector = SmallVector<PHINode *, 4>; 560 561 /// A type for scalarized values in the new loop. Each value from the 562 /// original loop, when scalarized, is represented by UF x VF scalar values 563 /// in the new unrolled loop, where UF is the unroll factor and VF is the 564 /// vectorization factor. 565 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 566 567 /// Set up the values of the IVs correctly when exiting the vector loop. 568 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 569 Value *VectorTripCount, Value *EndValue, 570 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 571 VPlan &Plan); 572 573 /// Handle all cross-iteration phis in the header. 574 void fixCrossIterationPHIs(VPTransformState &State); 575 576 /// Create the exit value of first order recurrences in the middle block and 577 /// update their users. 578 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 579 VPTransformState &State); 580 581 /// Create code for the loop exit value of the reduction. 582 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 583 584 /// Clear NSW/NUW flags from reduction instructions if necessary. 585 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 586 VPTransformState &State); 587 588 /// Iteratively sink the scalarized operands of a predicated instruction into 589 /// the block that was created for it. 590 void sinkScalarOperands(Instruction *PredInst); 591 592 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 593 /// represented as. 594 void truncateToMinimalBitwidths(VPTransformState &State); 595 596 /// Returns (and creates if needed) the original loop trip count. 597 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 598 599 /// Returns (and creates if needed) the trip count of the widened loop. 600 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 601 602 /// Returns a bitcasted value to the requested vector type. 603 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 604 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 605 const DataLayout &DL); 606 607 /// Emit a bypass check to see if the vector trip count is zero, including if 608 /// it overflows. 609 void emitIterationCountCheck(BasicBlock *Bypass); 610 611 /// Emit a bypass check to see if all of the SCEV assumptions we've 612 /// had to make are correct. Returns the block containing the checks or 613 /// nullptr if no checks have been added. 614 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 615 616 /// Emit bypass checks to check any memory assumptions we may have made. 617 /// Returns the block containing the checks or nullptr if no checks have been 618 /// added. 619 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 620 621 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 622 /// vector loop preheader, middle block and scalar preheader. 623 void createVectorLoopSkeleton(StringRef Prefix); 624 625 /// Create new phi nodes for the induction variables to resume iteration count 626 /// in the scalar epilogue, from where the vectorized loop left off. 627 /// In cases where the loop skeleton is more complicated (eg. epilogue 628 /// vectorization) and the resume values can come from an additional bypass 629 /// block, the \p AdditionalBypass pair provides information about the bypass 630 /// block and the end value on the edge from bypass to this loop. 631 void createInductionResumeValues( 632 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 633 634 /// Complete the loop skeleton by adding debug MDs, creating appropriate 635 /// conditional branches in the middle block, preparing the builder and 636 /// running the verifier. Return the preheader of the completed vector loop. 637 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 638 639 /// Add additional metadata to \p To that was not present on \p Orig. 640 /// 641 /// Currently this is used to add the noalias annotations based on the 642 /// inserted memchecks. Use this for instructions that are *cloned* into the 643 /// vector loop. 644 void addNewMetadata(Instruction *To, const Instruction *Orig); 645 646 /// Collect poison-generating recipes that may generate a poison value that is 647 /// used after vectorization, even when their operands are not poison. Those 648 /// recipes meet the following conditions: 649 /// * Contribute to the address computation of a recipe generating a widen 650 /// memory load/store (VPWidenMemoryInstructionRecipe or 651 /// VPInterleaveRecipe). 652 /// * Such a widen memory load/store has at least one underlying Instruction 653 /// that is in a basic block that needs predication and after vectorization 654 /// the generated instruction won't be predicated. 655 void collectPoisonGeneratingRecipes(VPTransformState &State); 656 657 /// Allow subclasses to override and print debug traces before/after vplan 658 /// execution, when trace information is requested. 659 virtual void printDebugTracesAtStart(){}; 660 virtual void printDebugTracesAtEnd(){}; 661 662 /// The original loop. 663 Loop *OrigLoop; 664 665 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 666 /// dynamic knowledge to simplify SCEV expressions and converts them to a 667 /// more usable form. 668 PredicatedScalarEvolution &PSE; 669 670 /// Loop Info. 671 LoopInfo *LI; 672 673 /// Dominator Tree. 674 DominatorTree *DT; 675 676 /// Alias Analysis. 677 AAResults *AA; 678 679 /// Target Library Info. 680 const TargetLibraryInfo *TLI; 681 682 /// Target Transform Info. 683 const TargetTransformInfo *TTI; 684 685 /// Assumption Cache. 686 AssumptionCache *AC; 687 688 /// Interface to emit optimization remarks. 689 OptimizationRemarkEmitter *ORE; 690 691 /// LoopVersioning. It's only set up (non-null) if memchecks were 692 /// used. 693 /// 694 /// This is currently only used to add no-alias metadata based on the 695 /// memchecks. The actually versioning is performed manually. 696 std::unique_ptr<LoopVersioning> LVer; 697 698 /// The vectorization SIMD factor to use. Each vector will have this many 699 /// vector elements. 700 ElementCount VF; 701 702 /// The vectorization unroll factor to use. Each scalar is vectorized to this 703 /// many different vector instructions. 704 unsigned UF; 705 706 /// The builder that we use 707 IRBuilder<> Builder; 708 709 // --- Vectorization state --- 710 711 /// The vector-loop preheader. 712 BasicBlock *LoopVectorPreHeader; 713 714 /// The scalar-loop preheader. 715 BasicBlock *LoopScalarPreHeader; 716 717 /// Middle Block between the vector and the scalar. 718 BasicBlock *LoopMiddleBlock; 719 720 /// The unique ExitBlock of the scalar loop if one exists. Note that 721 /// there can be multiple exiting edges reaching this block. 722 BasicBlock *LoopExitBlock; 723 724 /// The scalar loop body. 725 BasicBlock *LoopScalarBody; 726 727 /// A list of all bypass blocks. The first block is the entry of the loop. 728 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 729 730 /// Store instructions that were predicated. 731 SmallVector<Instruction *, 4> PredicatedInstructions; 732 733 /// Trip count of the original loop. 734 Value *TripCount = nullptr; 735 736 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 737 Value *VectorTripCount = nullptr; 738 739 /// The legality analysis. 740 LoopVectorizationLegality *Legal; 741 742 /// The profitablity analysis. 743 LoopVectorizationCostModel *Cost; 744 745 // Record whether runtime checks are added. 746 bool AddedSafetyChecks = false; 747 748 // Holds the end values for each induction variable. We save the end values 749 // so we can later fix-up the external users of the induction variables. 750 DenseMap<PHINode *, Value *> IVEndValues; 751 752 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 753 // fixed up at the end of vector code generation. 754 SmallVector<PHINode *, 8> OrigPHIsToFix; 755 756 /// BFI and PSI are used to check for profile guided size optimizations. 757 BlockFrequencyInfo *BFI; 758 ProfileSummaryInfo *PSI; 759 760 // Whether this loop should be optimized for size based on profile guided size 761 // optimizatios. 762 bool OptForSizeBasedOnProfile; 763 764 /// Structure to hold information about generated runtime checks, responsible 765 /// for cleaning the checks, if vectorization turns out unprofitable. 766 GeneratedRTChecks &RTChecks; 767 768 // Holds the resume values for reductions in the loops, used to set the 769 // correct start value of reduction PHIs when vectorizing the epilogue. 770 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 771 ReductionResumeValues; 772 }; 773 774 class InnerLoopUnroller : public InnerLoopVectorizer { 775 public: 776 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 777 LoopInfo *LI, DominatorTree *DT, 778 const TargetLibraryInfo *TLI, 779 const TargetTransformInfo *TTI, AssumptionCache *AC, 780 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 781 LoopVectorizationLegality *LVL, 782 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 783 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 784 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 785 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 786 BFI, PSI, Check) {} 787 788 private: 789 Value *getBroadcastInstrs(Value *V) override; 790 }; 791 792 /// Encapsulate information regarding vectorization of a loop and its epilogue. 793 /// This information is meant to be updated and used across two stages of 794 /// epilogue vectorization. 795 struct EpilogueLoopVectorizationInfo { 796 ElementCount MainLoopVF = ElementCount::getFixed(0); 797 unsigned MainLoopUF = 0; 798 ElementCount EpilogueVF = ElementCount::getFixed(0); 799 unsigned EpilogueUF = 0; 800 BasicBlock *MainLoopIterationCountCheck = nullptr; 801 BasicBlock *EpilogueIterationCountCheck = nullptr; 802 BasicBlock *SCEVSafetyCheck = nullptr; 803 BasicBlock *MemSafetyCheck = nullptr; 804 Value *TripCount = nullptr; 805 Value *VectorTripCount = nullptr; 806 807 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 808 ElementCount EVF, unsigned EUF) 809 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 810 assert(EUF == 1 && 811 "A high UF for the epilogue loop is likely not beneficial."); 812 } 813 }; 814 815 /// An extension of the inner loop vectorizer that creates a skeleton for a 816 /// vectorized loop that has its epilogue (residual) also vectorized. 817 /// The idea is to run the vplan on a given loop twice, firstly to setup the 818 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 819 /// from the first step and vectorize the epilogue. This is achieved by 820 /// deriving two concrete strategy classes from this base class and invoking 821 /// them in succession from the loop vectorizer planner. 822 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 823 public: 824 InnerLoopAndEpilogueVectorizer( 825 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 826 DominatorTree *DT, const TargetLibraryInfo *TLI, 827 const TargetTransformInfo *TTI, AssumptionCache *AC, 828 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 829 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 830 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 831 GeneratedRTChecks &Checks) 832 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 833 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 834 Checks), 835 EPI(EPI) {} 836 837 // Override this function to handle the more complex control flow around the 838 // three loops. 839 std::pair<BasicBlock *, Value *> 840 createVectorizedLoopSkeleton() final override { 841 return createEpilogueVectorizedLoopSkeleton(); 842 } 843 844 /// The interface for creating a vectorized skeleton using one of two 845 /// different strategies, each corresponding to one execution of the vplan 846 /// as described above. 847 virtual std::pair<BasicBlock *, Value *> 848 createEpilogueVectorizedLoopSkeleton() = 0; 849 850 /// Holds and updates state information required to vectorize the main loop 851 /// and its epilogue in two separate passes. This setup helps us avoid 852 /// regenerating and recomputing runtime safety checks. It also helps us to 853 /// shorten the iteration-count-check path length for the cases where the 854 /// iteration count of the loop is so small that the main vector loop is 855 /// completely skipped. 856 EpilogueLoopVectorizationInfo &EPI; 857 }; 858 859 /// A specialized derived class of inner loop vectorizer that performs 860 /// vectorization of *main* loops in the process of vectorizing loops and their 861 /// epilogues. 862 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 863 public: 864 EpilogueVectorizerMainLoop( 865 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 866 DominatorTree *DT, const TargetLibraryInfo *TLI, 867 const TargetTransformInfo *TTI, AssumptionCache *AC, 868 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 869 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 870 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 871 GeneratedRTChecks &Check) 872 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 873 EPI, LVL, CM, BFI, PSI, Check) {} 874 /// Implements the interface for creating a vectorized skeleton using the 875 /// *main loop* strategy (ie the first pass of vplan execution). 876 std::pair<BasicBlock *, Value *> 877 createEpilogueVectorizedLoopSkeleton() final override; 878 879 protected: 880 /// Emits an iteration count bypass check once for the main loop (when \p 881 /// ForEpilogue is false) and once for the epilogue loop (when \p 882 /// ForEpilogue is true). 883 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 884 void printDebugTracesAtStart() override; 885 void printDebugTracesAtEnd() override; 886 }; 887 888 // A specialized derived class of inner loop vectorizer that performs 889 // vectorization of *epilogue* loops in the process of vectorizing loops and 890 // their epilogues. 891 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 892 public: 893 EpilogueVectorizerEpilogueLoop( 894 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 895 DominatorTree *DT, const TargetLibraryInfo *TLI, 896 const TargetTransformInfo *TTI, AssumptionCache *AC, 897 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 898 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 899 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 900 GeneratedRTChecks &Checks) 901 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 902 EPI, LVL, CM, BFI, PSI, Checks) { 903 TripCount = EPI.TripCount; 904 } 905 /// Implements the interface for creating a vectorized skeleton using the 906 /// *epilogue loop* strategy (ie the second pass of vplan execution). 907 std::pair<BasicBlock *, Value *> 908 createEpilogueVectorizedLoopSkeleton() final override; 909 910 protected: 911 /// Emits an iteration count bypass check after the main vector loop has 912 /// finished to see if there are any iterations left to execute by either 913 /// the vector epilogue or the scalar epilogue. 914 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 915 BasicBlock *Bypass, 916 BasicBlock *Insert); 917 void printDebugTracesAtStart() override; 918 void printDebugTracesAtEnd() override; 919 }; 920 } // end namespace llvm 921 922 /// Look for a meaningful debug location on the instruction or it's 923 /// operands. 924 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 925 if (!I) 926 return I; 927 928 DebugLoc Empty; 929 if (I->getDebugLoc() != Empty) 930 return I; 931 932 for (Use &Op : I->operands()) { 933 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 934 if (OpInst->getDebugLoc() != Empty) 935 return OpInst; 936 } 937 938 return I; 939 } 940 941 void InnerLoopVectorizer::setDebugLocFromInst( 942 const Value *V, Optional<IRBuilderBase *> CustomBuilder) { 943 IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder; 944 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 945 const DILocation *DIL = Inst->getDebugLoc(); 946 947 // When a FSDiscriminator is enabled, we don't need to add the multiply 948 // factors to the discriminators. 949 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 950 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 951 // FIXME: For scalable vectors, assume vscale=1. 952 auto NewDIL = 953 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 954 if (NewDIL) 955 B->SetCurrentDebugLocation(NewDIL.getValue()); 956 else 957 LLVM_DEBUG(dbgs() 958 << "Failed to create new discriminator: " 959 << DIL->getFilename() << " Line: " << DIL->getLine()); 960 } else 961 B->SetCurrentDebugLocation(DIL); 962 } else 963 B->SetCurrentDebugLocation(DebugLoc()); 964 } 965 966 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 967 /// is passed, the message relates to that particular instruction. 968 #ifndef NDEBUG 969 static void debugVectorizationMessage(const StringRef Prefix, 970 const StringRef DebugMsg, 971 Instruction *I) { 972 dbgs() << "LV: " << Prefix << DebugMsg; 973 if (I != nullptr) 974 dbgs() << " " << *I; 975 else 976 dbgs() << '.'; 977 dbgs() << '\n'; 978 } 979 #endif 980 981 /// Create an analysis remark that explains why vectorization failed 982 /// 983 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 984 /// RemarkName is the identifier for the remark. If \p I is passed it is an 985 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 986 /// the location of the remark. \return the remark object that can be 987 /// streamed to. 988 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 989 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 990 Value *CodeRegion = TheLoop->getHeader(); 991 DebugLoc DL = TheLoop->getStartLoc(); 992 993 if (I) { 994 CodeRegion = I->getParent(); 995 // If there is no debug location attached to the instruction, revert back to 996 // using the loop's. 997 if (I->getDebugLoc()) 998 DL = I->getDebugLoc(); 999 } 1000 1001 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1002 } 1003 1004 namespace llvm { 1005 1006 /// Return a value for Step multiplied by VF. 1007 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1008 int64_t Step) { 1009 assert(Ty->isIntegerTy() && "Expected an integer step"); 1010 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 1011 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1012 } 1013 1014 /// Return the runtime value for VF. 1015 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1016 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1017 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1018 } 1019 1020 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 1021 ElementCount VF) { 1022 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 1023 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 1024 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 1025 return B.CreateUIToFP(RuntimeVF, FTy); 1026 } 1027 1028 void reportVectorizationFailure(const StringRef DebugMsg, 1029 const StringRef OREMsg, const StringRef ORETag, 1030 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1031 Instruction *I) { 1032 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1033 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1034 ORE->emit( 1035 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1036 << "loop not vectorized: " << OREMsg); 1037 } 1038 1039 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1040 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1041 Instruction *I) { 1042 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1043 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1044 ORE->emit( 1045 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1046 << Msg); 1047 } 1048 1049 } // end namespace llvm 1050 1051 #ifndef NDEBUG 1052 /// \return string containing a file name and a line # for the given loop. 1053 static std::string getDebugLocString(const Loop *L) { 1054 std::string Result; 1055 if (L) { 1056 raw_string_ostream OS(Result); 1057 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1058 LoopDbgLoc.print(OS); 1059 else 1060 // Just print the module name. 1061 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1062 OS.flush(); 1063 } 1064 return Result; 1065 } 1066 #endif 1067 1068 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1069 const Instruction *Orig) { 1070 // If the loop was versioned with memchecks, add the corresponding no-alias 1071 // metadata. 1072 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1073 LVer->annotateInstWithNoAlias(To, Orig); 1074 } 1075 1076 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1077 VPTransformState &State) { 1078 1079 // Collect recipes in the backward slice of `Root` that may generate a poison 1080 // value that is used after vectorization. 1081 SmallPtrSet<VPRecipeBase *, 16> Visited; 1082 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1083 SmallVector<VPRecipeBase *, 16> Worklist; 1084 Worklist.push_back(Root); 1085 1086 // Traverse the backward slice of Root through its use-def chain. 1087 while (!Worklist.empty()) { 1088 VPRecipeBase *CurRec = Worklist.back(); 1089 Worklist.pop_back(); 1090 1091 if (!Visited.insert(CurRec).second) 1092 continue; 1093 1094 // Prune search if we find another recipe generating a widen memory 1095 // instruction. Widen memory instructions involved in address computation 1096 // will lead to gather/scatter instructions, which don't need to be 1097 // handled. 1098 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1099 isa<VPInterleaveRecipe>(CurRec) || 1100 isa<VPScalarIVStepsRecipe>(CurRec) || 1101 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1102 continue; 1103 1104 // This recipe contributes to the address computation of a widen 1105 // load/store. Collect recipe if its underlying instruction has 1106 // poison-generating flags. 1107 Instruction *Instr = CurRec->getUnderlyingInstr(); 1108 if (Instr && Instr->hasPoisonGeneratingFlags()) 1109 State.MayGeneratePoisonRecipes.insert(CurRec); 1110 1111 // Add new definitions to the worklist. 1112 for (VPValue *operand : CurRec->operands()) 1113 if (VPDef *OpDef = operand->getDef()) 1114 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1115 } 1116 }); 1117 1118 // Traverse all the recipes in the VPlan and collect the poison-generating 1119 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1120 // VPInterleaveRecipe. 1121 auto Iter = depth_first( 1122 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1123 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1124 for (VPRecipeBase &Recipe : *VPBB) { 1125 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1126 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1127 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1128 if (AddrDef && WidenRec->isConsecutive() && 1129 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1130 collectPoisonGeneratingInstrsInBackwardSlice( 1131 cast<VPRecipeBase>(AddrDef)); 1132 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1133 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1134 if (AddrDef) { 1135 // Check if any member of the interleave group needs predication. 1136 const InterleaveGroup<Instruction> *InterGroup = 1137 InterleaveRec->getInterleaveGroup(); 1138 bool NeedPredication = false; 1139 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1140 I < NumMembers; ++I) { 1141 Instruction *Member = InterGroup->getMember(I); 1142 if (Member) 1143 NeedPredication |= 1144 Legal->blockNeedsPredication(Member->getParent()); 1145 } 1146 1147 if (NeedPredication) 1148 collectPoisonGeneratingInstrsInBackwardSlice( 1149 cast<VPRecipeBase>(AddrDef)); 1150 } 1151 } 1152 } 1153 } 1154 } 1155 1156 void InnerLoopVectorizer::addMetadata(Instruction *To, 1157 Instruction *From) { 1158 propagateMetadata(To, From); 1159 addNewMetadata(To, From); 1160 } 1161 1162 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1163 Instruction *From) { 1164 for (Value *V : To) { 1165 if (Instruction *I = dyn_cast<Instruction>(V)) 1166 addMetadata(I, From); 1167 } 1168 } 1169 1170 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1171 const RecurrenceDescriptor &RdxDesc) { 1172 auto It = ReductionResumeValues.find(&RdxDesc); 1173 assert(It != ReductionResumeValues.end() && 1174 "Expected to find a resume value for the reduction."); 1175 return It->second; 1176 } 1177 1178 namespace llvm { 1179 1180 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1181 // lowered. 1182 enum ScalarEpilogueLowering { 1183 1184 // The default: allowing scalar epilogues. 1185 CM_ScalarEpilogueAllowed, 1186 1187 // Vectorization with OptForSize: don't allow epilogues. 1188 CM_ScalarEpilogueNotAllowedOptSize, 1189 1190 // A special case of vectorisation with OptForSize: loops with a very small 1191 // trip count are considered for vectorization under OptForSize, thereby 1192 // making sure the cost of their loop body is dominant, free of runtime 1193 // guards and scalar iteration overheads. 1194 CM_ScalarEpilogueNotAllowedLowTripLoop, 1195 1196 // Loop hint predicate indicating an epilogue is undesired. 1197 CM_ScalarEpilogueNotNeededUsePredicate, 1198 1199 // Directive indicating we must either tail fold or not vectorize 1200 CM_ScalarEpilogueNotAllowedUsePredicate 1201 }; 1202 1203 /// ElementCountComparator creates a total ordering for ElementCount 1204 /// for the purposes of using it in a set structure. 1205 struct ElementCountComparator { 1206 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1207 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1208 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1209 } 1210 }; 1211 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1212 1213 /// LoopVectorizationCostModel - estimates the expected speedups due to 1214 /// vectorization. 1215 /// In many cases vectorization is not profitable. This can happen because of 1216 /// a number of reasons. In this class we mainly attempt to predict the 1217 /// expected speedup/slowdowns due to the supported instruction set. We use the 1218 /// TargetTransformInfo to query the different backends for the cost of 1219 /// different operations. 1220 class LoopVectorizationCostModel { 1221 public: 1222 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1223 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1224 LoopVectorizationLegality *Legal, 1225 const TargetTransformInfo &TTI, 1226 const TargetLibraryInfo *TLI, DemandedBits *DB, 1227 AssumptionCache *AC, 1228 OptimizationRemarkEmitter *ORE, const Function *F, 1229 const LoopVectorizeHints *Hints, 1230 InterleavedAccessInfo &IAI) 1231 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1232 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1233 Hints(Hints), InterleaveInfo(IAI) {} 1234 1235 /// \return An upper bound for the vectorization factors (both fixed and 1236 /// scalable). If the factors are 0, vectorization and interleaving should be 1237 /// avoided up front. 1238 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1239 1240 /// \return True if runtime checks are required for vectorization, and false 1241 /// otherwise. 1242 bool runtimeChecksRequired(); 1243 1244 /// \return The most profitable vectorization factor and the cost of that VF. 1245 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1246 /// then this vectorization factor will be selected if vectorization is 1247 /// possible. 1248 VectorizationFactor 1249 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1250 1251 VectorizationFactor 1252 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1253 const LoopVectorizationPlanner &LVP); 1254 1255 /// Setup cost-based decisions for user vectorization factor. 1256 /// \return true if the UserVF is a feasible VF to be chosen. 1257 bool selectUserVectorizationFactor(ElementCount UserVF) { 1258 collectUniformsAndScalars(UserVF); 1259 collectInstsToScalarize(UserVF); 1260 return expectedCost(UserVF).first.isValid(); 1261 } 1262 1263 /// \return The size (in bits) of the smallest and widest types in the code 1264 /// that needs to be vectorized. We ignore values that remain scalar such as 1265 /// 64 bit loop indices. 1266 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1267 1268 /// \return The desired interleave count. 1269 /// If interleave count has been specified by metadata it will be returned. 1270 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1271 /// are the selected vectorization factor and the cost of the selected VF. 1272 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1273 1274 /// Memory access instruction may be vectorized in more than one way. 1275 /// Form of instruction after vectorization depends on cost. 1276 /// This function takes cost-based decisions for Load/Store instructions 1277 /// and collects them in a map. This decisions map is used for building 1278 /// the lists of loop-uniform and loop-scalar instructions. 1279 /// The calculated cost is saved with widening decision in order to 1280 /// avoid redundant calculations. 1281 void setCostBasedWideningDecision(ElementCount VF); 1282 1283 /// A struct that represents some properties of the register usage 1284 /// of a loop. 1285 struct RegisterUsage { 1286 /// Holds the number of loop invariant values that are used in the loop. 1287 /// The key is ClassID of target-provided register class. 1288 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1289 /// Holds the maximum number of concurrent live intervals in the loop. 1290 /// The key is ClassID of target-provided register class. 1291 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1292 }; 1293 1294 /// \return Returns information about the register usages of the loop for the 1295 /// given vectorization factors. 1296 SmallVector<RegisterUsage, 8> 1297 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1298 1299 /// Collect values we want to ignore in the cost model. 1300 void collectValuesToIgnore(); 1301 1302 /// Collect all element types in the loop for which widening is needed. 1303 void collectElementTypesForWidening(); 1304 1305 /// Split reductions into those that happen in the loop, and those that happen 1306 /// outside. In loop reductions are collected into InLoopReductionChains. 1307 void collectInLoopReductions(); 1308 1309 /// Returns true if we should use strict in-order reductions for the given 1310 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1311 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1312 /// of FP operations. 1313 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) { 1314 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1315 } 1316 1317 /// \returns The smallest bitwidth each instruction can be represented with. 1318 /// The vector equivalents of these instructions should be truncated to this 1319 /// type. 1320 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1321 return MinBWs; 1322 } 1323 1324 /// \returns True if it is more profitable to scalarize instruction \p I for 1325 /// vectorization factor \p VF. 1326 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1327 assert(VF.isVector() && 1328 "Profitable to scalarize relevant only for VF > 1."); 1329 1330 // Cost model is not run in the VPlan-native path - return conservative 1331 // result until this changes. 1332 if (EnableVPlanNativePath) 1333 return false; 1334 1335 auto Scalars = InstsToScalarize.find(VF); 1336 assert(Scalars != InstsToScalarize.end() && 1337 "VF not yet analyzed for scalarization profitability"); 1338 return Scalars->second.find(I) != Scalars->second.end(); 1339 } 1340 1341 /// Returns true if \p I is known to be uniform after vectorization. 1342 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1343 if (VF.isScalar()) 1344 return true; 1345 1346 // Cost model is not run in the VPlan-native path - return conservative 1347 // result until this changes. 1348 if (EnableVPlanNativePath) 1349 return false; 1350 1351 auto UniformsPerVF = Uniforms.find(VF); 1352 assert(UniformsPerVF != Uniforms.end() && 1353 "VF not yet analyzed for uniformity"); 1354 return UniformsPerVF->second.count(I); 1355 } 1356 1357 /// Returns true if \p I is known to be scalar after vectorization. 1358 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1359 if (VF.isScalar()) 1360 return true; 1361 1362 // Cost model is not run in the VPlan-native path - return conservative 1363 // result until this changes. 1364 if (EnableVPlanNativePath) 1365 return false; 1366 1367 auto ScalarsPerVF = Scalars.find(VF); 1368 assert(ScalarsPerVF != Scalars.end() && 1369 "Scalar values are not calculated for VF"); 1370 return ScalarsPerVF->second.count(I); 1371 } 1372 1373 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1374 /// for vectorization factor \p VF. 1375 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1376 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1377 !isProfitableToScalarize(I, VF) && 1378 !isScalarAfterVectorization(I, VF); 1379 } 1380 1381 /// Decision that was taken during cost calculation for memory instruction. 1382 enum InstWidening { 1383 CM_Unknown, 1384 CM_Widen, // For consecutive accesses with stride +1. 1385 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1386 CM_Interleave, 1387 CM_GatherScatter, 1388 CM_Scalarize 1389 }; 1390 1391 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1392 /// instruction \p I and vector width \p VF. 1393 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1394 InstructionCost Cost) { 1395 assert(VF.isVector() && "Expected VF >=2"); 1396 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1397 } 1398 1399 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1400 /// interleaving group \p Grp and vector width \p VF. 1401 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1402 ElementCount VF, InstWidening W, 1403 InstructionCost Cost) { 1404 assert(VF.isVector() && "Expected VF >=2"); 1405 /// Broadcast this decicion to all instructions inside the group. 1406 /// But the cost will be assigned to one instruction only. 1407 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1408 if (auto *I = Grp->getMember(i)) { 1409 if (Grp->getInsertPos() == I) 1410 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1411 else 1412 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1413 } 1414 } 1415 } 1416 1417 /// Return the cost model decision for the given instruction \p I and vector 1418 /// width \p VF. Return CM_Unknown if this instruction did not pass 1419 /// through the cost modeling. 1420 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1421 assert(VF.isVector() && "Expected VF to be a vector VF"); 1422 // Cost model is not run in the VPlan-native path - return conservative 1423 // result until this changes. 1424 if (EnableVPlanNativePath) 1425 return CM_GatherScatter; 1426 1427 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1428 auto Itr = WideningDecisions.find(InstOnVF); 1429 if (Itr == WideningDecisions.end()) 1430 return CM_Unknown; 1431 return Itr->second.first; 1432 } 1433 1434 /// Return the vectorization cost for the given instruction \p I and vector 1435 /// width \p VF. 1436 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1437 assert(VF.isVector() && "Expected VF >=2"); 1438 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1439 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1440 "The cost is not calculated"); 1441 return WideningDecisions[InstOnVF].second; 1442 } 1443 1444 /// Return True if instruction \p I is an optimizable truncate whose operand 1445 /// is an induction variable. Such a truncate will be removed by adding a new 1446 /// induction variable with the destination type. 1447 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1448 // If the instruction is not a truncate, return false. 1449 auto *Trunc = dyn_cast<TruncInst>(I); 1450 if (!Trunc) 1451 return false; 1452 1453 // Get the source and destination types of the truncate. 1454 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1455 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1456 1457 // If the truncate is free for the given types, return false. Replacing a 1458 // free truncate with an induction variable would add an induction variable 1459 // update instruction to each iteration of the loop. We exclude from this 1460 // check the primary induction variable since it will need an update 1461 // instruction regardless. 1462 Value *Op = Trunc->getOperand(0); 1463 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1464 return false; 1465 1466 // If the truncated value is not an induction variable, return false. 1467 return Legal->isInductionPhi(Op); 1468 } 1469 1470 /// Collects the instructions to scalarize for each predicated instruction in 1471 /// the loop. 1472 void collectInstsToScalarize(ElementCount VF); 1473 1474 /// Collect Uniform and Scalar values for the given \p VF. 1475 /// The sets depend on CM decision for Load/Store instructions 1476 /// that may be vectorized as interleave, gather-scatter or scalarized. 1477 void collectUniformsAndScalars(ElementCount VF) { 1478 // Do the analysis once. 1479 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1480 return; 1481 setCostBasedWideningDecision(VF); 1482 collectLoopUniforms(VF); 1483 collectLoopScalars(VF); 1484 } 1485 1486 /// Returns true if the target machine supports masked store operation 1487 /// for the given \p DataType and kind of access to \p Ptr. 1488 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1489 return Legal->isConsecutivePtr(DataType, Ptr) && 1490 TTI.isLegalMaskedStore(DataType, Alignment); 1491 } 1492 1493 /// Returns true if the target machine supports masked load operation 1494 /// for the given \p DataType and kind of access to \p Ptr. 1495 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1496 return Legal->isConsecutivePtr(DataType, Ptr) && 1497 TTI.isLegalMaskedLoad(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine can represent \p V as a masked gather 1501 /// or scatter operation. 1502 bool isLegalGatherOrScatter(Value *V, 1503 ElementCount VF = ElementCount::getFixed(1)) { 1504 bool LI = isa<LoadInst>(V); 1505 bool SI = isa<StoreInst>(V); 1506 if (!LI && !SI) 1507 return false; 1508 auto *Ty = getLoadStoreType(V); 1509 Align Align = getLoadStoreAlignment(V); 1510 if (VF.isVector()) 1511 Ty = VectorType::get(Ty, VF); 1512 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1513 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1514 } 1515 1516 /// Returns true if the target machine supports all of the reduction 1517 /// variables found for the given VF. 1518 bool canVectorizeReductions(ElementCount VF) const { 1519 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1520 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1521 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1522 })); 1523 } 1524 1525 /// Returns true if \p I is an instruction that will be scalarized with 1526 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1527 /// instructions include conditional stores and instructions that may divide 1528 /// by zero. 1529 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1530 1531 // Returns true if \p I is an instruction that will be predicated either 1532 // through scalar predication or masked load/store or masked gather/scatter. 1533 // \p VF is the vectorization factor that will be used to vectorize \p I. 1534 // Superset of instructions that return true for isScalarWithPredication. 1535 bool isPredicatedInst(Instruction *I, ElementCount VF, 1536 bool IsKnownUniform = false) { 1537 // When we know the load is uniform and the original scalar loop was not 1538 // predicated we don't need to mark it as a predicated instruction. Any 1539 // vectorised blocks created when tail-folding are something artificial we 1540 // have introduced and we know there is always at least one active lane. 1541 // That's why we call Legal->blockNeedsPredication here because it doesn't 1542 // query tail-folding. 1543 if (IsKnownUniform && isa<LoadInst>(I) && 1544 !Legal->blockNeedsPredication(I->getParent())) 1545 return false; 1546 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1547 return false; 1548 // Loads and stores that need some form of masked operation are predicated 1549 // instructions. 1550 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1551 return Legal->isMaskRequired(I); 1552 return isScalarWithPredication(I, VF); 1553 } 1554 1555 /// Returns true if \p I is a memory instruction with consecutive memory 1556 /// access that can be widened. 1557 bool 1558 memoryInstructionCanBeWidened(Instruction *I, 1559 ElementCount VF = ElementCount::getFixed(1)); 1560 1561 /// Returns true if \p I is a memory instruction in an interleaved-group 1562 /// of memory accesses that can be vectorized with wide vector loads/stores 1563 /// and shuffles. 1564 bool 1565 interleavedAccessCanBeWidened(Instruction *I, 1566 ElementCount VF = ElementCount::getFixed(1)); 1567 1568 /// Check if \p Instr belongs to any interleaved access group. 1569 bool isAccessInterleaved(Instruction *Instr) { 1570 return InterleaveInfo.isInterleaved(Instr); 1571 } 1572 1573 /// Get the interleaved access group that \p Instr belongs to. 1574 const InterleaveGroup<Instruction> * 1575 getInterleavedAccessGroup(Instruction *Instr) { 1576 return InterleaveInfo.getInterleaveGroup(Instr); 1577 } 1578 1579 /// Returns true if we're required to use a scalar epilogue for at least 1580 /// the final iteration of the original loop. 1581 bool requiresScalarEpilogue(ElementCount VF) const { 1582 if (!isScalarEpilogueAllowed()) 1583 return false; 1584 // If we might exit from anywhere but the latch, must run the exiting 1585 // iteration in scalar form. 1586 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1587 return true; 1588 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1589 } 1590 1591 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1592 /// loop hint annotation. 1593 bool isScalarEpilogueAllowed() const { 1594 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1595 } 1596 1597 /// Returns true if all loop blocks should be masked to fold tail loop. 1598 bool foldTailByMasking() const { return FoldTailByMasking; } 1599 1600 /// Returns true if the instructions in this block requires predication 1601 /// for any reason, e.g. because tail folding now requires a predicate 1602 /// or because the block in the original loop was predicated. 1603 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1604 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1605 } 1606 1607 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1608 /// nodes to the chain of instructions representing the reductions. Uses a 1609 /// MapVector to ensure deterministic iteration order. 1610 using ReductionChainMap = 1611 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1612 1613 /// Return the chain of instructions representing an inloop reduction. 1614 const ReductionChainMap &getInLoopReductionChains() const { 1615 return InLoopReductionChains; 1616 } 1617 1618 /// Returns true if the Phi is part of an inloop reduction. 1619 bool isInLoopReduction(PHINode *Phi) const { 1620 return InLoopReductionChains.count(Phi); 1621 } 1622 1623 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1624 /// with factor VF. Return the cost of the instruction, including 1625 /// scalarization overhead if it's needed. 1626 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1627 1628 /// Estimate cost of a call instruction CI if it were vectorized with factor 1629 /// VF. Return the cost of the instruction, including scalarization overhead 1630 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1631 /// scalarized - 1632 /// i.e. either vector version isn't available, or is too expensive. 1633 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1634 bool &NeedToScalarize) const; 1635 1636 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1637 /// that of B. 1638 bool isMoreProfitable(const VectorizationFactor &A, 1639 const VectorizationFactor &B) const; 1640 1641 /// Invalidates decisions already taken by the cost model. 1642 void invalidateCostModelingDecisions() { 1643 WideningDecisions.clear(); 1644 Uniforms.clear(); 1645 Scalars.clear(); 1646 } 1647 1648 private: 1649 unsigned NumPredStores = 0; 1650 1651 /// Convenience function that returns the value of vscale_range iff 1652 /// vscale_range.min == vscale_range.max or otherwise returns the value 1653 /// returned by the corresponding TLI method. 1654 Optional<unsigned> getVScaleForTuning() const; 1655 1656 /// \return An upper bound for the vectorization factors for both 1657 /// fixed and scalable vectorization, where the minimum-known number of 1658 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1659 /// disabled or unsupported, then the scalable part will be equal to 1660 /// ElementCount::getScalable(0). 1661 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1662 ElementCount UserVF, 1663 bool FoldTailByMasking); 1664 1665 /// \return the maximized element count based on the targets vector 1666 /// registers and the loop trip-count, but limited to a maximum safe VF. 1667 /// This is a helper function of computeFeasibleMaxVF. 1668 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1669 /// issue that occurred on one of the buildbots which cannot be reproduced 1670 /// without having access to the properietary compiler (see comments on 1671 /// D98509). The issue is currently under investigation and this workaround 1672 /// will be removed as soon as possible. 1673 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1674 unsigned SmallestType, 1675 unsigned WidestType, 1676 const ElementCount &MaxSafeVF, 1677 bool FoldTailByMasking); 1678 1679 /// \return the maximum legal scalable VF, based on the safe max number 1680 /// of elements. 1681 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1682 1683 /// The vectorization cost is a combination of the cost itself and a boolean 1684 /// indicating whether any of the contributing operations will actually 1685 /// operate on vector values after type legalization in the backend. If this 1686 /// latter value is false, then all operations will be scalarized (i.e. no 1687 /// vectorization has actually taken place). 1688 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1689 1690 /// Returns the expected execution cost. The unit of the cost does 1691 /// not matter because we use the 'cost' units to compare different 1692 /// vector widths. The cost that is returned is *not* normalized by 1693 /// the factor width. If \p Invalid is not nullptr, this function 1694 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1695 /// each instruction that has an Invalid cost for the given VF. 1696 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1697 VectorizationCostTy 1698 expectedCost(ElementCount VF, 1699 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1700 1701 /// Returns the execution time cost of an instruction for a given vector 1702 /// width. Vector width of one means scalar. 1703 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1704 1705 /// The cost-computation logic from getInstructionCost which provides 1706 /// the vector type as an output parameter. 1707 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1708 Type *&VectorTy); 1709 1710 /// Return the cost of instructions in an inloop reduction pattern, if I is 1711 /// part of that pattern. 1712 Optional<InstructionCost> 1713 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1714 TTI::TargetCostKind CostKind); 1715 1716 /// Calculate vectorization cost of memory instruction \p I. 1717 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1718 1719 /// The cost computation for scalarized memory instruction. 1720 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1721 1722 /// The cost computation for interleaving group of memory instructions. 1723 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1724 1725 /// The cost computation for Gather/Scatter instruction. 1726 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1727 1728 /// The cost computation for widening instruction \p I with consecutive 1729 /// memory access. 1730 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1731 1732 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1733 /// Load: scalar load + broadcast. 1734 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1735 /// element) 1736 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1737 1738 /// Estimate the overhead of scalarizing an instruction. This is a 1739 /// convenience wrapper for the type-based getScalarizationOverhead API. 1740 InstructionCost getScalarizationOverhead(Instruction *I, 1741 ElementCount VF) const; 1742 1743 /// Returns whether the instruction is a load or store and will be a emitted 1744 /// as a vector operation. 1745 bool isConsecutiveLoadOrStore(Instruction *I); 1746 1747 /// Returns true if an artificially high cost for emulated masked memrefs 1748 /// should be used. 1749 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1750 1751 /// Map of scalar integer values to the smallest bitwidth they can be legally 1752 /// represented as. The vector equivalents of these values should be truncated 1753 /// to this type. 1754 MapVector<Instruction *, uint64_t> MinBWs; 1755 1756 /// A type representing the costs for instructions if they were to be 1757 /// scalarized rather than vectorized. The entries are Instruction-Cost 1758 /// pairs. 1759 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1760 1761 /// A set containing all BasicBlocks that are known to present after 1762 /// vectorization as a predicated block. 1763 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1764 1765 /// Records whether it is allowed to have the original scalar loop execute at 1766 /// least once. This may be needed as a fallback loop in case runtime 1767 /// aliasing/dependence checks fail, or to handle the tail/remainder 1768 /// iterations when the trip count is unknown or doesn't divide by the VF, 1769 /// or as a peel-loop to handle gaps in interleave-groups. 1770 /// Under optsize and when the trip count is very small we don't allow any 1771 /// iterations to execute in the scalar loop. 1772 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1773 1774 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1775 bool FoldTailByMasking = false; 1776 1777 /// A map holding scalar costs for different vectorization factors. The 1778 /// presence of a cost for an instruction in the mapping indicates that the 1779 /// instruction will be scalarized when vectorizing with the associated 1780 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1781 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1782 1783 /// Holds the instructions known to be uniform after vectorization. 1784 /// The data is collected per VF. 1785 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1786 1787 /// Holds the instructions known to be scalar after vectorization. 1788 /// The data is collected per VF. 1789 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1790 1791 /// Holds the instructions (address computations) that are forced to be 1792 /// scalarized. 1793 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1794 1795 /// PHINodes of the reductions that should be expanded in-loop along with 1796 /// their associated chains of reduction operations, in program order from top 1797 /// (PHI) to bottom 1798 ReductionChainMap InLoopReductionChains; 1799 1800 /// A Map of inloop reduction operations and their immediate chain operand. 1801 /// FIXME: This can be removed once reductions can be costed correctly in 1802 /// vplan. This was added to allow quick lookup to the inloop operations, 1803 /// without having to loop through InLoopReductionChains. 1804 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1805 1806 /// Returns the expected difference in cost from scalarizing the expression 1807 /// feeding a predicated instruction \p PredInst. The instructions to 1808 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1809 /// non-negative return value implies the expression will be scalarized. 1810 /// Currently, only single-use chains are considered for scalarization. 1811 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1812 ElementCount VF); 1813 1814 /// Collect the instructions that are uniform after vectorization. An 1815 /// instruction is uniform if we represent it with a single scalar value in 1816 /// the vectorized loop corresponding to each vector iteration. Examples of 1817 /// uniform instructions include pointer operands of consecutive or 1818 /// interleaved memory accesses. Note that although uniformity implies an 1819 /// instruction will be scalar, the reverse is not true. In general, a 1820 /// scalarized instruction will be represented by VF scalar values in the 1821 /// vectorized loop, each corresponding to an iteration of the original 1822 /// scalar loop. 1823 void collectLoopUniforms(ElementCount VF); 1824 1825 /// Collect the instructions that are scalar after vectorization. An 1826 /// instruction is scalar if it is known to be uniform or will be scalarized 1827 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1828 /// to the list if they are used by a load/store instruction that is marked as 1829 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1830 /// VF values in the vectorized loop, each corresponding to an iteration of 1831 /// the original scalar loop. 1832 void collectLoopScalars(ElementCount VF); 1833 1834 /// Keeps cost model vectorization decision and cost for instructions. 1835 /// Right now it is used for memory instructions only. 1836 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1837 std::pair<InstWidening, InstructionCost>>; 1838 1839 DecisionList WideningDecisions; 1840 1841 /// Returns true if \p V is expected to be vectorized and it needs to be 1842 /// extracted. 1843 bool needsExtract(Value *V, ElementCount VF) const { 1844 Instruction *I = dyn_cast<Instruction>(V); 1845 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1846 TheLoop->isLoopInvariant(I)) 1847 return false; 1848 1849 // Assume we can vectorize V (and hence we need extraction) if the 1850 // scalars are not computed yet. This can happen, because it is called 1851 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1852 // the scalars are collected. That should be a safe assumption in most 1853 // cases, because we check if the operands have vectorizable types 1854 // beforehand in LoopVectorizationLegality. 1855 return Scalars.find(VF) == Scalars.end() || 1856 !isScalarAfterVectorization(I, VF); 1857 }; 1858 1859 /// Returns a range containing only operands needing to be extracted. 1860 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1861 ElementCount VF) const { 1862 return SmallVector<Value *, 4>(make_filter_range( 1863 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1864 } 1865 1866 /// Determines if we have the infrastructure to vectorize loop \p L and its 1867 /// epilogue, assuming the main loop is vectorized by \p VF. 1868 bool isCandidateForEpilogueVectorization(const Loop &L, 1869 const ElementCount VF) const; 1870 1871 /// Returns true if epilogue vectorization is considered profitable, and 1872 /// false otherwise. 1873 /// \p VF is the vectorization factor chosen for the original loop. 1874 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1875 1876 public: 1877 /// The loop that we evaluate. 1878 Loop *TheLoop; 1879 1880 /// Predicated scalar evolution analysis. 1881 PredicatedScalarEvolution &PSE; 1882 1883 /// Loop Info analysis. 1884 LoopInfo *LI; 1885 1886 /// Vectorization legality. 1887 LoopVectorizationLegality *Legal; 1888 1889 /// Vector target information. 1890 const TargetTransformInfo &TTI; 1891 1892 /// Target Library Info. 1893 const TargetLibraryInfo *TLI; 1894 1895 /// Demanded bits analysis. 1896 DemandedBits *DB; 1897 1898 /// Assumption cache. 1899 AssumptionCache *AC; 1900 1901 /// Interface to emit optimization remarks. 1902 OptimizationRemarkEmitter *ORE; 1903 1904 const Function *TheFunction; 1905 1906 /// Loop Vectorize Hint. 1907 const LoopVectorizeHints *Hints; 1908 1909 /// The interleave access information contains groups of interleaved accesses 1910 /// with the same stride and close to each other. 1911 InterleavedAccessInfo &InterleaveInfo; 1912 1913 /// Values to ignore in the cost model. 1914 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1915 1916 /// Values to ignore in the cost model when VF > 1. 1917 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1918 1919 /// All element types found in the loop. 1920 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1921 1922 /// Profitable vector factors. 1923 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1924 }; 1925 } // end namespace llvm 1926 1927 /// Helper struct to manage generating runtime checks for vectorization. 1928 /// 1929 /// The runtime checks are created up-front in temporary blocks to allow better 1930 /// estimating the cost and un-linked from the existing IR. After deciding to 1931 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1932 /// temporary blocks are completely removed. 1933 class GeneratedRTChecks { 1934 /// Basic block which contains the generated SCEV checks, if any. 1935 BasicBlock *SCEVCheckBlock = nullptr; 1936 1937 /// The value representing the result of the generated SCEV checks. If it is 1938 /// nullptr, either no SCEV checks have been generated or they have been used. 1939 Value *SCEVCheckCond = nullptr; 1940 1941 /// Basic block which contains the generated memory runtime checks, if any. 1942 BasicBlock *MemCheckBlock = nullptr; 1943 1944 /// The value representing the result of the generated memory runtime checks. 1945 /// If it is nullptr, either no memory runtime checks have been generated or 1946 /// they have been used. 1947 Value *MemRuntimeCheckCond = nullptr; 1948 1949 DominatorTree *DT; 1950 LoopInfo *LI; 1951 1952 SCEVExpander SCEVExp; 1953 SCEVExpander MemCheckExp; 1954 1955 public: 1956 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1957 const DataLayout &DL) 1958 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1959 MemCheckExp(SE, DL, "scev.check") {} 1960 1961 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1962 /// accurately estimate the cost of the runtime checks. The blocks are 1963 /// un-linked from the IR and is added back during vector code generation. If 1964 /// there is no vector code generation, the check blocks are removed 1965 /// completely. 1966 void Create(Loop *L, const LoopAccessInfo &LAI, 1967 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1968 1969 BasicBlock *LoopHeader = L->getHeader(); 1970 BasicBlock *Preheader = L->getLoopPreheader(); 1971 1972 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1973 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1974 // may be used by SCEVExpander. The blocks will be un-linked from their 1975 // predecessors and removed from LI & DT at the end of the function. 1976 if (!UnionPred.isAlwaysTrue()) { 1977 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1978 nullptr, "vector.scevcheck"); 1979 1980 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1981 &UnionPred, SCEVCheckBlock->getTerminator()); 1982 } 1983 1984 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1985 if (RtPtrChecking.Need) { 1986 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1987 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1988 "vector.memcheck"); 1989 1990 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1991 if (DiffChecks) { 1992 MemRuntimeCheckCond = addDiffRuntimeChecks( 1993 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1994 [VF](IRBuilderBase &B, unsigned Bits) { 1995 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1996 }, 1997 IC); 1998 } else { 1999 MemRuntimeCheckCond = 2000 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 2001 RtPtrChecking.getChecks(), MemCheckExp); 2002 } 2003 assert(MemRuntimeCheckCond && 2004 "no RT checks generated although RtPtrChecking " 2005 "claimed checks are required"); 2006 } 2007 2008 if (!MemCheckBlock && !SCEVCheckBlock) 2009 return; 2010 2011 // Unhook the temporary block with the checks, update various places 2012 // accordingly. 2013 if (SCEVCheckBlock) 2014 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2015 if (MemCheckBlock) 2016 MemCheckBlock->replaceAllUsesWith(Preheader); 2017 2018 if (SCEVCheckBlock) { 2019 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2020 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2021 Preheader->getTerminator()->eraseFromParent(); 2022 } 2023 if (MemCheckBlock) { 2024 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2025 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2026 Preheader->getTerminator()->eraseFromParent(); 2027 } 2028 2029 DT->changeImmediateDominator(LoopHeader, Preheader); 2030 if (MemCheckBlock) { 2031 DT->eraseNode(MemCheckBlock); 2032 LI->removeBlock(MemCheckBlock); 2033 } 2034 if (SCEVCheckBlock) { 2035 DT->eraseNode(SCEVCheckBlock); 2036 LI->removeBlock(SCEVCheckBlock); 2037 } 2038 } 2039 2040 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2041 /// unused. 2042 ~GeneratedRTChecks() { 2043 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2044 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2045 if (!SCEVCheckCond) 2046 SCEVCleaner.markResultUsed(); 2047 2048 if (!MemRuntimeCheckCond) 2049 MemCheckCleaner.markResultUsed(); 2050 2051 if (MemRuntimeCheckCond) { 2052 auto &SE = *MemCheckExp.getSE(); 2053 // Memory runtime check generation creates compares that use expanded 2054 // values. Remove them before running the SCEVExpanderCleaners. 2055 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2056 if (MemCheckExp.isInsertedInstruction(&I)) 2057 continue; 2058 SE.forgetValue(&I); 2059 I.eraseFromParent(); 2060 } 2061 } 2062 MemCheckCleaner.cleanup(); 2063 SCEVCleaner.cleanup(); 2064 2065 if (SCEVCheckCond) 2066 SCEVCheckBlock->eraseFromParent(); 2067 if (MemRuntimeCheckCond) 2068 MemCheckBlock->eraseFromParent(); 2069 } 2070 2071 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2072 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2073 /// depending on the generated condition. 2074 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2075 BasicBlock *LoopVectorPreHeader, 2076 BasicBlock *LoopExitBlock) { 2077 if (!SCEVCheckCond) 2078 return nullptr; 2079 2080 Value *Cond = SCEVCheckCond; 2081 // Mark the check as used, to prevent it from being removed during cleanup. 2082 SCEVCheckCond = nullptr; 2083 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2084 if (C->isZero()) 2085 return nullptr; 2086 2087 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2088 2089 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2090 // Create new preheader for vector loop. 2091 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2092 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2093 2094 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2095 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2096 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2097 SCEVCheckBlock); 2098 2099 DT->addNewBlock(SCEVCheckBlock, Pred); 2100 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2101 2102 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2103 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2104 return SCEVCheckBlock; 2105 } 2106 2107 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2108 /// the branches to branch to the vector preheader or \p Bypass, depending on 2109 /// the generated condition. 2110 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2111 BasicBlock *LoopVectorPreHeader) { 2112 // Check if we generated code that checks in runtime if arrays overlap. 2113 if (!MemRuntimeCheckCond) 2114 return nullptr; 2115 2116 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2117 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2118 MemCheckBlock); 2119 2120 DT->addNewBlock(MemCheckBlock, Pred); 2121 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2122 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2123 2124 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2125 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2126 2127 ReplaceInstWithInst( 2128 MemCheckBlock->getTerminator(), 2129 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2130 MemCheckBlock->getTerminator()->setDebugLoc( 2131 Pred->getTerminator()->getDebugLoc()); 2132 2133 // Mark the check as used, to prevent it from being removed during cleanup. 2134 MemRuntimeCheckCond = nullptr; 2135 return MemCheckBlock; 2136 } 2137 }; 2138 2139 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2140 // vectorization. The loop needs to be annotated with #pragma omp simd 2141 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2142 // vector length information is not provided, vectorization is not considered 2143 // explicit. Interleave hints are not allowed either. These limitations will be 2144 // relaxed in the future. 2145 // Please, note that we are currently forced to abuse the pragma 'clang 2146 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2147 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2148 // provides *explicit vectorization hints* (LV can bypass legal checks and 2149 // assume that vectorization is legal). However, both hints are implemented 2150 // using the same metadata (llvm.loop.vectorize, processed by 2151 // LoopVectorizeHints). This will be fixed in the future when the native IR 2152 // representation for pragma 'omp simd' is introduced. 2153 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2154 OptimizationRemarkEmitter *ORE) { 2155 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2156 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2157 2158 // Only outer loops with an explicit vectorization hint are supported. 2159 // Unannotated outer loops are ignored. 2160 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2161 return false; 2162 2163 Function *Fn = OuterLp->getHeader()->getParent(); 2164 if (!Hints.allowVectorization(Fn, OuterLp, 2165 true /*VectorizeOnlyWhenForced*/)) { 2166 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2167 return false; 2168 } 2169 2170 if (Hints.getInterleave() > 1) { 2171 // TODO: Interleave support is future work. 2172 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2173 "outer loops.\n"); 2174 Hints.emitRemarkWithHints(); 2175 return false; 2176 } 2177 2178 return true; 2179 } 2180 2181 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2182 OptimizationRemarkEmitter *ORE, 2183 SmallVectorImpl<Loop *> &V) { 2184 // Collect inner loops and outer loops without irreducible control flow. For 2185 // now, only collect outer loops that have explicit vectorization hints. If we 2186 // are stress testing the VPlan H-CFG construction, we collect the outermost 2187 // loop of every loop nest. 2188 if (L.isInnermost() || VPlanBuildStressTest || 2189 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2190 LoopBlocksRPO RPOT(&L); 2191 RPOT.perform(LI); 2192 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2193 V.push_back(&L); 2194 // TODO: Collect inner loops inside marked outer loops in case 2195 // vectorization fails for the outer loop. Do not invoke 2196 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2197 // already known to be reducible. We can use an inherited attribute for 2198 // that. 2199 return; 2200 } 2201 } 2202 for (Loop *InnerL : L) 2203 collectSupportedLoops(*InnerL, LI, ORE, V); 2204 } 2205 2206 namespace { 2207 2208 /// The LoopVectorize Pass. 2209 struct LoopVectorize : public FunctionPass { 2210 /// Pass identification, replacement for typeid 2211 static char ID; 2212 2213 LoopVectorizePass Impl; 2214 2215 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2216 bool VectorizeOnlyWhenForced = false) 2217 : FunctionPass(ID), 2218 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2219 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2220 } 2221 2222 bool runOnFunction(Function &F) override { 2223 if (skipFunction(F)) 2224 return false; 2225 2226 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2227 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2228 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2229 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2230 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2231 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2232 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2233 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2234 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2235 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2236 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2237 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2238 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2239 2240 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2241 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2242 2243 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2244 GetLAA, *ORE, PSI).MadeAnyChange; 2245 } 2246 2247 void getAnalysisUsage(AnalysisUsage &AU) const override { 2248 AU.addRequired<AssumptionCacheTracker>(); 2249 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2250 AU.addRequired<DominatorTreeWrapperPass>(); 2251 AU.addRequired<LoopInfoWrapperPass>(); 2252 AU.addRequired<ScalarEvolutionWrapperPass>(); 2253 AU.addRequired<TargetTransformInfoWrapperPass>(); 2254 AU.addRequired<AAResultsWrapperPass>(); 2255 AU.addRequired<LoopAccessLegacyAnalysis>(); 2256 AU.addRequired<DemandedBitsWrapperPass>(); 2257 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2258 AU.addRequired<InjectTLIMappingsLegacy>(); 2259 2260 // We currently do not preserve loopinfo/dominator analyses with outer loop 2261 // vectorization. Until this is addressed, mark these analyses as preserved 2262 // only for non-VPlan-native path. 2263 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2264 if (!EnableVPlanNativePath) { 2265 AU.addPreserved<LoopInfoWrapperPass>(); 2266 AU.addPreserved<DominatorTreeWrapperPass>(); 2267 } 2268 2269 AU.addPreserved<BasicAAWrapperPass>(); 2270 AU.addPreserved<GlobalsAAWrapperPass>(); 2271 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2272 } 2273 }; 2274 2275 } // end anonymous namespace 2276 2277 //===----------------------------------------------------------------------===// 2278 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2279 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2280 //===----------------------------------------------------------------------===// 2281 2282 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2283 // We need to place the broadcast of invariant variables outside the loop, 2284 // but only if it's proven safe to do so. Else, broadcast will be inside 2285 // vector loop body. 2286 Instruction *Instr = dyn_cast<Instruction>(V); 2287 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2288 (!Instr || 2289 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2290 // Place the code for broadcasting invariant variables in the new preheader. 2291 IRBuilder<>::InsertPointGuard Guard(Builder); 2292 if (SafeToHoist) 2293 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2294 2295 // Broadcast the scalar into all locations in the vector. 2296 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2297 2298 return Shuf; 2299 } 2300 2301 /// This function adds 2302 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2303 /// to each vector element of Val. The sequence starts at StartIndex. 2304 /// \p Opcode is relevant for FP induction variable. 2305 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2306 Instruction::BinaryOps BinOp, ElementCount VF, 2307 IRBuilderBase &Builder) { 2308 assert(VF.isVector() && "only vector VFs are supported"); 2309 2310 // Create and check the types. 2311 auto *ValVTy = cast<VectorType>(Val->getType()); 2312 ElementCount VLen = ValVTy->getElementCount(); 2313 2314 Type *STy = Val->getType()->getScalarType(); 2315 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2316 "Induction Step must be an integer or FP"); 2317 assert(Step->getType() == STy && "Step has wrong type"); 2318 2319 SmallVector<Constant *, 8> Indices; 2320 2321 // Create a vector of consecutive numbers from zero to VF. 2322 VectorType *InitVecValVTy = ValVTy; 2323 if (STy->isFloatingPointTy()) { 2324 Type *InitVecValSTy = 2325 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2326 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2327 } 2328 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2329 2330 // Splat the StartIdx 2331 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2332 2333 if (STy->isIntegerTy()) { 2334 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2335 Step = Builder.CreateVectorSplat(VLen, Step); 2336 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2337 // FIXME: The newly created binary instructions should contain nsw/nuw 2338 // flags, which can be found from the original scalar operations. 2339 Step = Builder.CreateMul(InitVec, Step); 2340 return Builder.CreateAdd(Val, Step, "induction"); 2341 } 2342 2343 // Floating point induction. 2344 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2345 "Binary Opcode should be specified for FP induction"); 2346 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2347 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2348 2349 Step = Builder.CreateVectorSplat(VLen, Step); 2350 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2351 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2352 } 2353 2354 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2355 /// variable on which to base the steps, \p Step is the size of the step. 2356 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2357 const InductionDescriptor &ID, VPValue *Def, 2358 VPTransformState &State) { 2359 IRBuilderBase &Builder = State.Builder; 2360 // We shouldn't have to build scalar steps if we aren't vectorizing. 2361 assert(State.VF.isVector() && "VF should be greater than one"); 2362 // Get the value type and ensure it and the step have the same integer type. 2363 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2364 assert(ScalarIVTy == Step->getType() && 2365 "Val and Step should have the same type"); 2366 2367 // We build scalar steps for both integer and floating-point induction 2368 // variables. Here, we determine the kind of arithmetic we will perform. 2369 Instruction::BinaryOps AddOp; 2370 Instruction::BinaryOps MulOp; 2371 if (ScalarIVTy->isIntegerTy()) { 2372 AddOp = Instruction::Add; 2373 MulOp = Instruction::Mul; 2374 } else { 2375 AddOp = ID.getInductionOpcode(); 2376 MulOp = Instruction::FMul; 2377 } 2378 2379 // Determine the number of scalars we need to generate for each unroll 2380 // iteration. 2381 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2382 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2383 // Compute the scalar steps and save the results in State. 2384 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2385 ScalarIVTy->getScalarSizeInBits()); 2386 Type *VecIVTy = nullptr; 2387 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2388 if (!FirstLaneOnly && State.VF.isScalable()) { 2389 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2390 UnitStepVec = 2391 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2392 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2393 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2394 } 2395 2396 for (unsigned Part = 0; Part < State.UF; ++Part) { 2397 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2398 2399 if (!FirstLaneOnly && State.VF.isScalable()) { 2400 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2401 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2402 if (ScalarIVTy->isFloatingPointTy()) 2403 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2404 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2405 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2406 State.set(Def, Add, Part); 2407 // It's useful to record the lane values too for the known minimum number 2408 // of elements so we do those below. This improves the code quality when 2409 // trying to extract the first element, for example. 2410 } 2411 2412 if (ScalarIVTy->isFloatingPointTy()) 2413 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2414 2415 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2416 Value *StartIdx = Builder.CreateBinOp( 2417 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2418 // The step returned by `createStepForVF` is a runtime-evaluated value 2419 // when VF is scalable. Otherwise, it should be folded into a Constant. 2420 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2421 "Expected StartIdx to be folded to a constant when VF is not " 2422 "scalable"); 2423 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2424 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2425 State.set(Def, Add, VPIteration(Part, Lane)); 2426 } 2427 } 2428 } 2429 2430 // Generate code for the induction step. Note that induction steps are 2431 // required to be loop-invariant 2432 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2433 Instruction *InsertBefore, 2434 Loop *OrigLoop = nullptr) { 2435 const DataLayout &DL = SE.getDataLayout(); 2436 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2437 "Induction step should be loop invariant"); 2438 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2439 return E->getValue(); 2440 2441 SCEVExpander Exp(SE, DL, "induction"); 2442 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2443 } 2444 2445 /// Compute the transformed value of Index at offset StartValue using step 2446 /// StepValue. 2447 /// For integer induction, returns StartValue + Index * StepValue. 2448 /// For pointer induction, returns StartValue[Index * StepValue]. 2449 /// FIXME: The newly created binary instructions should contain nsw/nuw 2450 /// flags, which can be found from the original scalar operations. 2451 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2452 Value *StartValue, Value *Step, 2453 const InductionDescriptor &ID) { 2454 assert(Index->getType()->getScalarType() == Step->getType() && 2455 "Index scalar type does not match StepValue type"); 2456 2457 // Note: the IR at this point is broken. We cannot use SE to create any new 2458 // SCEV and then expand it, hoping that SCEV's simplification will give us 2459 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2460 // lead to various SCEV crashes. So all we can do is to use builder and rely 2461 // on InstCombine for future simplifications. Here we handle some trivial 2462 // cases only. 2463 auto CreateAdd = [&B](Value *X, Value *Y) { 2464 assert(X->getType() == Y->getType() && "Types don't match!"); 2465 if (auto *CX = dyn_cast<ConstantInt>(X)) 2466 if (CX->isZero()) 2467 return Y; 2468 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2469 if (CY->isZero()) 2470 return X; 2471 return B.CreateAdd(X, Y); 2472 }; 2473 2474 // We allow X to be a vector type, in which case Y will potentially be 2475 // splatted into a vector with the same element count. 2476 auto CreateMul = [&B](Value *X, Value *Y) { 2477 assert(X->getType()->getScalarType() == Y->getType() && 2478 "Types don't match!"); 2479 if (auto *CX = dyn_cast<ConstantInt>(X)) 2480 if (CX->isOne()) 2481 return Y; 2482 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2483 if (CY->isOne()) 2484 return X; 2485 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2486 if (XVTy && !isa<VectorType>(Y->getType())) 2487 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2488 return B.CreateMul(X, Y); 2489 }; 2490 2491 switch (ID.getKind()) { 2492 case InductionDescriptor::IK_IntInduction: { 2493 assert(!isa<VectorType>(Index->getType()) && 2494 "Vector indices not supported for integer inductions yet"); 2495 assert(Index->getType() == StartValue->getType() && 2496 "Index type does not match StartValue type"); 2497 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2498 return B.CreateSub(StartValue, Index); 2499 auto *Offset = CreateMul(Index, Step); 2500 return CreateAdd(StartValue, Offset); 2501 } 2502 case InductionDescriptor::IK_PtrInduction: { 2503 assert(isa<Constant>(Step) && 2504 "Expected constant step for pointer induction"); 2505 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2506 } 2507 case InductionDescriptor::IK_FpInduction: { 2508 assert(!isa<VectorType>(Index->getType()) && 2509 "Vector indices not supported for FP inductions yet"); 2510 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2511 auto InductionBinOp = ID.getInductionBinOp(); 2512 assert(InductionBinOp && 2513 (InductionBinOp->getOpcode() == Instruction::FAdd || 2514 InductionBinOp->getOpcode() == Instruction::FSub) && 2515 "Original bin op should be defined for FP induction"); 2516 2517 Value *MulExp = B.CreateFMul(Step, Index); 2518 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2519 "induction"); 2520 } 2521 case InductionDescriptor::IK_NoInduction: 2522 return nullptr; 2523 } 2524 llvm_unreachable("invalid enum"); 2525 } 2526 2527 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2528 const VPIteration &Instance, 2529 VPTransformState &State) { 2530 Value *ScalarInst = State.get(Def, Instance); 2531 Value *VectorValue = State.get(Def, Instance.Part); 2532 VectorValue = Builder.CreateInsertElement( 2533 VectorValue, ScalarInst, 2534 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2535 State.set(Def, VectorValue, Instance.Part); 2536 } 2537 2538 // Return whether we allow using masked interleave-groups (for dealing with 2539 // strided loads/stores that reside in predicated blocks, or for dealing 2540 // with gaps). 2541 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2542 // If an override option has been passed in for interleaved accesses, use it. 2543 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2544 return EnableMaskedInterleavedMemAccesses; 2545 2546 return TTI.enableMaskedInterleavedAccessVectorization(); 2547 } 2548 2549 // Try to vectorize the interleave group that \p Instr belongs to. 2550 // 2551 // E.g. Translate following interleaved load group (factor = 3): 2552 // for (i = 0; i < N; i+=3) { 2553 // R = Pic[i]; // Member of index 0 2554 // G = Pic[i+1]; // Member of index 1 2555 // B = Pic[i+2]; // Member of index 2 2556 // ... // do something to R, G, B 2557 // } 2558 // To: 2559 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2560 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2561 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2562 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2563 // 2564 // Or translate following interleaved store group (factor = 3): 2565 // for (i = 0; i < N; i+=3) { 2566 // ... do something to R, G, B 2567 // Pic[i] = R; // Member of index 0 2568 // Pic[i+1] = G; // Member of index 1 2569 // Pic[i+2] = B; // Member of index 2 2570 // } 2571 // To: 2572 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2573 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2574 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2575 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2576 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2577 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2578 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2579 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2580 VPValue *BlockInMask) { 2581 Instruction *Instr = Group->getInsertPos(); 2582 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2583 2584 // Prepare for the vector type of the interleaved load/store. 2585 Type *ScalarTy = getLoadStoreType(Instr); 2586 unsigned InterleaveFactor = Group->getFactor(); 2587 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2588 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2589 2590 // Prepare for the new pointers. 2591 SmallVector<Value *, 2> AddrParts; 2592 unsigned Index = Group->getIndex(Instr); 2593 2594 // TODO: extend the masked interleaved-group support to reversed access. 2595 assert((!BlockInMask || !Group->isReverse()) && 2596 "Reversed masked interleave-group not supported."); 2597 2598 // If the group is reverse, adjust the index to refer to the last vector lane 2599 // instead of the first. We adjust the index from the first vector lane, 2600 // rather than directly getting the pointer for lane VF - 1, because the 2601 // pointer operand of the interleaved access is supposed to be uniform. For 2602 // uniform instructions, we're only required to generate a value for the 2603 // first vector lane in each unroll iteration. 2604 if (Group->isReverse()) 2605 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2606 2607 for (unsigned Part = 0; Part < UF; Part++) { 2608 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2609 setDebugLocFromInst(AddrPart); 2610 2611 // Notice current instruction could be any index. Need to adjust the address 2612 // to the member of index 0. 2613 // 2614 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2615 // b = A[i]; // Member of index 0 2616 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2617 // 2618 // E.g. A[i+1] = a; // Member of index 1 2619 // A[i] = b; // Member of index 0 2620 // A[i+2] = c; // Member of index 2 (Current instruction) 2621 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2622 2623 bool InBounds = false; 2624 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2625 InBounds = gep->isInBounds(); 2626 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2627 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2628 2629 // Cast to the vector pointer type. 2630 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2631 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2632 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2633 } 2634 2635 setDebugLocFromInst(Instr); 2636 Value *PoisonVec = PoisonValue::get(VecTy); 2637 2638 Value *MaskForGaps = nullptr; 2639 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2640 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2641 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2642 } 2643 2644 // Vectorize the interleaved load group. 2645 if (isa<LoadInst>(Instr)) { 2646 // For each unroll part, create a wide load for the group. 2647 SmallVector<Value *, 2> NewLoads; 2648 for (unsigned Part = 0; Part < UF; Part++) { 2649 Instruction *NewLoad; 2650 if (BlockInMask || MaskForGaps) { 2651 assert(useMaskedInterleavedAccesses(*TTI) && 2652 "masked interleaved groups are not allowed."); 2653 Value *GroupMask = MaskForGaps; 2654 if (BlockInMask) { 2655 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2656 Value *ShuffledMask = Builder.CreateShuffleVector( 2657 BlockInMaskPart, 2658 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2659 "interleaved.mask"); 2660 GroupMask = MaskForGaps 2661 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2662 MaskForGaps) 2663 : ShuffledMask; 2664 } 2665 NewLoad = 2666 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2667 GroupMask, PoisonVec, "wide.masked.vec"); 2668 } 2669 else 2670 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2671 Group->getAlign(), "wide.vec"); 2672 Group->addMetadata(NewLoad); 2673 NewLoads.push_back(NewLoad); 2674 } 2675 2676 // For each member in the group, shuffle out the appropriate data from the 2677 // wide loads. 2678 unsigned J = 0; 2679 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2680 Instruction *Member = Group->getMember(I); 2681 2682 // Skip the gaps in the group. 2683 if (!Member) 2684 continue; 2685 2686 auto StrideMask = 2687 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2688 for (unsigned Part = 0; Part < UF; Part++) { 2689 Value *StridedVec = Builder.CreateShuffleVector( 2690 NewLoads[Part], StrideMask, "strided.vec"); 2691 2692 // If this member has different type, cast the result type. 2693 if (Member->getType() != ScalarTy) { 2694 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2695 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2696 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2697 } 2698 2699 if (Group->isReverse()) 2700 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2701 2702 State.set(VPDefs[J], StridedVec, Part); 2703 } 2704 ++J; 2705 } 2706 return; 2707 } 2708 2709 // The sub vector type for current instruction. 2710 auto *SubVT = VectorType::get(ScalarTy, VF); 2711 2712 // Vectorize the interleaved store group. 2713 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2714 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2715 "masked interleaved groups are not allowed."); 2716 assert((!MaskForGaps || !VF.isScalable()) && 2717 "masking gaps for scalable vectors is not yet supported."); 2718 for (unsigned Part = 0; Part < UF; Part++) { 2719 // Collect the stored vector from each member. 2720 SmallVector<Value *, 4> StoredVecs; 2721 for (unsigned i = 0; i < InterleaveFactor; i++) { 2722 assert((Group->getMember(i) || MaskForGaps) && 2723 "Fail to get a member from an interleaved store group"); 2724 Instruction *Member = Group->getMember(i); 2725 2726 // Skip the gaps in the group. 2727 if (!Member) { 2728 Value *Undef = PoisonValue::get(SubVT); 2729 StoredVecs.push_back(Undef); 2730 continue; 2731 } 2732 2733 Value *StoredVec = State.get(StoredValues[i], Part); 2734 2735 if (Group->isReverse()) 2736 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2737 2738 // If this member has different type, cast it to a unified type. 2739 2740 if (StoredVec->getType() != SubVT) 2741 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2742 2743 StoredVecs.push_back(StoredVec); 2744 } 2745 2746 // Concatenate all vectors into a wide vector. 2747 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2748 2749 // Interleave the elements in the wide vector. 2750 Value *IVec = Builder.CreateShuffleVector( 2751 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2752 "interleaved.vec"); 2753 2754 Instruction *NewStoreInstr; 2755 if (BlockInMask || MaskForGaps) { 2756 Value *GroupMask = MaskForGaps; 2757 if (BlockInMask) { 2758 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2759 Value *ShuffledMask = Builder.CreateShuffleVector( 2760 BlockInMaskPart, 2761 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2762 "interleaved.mask"); 2763 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2764 ShuffledMask, MaskForGaps) 2765 : ShuffledMask; 2766 } 2767 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2768 Group->getAlign(), GroupMask); 2769 } else 2770 NewStoreInstr = 2771 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2772 2773 Group->addMetadata(NewStoreInstr); 2774 } 2775 } 2776 2777 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2778 VPReplicateRecipe *RepRecipe, 2779 const VPIteration &Instance, 2780 bool IfPredicateInstr, 2781 VPTransformState &State) { 2782 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2783 2784 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2785 // the first lane and part. 2786 if (isa<NoAliasScopeDeclInst>(Instr)) 2787 if (!Instance.isFirstIteration()) 2788 return; 2789 2790 // Does this instruction return a value ? 2791 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2792 2793 Instruction *Cloned = Instr->clone(); 2794 if (!IsVoidRetTy) 2795 Cloned->setName(Instr->getName() + ".cloned"); 2796 2797 // If the scalarized instruction contributes to the address computation of a 2798 // widen masked load/store which was in a basic block that needed predication 2799 // and is not predicated after vectorization, we can't propagate 2800 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2801 // instruction could feed a poison value to the base address of the widen 2802 // load/store. 2803 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2804 Cloned->dropPoisonGeneratingFlags(); 2805 2806 if (Instr->getDebugLoc()) 2807 setDebugLocFromInst(Instr); 2808 2809 // Replace the operands of the cloned instructions with their scalar 2810 // equivalents in the new loop. 2811 for (auto &I : enumerate(RepRecipe->operands())) { 2812 auto InputInstance = Instance; 2813 VPValue *Operand = I.value(); 2814 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2815 if (OperandR && OperandR->isUniform()) 2816 InputInstance.Lane = VPLane::getFirstLane(); 2817 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2818 } 2819 addNewMetadata(Cloned, Instr); 2820 2821 // Place the cloned scalar in the new loop. 2822 State.Builder.Insert(Cloned); 2823 2824 State.set(RepRecipe, Cloned, Instance); 2825 2826 // If we just cloned a new assumption, add it the assumption cache. 2827 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2828 AC->registerAssumption(II); 2829 2830 // End if-block. 2831 if (IfPredicateInstr) 2832 PredicatedInstructions.push_back(Cloned); 2833 } 2834 2835 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2836 if (TripCount) 2837 return TripCount; 2838 2839 assert(InsertBlock); 2840 IRBuilder<> Builder(InsertBlock->getTerminator()); 2841 // Find the loop boundaries. 2842 ScalarEvolution *SE = PSE.getSE(); 2843 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2844 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2845 "Invalid loop count"); 2846 2847 Type *IdxTy = Legal->getWidestInductionType(); 2848 assert(IdxTy && "No type for induction"); 2849 2850 // The exit count might have the type of i64 while the phi is i32. This can 2851 // happen if we have an induction variable that is sign extended before the 2852 // compare. The only way that we get a backedge taken count is that the 2853 // induction variable was signed and as such will not overflow. In such a case 2854 // truncation is legal. 2855 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2856 IdxTy->getPrimitiveSizeInBits()) 2857 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2858 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2859 2860 // Get the total trip count from the count by adding 1. 2861 const SCEV *ExitCount = SE->getAddExpr( 2862 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2863 2864 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2865 2866 // Expand the trip count and place the new instructions in the preheader. 2867 // Notice that the pre-header does not change, only the loop body. 2868 SCEVExpander Exp(*SE, DL, "induction"); 2869 2870 // Count holds the overall loop count (N). 2871 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2872 InsertBlock->getTerminator()); 2873 2874 if (TripCount->getType()->isPointerTy()) 2875 TripCount = 2876 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2877 InsertBlock->getTerminator()); 2878 2879 return TripCount; 2880 } 2881 2882 Value * 2883 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2884 if (VectorTripCount) 2885 return VectorTripCount; 2886 2887 Value *TC = getOrCreateTripCount(InsertBlock); 2888 IRBuilder<> Builder(InsertBlock->getTerminator()); 2889 2890 Type *Ty = TC->getType(); 2891 // This is where we can make the step a runtime constant. 2892 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2893 2894 // If the tail is to be folded by masking, round the number of iterations N 2895 // up to a multiple of Step instead of rounding down. This is done by first 2896 // adding Step-1 and then rounding down. Note that it's ok if this addition 2897 // overflows: the vector induction variable will eventually wrap to zero given 2898 // that it starts at zero and its Step is a power of two; the loop will then 2899 // exit, with the last early-exit vector comparison also producing all-true. 2900 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2901 // is accounted for in emitIterationCountCheck that adds an overflow check. 2902 if (Cost->foldTailByMasking()) { 2903 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2904 "VF*UF must be a power of 2 when folding tail by masking"); 2905 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2906 TC = Builder.CreateAdd( 2907 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2908 } 2909 2910 // Now we need to generate the expression for the part of the loop that the 2911 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2912 // iterations are not required for correctness, or N - Step, otherwise. Step 2913 // is equal to the vectorization factor (number of SIMD elements) times the 2914 // unroll factor (number of SIMD instructions). 2915 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2916 2917 // There are cases where we *must* run at least one iteration in the remainder 2918 // loop. See the cost model for when this can happen. If the step evenly 2919 // divides the trip count, we set the remainder to be equal to the step. If 2920 // the step does not evenly divide the trip count, no adjustment is necessary 2921 // since there will already be scalar iterations. Note that the minimum 2922 // iterations check ensures that N >= Step. 2923 if (Cost->requiresScalarEpilogue(VF)) { 2924 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2925 R = Builder.CreateSelect(IsZero, Step, R); 2926 } 2927 2928 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2929 2930 return VectorTripCount; 2931 } 2932 2933 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2934 const DataLayout &DL) { 2935 // Verify that V is a vector type with same number of elements as DstVTy. 2936 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2937 unsigned VF = DstFVTy->getNumElements(); 2938 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2939 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2940 Type *SrcElemTy = SrcVecTy->getElementType(); 2941 Type *DstElemTy = DstFVTy->getElementType(); 2942 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2943 "Vector elements must have same size"); 2944 2945 // Do a direct cast if element types are castable. 2946 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2947 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2948 } 2949 // V cannot be directly casted to desired vector type. 2950 // May happen when V is a floating point vector but DstVTy is a vector of 2951 // pointers or vice-versa. Handle this using a two-step bitcast using an 2952 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2953 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2954 "Only one type should be a pointer type"); 2955 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2956 "Only one type should be a floating point type"); 2957 Type *IntTy = 2958 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2959 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2960 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2961 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2962 } 2963 2964 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2965 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2966 // Reuse existing vector loop preheader for TC checks. 2967 // Note that new preheader block is generated for vector loop. 2968 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2969 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2970 2971 // Generate code to check if the loop's trip count is less than VF * UF, or 2972 // equal to it in case a scalar epilogue is required; this implies that the 2973 // vector trip count is zero. This check also covers the case where adding one 2974 // to the backedge-taken count overflowed leading to an incorrect trip count 2975 // of zero. In this case we will also jump to the scalar loop. 2976 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2977 : ICmpInst::ICMP_ULT; 2978 2979 // If tail is to be folded, vector loop takes care of all iterations. 2980 Type *CountTy = Count->getType(); 2981 Value *CheckMinIters = Builder.getFalse(); 2982 Value *Step = createStepForVF(Builder, CountTy, VF, UF); 2983 if (!Cost->foldTailByMasking()) 2984 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2985 else if (VF.isScalable()) { 2986 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2987 // an overflow to zero when updating induction variables and so an 2988 // additional overflow check is required before entering the vector loop. 2989 2990 // Get the maximum unsigned value for the type. 2991 Value *MaxUIntTripCount = 2992 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2993 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2994 2995 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2996 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); 2997 } 2998 // Create new preheader for vector loop. 2999 LoopVectorPreHeader = 3000 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3001 "vector.ph"); 3002 3003 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3004 DT->getNode(Bypass)->getIDom()) && 3005 "TC check is expected to dominate Bypass"); 3006 3007 // Update dominator for Bypass & LoopExit (if needed). 3008 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3009 if (!Cost->requiresScalarEpilogue(VF)) 3010 // If there is an epilogue which must run, there's no edge from the 3011 // middle block to exit blocks and thus no need to update the immediate 3012 // dominator of the exit blocks. 3013 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3014 3015 ReplaceInstWithInst( 3016 TCCheckBlock->getTerminator(), 3017 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3018 LoopBypassBlocks.push_back(TCCheckBlock); 3019 } 3020 3021 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 3022 3023 BasicBlock *const SCEVCheckBlock = 3024 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 3025 if (!SCEVCheckBlock) 3026 return nullptr; 3027 3028 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3029 (OptForSizeBasedOnProfile && 3030 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3031 "Cannot SCEV check stride or overflow when optimizing for size"); 3032 3033 3034 // Update dominator only if this is first RT check. 3035 if (LoopBypassBlocks.empty()) { 3036 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3037 if (!Cost->requiresScalarEpilogue(VF)) 3038 // If there is an epilogue which must run, there's no edge from the 3039 // middle block to exit blocks and thus no need to update the immediate 3040 // dominator of the exit blocks. 3041 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3042 } 3043 3044 LoopBypassBlocks.push_back(SCEVCheckBlock); 3045 AddedSafetyChecks = true; 3046 return SCEVCheckBlock; 3047 } 3048 3049 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 3050 // VPlan-native path does not do any analysis for runtime checks currently. 3051 if (EnableVPlanNativePath) 3052 return nullptr; 3053 3054 BasicBlock *const MemCheckBlock = 3055 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 3056 3057 // Check if we generated code that checks in runtime if arrays overlap. We put 3058 // the checks into a separate block to make the more common case of few 3059 // elements faster. 3060 if (!MemCheckBlock) 3061 return nullptr; 3062 3063 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3064 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3065 "Cannot emit memory checks when optimizing for size, unless forced " 3066 "to vectorize."); 3067 ORE->emit([&]() { 3068 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3069 OrigLoop->getStartLoc(), 3070 OrigLoop->getHeader()) 3071 << "Code-size may be reduced by not forcing " 3072 "vectorization, or by source-code modifications " 3073 "eliminating the need for runtime checks " 3074 "(e.g., adding 'restrict')."; 3075 }); 3076 } 3077 3078 LoopBypassBlocks.push_back(MemCheckBlock); 3079 3080 AddedSafetyChecks = true; 3081 3082 // Only use noalias metadata when using memory checks guaranteeing no overlap 3083 // across all iterations. 3084 if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) { 3085 // We currently don't use LoopVersioning for the actual loop cloning but we 3086 // still use it to add the noalias metadata. 3087 LVer = std::make_unique<LoopVersioning>( 3088 *Legal->getLAI(), 3089 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3090 DT, PSE.getSE()); 3091 LVer->prepareNoAliasMetadata(); 3092 } 3093 return MemCheckBlock; 3094 } 3095 3096 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3097 LoopScalarBody = OrigLoop->getHeader(); 3098 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3099 assert(LoopVectorPreHeader && "Invalid loop structure"); 3100 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3101 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3102 "multiple exit loop without required epilogue?"); 3103 3104 LoopMiddleBlock = 3105 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3106 LI, nullptr, Twine(Prefix) + "middle.block"); 3107 LoopScalarPreHeader = 3108 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3109 nullptr, Twine(Prefix) + "scalar.ph"); 3110 3111 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3112 3113 // Set up the middle block terminator. Two cases: 3114 // 1) If we know that we must execute the scalar epilogue, emit an 3115 // unconditional branch. 3116 // 2) Otherwise, we must have a single unique exit block (due to how we 3117 // implement the multiple exit case). In this case, set up a conditonal 3118 // branch from the middle block to the loop scalar preheader, and the 3119 // exit block. completeLoopSkeleton will update the condition to use an 3120 // iteration check, if required to decide whether to execute the remainder. 3121 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3122 BranchInst::Create(LoopScalarPreHeader) : 3123 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3124 Builder.getTrue()); 3125 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3126 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3127 3128 // Update dominator for loop exit. During skeleton creation, only the vector 3129 // pre-header and the middle block are created. The vector loop is entirely 3130 // created during VPlan exection. 3131 if (!Cost->requiresScalarEpilogue(VF)) 3132 // If there is an epilogue which must run, there's no edge from the 3133 // middle block to exit blocks and thus no need to update the immediate 3134 // dominator of the exit blocks. 3135 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3136 } 3137 3138 void InnerLoopVectorizer::createInductionResumeValues( 3139 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3140 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3141 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3142 "Inconsistent information about additional bypass."); 3143 3144 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3145 assert(VectorTripCount && "Expected valid arguments"); 3146 // We are going to resume the execution of the scalar loop. 3147 // Go over all of the induction variables that we found and fix the 3148 // PHIs that are left in the scalar version of the loop. 3149 // The starting values of PHI nodes depend on the counter of the last 3150 // iteration in the vectorized loop. 3151 // If we come from a bypass edge then we need to start from the original 3152 // start value. 3153 Instruction *OldInduction = Legal->getPrimaryInduction(); 3154 for (auto &InductionEntry : Legal->getInductionVars()) { 3155 PHINode *OrigPhi = InductionEntry.first; 3156 InductionDescriptor II = InductionEntry.second; 3157 3158 // Create phi nodes to merge from the backedge-taken check block. 3159 PHINode *BCResumeVal = 3160 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3161 LoopScalarPreHeader->getTerminator()); 3162 // Copy original phi DL over to the new one. 3163 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3164 Value *&EndValue = IVEndValues[OrigPhi]; 3165 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3166 if (OrigPhi == OldInduction) { 3167 // We know what the end value is. 3168 EndValue = VectorTripCount; 3169 } else { 3170 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3171 3172 // Fast-math-flags propagate from the original induction instruction. 3173 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3174 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3175 3176 Type *StepType = II.getStep()->getType(); 3177 Instruction::CastOps CastOp = 3178 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3179 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3180 Value *Step = 3181 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3182 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3183 EndValue->setName("ind.end"); 3184 3185 // Compute the end value for the additional bypass (if applicable). 3186 if (AdditionalBypass.first) { 3187 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3188 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3189 StepType, true); 3190 Value *Step = 3191 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3192 VTC = 3193 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3194 EndValueFromAdditionalBypass = 3195 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3196 EndValueFromAdditionalBypass->setName("ind.end"); 3197 } 3198 } 3199 // The new PHI merges the original incoming value, in case of a bypass, 3200 // or the value at the end of the vectorized loop. 3201 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3202 3203 // Fix the scalar body counter (PHI node). 3204 // The old induction's phi node in the scalar body needs the truncated 3205 // value. 3206 for (BasicBlock *BB : LoopBypassBlocks) 3207 BCResumeVal->addIncoming(II.getStartValue(), BB); 3208 3209 if (AdditionalBypass.first) 3210 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3211 EndValueFromAdditionalBypass); 3212 3213 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3214 } 3215 } 3216 3217 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3218 // The trip counts should be cached by now. 3219 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3220 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3221 3222 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3223 3224 // Add a check in the middle block to see if we have completed 3225 // all of the iterations in the first vector loop. Three cases: 3226 // 1) If we require a scalar epilogue, there is no conditional branch as 3227 // we unconditionally branch to the scalar preheader. Do nothing. 3228 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3229 // Thus if tail is to be folded, we know we don't need to run the 3230 // remainder and we can use the previous value for the condition (true). 3231 // 3) Otherwise, construct a runtime check. 3232 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3233 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3234 Count, VectorTripCount, "cmp.n", 3235 LoopMiddleBlock->getTerminator()); 3236 3237 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3238 // of the corresponding compare because they may have ended up with 3239 // different line numbers and we want to avoid awkward line stepping while 3240 // debugging. Eg. if the compare has got a line number inside the loop. 3241 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3242 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3243 } 3244 3245 #ifdef EXPENSIVE_CHECKS 3246 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3247 #endif 3248 3249 return LoopVectorPreHeader; 3250 } 3251 3252 std::pair<BasicBlock *, Value *> 3253 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3254 /* 3255 In this function we generate a new loop. The new loop will contain 3256 the vectorized instructions while the old loop will continue to run the 3257 scalar remainder. 3258 3259 [ ] <-- loop iteration number check. 3260 / | 3261 / v 3262 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3263 | / | 3264 | / v 3265 || [ ] <-- vector pre header. 3266 |/ | 3267 | v 3268 | [ ] \ 3269 | [ ]_| <-- vector loop (created during VPlan execution). 3270 | | 3271 | v 3272 \ -[ ] <--- middle-block. 3273 \/ | 3274 /\ v 3275 | ->[ ] <--- new preheader. 3276 | | 3277 (opt) v <-- edge from middle to exit iff epilogue is not required. 3278 | [ ] \ 3279 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3280 \ | 3281 \ v 3282 >[ ] <-- exit block(s). 3283 ... 3284 */ 3285 3286 // Get the metadata of the original loop before it gets modified. 3287 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3288 3289 // Workaround! Compute the trip count of the original loop and cache it 3290 // before we start modifying the CFG. This code has a systemic problem 3291 // wherein it tries to run analysis over partially constructed IR; this is 3292 // wrong, and not simply for SCEV. The trip count of the original loop 3293 // simply happens to be prone to hitting this in practice. In theory, we 3294 // can hit the same issue for any SCEV, or ValueTracking query done during 3295 // mutation. See PR49900. 3296 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3297 3298 // Create an empty vector loop, and prepare basic blocks for the runtime 3299 // checks. 3300 createVectorLoopSkeleton(""); 3301 3302 // Now, compare the new count to zero. If it is zero skip the vector loop and 3303 // jump to the scalar loop. This check also covers the case where the 3304 // backedge-taken count is uint##_max: adding one to it will overflow leading 3305 // to an incorrect trip count of zero. In this (rare) case we will also jump 3306 // to the scalar loop. 3307 emitIterationCountCheck(LoopScalarPreHeader); 3308 3309 // Generate the code to check any assumptions that we've made for SCEV 3310 // expressions. 3311 emitSCEVChecks(LoopScalarPreHeader); 3312 3313 // Generate the code that checks in runtime if arrays overlap. We put the 3314 // checks into a separate block to make the more common case of few elements 3315 // faster. 3316 emitMemRuntimeChecks(LoopScalarPreHeader); 3317 3318 // Emit phis for the new starting index of the scalar loop. 3319 createInductionResumeValues(); 3320 3321 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3322 } 3323 3324 // Fix up external users of the induction variable. At this point, we are 3325 // in LCSSA form, with all external PHIs that use the IV having one input value, 3326 // coming from the remainder loop. We need those PHIs to also have a correct 3327 // value for the IV when arriving directly from the middle block. 3328 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3329 const InductionDescriptor &II, 3330 Value *VectorTripCount, Value *EndValue, 3331 BasicBlock *MiddleBlock, 3332 BasicBlock *VectorHeader, VPlan &Plan) { 3333 // There are two kinds of external IV usages - those that use the value 3334 // computed in the last iteration (the PHI) and those that use the penultimate 3335 // value (the value that feeds into the phi from the loop latch). 3336 // We allow both, but they, obviously, have different values. 3337 3338 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3339 3340 DenseMap<Value *, Value *> MissingVals; 3341 3342 // An external user of the last iteration's value should see the value that 3343 // the remainder loop uses to initialize its own IV. 3344 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3345 for (User *U : PostInc->users()) { 3346 Instruction *UI = cast<Instruction>(U); 3347 if (!OrigLoop->contains(UI)) { 3348 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3349 MissingVals[UI] = EndValue; 3350 } 3351 } 3352 3353 // An external user of the penultimate value need to see EndValue - Step. 3354 // The simplest way to get this is to recompute it from the constituent SCEVs, 3355 // that is Start + (Step * (CRD - 1)). 3356 for (User *U : OrigPhi->users()) { 3357 auto *UI = cast<Instruction>(U); 3358 if (!OrigLoop->contains(UI)) { 3359 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3360 3361 IRBuilder<> B(MiddleBlock->getTerminator()); 3362 3363 // Fast-math-flags propagate from the original induction instruction. 3364 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3365 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3366 3367 Value *CountMinusOne = B.CreateSub( 3368 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3369 Value *CMO = 3370 !II.getStep()->getType()->isIntegerTy() 3371 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3372 II.getStep()->getType()) 3373 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3374 CMO->setName("cast.cmo"); 3375 3376 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3377 VectorHeader->getTerminator()); 3378 Value *Escape = 3379 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3380 Escape->setName("ind.escape"); 3381 MissingVals[UI] = Escape; 3382 } 3383 } 3384 3385 for (auto &I : MissingVals) { 3386 PHINode *PHI = cast<PHINode>(I.first); 3387 // One corner case we have to handle is two IVs "chasing" each-other, 3388 // that is %IV2 = phi [...], [ %IV1, %latch ] 3389 // In this case, if IV1 has an external use, we need to avoid adding both 3390 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3391 // don't already have an incoming value for the middle block. 3392 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3393 PHI->addIncoming(I.second, MiddleBlock); 3394 Plan.removeLiveOut(PHI); 3395 } 3396 } 3397 } 3398 3399 namespace { 3400 3401 struct CSEDenseMapInfo { 3402 static bool canHandle(const Instruction *I) { 3403 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3404 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3405 } 3406 3407 static inline Instruction *getEmptyKey() { 3408 return DenseMapInfo<Instruction *>::getEmptyKey(); 3409 } 3410 3411 static inline Instruction *getTombstoneKey() { 3412 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3413 } 3414 3415 static unsigned getHashValue(const Instruction *I) { 3416 assert(canHandle(I) && "Unknown instruction!"); 3417 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3418 I->value_op_end())); 3419 } 3420 3421 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3422 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3423 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3424 return LHS == RHS; 3425 return LHS->isIdenticalTo(RHS); 3426 } 3427 }; 3428 3429 } // end anonymous namespace 3430 3431 ///Perform cse of induction variable instructions. 3432 static void cse(BasicBlock *BB) { 3433 // Perform simple cse. 3434 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3435 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3436 if (!CSEDenseMapInfo::canHandle(&In)) 3437 continue; 3438 3439 // Check if we can replace this instruction with any of the 3440 // visited instructions. 3441 if (Instruction *V = CSEMap.lookup(&In)) { 3442 In.replaceAllUsesWith(V); 3443 In.eraseFromParent(); 3444 continue; 3445 } 3446 3447 CSEMap[&In] = &In; 3448 } 3449 } 3450 3451 InstructionCost 3452 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3453 bool &NeedToScalarize) const { 3454 Function *F = CI->getCalledFunction(); 3455 Type *ScalarRetTy = CI->getType(); 3456 SmallVector<Type *, 4> Tys, ScalarTys; 3457 for (auto &ArgOp : CI->args()) 3458 ScalarTys.push_back(ArgOp->getType()); 3459 3460 // Estimate cost of scalarized vector call. The source operands are assumed 3461 // to be vectors, so we need to extract individual elements from there, 3462 // execute VF scalar calls, and then gather the result into the vector return 3463 // value. 3464 InstructionCost ScalarCallCost = 3465 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3466 if (VF.isScalar()) 3467 return ScalarCallCost; 3468 3469 // Compute corresponding vector type for return value and arguments. 3470 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3471 for (Type *ScalarTy : ScalarTys) 3472 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3473 3474 // Compute costs of unpacking argument values for the scalar calls and 3475 // packing the return values to a vector. 3476 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3477 3478 InstructionCost Cost = 3479 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3480 3481 // If we can't emit a vector call for this function, then the currently found 3482 // cost is the cost we need to return. 3483 NeedToScalarize = true; 3484 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3485 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3486 3487 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3488 return Cost; 3489 3490 // If the corresponding vector cost is cheaper, return its cost. 3491 InstructionCost VectorCallCost = 3492 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3493 if (VectorCallCost < Cost) { 3494 NeedToScalarize = false; 3495 Cost = VectorCallCost; 3496 } 3497 return Cost; 3498 } 3499 3500 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3501 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3502 return Elt; 3503 return VectorType::get(Elt, VF); 3504 } 3505 3506 InstructionCost 3507 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3508 ElementCount VF) const { 3509 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3510 assert(ID && "Expected intrinsic call!"); 3511 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3512 FastMathFlags FMF; 3513 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3514 FMF = FPMO->getFastMathFlags(); 3515 3516 SmallVector<const Value *> Arguments(CI->args()); 3517 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3518 SmallVector<Type *> ParamTys; 3519 std::transform(FTy->param_begin(), FTy->param_end(), 3520 std::back_inserter(ParamTys), 3521 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3522 3523 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3524 dyn_cast<IntrinsicInst>(CI)); 3525 return TTI.getIntrinsicInstrCost(CostAttrs, 3526 TargetTransformInfo::TCK_RecipThroughput); 3527 } 3528 3529 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3530 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3531 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3532 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3533 } 3534 3535 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3536 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3537 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3538 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3539 } 3540 3541 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3542 // For every instruction `I` in MinBWs, truncate the operands, create a 3543 // truncated version of `I` and reextend its result. InstCombine runs 3544 // later and will remove any ext/trunc pairs. 3545 SmallPtrSet<Value *, 4> Erased; 3546 for (const auto &KV : Cost->getMinimalBitwidths()) { 3547 // If the value wasn't vectorized, we must maintain the original scalar 3548 // type. The absence of the value from State indicates that it 3549 // wasn't vectorized. 3550 // FIXME: Should not rely on getVPValue at this point. 3551 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3552 if (!State.hasAnyVectorValue(Def)) 3553 continue; 3554 for (unsigned Part = 0; Part < UF; ++Part) { 3555 Value *I = State.get(Def, Part); 3556 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3557 continue; 3558 Type *OriginalTy = I->getType(); 3559 Type *ScalarTruncatedTy = 3560 IntegerType::get(OriginalTy->getContext(), KV.second); 3561 auto *TruncatedTy = VectorType::get( 3562 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3563 if (TruncatedTy == OriginalTy) 3564 continue; 3565 3566 IRBuilder<> B(cast<Instruction>(I)); 3567 auto ShrinkOperand = [&](Value *V) -> Value * { 3568 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3569 if (ZI->getSrcTy() == TruncatedTy) 3570 return ZI->getOperand(0); 3571 return B.CreateZExtOrTrunc(V, TruncatedTy); 3572 }; 3573 3574 // The actual instruction modification depends on the instruction type, 3575 // unfortunately. 3576 Value *NewI = nullptr; 3577 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3578 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3579 ShrinkOperand(BO->getOperand(1))); 3580 3581 // Any wrapping introduced by shrinking this operation shouldn't be 3582 // considered undefined behavior. So, we can't unconditionally copy 3583 // arithmetic wrapping flags to NewI. 3584 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3585 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3586 NewI = 3587 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3588 ShrinkOperand(CI->getOperand(1))); 3589 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3590 NewI = B.CreateSelect(SI->getCondition(), 3591 ShrinkOperand(SI->getTrueValue()), 3592 ShrinkOperand(SI->getFalseValue())); 3593 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3594 switch (CI->getOpcode()) { 3595 default: 3596 llvm_unreachable("Unhandled cast!"); 3597 case Instruction::Trunc: 3598 NewI = ShrinkOperand(CI->getOperand(0)); 3599 break; 3600 case Instruction::SExt: 3601 NewI = B.CreateSExtOrTrunc( 3602 CI->getOperand(0), 3603 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3604 break; 3605 case Instruction::ZExt: 3606 NewI = B.CreateZExtOrTrunc( 3607 CI->getOperand(0), 3608 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3609 break; 3610 } 3611 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3612 auto Elements0 = 3613 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3614 auto *O0 = B.CreateZExtOrTrunc( 3615 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3616 auto Elements1 = 3617 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3618 auto *O1 = B.CreateZExtOrTrunc( 3619 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3620 3621 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3622 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3623 // Don't do anything with the operands, just extend the result. 3624 continue; 3625 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3626 auto Elements = 3627 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3628 auto *O0 = B.CreateZExtOrTrunc( 3629 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3630 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3631 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3632 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3633 auto Elements = 3634 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3635 auto *O0 = B.CreateZExtOrTrunc( 3636 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3637 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3638 } else { 3639 // If we don't know what to do, be conservative and don't do anything. 3640 continue; 3641 } 3642 3643 // Lastly, extend the result. 3644 NewI->takeName(cast<Instruction>(I)); 3645 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3646 I->replaceAllUsesWith(Res); 3647 cast<Instruction>(I)->eraseFromParent(); 3648 Erased.insert(I); 3649 State.reset(Def, Res, Part); 3650 } 3651 } 3652 3653 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3654 for (const auto &KV : Cost->getMinimalBitwidths()) { 3655 // If the value wasn't vectorized, we must maintain the original scalar 3656 // type. The absence of the value from State indicates that it 3657 // wasn't vectorized. 3658 // FIXME: Should not rely on getVPValue at this point. 3659 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3660 if (!State.hasAnyVectorValue(Def)) 3661 continue; 3662 for (unsigned Part = 0; Part < UF; ++Part) { 3663 Value *I = State.get(Def, Part); 3664 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3665 if (Inst && Inst->use_empty()) { 3666 Value *NewI = Inst->getOperand(0); 3667 Inst->eraseFromParent(); 3668 State.reset(Def, NewI, Part); 3669 } 3670 } 3671 } 3672 } 3673 3674 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3675 VPlan &Plan) { 3676 // Insert truncates and extends for any truncated instructions as hints to 3677 // InstCombine. 3678 if (VF.isVector()) 3679 truncateToMinimalBitwidths(State); 3680 3681 // Fix widened non-induction PHIs by setting up the PHI operands. 3682 if (OrigPHIsToFix.size()) { 3683 assert(EnableVPlanNativePath && 3684 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3685 fixNonInductionPHIs(State); 3686 } 3687 3688 // At this point every instruction in the original loop is widened to a 3689 // vector form. Now we need to fix the recurrences in the loop. These PHI 3690 // nodes are currently empty because we did not want to introduce cycles. 3691 // This is the second stage of vectorizing recurrences. 3692 fixCrossIterationPHIs(State); 3693 3694 // Forget the original basic block. 3695 PSE.getSE()->forgetLoop(OrigLoop); 3696 3697 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitBasicBlock(); 3698 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3699 if (Cost->requiresScalarEpilogue(VF)) { 3700 // No edge from the middle block to the unique exit block has been inserted 3701 // and there is nothing to fix from vector loop; phis should have incoming 3702 // from scalar loop only. 3703 Plan.clearLiveOuts(); 3704 } else { 3705 // If we inserted an edge from the middle block to the unique exit block, 3706 // update uses outside the loop (phis) to account for the newly inserted 3707 // edge. 3708 3709 // Fix-up external users of the induction variables. 3710 for (auto &Entry : Legal->getInductionVars()) 3711 fixupIVUsers(Entry.first, Entry.second, 3712 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3713 IVEndValues[Entry.first], LoopMiddleBlock, 3714 VectorLoop->getHeader(), Plan); 3715 } 3716 3717 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3718 // in the exit block, so update the builder. 3719 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3720 for (auto &KV : Plan.getLiveOuts()) 3721 KV.second->fixPhi(Plan, State); 3722 3723 for (Instruction *PI : PredicatedInstructions) 3724 sinkScalarOperands(&*PI); 3725 3726 // Remove redundant induction instructions. 3727 cse(VectorLoop->getHeader()); 3728 3729 // Set/update profile weights for the vector and remainder loops as original 3730 // loop iterations are now distributed among them. Note that original loop 3731 // represented by LoopScalarBody becomes remainder loop after vectorization. 3732 // 3733 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3734 // end up getting slightly roughened result but that should be OK since 3735 // profile is not inherently precise anyway. Note also possible bypass of 3736 // vector code caused by legality checks is ignored, assigning all the weight 3737 // to the vector loop, optimistically. 3738 // 3739 // For scalable vectorization we can't know at compile time how many iterations 3740 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3741 // vscale of '1'. 3742 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3743 LI->getLoopFor(LoopScalarBody), 3744 VF.getKnownMinValue() * UF); 3745 } 3746 3747 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3748 // In order to support recurrences we need to be able to vectorize Phi nodes. 3749 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3750 // stage #2: We now need to fix the recurrences by adding incoming edges to 3751 // the currently empty PHI nodes. At this point every instruction in the 3752 // original loop is widened to a vector form so we can use them to construct 3753 // the incoming edges. 3754 VPBasicBlock *Header = 3755 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3756 for (VPRecipeBase &R : Header->phis()) { 3757 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3758 fixReduction(ReductionPhi, State); 3759 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3760 fixFirstOrderRecurrence(FOR, State); 3761 } 3762 } 3763 3764 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3765 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3766 // This is the second phase of vectorizing first-order recurrences. An 3767 // overview of the transformation is described below. Suppose we have the 3768 // following loop. 3769 // 3770 // for (int i = 0; i < n; ++i) 3771 // b[i] = a[i] - a[i - 1]; 3772 // 3773 // There is a first-order recurrence on "a". For this loop, the shorthand 3774 // scalar IR looks like: 3775 // 3776 // scalar.ph: 3777 // s_init = a[-1] 3778 // br scalar.body 3779 // 3780 // scalar.body: 3781 // i = phi [0, scalar.ph], [i+1, scalar.body] 3782 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3783 // s2 = a[i] 3784 // b[i] = s2 - s1 3785 // br cond, scalar.body, ... 3786 // 3787 // In this example, s1 is a recurrence because it's value depends on the 3788 // previous iteration. In the first phase of vectorization, we created a 3789 // vector phi v1 for s1. We now complete the vectorization and produce the 3790 // shorthand vector IR shown below (for VF = 4, UF = 1). 3791 // 3792 // vector.ph: 3793 // v_init = vector(..., ..., ..., a[-1]) 3794 // br vector.body 3795 // 3796 // vector.body 3797 // i = phi [0, vector.ph], [i+4, vector.body] 3798 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3799 // v2 = a[i, i+1, i+2, i+3]; 3800 // v3 = vector(v1(3), v2(0, 1, 2)) 3801 // b[i, i+1, i+2, i+3] = v2 - v3 3802 // br cond, vector.body, middle.block 3803 // 3804 // middle.block: 3805 // x = v2(3) 3806 // br scalar.ph 3807 // 3808 // scalar.ph: 3809 // s_init = phi [x, middle.block], [a[-1], otherwise] 3810 // br scalar.body 3811 // 3812 // After execution completes the vector loop, we extract the next value of 3813 // the recurrence (x) to use as the initial value in the scalar loop. 3814 3815 // Extract the last vector element in the middle block. This will be the 3816 // initial value for the recurrence when jumping to the scalar loop. 3817 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3818 Value *Incoming = State.get(PreviousDef, UF - 1); 3819 auto *ExtractForScalar = Incoming; 3820 auto *IdxTy = Builder.getInt32Ty(); 3821 if (VF.isVector()) { 3822 auto *One = ConstantInt::get(IdxTy, 1); 3823 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3824 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3825 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3826 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3827 "vector.recur.extract"); 3828 } 3829 // Extract the second last element in the middle block if the 3830 // Phi is used outside the loop. We need to extract the phi itself 3831 // and not the last element (the phi update in the current iteration). This 3832 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3833 // when the scalar loop is not run at all. 3834 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3835 if (VF.isVector()) { 3836 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3837 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3838 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3839 Incoming, Idx, "vector.recur.extract.for.phi"); 3840 } else if (UF > 1) 3841 // When loop is unrolled without vectorizing, initialize 3842 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3843 // of `Incoming`. This is analogous to the vectorized case above: extracting 3844 // the second last element when VF > 1. 3845 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3846 3847 // Fix the initial value of the original recurrence in the scalar loop. 3848 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3849 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3850 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3851 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3852 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3853 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3854 Start->addIncoming(Incoming, BB); 3855 } 3856 3857 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3858 Phi->setName("scalar.recur"); 3859 3860 // Finally, fix users of the recurrence outside the loop. The users will need 3861 // either the last value of the scalar recurrence or the last value of the 3862 // vector recurrence we extracted in the middle block. Since the loop is in 3863 // LCSSA form, we just need to find all the phi nodes for the original scalar 3864 // recurrence in the exit block, and then add an edge for the middle block. 3865 // Note that LCSSA does not imply single entry when the original scalar loop 3866 // had multiple exiting edges (as we always run the last iteration in the 3867 // scalar epilogue); in that case, there is no edge from middle to exit and 3868 // and thus no phis which needed updated. 3869 if (!Cost->requiresScalarEpilogue(VF)) 3870 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3871 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3872 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3873 State.Plan->removeLiveOut(&LCSSAPhi); 3874 } 3875 } 3876 3877 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3878 VPTransformState &State) { 3879 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3880 // Get it's reduction variable descriptor. 3881 assert(Legal->isReductionVariable(OrigPhi) && 3882 "Unable to find the reduction variable"); 3883 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3884 3885 RecurKind RK = RdxDesc.getRecurrenceKind(); 3886 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3887 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3888 setDebugLocFromInst(ReductionStartValue); 3889 3890 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3891 // This is the vector-clone of the value that leaves the loop. 3892 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3893 3894 // Wrap flags are in general invalid after vectorization, clear them. 3895 clearReductionWrapFlags(PhiR, State); 3896 3897 // Before each round, move the insertion point right between 3898 // the PHIs and the values we are going to write. 3899 // This allows us to write both PHINodes and the extractelement 3900 // instructions. 3901 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3902 3903 setDebugLocFromInst(LoopExitInst); 3904 3905 Type *PhiTy = OrigPhi->getType(); 3906 3907 VPBasicBlock *LatchVPBB = 3908 PhiR->getParent()->getEnclosingLoopRegion()->getExitBasicBlock(); 3909 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3910 // If tail is folded by masking, the vector value to leave the loop should be 3911 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3912 // instead of the former. For an inloop reduction the reduction will already 3913 // be predicated, and does not need to be handled here. 3914 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3915 for (unsigned Part = 0; Part < UF; ++Part) { 3916 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3917 Value *Sel = nullptr; 3918 for (User *U : VecLoopExitInst->users()) { 3919 if (isa<SelectInst>(U)) { 3920 assert(!Sel && "Reduction exit feeding two selects"); 3921 Sel = U; 3922 } else 3923 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3924 } 3925 assert(Sel && "Reduction exit feeds no select"); 3926 State.reset(LoopExitInstDef, Sel, Part); 3927 3928 // If the target can create a predicated operator for the reduction at no 3929 // extra cost in the loop (for example a predicated vadd), it can be 3930 // cheaper for the select to remain in the loop than be sunk out of it, 3931 // and so use the select value for the phi instead of the old 3932 // LoopExitValue. 3933 if (PreferPredicatedReductionSelect || 3934 TTI->preferPredicatedReductionSelect( 3935 RdxDesc.getOpcode(), PhiTy, 3936 TargetTransformInfo::ReductionFlags())) { 3937 auto *VecRdxPhi = 3938 cast<PHINode>(State.get(PhiR, Part)); 3939 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3940 } 3941 } 3942 } 3943 3944 // If the vector reduction can be performed in a smaller type, we truncate 3945 // then extend the loop exit value to enable InstCombine to evaluate the 3946 // entire expression in the smaller type. 3947 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3948 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3949 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3950 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3951 VectorParts RdxParts(UF); 3952 for (unsigned Part = 0; Part < UF; ++Part) { 3953 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3954 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3955 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3956 : Builder.CreateZExt(Trunc, VecTy); 3957 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3958 if (U != Trunc) { 3959 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3960 RdxParts[Part] = Extnd; 3961 } 3962 } 3963 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3964 for (unsigned Part = 0; Part < UF; ++Part) { 3965 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3966 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3967 } 3968 } 3969 3970 // Reduce all of the unrolled parts into a single vector. 3971 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3972 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3973 3974 // The middle block terminator has already been assigned a DebugLoc here (the 3975 // OrigLoop's single latch terminator). We want the whole middle block to 3976 // appear to execute on this line because: (a) it is all compiler generated, 3977 // (b) these instructions are always executed after evaluating the latch 3978 // conditional branch, and (c) other passes may add new predecessors which 3979 // terminate on this line. This is the easiest way to ensure we don't 3980 // accidentally cause an extra step back into the loop while debugging. 3981 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3982 if (PhiR->isOrdered()) 3983 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3984 else { 3985 // Floating-point operations should have some FMF to enable the reduction. 3986 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3987 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3988 for (unsigned Part = 1; Part < UF; ++Part) { 3989 Value *RdxPart = State.get(LoopExitInstDef, Part); 3990 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3991 ReducedPartRdx = Builder.CreateBinOp( 3992 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3993 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3994 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3995 ReducedPartRdx, RdxPart); 3996 else 3997 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3998 } 3999 } 4000 4001 // Create the reduction after the loop. Note that inloop reductions create the 4002 // target reduction in the loop using a Reduction recipe. 4003 if (VF.isVector() && !PhiR->isInLoop()) { 4004 ReducedPartRdx = 4005 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 4006 // If the reduction can be performed in a smaller type, we need to extend 4007 // the reduction to the wider type before we branch to the original loop. 4008 if (PhiTy != RdxDesc.getRecurrenceType()) 4009 ReducedPartRdx = RdxDesc.isSigned() 4010 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4011 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4012 } 4013 4014 PHINode *ResumePhi = 4015 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 4016 4017 // Create a phi node that merges control-flow from the backedge-taken check 4018 // block and the middle block. 4019 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4020 LoopScalarPreHeader->getTerminator()); 4021 4022 // If we are fixing reductions in the epilogue loop then we should already 4023 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 4024 // we carry over the incoming values correctly. 4025 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 4026 if (Incoming == LoopMiddleBlock) 4027 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 4028 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 4029 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 4030 Incoming); 4031 else 4032 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 4033 } 4034 4035 // Set the resume value for this reduction 4036 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 4037 4038 // If there were stores of the reduction value to a uniform memory address 4039 // inside the loop, create the final store here. 4040 if (StoreInst *SI = RdxDesc.IntermediateStore) { 4041 StoreInst *NewSI = 4042 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 4043 propagateMetadata(NewSI, SI); 4044 4045 // If the reduction value is used in other places, 4046 // then let the code below create PHI's for that. 4047 } 4048 4049 // Now, we need to fix the users of the reduction variable 4050 // inside and outside of the scalar remainder loop. 4051 4052 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4053 // in the exit blocks. See comment on analogous loop in 4054 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4055 if (!Cost->requiresScalarEpilogue(VF)) 4056 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4057 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 4058 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4059 State.Plan->removeLiveOut(&LCSSAPhi); 4060 } 4061 4062 // Fix the scalar loop reduction variable with the incoming reduction sum 4063 // from the vector body and from the backedge value. 4064 int IncomingEdgeBlockIdx = 4065 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4066 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4067 // Pick the other block. 4068 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4069 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4070 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4071 } 4072 4073 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 4074 VPTransformState &State) { 4075 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 4076 RecurKind RK = RdxDesc.getRecurrenceKind(); 4077 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4078 return; 4079 4080 SmallVector<VPValue *, 8> Worklist; 4081 SmallPtrSet<VPValue *, 8> Visited; 4082 Worklist.push_back(PhiR); 4083 Visited.insert(PhiR); 4084 4085 while (!Worklist.empty()) { 4086 VPValue *Cur = Worklist.pop_back_val(); 4087 for (unsigned Part = 0; Part < UF; ++Part) { 4088 Value *V = State.get(Cur, Part); 4089 if (!isa<OverflowingBinaryOperator>(V)) 4090 break; 4091 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4092 } 4093 4094 for (VPUser *U : Cur->users()) { 4095 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4096 if (!UserRecipe) 4097 continue; 4098 for (VPValue *V : UserRecipe->definedValues()) 4099 if (Visited.insert(V).second) 4100 Worklist.push_back(V); 4101 } 4102 } 4103 } 4104 4105 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4106 // The basic block and loop containing the predicated instruction. 4107 auto *PredBB = PredInst->getParent(); 4108 auto *VectorLoop = LI->getLoopFor(PredBB); 4109 4110 // Initialize a worklist with the operands of the predicated instruction. 4111 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4112 4113 // Holds instructions that we need to analyze again. An instruction may be 4114 // reanalyzed if we don't yet know if we can sink it or not. 4115 SmallVector<Instruction *, 8> InstsToReanalyze; 4116 4117 // Returns true if a given use occurs in the predicated block. Phi nodes use 4118 // their operands in their corresponding predecessor blocks. 4119 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4120 auto *I = cast<Instruction>(U.getUser()); 4121 BasicBlock *BB = I->getParent(); 4122 if (auto *Phi = dyn_cast<PHINode>(I)) 4123 BB = Phi->getIncomingBlock( 4124 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4125 return BB == PredBB; 4126 }; 4127 4128 // Iteratively sink the scalarized operands of the predicated instruction 4129 // into the block we created for it. When an instruction is sunk, it's 4130 // operands are then added to the worklist. The algorithm ends after one pass 4131 // through the worklist doesn't sink a single instruction. 4132 bool Changed; 4133 do { 4134 // Add the instructions that need to be reanalyzed to the worklist, and 4135 // reset the changed indicator. 4136 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4137 InstsToReanalyze.clear(); 4138 Changed = false; 4139 4140 while (!Worklist.empty()) { 4141 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4142 4143 // We can't sink an instruction if it is a phi node, is not in the loop, 4144 // or may have side effects. 4145 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4146 I->mayHaveSideEffects()) 4147 continue; 4148 4149 // If the instruction is already in PredBB, check if we can sink its 4150 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4151 // sinking the scalar instruction I, hence it appears in PredBB; but it 4152 // may have failed to sink I's operands (recursively), which we try 4153 // (again) here. 4154 if (I->getParent() == PredBB) { 4155 Worklist.insert(I->op_begin(), I->op_end()); 4156 continue; 4157 } 4158 4159 // It's legal to sink the instruction if all its uses occur in the 4160 // predicated block. Otherwise, there's nothing to do yet, and we may 4161 // need to reanalyze the instruction. 4162 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4163 InstsToReanalyze.push_back(I); 4164 continue; 4165 } 4166 4167 // Move the instruction to the beginning of the predicated block, and add 4168 // it's operands to the worklist. 4169 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4170 Worklist.insert(I->op_begin(), I->op_end()); 4171 4172 // The sinking may have enabled other instructions to be sunk, so we will 4173 // need to iterate. 4174 Changed = true; 4175 } 4176 } while (Changed); 4177 } 4178 4179 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4180 for (PHINode *OrigPhi : OrigPHIsToFix) { 4181 VPWidenPHIRecipe *VPPhi = 4182 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4183 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4184 // Make sure the builder has a valid insert point. 4185 Builder.SetInsertPoint(NewPhi); 4186 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4187 VPValue *Inc = VPPhi->getIncomingValue(i); 4188 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4189 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4190 } 4191 } 4192 } 4193 4194 bool InnerLoopVectorizer::useOrderedReductions( 4195 const RecurrenceDescriptor &RdxDesc) { 4196 return Cost->useOrderedReductions(RdxDesc); 4197 } 4198 4199 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4200 VPWidenPHIRecipe *PhiR, 4201 VPTransformState &State) { 4202 assert(EnableVPlanNativePath && 4203 "Non-native vplans are not expected to have VPWidenPHIRecipes."); 4204 // Currently we enter here in the VPlan-native path for non-induction 4205 // PHIs where all control flow is uniform. We simply widen these PHIs. 4206 // Create a vector phi with no operands - the vector phi operands will be 4207 // set at the end of vector code generation. 4208 Type *VecTy = (State.VF.isScalar()) 4209 ? PN->getType() 4210 : VectorType::get(PN->getType(), State.VF); 4211 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4212 State.set(PhiR, VecPhi, 0); 4213 OrigPHIsToFix.push_back(cast<PHINode>(PN)); 4214 } 4215 4216 /// A helper function for checking whether an integer division-related 4217 /// instruction may divide by zero (in which case it must be predicated if 4218 /// executed conditionally in the scalar code). 4219 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4220 /// Non-zero divisors that are non compile-time constants will not be 4221 /// converted into multiplication, so we will still end up scalarizing 4222 /// the division, but can do so w/o predication. 4223 static bool mayDivideByZero(Instruction &I) { 4224 assert((I.getOpcode() == Instruction::UDiv || 4225 I.getOpcode() == Instruction::SDiv || 4226 I.getOpcode() == Instruction::URem || 4227 I.getOpcode() == Instruction::SRem) && 4228 "Unexpected instruction"); 4229 Value *Divisor = I.getOperand(1); 4230 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4231 return !CInt || CInt->isZero(); 4232 } 4233 4234 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4235 VPUser &ArgOperands, 4236 VPTransformState &State) { 4237 assert(!isa<DbgInfoIntrinsic>(I) && 4238 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4239 setDebugLocFromInst(&I); 4240 4241 Module *M = I.getParent()->getParent()->getParent(); 4242 auto *CI = cast<CallInst>(&I); 4243 4244 SmallVector<Type *, 4> Tys; 4245 for (Value *ArgOperand : CI->args()) 4246 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4247 4248 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4249 4250 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4251 // version of the instruction. 4252 // Is it beneficial to perform intrinsic call compared to lib call? 4253 bool NeedToScalarize = false; 4254 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4255 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4256 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4257 assert((UseVectorIntrinsic || !NeedToScalarize) && 4258 "Instruction should be scalarized elsewhere."); 4259 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4260 "Either the intrinsic cost or vector call cost must be valid"); 4261 4262 for (unsigned Part = 0; Part < UF; ++Part) { 4263 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4264 SmallVector<Value *, 4> Args; 4265 for (auto &I : enumerate(ArgOperands.operands())) { 4266 // Some intrinsics have a scalar argument - don't replace it with a 4267 // vector. 4268 Value *Arg; 4269 if (!UseVectorIntrinsic || 4270 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4271 Arg = State.get(I.value(), Part); 4272 else 4273 Arg = State.get(I.value(), VPIteration(0, 0)); 4274 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4275 TysForDecl.push_back(Arg->getType()); 4276 Args.push_back(Arg); 4277 } 4278 4279 Function *VectorF; 4280 if (UseVectorIntrinsic) { 4281 // Use vector version of the intrinsic. 4282 if (VF.isVector()) 4283 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4284 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4285 assert(VectorF && "Can't retrieve vector intrinsic."); 4286 } else { 4287 // Use vector version of the function call. 4288 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4289 #ifndef NDEBUG 4290 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4291 "Can't create vector function."); 4292 #endif 4293 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4294 } 4295 SmallVector<OperandBundleDef, 1> OpBundles; 4296 CI->getOperandBundlesAsDefs(OpBundles); 4297 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4298 4299 if (isa<FPMathOperator>(V)) 4300 V->copyFastMathFlags(CI); 4301 4302 State.set(Def, V, Part); 4303 addMetadata(V, &I); 4304 } 4305 } 4306 4307 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4308 // We should not collect Scalars more than once per VF. Right now, this 4309 // function is called from collectUniformsAndScalars(), which already does 4310 // this check. Collecting Scalars for VF=1 does not make any sense. 4311 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4312 "This function should not be visited twice for the same VF"); 4313 4314 // This avoids any chances of creating a REPLICATE recipe during planning 4315 // since that would result in generation of scalarized code during execution, 4316 // which is not supported for scalable vectors. 4317 if (VF.isScalable()) { 4318 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4319 return; 4320 } 4321 4322 SmallSetVector<Instruction *, 8> Worklist; 4323 4324 // These sets are used to seed the analysis with pointers used by memory 4325 // accesses that will remain scalar. 4326 SmallSetVector<Instruction *, 8> ScalarPtrs; 4327 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4328 auto *Latch = TheLoop->getLoopLatch(); 4329 4330 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4331 // The pointer operands of loads and stores will be scalar as long as the 4332 // memory access is not a gather or scatter operation. The value operand of a 4333 // store will remain scalar if the store is scalarized. 4334 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4335 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4336 assert(WideningDecision != CM_Unknown && 4337 "Widening decision should be ready at this moment"); 4338 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4339 if (Ptr == Store->getValueOperand()) 4340 return WideningDecision == CM_Scalarize; 4341 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4342 "Ptr is neither a value or pointer operand"); 4343 return WideningDecision != CM_GatherScatter; 4344 }; 4345 4346 // A helper that returns true if the given value is a bitcast or 4347 // getelementptr instruction contained in the loop. 4348 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4349 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4350 isa<GetElementPtrInst>(V)) && 4351 !TheLoop->isLoopInvariant(V); 4352 }; 4353 4354 // A helper that evaluates a memory access's use of a pointer. If the use will 4355 // be a scalar use and the pointer is only used by memory accesses, we place 4356 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4357 // PossibleNonScalarPtrs. 4358 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4359 // We only care about bitcast and getelementptr instructions contained in 4360 // the loop. 4361 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4362 return; 4363 4364 // If the pointer has already been identified as scalar (e.g., if it was 4365 // also identified as uniform), there's nothing to do. 4366 auto *I = cast<Instruction>(Ptr); 4367 if (Worklist.count(I)) 4368 return; 4369 4370 // If the use of the pointer will be a scalar use, and all users of the 4371 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4372 // place the pointer in PossibleNonScalarPtrs. 4373 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4374 return isa<LoadInst>(U) || isa<StoreInst>(U); 4375 })) 4376 ScalarPtrs.insert(I); 4377 else 4378 PossibleNonScalarPtrs.insert(I); 4379 }; 4380 4381 // We seed the scalars analysis with three classes of instructions: (1) 4382 // instructions marked uniform-after-vectorization and (2) bitcast, 4383 // getelementptr and (pointer) phi instructions used by memory accesses 4384 // requiring a scalar use. 4385 // 4386 // (1) Add to the worklist all instructions that have been identified as 4387 // uniform-after-vectorization. 4388 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4389 4390 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4391 // memory accesses requiring a scalar use. The pointer operands of loads and 4392 // stores will be scalar as long as the memory accesses is not a gather or 4393 // scatter operation. The value operand of a store will remain scalar if the 4394 // store is scalarized. 4395 for (auto *BB : TheLoop->blocks()) 4396 for (auto &I : *BB) { 4397 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4398 evaluatePtrUse(Load, Load->getPointerOperand()); 4399 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4400 evaluatePtrUse(Store, Store->getPointerOperand()); 4401 evaluatePtrUse(Store, Store->getValueOperand()); 4402 } 4403 } 4404 for (auto *I : ScalarPtrs) 4405 if (!PossibleNonScalarPtrs.count(I)) { 4406 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4407 Worklist.insert(I); 4408 } 4409 4410 // Insert the forced scalars. 4411 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4412 // induction variable when the PHI user is scalarized. 4413 auto ForcedScalar = ForcedScalars.find(VF); 4414 if (ForcedScalar != ForcedScalars.end()) 4415 for (auto *I : ForcedScalar->second) 4416 Worklist.insert(I); 4417 4418 // Expand the worklist by looking through any bitcasts and getelementptr 4419 // instructions we've already identified as scalar. This is similar to the 4420 // expansion step in collectLoopUniforms(); however, here we're only 4421 // expanding to include additional bitcasts and getelementptr instructions. 4422 unsigned Idx = 0; 4423 while (Idx != Worklist.size()) { 4424 Instruction *Dst = Worklist[Idx++]; 4425 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4426 continue; 4427 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4428 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4429 auto *J = cast<Instruction>(U); 4430 return !TheLoop->contains(J) || Worklist.count(J) || 4431 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4432 isScalarUse(J, Src)); 4433 })) { 4434 Worklist.insert(Src); 4435 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4436 } 4437 } 4438 4439 // An induction variable will remain scalar if all users of the induction 4440 // variable and induction variable update remain scalar. 4441 for (auto &Induction : Legal->getInductionVars()) { 4442 auto *Ind = Induction.first; 4443 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4444 4445 // If tail-folding is applied, the primary induction variable will be used 4446 // to feed a vector compare. 4447 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4448 continue; 4449 4450 // Returns true if \p Indvar is a pointer induction that is used directly by 4451 // load/store instruction \p I. 4452 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4453 Instruction *I) { 4454 return Induction.second.getKind() == 4455 InductionDescriptor::IK_PtrInduction && 4456 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4457 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4458 }; 4459 4460 // Determine if all users of the induction variable are scalar after 4461 // vectorization. 4462 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4463 auto *I = cast<Instruction>(U); 4464 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4465 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4466 }); 4467 if (!ScalarInd) 4468 continue; 4469 4470 // Determine if all users of the induction variable update instruction are 4471 // scalar after vectorization. 4472 auto ScalarIndUpdate = 4473 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4474 auto *I = cast<Instruction>(U); 4475 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4476 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4477 }); 4478 if (!ScalarIndUpdate) 4479 continue; 4480 4481 // The induction variable and its update instruction will remain scalar. 4482 Worklist.insert(Ind); 4483 Worklist.insert(IndUpdate); 4484 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4485 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4486 << "\n"); 4487 } 4488 4489 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4490 } 4491 4492 bool LoopVectorizationCostModel::isScalarWithPredication( 4493 Instruction *I, ElementCount VF) const { 4494 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4495 return false; 4496 switch(I->getOpcode()) { 4497 default: 4498 break; 4499 case Instruction::Load: 4500 case Instruction::Store: { 4501 if (!Legal->isMaskRequired(I)) 4502 return false; 4503 auto *Ptr = getLoadStorePointerOperand(I); 4504 auto *Ty = getLoadStoreType(I); 4505 Type *VTy = Ty; 4506 if (VF.isVector()) 4507 VTy = VectorType::get(Ty, VF); 4508 const Align Alignment = getLoadStoreAlignment(I); 4509 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4510 TTI.isLegalMaskedGather(VTy, Alignment)) 4511 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4512 TTI.isLegalMaskedScatter(VTy, Alignment)); 4513 } 4514 case Instruction::UDiv: 4515 case Instruction::SDiv: 4516 case Instruction::SRem: 4517 case Instruction::URem: 4518 return mayDivideByZero(*I); 4519 } 4520 return false; 4521 } 4522 4523 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4524 Instruction *I, ElementCount VF) { 4525 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4526 assert(getWideningDecision(I, VF) == CM_Unknown && 4527 "Decision should not be set yet."); 4528 auto *Group = getInterleavedAccessGroup(I); 4529 assert(Group && "Must have a group."); 4530 4531 // If the instruction's allocated size doesn't equal it's type size, it 4532 // requires padding and will be scalarized. 4533 auto &DL = I->getModule()->getDataLayout(); 4534 auto *ScalarTy = getLoadStoreType(I); 4535 if (hasIrregularType(ScalarTy, DL)) 4536 return false; 4537 4538 // If the group involves a non-integral pointer, we may not be able to 4539 // losslessly cast all values to a common type. 4540 unsigned InterleaveFactor = Group->getFactor(); 4541 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4542 for (unsigned i = 0; i < InterleaveFactor; i++) { 4543 Instruction *Member = Group->getMember(i); 4544 if (!Member) 4545 continue; 4546 auto *MemberTy = getLoadStoreType(Member); 4547 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4548 // Don't coerce non-integral pointers to integers or vice versa. 4549 if (MemberNI != ScalarNI) { 4550 // TODO: Consider adding special nullptr value case here 4551 return false; 4552 } else if (MemberNI && ScalarNI && 4553 ScalarTy->getPointerAddressSpace() != 4554 MemberTy->getPointerAddressSpace()) { 4555 return false; 4556 } 4557 } 4558 4559 // Check if masking is required. 4560 // A Group may need masking for one of two reasons: it resides in a block that 4561 // needs predication, or it was decided to use masking to deal with gaps 4562 // (either a gap at the end of a load-access that may result in a speculative 4563 // load, or any gaps in a store-access). 4564 bool PredicatedAccessRequiresMasking = 4565 blockNeedsPredicationForAnyReason(I->getParent()) && 4566 Legal->isMaskRequired(I); 4567 bool LoadAccessWithGapsRequiresEpilogMasking = 4568 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4569 !isScalarEpilogueAllowed(); 4570 bool StoreAccessWithGapsRequiresMasking = 4571 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4572 if (!PredicatedAccessRequiresMasking && 4573 !LoadAccessWithGapsRequiresEpilogMasking && 4574 !StoreAccessWithGapsRequiresMasking) 4575 return true; 4576 4577 // If masked interleaving is required, we expect that the user/target had 4578 // enabled it, because otherwise it either wouldn't have been created or 4579 // it should have been invalidated by the CostModel. 4580 assert(useMaskedInterleavedAccesses(TTI) && 4581 "Masked interleave-groups for predicated accesses are not enabled."); 4582 4583 if (Group->isReverse()) 4584 return false; 4585 4586 auto *Ty = getLoadStoreType(I); 4587 const Align Alignment = getLoadStoreAlignment(I); 4588 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4589 : TTI.isLegalMaskedStore(Ty, Alignment); 4590 } 4591 4592 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4593 Instruction *I, ElementCount VF) { 4594 // Get and ensure we have a valid memory instruction. 4595 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4596 4597 auto *Ptr = getLoadStorePointerOperand(I); 4598 auto *ScalarTy = getLoadStoreType(I); 4599 4600 // In order to be widened, the pointer should be consecutive, first of all. 4601 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4602 return false; 4603 4604 // If the instruction is a store located in a predicated block, it will be 4605 // scalarized. 4606 if (isScalarWithPredication(I, VF)) 4607 return false; 4608 4609 // If the instruction's allocated size doesn't equal it's type size, it 4610 // requires padding and will be scalarized. 4611 auto &DL = I->getModule()->getDataLayout(); 4612 if (hasIrregularType(ScalarTy, DL)) 4613 return false; 4614 4615 return true; 4616 } 4617 4618 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4619 // We should not collect Uniforms more than once per VF. Right now, 4620 // this function is called from collectUniformsAndScalars(), which 4621 // already does this check. Collecting Uniforms for VF=1 does not make any 4622 // sense. 4623 4624 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4625 "This function should not be visited twice for the same VF"); 4626 4627 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4628 // not analyze again. Uniforms.count(VF) will return 1. 4629 Uniforms[VF].clear(); 4630 4631 // We now know that the loop is vectorizable! 4632 // Collect instructions inside the loop that will remain uniform after 4633 // vectorization. 4634 4635 // Global values, params and instructions outside of current loop are out of 4636 // scope. 4637 auto isOutOfScope = [&](Value *V) -> bool { 4638 Instruction *I = dyn_cast<Instruction>(V); 4639 return (!I || !TheLoop->contains(I)); 4640 }; 4641 4642 // Worklist containing uniform instructions demanding lane 0. 4643 SetVector<Instruction *> Worklist; 4644 BasicBlock *Latch = TheLoop->getLoopLatch(); 4645 4646 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4647 // that are scalar with predication must not be considered uniform after 4648 // vectorization, because that would create an erroneous replicating region 4649 // where only a single instance out of VF should be formed. 4650 // TODO: optimize such seldom cases if found important, see PR40816. 4651 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4652 if (isOutOfScope(I)) { 4653 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4654 << *I << "\n"); 4655 return; 4656 } 4657 if (isScalarWithPredication(I, VF)) { 4658 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4659 << *I << "\n"); 4660 return; 4661 } 4662 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4663 Worklist.insert(I); 4664 }; 4665 4666 // Start with the conditional branch. If the branch condition is an 4667 // instruction contained in the loop that is only used by the branch, it is 4668 // uniform. 4669 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4670 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4671 addToWorklistIfAllowed(Cmp); 4672 4673 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4674 InstWidening WideningDecision = getWideningDecision(I, VF); 4675 assert(WideningDecision != CM_Unknown && 4676 "Widening decision should be ready at this moment"); 4677 4678 // A uniform memory op is itself uniform. We exclude uniform stores 4679 // here as they demand the last lane, not the first one. 4680 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4681 assert(WideningDecision == CM_Scalarize); 4682 return true; 4683 } 4684 4685 return (WideningDecision == CM_Widen || 4686 WideningDecision == CM_Widen_Reverse || 4687 WideningDecision == CM_Interleave); 4688 }; 4689 4690 4691 // Returns true if Ptr is the pointer operand of a memory access instruction 4692 // I, and I is known to not require scalarization. 4693 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4694 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4695 }; 4696 4697 // Holds a list of values which are known to have at least one uniform use. 4698 // Note that there may be other uses which aren't uniform. A "uniform use" 4699 // here is something which only demands lane 0 of the unrolled iterations; 4700 // it does not imply that all lanes produce the same value (e.g. this is not 4701 // the usual meaning of uniform) 4702 SetVector<Value *> HasUniformUse; 4703 4704 // Scan the loop for instructions which are either a) known to have only 4705 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4706 for (auto *BB : TheLoop->blocks()) 4707 for (auto &I : *BB) { 4708 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4709 switch (II->getIntrinsicID()) { 4710 case Intrinsic::sideeffect: 4711 case Intrinsic::experimental_noalias_scope_decl: 4712 case Intrinsic::assume: 4713 case Intrinsic::lifetime_start: 4714 case Intrinsic::lifetime_end: 4715 if (TheLoop->hasLoopInvariantOperands(&I)) 4716 addToWorklistIfAllowed(&I); 4717 break; 4718 default: 4719 break; 4720 } 4721 } 4722 4723 // ExtractValue instructions must be uniform, because the operands are 4724 // known to be loop-invariant. 4725 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4726 assert(isOutOfScope(EVI->getAggregateOperand()) && 4727 "Expected aggregate value to be loop invariant"); 4728 addToWorklistIfAllowed(EVI); 4729 continue; 4730 } 4731 4732 // If there's no pointer operand, there's nothing to do. 4733 auto *Ptr = getLoadStorePointerOperand(&I); 4734 if (!Ptr) 4735 continue; 4736 4737 // A uniform memory op is itself uniform. We exclude uniform stores 4738 // here as they demand the last lane, not the first one. 4739 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4740 addToWorklistIfAllowed(&I); 4741 4742 if (isUniformDecision(&I, VF)) { 4743 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4744 HasUniformUse.insert(Ptr); 4745 } 4746 } 4747 4748 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4749 // demanding) users. Since loops are assumed to be in LCSSA form, this 4750 // disallows uses outside the loop as well. 4751 for (auto *V : HasUniformUse) { 4752 if (isOutOfScope(V)) 4753 continue; 4754 auto *I = cast<Instruction>(V); 4755 auto UsersAreMemAccesses = 4756 llvm::all_of(I->users(), [&](User *U) -> bool { 4757 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4758 }); 4759 if (UsersAreMemAccesses) 4760 addToWorklistIfAllowed(I); 4761 } 4762 4763 // Expand Worklist in topological order: whenever a new instruction 4764 // is added , its users should be already inside Worklist. It ensures 4765 // a uniform instruction will only be used by uniform instructions. 4766 unsigned idx = 0; 4767 while (idx != Worklist.size()) { 4768 Instruction *I = Worklist[idx++]; 4769 4770 for (auto OV : I->operand_values()) { 4771 // isOutOfScope operands cannot be uniform instructions. 4772 if (isOutOfScope(OV)) 4773 continue; 4774 // First order recurrence Phi's should typically be considered 4775 // non-uniform. 4776 auto *OP = dyn_cast<PHINode>(OV); 4777 if (OP && Legal->isFirstOrderRecurrence(OP)) 4778 continue; 4779 // If all the users of the operand are uniform, then add the 4780 // operand into the uniform worklist. 4781 auto *OI = cast<Instruction>(OV); 4782 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4783 auto *J = cast<Instruction>(U); 4784 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4785 })) 4786 addToWorklistIfAllowed(OI); 4787 } 4788 } 4789 4790 // For an instruction to be added into Worklist above, all its users inside 4791 // the loop should also be in Worklist. However, this condition cannot be 4792 // true for phi nodes that form a cyclic dependence. We must process phi 4793 // nodes separately. An induction variable will remain uniform if all users 4794 // of the induction variable and induction variable update remain uniform. 4795 // The code below handles both pointer and non-pointer induction variables. 4796 for (auto &Induction : Legal->getInductionVars()) { 4797 auto *Ind = Induction.first; 4798 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4799 4800 // Determine if all users of the induction variable are uniform after 4801 // vectorization. 4802 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4803 auto *I = cast<Instruction>(U); 4804 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4805 isVectorizedMemAccessUse(I, Ind); 4806 }); 4807 if (!UniformInd) 4808 continue; 4809 4810 // Determine if all users of the induction variable update instruction are 4811 // uniform after vectorization. 4812 auto UniformIndUpdate = 4813 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4814 auto *I = cast<Instruction>(U); 4815 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4816 isVectorizedMemAccessUse(I, IndUpdate); 4817 }); 4818 if (!UniformIndUpdate) 4819 continue; 4820 4821 // The induction variable and its update instruction will remain uniform. 4822 addToWorklistIfAllowed(Ind); 4823 addToWorklistIfAllowed(IndUpdate); 4824 } 4825 4826 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4827 } 4828 4829 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4830 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4831 4832 if (Legal->getRuntimePointerChecking()->Need) { 4833 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4834 "runtime pointer checks needed. Enable vectorization of this " 4835 "loop with '#pragma clang loop vectorize(enable)' when " 4836 "compiling with -Os/-Oz", 4837 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4838 return true; 4839 } 4840 4841 if (!PSE.getPredicate().isAlwaysTrue()) { 4842 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4843 "runtime SCEV checks needed. Enable vectorization of this " 4844 "loop with '#pragma clang loop vectorize(enable)' when " 4845 "compiling with -Os/-Oz", 4846 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4847 return true; 4848 } 4849 4850 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4851 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4852 reportVectorizationFailure("Runtime stride check for small trip count", 4853 "runtime stride == 1 checks needed. Enable vectorization of " 4854 "this loop without such check by compiling with -Os/-Oz", 4855 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4856 return true; 4857 } 4858 4859 return false; 4860 } 4861 4862 ElementCount 4863 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4864 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4865 return ElementCount::getScalable(0); 4866 4867 if (Hints->isScalableVectorizationDisabled()) { 4868 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4869 "ScalableVectorizationDisabled", ORE, TheLoop); 4870 return ElementCount::getScalable(0); 4871 } 4872 4873 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4874 4875 auto MaxScalableVF = ElementCount::getScalable( 4876 std::numeric_limits<ElementCount::ScalarTy>::max()); 4877 4878 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4879 // FIXME: While for scalable vectors this is currently sufficient, this should 4880 // be replaced by a more detailed mechanism that filters out specific VFs, 4881 // instead of invalidating vectorization for a whole set of VFs based on the 4882 // MaxVF. 4883 4884 // Disable scalable vectorization if the loop contains unsupported reductions. 4885 if (!canVectorizeReductions(MaxScalableVF)) { 4886 reportVectorizationInfo( 4887 "Scalable vectorization not supported for the reduction " 4888 "operations found in this loop.", 4889 "ScalableVFUnfeasible", ORE, TheLoop); 4890 return ElementCount::getScalable(0); 4891 } 4892 4893 // Disable scalable vectorization if the loop contains any instructions 4894 // with element types not supported for scalable vectors. 4895 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4896 return !Ty->isVoidTy() && 4897 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4898 })) { 4899 reportVectorizationInfo("Scalable vectorization is not supported " 4900 "for all element types found in this loop.", 4901 "ScalableVFUnfeasible", ORE, TheLoop); 4902 return ElementCount::getScalable(0); 4903 } 4904 4905 if (Legal->isSafeForAnyVectorWidth()) 4906 return MaxScalableVF; 4907 4908 // Limit MaxScalableVF by the maximum safe dependence distance. 4909 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4910 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4911 MaxVScale = 4912 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4913 MaxScalableVF = ElementCount::getScalable( 4914 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4915 if (!MaxScalableVF) 4916 reportVectorizationInfo( 4917 "Max legal vector width too small, scalable vectorization " 4918 "unfeasible.", 4919 "ScalableVFUnfeasible", ORE, TheLoop); 4920 4921 return MaxScalableVF; 4922 } 4923 4924 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4925 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4926 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4927 unsigned SmallestType, WidestType; 4928 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4929 4930 // Get the maximum safe dependence distance in bits computed by LAA. 4931 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4932 // the memory accesses that is most restrictive (involved in the smallest 4933 // dependence distance). 4934 unsigned MaxSafeElements = 4935 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4936 4937 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4938 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4939 4940 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4941 << ".\n"); 4942 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4943 << ".\n"); 4944 4945 // First analyze the UserVF, fall back if the UserVF should be ignored. 4946 if (UserVF) { 4947 auto MaxSafeUserVF = 4948 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4949 4950 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4951 // If `VF=vscale x N` is safe, then so is `VF=N` 4952 if (UserVF.isScalable()) 4953 return FixedScalableVFPair( 4954 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4955 else 4956 return UserVF; 4957 } 4958 4959 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4960 4961 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4962 // is better to ignore the hint and let the compiler choose a suitable VF. 4963 if (!UserVF.isScalable()) { 4964 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4965 << " is unsafe, clamping to max safe VF=" 4966 << MaxSafeFixedVF << ".\n"); 4967 ORE->emit([&]() { 4968 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4969 TheLoop->getStartLoc(), 4970 TheLoop->getHeader()) 4971 << "User-specified vectorization factor " 4972 << ore::NV("UserVectorizationFactor", UserVF) 4973 << " is unsafe, clamping to maximum safe vectorization factor " 4974 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4975 }); 4976 return MaxSafeFixedVF; 4977 } 4978 4979 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4980 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4981 << " is ignored because scalable vectors are not " 4982 "available.\n"); 4983 ORE->emit([&]() { 4984 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4985 TheLoop->getStartLoc(), 4986 TheLoop->getHeader()) 4987 << "User-specified vectorization factor " 4988 << ore::NV("UserVectorizationFactor", UserVF) 4989 << " is ignored because the target does not support scalable " 4990 "vectors. The compiler will pick a more suitable value."; 4991 }); 4992 } else { 4993 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4994 << " is unsafe. Ignoring scalable UserVF.\n"); 4995 ORE->emit([&]() { 4996 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4997 TheLoop->getStartLoc(), 4998 TheLoop->getHeader()) 4999 << "User-specified vectorization factor " 5000 << ore::NV("UserVectorizationFactor", UserVF) 5001 << " is unsafe. Ignoring the hint to let the compiler pick a " 5002 "more suitable value."; 5003 }); 5004 } 5005 } 5006 5007 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5008 << " / " << WidestType << " bits.\n"); 5009 5010 FixedScalableVFPair Result(ElementCount::getFixed(1), 5011 ElementCount::getScalable(0)); 5012 if (auto MaxVF = 5013 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5014 MaxSafeFixedVF, FoldTailByMasking)) 5015 Result.FixedVF = MaxVF; 5016 5017 if (auto MaxVF = 5018 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 5019 MaxSafeScalableVF, FoldTailByMasking)) 5020 if (MaxVF.isScalable()) { 5021 Result.ScalableVF = MaxVF; 5022 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5023 << "\n"); 5024 } 5025 5026 return Result; 5027 } 5028 5029 FixedScalableVFPair 5030 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5031 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5032 // TODO: It may by useful to do since it's still likely to be dynamically 5033 // uniform if the target can skip. 5034 reportVectorizationFailure( 5035 "Not inserting runtime ptr check for divergent target", 5036 "runtime pointer checks needed. Not enabled for divergent target", 5037 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5038 return FixedScalableVFPair::getNone(); 5039 } 5040 5041 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5042 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5043 if (TC == 1) { 5044 reportVectorizationFailure("Single iteration (non) loop", 5045 "loop trip count is one, irrelevant for vectorization", 5046 "SingleIterationLoop", ORE, TheLoop); 5047 return FixedScalableVFPair::getNone(); 5048 } 5049 5050 switch (ScalarEpilogueStatus) { 5051 case CM_ScalarEpilogueAllowed: 5052 return computeFeasibleMaxVF(TC, UserVF, false); 5053 case CM_ScalarEpilogueNotAllowedUsePredicate: 5054 LLVM_FALLTHROUGH; 5055 case CM_ScalarEpilogueNotNeededUsePredicate: 5056 LLVM_DEBUG( 5057 dbgs() << "LV: vector predicate hint/switch found.\n" 5058 << "LV: Not allowing scalar epilogue, creating predicated " 5059 << "vector loop.\n"); 5060 break; 5061 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5062 // fallthrough as a special case of OptForSize 5063 case CM_ScalarEpilogueNotAllowedOptSize: 5064 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5065 LLVM_DEBUG( 5066 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5067 else 5068 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5069 << "count.\n"); 5070 5071 // Bail if runtime checks are required, which are not good when optimising 5072 // for size. 5073 if (runtimeChecksRequired()) 5074 return FixedScalableVFPair::getNone(); 5075 5076 break; 5077 } 5078 5079 // The only loops we can vectorize without a scalar epilogue, are loops with 5080 // a bottom-test and a single exiting block. We'd have to handle the fact 5081 // that not every instruction executes on the last iteration. This will 5082 // require a lane mask which varies through the vector loop body. (TODO) 5083 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5084 // If there was a tail-folding hint/switch, but we can't fold the tail by 5085 // masking, fallback to a vectorization with a scalar epilogue. 5086 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5087 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5088 "scalar epilogue instead.\n"); 5089 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5090 return computeFeasibleMaxVF(TC, UserVF, false); 5091 } 5092 return FixedScalableVFPair::getNone(); 5093 } 5094 5095 // Now try the tail folding 5096 5097 // Invalidate interleave groups that require an epilogue if we can't mask 5098 // the interleave-group. 5099 if (!useMaskedInterleavedAccesses(TTI)) { 5100 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5101 "No decisions should have been taken at this point"); 5102 // Note: There is no need to invalidate any cost modeling decisions here, as 5103 // non where taken so far. 5104 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5105 } 5106 5107 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5108 // Avoid tail folding if the trip count is known to be a multiple of any VF 5109 // we chose. 5110 // FIXME: The condition below pessimises the case for fixed-width vectors, 5111 // when scalable VFs are also candidates for vectorization. 5112 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5113 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5114 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5115 "MaxFixedVF must be a power of 2"); 5116 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5117 : MaxFixedVF.getFixedValue(); 5118 ScalarEvolution *SE = PSE.getSE(); 5119 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5120 const SCEV *ExitCount = SE->getAddExpr( 5121 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5122 const SCEV *Rem = SE->getURemExpr( 5123 SE->applyLoopGuards(ExitCount, TheLoop), 5124 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5125 if (Rem->isZero()) { 5126 // Accept MaxFixedVF if we do not have a tail. 5127 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5128 return MaxFactors; 5129 } 5130 } 5131 5132 // If we don't know the precise trip count, or if the trip count that we 5133 // found modulo the vectorization factor is not zero, try to fold the tail 5134 // by masking. 5135 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5136 if (Legal->prepareToFoldTailByMasking()) { 5137 FoldTailByMasking = true; 5138 return MaxFactors; 5139 } 5140 5141 // If there was a tail-folding hint/switch, but we can't fold the tail by 5142 // masking, fallback to a vectorization with a scalar epilogue. 5143 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5144 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5145 "scalar epilogue instead.\n"); 5146 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5147 return MaxFactors; 5148 } 5149 5150 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5151 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5152 return FixedScalableVFPair::getNone(); 5153 } 5154 5155 if (TC == 0) { 5156 reportVectorizationFailure( 5157 "Unable to calculate the loop count due to complex control flow", 5158 "unable to calculate the loop count due to complex control flow", 5159 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5160 return FixedScalableVFPair::getNone(); 5161 } 5162 5163 reportVectorizationFailure( 5164 "Cannot optimize for size and vectorize at the same time.", 5165 "cannot optimize for size and vectorize at the same time. " 5166 "Enable vectorization of this loop with '#pragma clang loop " 5167 "vectorize(enable)' when compiling with -Os/-Oz", 5168 "NoTailLoopWithOptForSize", ORE, TheLoop); 5169 return FixedScalableVFPair::getNone(); 5170 } 5171 5172 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5173 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5174 const ElementCount &MaxSafeVF, bool FoldTailByMasking) { 5175 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5176 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5177 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5178 : TargetTransformInfo::RGK_FixedWidthVector); 5179 5180 // Convenience function to return the minimum of two ElementCounts. 5181 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5182 assert((LHS.isScalable() == RHS.isScalable()) && 5183 "Scalable flags must match"); 5184 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5185 }; 5186 5187 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5188 // Note that both WidestRegister and WidestType may not be a powers of 2. 5189 auto MaxVectorElementCount = ElementCount::get( 5190 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5191 ComputeScalableMaxVF); 5192 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5193 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5194 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5195 5196 if (!MaxVectorElementCount) { 5197 LLVM_DEBUG(dbgs() << "LV: The target has no " 5198 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5199 << " vector registers.\n"); 5200 return ElementCount::getFixed(1); 5201 } 5202 5203 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5204 if (ConstTripCount && 5205 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5206 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5207 // If loop trip count (TC) is known at compile time there is no point in 5208 // choosing VF greater than TC (as done in the loop below). Select maximum 5209 // power of two which doesn't exceed TC. 5210 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5211 // when the TC is less than or equal to the known number of lanes. 5212 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5213 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5214 "exceeding the constant trip count: " 5215 << ClampedConstTripCount << "\n"); 5216 return ElementCount::getFixed(ClampedConstTripCount); 5217 } 5218 5219 TargetTransformInfo::RegisterKind RegKind = 5220 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5221 : TargetTransformInfo::RGK_FixedWidthVector; 5222 ElementCount MaxVF = MaxVectorElementCount; 5223 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5224 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5225 auto MaxVectorElementCountMaxBW = ElementCount::get( 5226 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5227 ComputeScalableMaxVF); 5228 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5229 5230 // Collect all viable vectorization factors larger than the default MaxVF 5231 // (i.e. MaxVectorElementCount). 5232 SmallVector<ElementCount, 8> VFs; 5233 for (ElementCount VS = MaxVectorElementCount * 2; 5234 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5235 VFs.push_back(VS); 5236 5237 // For each VF calculate its register usage. 5238 auto RUs = calculateRegisterUsage(VFs); 5239 5240 // Select the largest VF which doesn't require more registers than existing 5241 // ones. 5242 for (int i = RUs.size() - 1; i >= 0; --i) { 5243 bool Selected = true; 5244 for (auto &pair : RUs[i].MaxLocalUsers) { 5245 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5246 if (pair.second > TargetNumRegisters) 5247 Selected = false; 5248 } 5249 if (Selected) { 5250 MaxVF = VFs[i]; 5251 break; 5252 } 5253 } 5254 if (ElementCount MinVF = 5255 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5256 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5257 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5258 << ") with target's minimum: " << MinVF << '\n'); 5259 MaxVF = MinVF; 5260 } 5261 } 5262 5263 // Invalidate any widening decisions we might have made, in case the loop 5264 // requires prediction (decided later), but we have already made some 5265 // load/store widening decisions. 5266 invalidateCostModelingDecisions(); 5267 } 5268 return MaxVF; 5269 } 5270 5271 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5272 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5273 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5274 auto Min = Attr.getVScaleRangeMin(); 5275 auto Max = Attr.getVScaleRangeMax(); 5276 if (Max && Min == Max) 5277 return Max; 5278 } 5279 5280 return TTI.getVScaleForTuning(); 5281 } 5282 5283 bool LoopVectorizationCostModel::isMoreProfitable( 5284 const VectorizationFactor &A, const VectorizationFactor &B) const { 5285 InstructionCost CostA = A.Cost; 5286 InstructionCost CostB = B.Cost; 5287 5288 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5289 5290 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5291 MaxTripCount) { 5292 // If we are folding the tail and the trip count is a known (possibly small) 5293 // constant, the trip count will be rounded up to an integer number of 5294 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5295 // which we compare directly. When not folding the tail, the total cost will 5296 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5297 // approximated with the per-lane cost below instead of using the tripcount 5298 // as here. 5299 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5300 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5301 return RTCostA < RTCostB; 5302 } 5303 5304 // Improve estimate for the vector width if it is scalable. 5305 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5306 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5307 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5308 if (A.Width.isScalable()) 5309 EstimatedWidthA *= VScale.getValue(); 5310 if (B.Width.isScalable()) 5311 EstimatedWidthB *= VScale.getValue(); 5312 } 5313 5314 // Assume vscale may be larger than 1 (or the value being tuned for), 5315 // so that scalable vectorization is slightly favorable over fixed-width 5316 // vectorization. 5317 if (A.Width.isScalable() && !B.Width.isScalable()) 5318 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5319 5320 // To avoid the need for FP division: 5321 // (CostA / A.Width) < (CostB / B.Width) 5322 // <=> (CostA * B.Width) < (CostB * A.Width) 5323 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5324 } 5325 5326 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5327 const ElementCountSet &VFCandidates) { 5328 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5329 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5330 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5331 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5332 "Expected Scalar VF to be a candidate"); 5333 5334 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5335 VectorizationFactor ChosenFactor = ScalarCost; 5336 5337 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5338 if (ForceVectorization && VFCandidates.size() > 1) { 5339 // Ignore scalar width, because the user explicitly wants vectorization. 5340 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5341 // evaluation. 5342 ChosenFactor.Cost = InstructionCost::getMax(); 5343 } 5344 5345 SmallVector<InstructionVFPair> InvalidCosts; 5346 for (const auto &i : VFCandidates) { 5347 // The cost for scalar VF=1 is already calculated, so ignore it. 5348 if (i.isScalar()) 5349 continue; 5350 5351 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5352 VectorizationFactor Candidate(i, C.first); 5353 5354 #ifndef NDEBUG 5355 unsigned AssumedMinimumVscale = 1; 5356 if (Optional<unsigned> VScale = getVScaleForTuning()) 5357 AssumedMinimumVscale = VScale.getValue(); 5358 unsigned Width = 5359 Candidate.Width.isScalable() 5360 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5361 : Candidate.Width.getFixedValue(); 5362 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5363 << " costs: " << (Candidate.Cost / Width)); 5364 if (i.isScalable()) 5365 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5366 << AssumedMinimumVscale << ")"); 5367 LLVM_DEBUG(dbgs() << ".\n"); 5368 #endif 5369 5370 if (!C.second && !ForceVectorization) { 5371 LLVM_DEBUG( 5372 dbgs() << "LV: Not considering vector loop of width " << i 5373 << " because it will not generate any vector instructions.\n"); 5374 continue; 5375 } 5376 5377 // If profitable add it to ProfitableVF list. 5378 if (isMoreProfitable(Candidate, ScalarCost)) 5379 ProfitableVFs.push_back(Candidate); 5380 5381 if (isMoreProfitable(Candidate, ChosenFactor)) 5382 ChosenFactor = Candidate; 5383 } 5384 5385 // Emit a report of VFs with invalid costs in the loop. 5386 if (!InvalidCosts.empty()) { 5387 // Group the remarks per instruction, keeping the instruction order from 5388 // InvalidCosts. 5389 std::map<Instruction *, unsigned> Numbering; 5390 unsigned I = 0; 5391 for (auto &Pair : InvalidCosts) 5392 if (!Numbering.count(Pair.first)) 5393 Numbering[Pair.first] = I++; 5394 5395 // Sort the list, first on instruction(number) then on VF. 5396 llvm::sort(InvalidCosts, 5397 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5398 if (Numbering[A.first] != Numbering[B.first]) 5399 return Numbering[A.first] < Numbering[B.first]; 5400 ElementCountComparator ECC; 5401 return ECC(A.second, B.second); 5402 }); 5403 5404 // For a list of ordered instruction-vf pairs: 5405 // [(load, vf1), (load, vf2), (store, vf1)] 5406 // Group the instructions together to emit separate remarks for: 5407 // load (vf1, vf2) 5408 // store (vf1) 5409 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5410 auto Subset = ArrayRef<InstructionVFPair>(); 5411 do { 5412 if (Subset.empty()) 5413 Subset = Tail.take_front(1); 5414 5415 Instruction *I = Subset.front().first; 5416 5417 // If the next instruction is different, or if there are no other pairs, 5418 // emit a remark for the collated subset. e.g. 5419 // [(load, vf1), (load, vf2))] 5420 // to emit: 5421 // remark: invalid costs for 'load' at VF=(vf, vf2) 5422 if (Subset == Tail || Tail[Subset.size()].first != I) { 5423 std::string OutString; 5424 raw_string_ostream OS(OutString); 5425 assert(!Subset.empty() && "Unexpected empty range"); 5426 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5427 for (auto &Pair : Subset) 5428 OS << (Pair.second == Subset.front().second ? "" : ", ") 5429 << Pair.second; 5430 OS << "):"; 5431 if (auto *CI = dyn_cast<CallInst>(I)) 5432 OS << " call to " << CI->getCalledFunction()->getName(); 5433 else 5434 OS << " " << I->getOpcodeName(); 5435 OS.flush(); 5436 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5437 Tail = Tail.drop_front(Subset.size()); 5438 Subset = {}; 5439 } else 5440 // Grow the subset by one element 5441 Subset = Tail.take_front(Subset.size() + 1); 5442 } while (!Tail.empty()); 5443 } 5444 5445 if (!EnableCondStoresVectorization && NumPredStores) { 5446 reportVectorizationFailure("There are conditional stores.", 5447 "store that is conditionally executed prevents vectorization", 5448 "ConditionalStore", ORE, TheLoop); 5449 ChosenFactor = ScalarCost; 5450 } 5451 5452 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5453 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5454 << "LV: Vectorization seems to be not beneficial, " 5455 << "but was forced by a user.\n"); 5456 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5457 return ChosenFactor; 5458 } 5459 5460 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5461 const Loop &L, ElementCount VF) const { 5462 // Cross iteration phis such as reductions need special handling and are 5463 // currently unsupported. 5464 if (any_of(L.getHeader()->phis(), 5465 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5466 return false; 5467 5468 // Phis with uses outside of the loop require special handling and are 5469 // currently unsupported. 5470 for (auto &Entry : Legal->getInductionVars()) { 5471 // Look for uses of the value of the induction at the last iteration. 5472 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5473 for (User *U : PostInc->users()) 5474 if (!L.contains(cast<Instruction>(U))) 5475 return false; 5476 // Look for uses of penultimate value of the induction. 5477 for (User *U : Entry.first->users()) 5478 if (!L.contains(cast<Instruction>(U))) 5479 return false; 5480 } 5481 5482 // Induction variables that are widened require special handling that is 5483 // currently not supported. 5484 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5485 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5486 this->isProfitableToScalarize(Entry.first, VF)); 5487 })) 5488 return false; 5489 5490 // Epilogue vectorization code has not been auditted to ensure it handles 5491 // non-latch exits properly. It may be fine, but it needs auditted and 5492 // tested. 5493 if (L.getExitingBlock() != L.getLoopLatch()) 5494 return false; 5495 5496 return true; 5497 } 5498 5499 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5500 const ElementCount VF) const { 5501 // FIXME: We need a much better cost-model to take different parameters such 5502 // as register pressure, code size increase and cost of extra branches into 5503 // account. For now we apply a very crude heuristic and only consider loops 5504 // with vectorization factors larger than a certain value. 5505 // We also consider epilogue vectorization unprofitable for targets that don't 5506 // consider interleaving beneficial (eg. MVE). 5507 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5508 return false; 5509 // FIXME: We should consider changing the threshold for scalable 5510 // vectors to take VScaleForTuning into account. 5511 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5512 return true; 5513 return false; 5514 } 5515 5516 VectorizationFactor 5517 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5518 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5519 VectorizationFactor Result = VectorizationFactor::Disabled(); 5520 if (!EnableEpilogueVectorization) { 5521 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5522 return Result; 5523 } 5524 5525 if (!isScalarEpilogueAllowed()) { 5526 LLVM_DEBUG( 5527 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5528 "allowed.\n";); 5529 return Result; 5530 } 5531 5532 // Not really a cost consideration, but check for unsupported cases here to 5533 // simplify the logic. 5534 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5535 LLVM_DEBUG( 5536 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5537 "not a supported candidate.\n";); 5538 return Result; 5539 } 5540 5541 if (EpilogueVectorizationForceVF > 1) { 5542 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5543 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5544 if (LVP.hasPlanWithVF(ForcedEC)) 5545 return {ForcedEC, 0}; 5546 else { 5547 LLVM_DEBUG( 5548 dbgs() 5549 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5550 return Result; 5551 } 5552 } 5553 5554 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5555 TheLoop->getHeader()->getParent()->hasMinSize()) { 5556 LLVM_DEBUG( 5557 dbgs() 5558 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5559 return Result; 5560 } 5561 5562 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5563 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5564 "this loop\n"); 5565 return Result; 5566 } 5567 5568 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5569 // the main loop handles 8 lanes per iteration. We could still benefit from 5570 // vectorizing the epilogue loop with VF=4. 5571 ElementCount EstimatedRuntimeVF = MainLoopVF; 5572 if (MainLoopVF.isScalable()) { 5573 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5574 if (Optional<unsigned> VScale = getVScaleForTuning()) 5575 EstimatedRuntimeVF *= VScale.getValue(); 5576 } 5577 5578 for (auto &NextVF : ProfitableVFs) 5579 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5580 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5581 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5582 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5583 LVP.hasPlanWithVF(NextVF.Width)) 5584 Result = NextVF; 5585 5586 if (Result != VectorizationFactor::Disabled()) 5587 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5588 << Result.Width << "\n";); 5589 return Result; 5590 } 5591 5592 std::pair<unsigned, unsigned> 5593 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5594 unsigned MinWidth = -1U; 5595 unsigned MaxWidth = 8; 5596 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5597 // For in-loop reductions, no element types are added to ElementTypesInLoop 5598 // if there are no loads/stores in the loop. In this case, check through the 5599 // reduction variables to determine the maximum width. 5600 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5601 // Reset MaxWidth so that we can find the smallest type used by recurrences 5602 // in the loop. 5603 MaxWidth = -1U; 5604 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5605 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5606 // When finding the min width used by the recurrence we need to account 5607 // for casts on the input operands of the recurrence. 5608 MaxWidth = std::min<unsigned>( 5609 MaxWidth, std::min<unsigned>( 5610 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5611 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5612 } 5613 } else { 5614 for (Type *T : ElementTypesInLoop) { 5615 MinWidth = std::min<unsigned>( 5616 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5617 MaxWidth = std::max<unsigned>( 5618 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5619 } 5620 } 5621 return {MinWidth, MaxWidth}; 5622 } 5623 5624 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5625 ElementTypesInLoop.clear(); 5626 // For each block. 5627 for (BasicBlock *BB : TheLoop->blocks()) { 5628 // For each instruction in the loop. 5629 for (Instruction &I : BB->instructionsWithoutDebug()) { 5630 Type *T = I.getType(); 5631 5632 // Skip ignored values. 5633 if (ValuesToIgnore.count(&I)) 5634 continue; 5635 5636 // Only examine Loads, Stores and PHINodes. 5637 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5638 continue; 5639 5640 // Examine PHI nodes that are reduction variables. Update the type to 5641 // account for the recurrence type. 5642 if (auto *PN = dyn_cast<PHINode>(&I)) { 5643 if (!Legal->isReductionVariable(PN)) 5644 continue; 5645 const RecurrenceDescriptor &RdxDesc = 5646 Legal->getReductionVars().find(PN)->second; 5647 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5648 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5649 RdxDesc.getRecurrenceType(), 5650 TargetTransformInfo::ReductionFlags())) 5651 continue; 5652 T = RdxDesc.getRecurrenceType(); 5653 } 5654 5655 // Examine the stored values. 5656 if (auto *ST = dyn_cast<StoreInst>(&I)) 5657 T = ST->getValueOperand()->getType(); 5658 5659 assert(T->isSized() && 5660 "Expected the load/store/recurrence type to be sized"); 5661 5662 ElementTypesInLoop.insert(T); 5663 } 5664 } 5665 } 5666 5667 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5668 unsigned LoopCost) { 5669 // -- The interleave heuristics -- 5670 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5671 // There are many micro-architectural considerations that we can't predict 5672 // at this level. For example, frontend pressure (on decode or fetch) due to 5673 // code size, or the number and capabilities of the execution ports. 5674 // 5675 // We use the following heuristics to select the interleave count: 5676 // 1. If the code has reductions, then we interleave to break the cross 5677 // iteration dependency. 5678 // 2. If the loop is really small, then we interleave to reduce the loop 5679 // overhead. 5680 // 3. We don't interleave if we think that we will spill registers to memory 5681 // due to the increased register pressure. 5682 5683 if (!isScalarEpilogueAllowed()) 5684 return 1; 5685 5686 // We used the distance for the interleave count. 5687 if (Legal->getMaxSafeDepDistBytes() != -1U) 5688 return 1; 5689 5690 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5691 const bool HasReductions = !Legal->getReductionVars().empty(); 5692 // Do not interleave loops with a relatively small known or estimated trip 5693 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5694 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5695 // because with the above conditions interleaving can expose ILP and break 5696 // cross iteration dependences for reductions. 5697 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5698 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5699 return 1; 5700 5701 // If we did not calculate the cost for VF (because the user selected the VF) 5702 // then we calculate the cost of VF here. 5703 if (LoopCost == 0) { 5704 InstructionCost C = expectedCost(VF).first; 5705 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5706 LoopCost = *C.getValue(); 5707 5708 // Loop body is free and there is no need for interleaving. 5709 if (LoopCost == 0) 5710 return 1; 5711 } 5712 5713 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5714 // We divide by these constants so assume that we have at least one 5715 // instruction that uses at least one register. 5716 for (auto& pair : R.MaxLocalUsers) { 5717 pair.second = std::max(pair.second, 1U); 5718 } 5719 5720 // We calculate the interleave count using the following formula. 5721 // Subtract the number of loop invariants from the number of available 5722 // registers. These registers are used by all of the interleaved instances. 5723 // Next, divide the remaining registers by the number of registers that is 5724 // required by the loop, in order to estimate how many parallel instances 5725 // fit without causing spills. All of this is rounded down if necessary to be 5726 // a power of two. We want power of two interleave count to simplify any 5727 // addressing operations or alignment considerations. 5728 // We also want power of two interleave counts to ensure that the induction 5729 // variable of the vector loop wraps to zero, when tail is folded by masking; 5730 // this currently happens when OptForSize, in which case IC is set to 1 above. 5731 unsigned IC = UINT_MAX; 5732 5733 for (auto& pair : R.MaxLocalUsers) { 5734 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5735 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5736 << " registers of " 5737 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5738 if (VF.isScalar()) { 5739 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5740 TargetNumRegisters = ForceTargetNumScalarRegs; 5741 } else { 5742 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5743 TargetNumRegisters = ForceTargetNumVectorRegs; 5744 } 5745 unsigned MaxLocalUsers = pair.second; 5746 unsigned LoopInvariantRegs = 0; 5747 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5748 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5749 5750 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5751 // Don't count the induction variable as interleaved. 5752 if (EnableIndVarRegisterHeur) { 5753 TmpIC = 5754 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5755 std::max(1U, (MaxLocalUsers - 1))); 5756 } 5757 5758 IC = std::min(IC, TmpIC); 5759 } 5760 5761 // Clamp the interleave ranges to reasonable counts. 5762 unsigned MaxInterleaveCount = 5763 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5764 5765 // Check if the user has overridden the max. 5766 if (VF.isScalar()) { 5767 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5768 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5769 } else { 5770 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5771 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5772 } 5773 5774 // If trip count is known or estimated compile time constant, limit the 5775 // interleave count to be less than the trip count divided by VF, provided it 5776 // is at least 1. 5777 // 5778 // For scalable vectors we can't know if interleaving is beneficial. It may 5779 // not be beneficial for small loops if none of the lanes in the second vector 5780 // iterations is enabled. However, for larger loops, there is likely to be a 5781 // similar benefit as for fixed-width vectors. For now, we choose to leave 5782 // the InterleaveCount as if vscale is '1', although if some information about 5783 // the vector is known (e.g. min vector size), we can make a better decision. 5784 if (BestKnownTC) { 5785 MaxInterleaveCount = 5786 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5787 // Make sure MaxInterleaveCount is greater than 0. 5788 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5789 } 5790 5791 assert(MaxInterleaveCount > 0 && 5792 "Maximum interleave count must be greater than 0"); 5793 5794 // Clamp the calculated IC to be between the 1 and the max interleave count 5795 // that the target and trip count allows. 5796 if (IC > MaxInterleaveCount) 5797 IC = MaxInterleaveCount; 5798 else 5799 // Make sure IC is greater than 0. 5800 IC = std::max(1u, IC); 5801 5802 assert(IC > 0 && "Interleave count must be greater than 0."); 5803 5804 // Interleave if we vectorized this loop and there is a reduction that could 5805 // benefit from interleaving. 5806 if (VF.isVector() && HasReductions) { 5807 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5808 return IC; 5809 } 5810 5811 // For any scalar loop that either requires runtime checks or predication we 5812 // are better off leaving this to the unroller. Note that if we've already 5813 // vectorized the loop we will have done the runtime check and so interleaving 5814 // won't require further checks. 5815 bool ScalarInterleavingRequiresPredication = 5816 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5817 return Legal->blockNeedsPredication(BB); 5818 })); 5819 bool ScalarInterleavingRequiresRuntimePointerCheck = 5820 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5821 5822 // We want to interleave small loops in order to reduce the loop overhead and 5823 // potentially expose ILP opportunities. 5824 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5825 << "LV: IC is " << IC << '\n' 5826 << "LV: VF is " << VF << '\n'); 5827 const bool AggressivelyInterleaveReductions = 5828 TTI.enableAggressiveInterleaving(HasReductions); 5829 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5830 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5831 // We assume that the cost overhead is 1 and we use the cost model 5832 // to estimate the cost of the loop and interleave until the cost of the 5833 // loop overhead is about 5% of the cost of the loop. 5834 unsigned SmallIC = 5835 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5836 5837 // Interleave until store/load ports (estimated by max interleave count) are 5838 // saturated. 5839 unsigned NumStores = Legal->getNumStores(); 5840 unsigned NumLoads = Legal->getNumLoads(); 5841 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5842 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5843 5844 // There is little point in interleaving for reductions containing selects 5845 // and compares when VF=1 since it may just create more overhead than it's 5846 // worth for loops with small trip counts. This is because we still have to 5847 // do the final reduction after the loop. 5848 bool HasSelectCmpReductions = 5849 HasReductions && 5850 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5851 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5852 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5853 RdxDesc.getRecurrenceKind()); 5854 }); 5855 if (HasSelectCmpReductions) { 5856 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5857 return 1; 5858 } 5859 5860 // If we have a scalar reduction (vector reductions are already dealt with 5861 // by this point), we can increase the critical path length if the loop 5862 // we're interleaving is inside another loop. For tree-wise reductions 5863 // set the limit to 2, and for ordered reductions it's best to disable 5864 // interleaving entirely. 5865 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5866 bool HasOrderedReductions = 5867 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5868 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5869 return RdxDesc.isOrdered(); 5870 }); 5871 if (HasOrderedReductions) { 5872 LLVM_DEBUG( 5873 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5874 return 1; 5875 } 5876 5877 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5878 SmallIC = std::min(SmallIC, F); 5879 StoresIC = std::min(StoresIC, F); 5880 LoadsIC = std::min(LoadsIC, F); 5881 } 5882 5883 if (EnableLoadStoreRuntimeInterleave && 5884 std::max(StoresIC, LoadsIC) > SmallIC) { 5885 LLVM_DEBUG( 5886 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5887 return std::max(StoresIC, LoadsIC); 5888 } 5889 5890 // If there are scalar reductions and TTI has enabled aggressive 5891 // interleaving for reductions, we will interleave to expose ILP. 5892 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5893 AggressivelyInterleaveReductions) { 5894 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5895 // Interleave no less than SmallIC but not as aggressive as the normal IC 5896 // to satisfy the rare situation when resources are too limited. 5897 return std::max(IC / 2, SmallIC); 5898 } else { 5899 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5900 return SmallIC; 5901 } 5902 } 5903 5904 // Interleave if this is a large loop (small loops are already dealt with by 5905 // this point) that could benefit from interleaving. 5906 if (AggressivelyInterleaveReductions) { 5907 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5908 return IC; 5909 } 5910 5911 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5912 return 1; 5913 } 5914 5915 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5916 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5917 // This function calculates the register usage by measuring the highest number 5918 // of values that are alive at a single location. Obviously, this is a very 5919 // rough estimation. We scan the loop in a topological order in order and 5920 // assign a number to each instruction. We use RPO to ensure that defs are 5921 // met before their users. We assume that each instruction that has in-loop 5922 // users starts an interval. We record every time that an in-loop value is 5923 // used, so we have a list of the first and last occurrences of each 5924 // instruction. Next, we transpose this data structure into a multi map that 5925 // holds the list of intervals that *end* at a specific location. This multi 5926 // map allows us to perform a linear search. We scan the instructions linearly 5927 // and record each time that a new interval starts, by placing it in a set. 5928 // If we find this value in the multi-map then we remove it from the set. 5929 // The max register usage is the maximum size of the set. 5930 // We also search for instructions that are defined outside the loop, but are 5931 // used inside the loop. We need this number separately from the max-interval 5932 // usage number because when we unroll, loop-invariant values do not take 5933 // more register. 5934 LoopBlocksDFS DFS(TheLoop); 5935 DFS.perform(LI); 5936 5937 RegisterUsage RU; 5938 5939 // Each 'key' in the map opens a new interval. The values 5940 // of the map are the index of the 'last seen' usage of the 5941 // instruction that is the key. 5942 using IntervalMap = DenseMap<Instruction *, unsigned>; 5943 5944 // Maps instruction to its index. 5945 SmallVector<Instruction *, 64> IdxToInstr; 5946 // Marks the end of each interval. 5947 IntervalMap EndPoint; 5948 // Saves the list of instruction indices that are used in the loop. 5949 SmallPtrSet<Instruction *, 8> Ends; 5950 // Saves the list of values that are used in the loop but are 5951 // defined outside the loop, such as arguments and constants. 5952 SmallPtrSet<Value *, 8> LoopInvariants; 5953 5954 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5955 for (Instruction &I : BB->instructionsWithoutDebug()) { 5956 IdxToInstr.push_back(&I); 5957 5958 // Save the end location of each USE. 5959 for (Value *U : I.operands()) { 5960 auto *Instr = dyn_cast<Instruction>(U); 5961 5962 // Ignore non-instruction values such as arguments, constants, etc. 5963 if (!Instr) 5964 continue; 5965 5966 // If this instruction is outside the loop then record it and continue. 5967 if (!TheLoop->contains(Instr)) { 5968 LoopInvariants.insert(Instr); 5969 continue; 5970 } 5971 5972 // Overwrite previous end points. 5973 EndPoint[Instr] = IdxToInstr.size(); 5974 Ends.insert(Instr); 5975 } 5976 } 5977 } 5978 5979 // Saves the list of intervals that end with the index in 'key'. 5980 using InstrList = SmallVector<Instruction *, 2>; 5981 DenseMap<unsigned, InstrList> TransposeEnds; 5982 5983 // Transpose the EndPoints to a list of values that end at each index. 5984 for (auto &Interval : EndPoint) 5985 TransposeEnds[Interval.second].push_back(Interval.first); 5986 5987 SmallPtrSet<Instruction *, 8> OpenIntervals; 5988 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5989 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5990 5991 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5992 5993 auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { 5994 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5995 return 0; 5996 return TTI.getRegUsageForType(VectorType::get(Ty, VF)); 5997 }; 5998 5999 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6000 Instruction *I = IdxToInstr[i]; 6001 6002 // Remove all of the instructions that end at this location. 6003 InstrList &List = TransposeEnds[i]; 6004 for (Instruction *ToRemove : List) 6005 OpenIntervals.erase(ToRemove); 6006 6007 // Ignore instructions that are never used within the loop. 6008 if (!Ends.count(I)) 6009 continue; 6010 6011 // Skip ignored values. 6012 if (ValuesToIgnore.count(I)) 6013 continue; 6014 6015 // For each VF find the maximum usage of registers. 6016 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6017 // Count the number of live intervals. 6018 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6019 6020 if (VFs[j].isScalar()) { 6021 for (auto Inst : OpenIntervals) { 6022 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6023 if (RegUsage.find(ClassID) == RegUsage.end()) 6024 RegUsage[ClassID] = 1; 6025 else 6026 RegUsage[ClassID] += 1; 6027 } 6028 } else { 6029 collectUniformsAndScalars(VFs[j]); 6030 for (auto Inst : OpenIntervals) { 6031 // Skip ignored values for VF > 1. 6032 if (VecValuesToIgnore.count(Inst)) 6033 continue; 6034 if (isScalarAfterVectorization(Inst, VFs[j])) { 6035 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6036 if (RegUsage.find(ClassID) == RegUsage.end()) 6037 RegUsage[ClassID] = 1; 6038 else 6039 RegUsage[ClassID] += 1; 6040 } else { 6041 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6042 if (RegUsage.find(ClassID) == RegUsage.end()) 6043 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6044 else 6045 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6046 } 6047 } 6048 } 6049 6050 for (auto& pair : RegUsage) { 6051 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6052 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6053 else 6054 MaxUsages[j][pair.first] = pair.second; 6055 } 6056 } 6057 6058 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6059 << OpenIntervals.size() << '\n'); 6060 6061 // Add the current instruction to the list of open intervals. 6062 OpenIntervals.insert(I); 6063 } 6064 6065 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6066 SmallMapVector<unsigned, unsigned, 4> Invariant; 6067 6068 for (auto Inst : LoopInvariants) { 6069 unsigned Usage = 6070 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6071 unsigned ClassID = 6072 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6073 if (Invariant.find(ClassID) == Invariant.end()) 6074 Invariant[ClassID] = Usage; 6075 else 6076 Invariant[ClassID] += Usage; 6077 } 6078 6079 LLVM_DEBUG({ 6080 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6081 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6082 << " item\n"; 6083 for (const auto &pair : MaxUsages[i]) { 6084 dbgs() << "LV(REG): RegisterClass: " 6085 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6086 << " registers\n"; 6087 } 6088 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6089 << " item\n"; 6090 for (const auto &pair : Invariant) { 6091 dbgs() << "LV(REG): RegisterClass: " 6092 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6093 << " registers\n"; 6094 } 6095 }); 6096 6097 RU.LoopInvariantRegs = Invariant; 6098 RU.MaxLocalUsers = MaxUsages[i]; 6099 RUs[i] = RU; 6100 } 6101 6102 return RUs; 6103 } 6104 6105 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6106 ElementCount VF) { 6107 // TODO: Cost model for emulated masked load/store is completely 6108 // broken. This hack guides the cost model to use an artificially 6109 // high enough value to practically disable vectorization with such 6110 // operations, except where previously deployed legality hack allowed 6111 // using very low cost values. This is to avoid regressions coming simply 6112 // from moving "masked load/store" check from legality to cost model. 6113 // Masked Load/Gather emulation was previously never allowed. 6114 // Limited number of Masked Store/Scatter emulation was allowed. 6115 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6116 return isa<LoadInst>(I) || 6117 (isa<StoreInst>(I) && 6118 NumPredStores > NumberOfStoresToPredicate); 6119 } 6120 6121 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6122 // If we aren't vectorizing the loop, or if we've already collected the 6123 // instructions to scalarize, there's nothing to do. Collection may already 6124 // have occurred if we have a user-selected VF and are now computing the 6125 // expected cost for interleaving. 6126 if (VF.isScalar() || VF.isZero() || 6127 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6128 return; 6129 6130 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6131 // not profitable to scalarize any instructions, the presence of VF in the 6132 // map will indicate that we've analyzed it already. 6133 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6134 6135 // Find all the instructions that are scalar with predication in the loop and 6136 // determine if it would be better to not if-convert the blocks they are in. 6137 // If so, we also record the instructions to scalarize. 6138 for (BasicBlock *BB : TheLoop->blocks()) { 6139 if (!blockNeedsPredicationForAnyReason(BB)) 6140 continue; 6141 for (Instruction &I : *BB) 6142 if (isScalarWithPredication(&I, VF)) { 6143 ScalarCostsTy ScalarCosts; 6144 // Do not apply discount if scalable, because that would lead to 6145 // invalid scalarization costs. 6146 // Do not apply discount logic if hacked cost is needed 6147 // for emulated masked memrefs. 6148 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6149 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6150 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6151 // Remember that BB will remain after vectorization. 6152 PredicatedBBsAfterVectorization.insert(BB); 6153 } 6154 } 6155 } 6156 6157 int LoopVectorizationCostModel::computePredInstDiscount( 6158 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6159 assert(!isUniformAfterVectorization(PredInst, VF) && 6160 "Instruction marked uniform-after-vectorization will be predicated"); 6161 6162 // Initialize the discount to zero, meaning that the scalar version and the 6163 // vector version cost the same. 6164 InstructionCost Discount = 0; 6165 6166 // Holds instructions to analyze. The instructions we visit are mapped in 6167 // ScalarCosts. Those instructions are the ones that would be scalarized if 6168 // we find that the scalar version costs less. 6169 SmallVector<Instruction *, 8> Worklist; 6170 6171 // Returns true if the given instruction can be scalarized. 6172 auto canBeScalarized = [&](Instruction *I) -> bool { 6173 // We only attempt to scalarize instructions forming a single-use chain 6174 // from the original predicated block that would otherwise be vectorized. 6175 // Although not strictly necessary, we give up on instructions we know will 6176 // already be scalar to avoid traversing chains that are unlikely to be 6177 // beneficial. 6178 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6179 isScalarAfterVectorization(I, VF)) 6180 return false; 6181 6182 // If the instruction is scalar with predication, it will be analyzed 6183 // separately. We ignore it within the context of PredInst. 6184 if (isScalarWithPredication(I, VF)) 6185 return false; 6186 6187 // If any of the instruction's operands are uniform after vectorization, 6188 // the instruction cannot be scalarized. This prevents, for example, a 6189 // masked load from being scalarized. 6190 // 6191 // We assume we will only emit a value for lane zero of an instruction 6192 // marked uniform after vectorization, rather than VF identical values. 6193 // Thus, if we scalarize an instruction that uses a uniform, we would 6194 // create uses of values corresponding to the lanes we aren't emitting code 6195 // for. This behavior can be changed by allowing getScalarValue to clone 6196 // the lane zero values for uniforms rather than asserting. 6197 for (Use &U : I->operands()) 6198 if (auto *J = dyn_cast<Instruction>(U.get())) 6199 if (isUniformAfterVectorization(J, VF)) 6200 return false; 6201 6202 // Otherwise, we can scalarize the instruction. 6203 return true; 6204 }; 6205 6206 // Compute the expected cost discount from scalarizing the entire expression 6207 // feeding the predicated instruction. We currently only consider expressions 6208 // that are single-use instruction chains. 6209 Worklist.push_back(PredInst); 6210 while (!Worklist.empty()) { 6211 Instruction *I = Worklist.pop_back_val(); 6212 6213 // If we've already analyzed the instruction, there's nothing to do. 6214 if (ScalarCosts.find(I) != ScalarCosts.end()) 6215 continue; 6216 6217 // Compute the cost of the vector instruction. Note that this cost already 6218 // includes the scalarization overhead of the predicated instruction. 6219 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6220 6221 // Compute the cost of the scalarized instruction. This cost is the cost of 6222 // the instruction as if it wasn't if-converted and instead remained in the 6223 // predicated block. We will scale this cost by block probability after 6224 // computing the scalarization overhead. 6225 InstructionCost ScalarCost = 6226 VF.getFixedValue() * 6227 getInstructionCost(I, ElementCount::getFixed(1)).first; 6228 6229 // Compute the scalarization overhead of needed insertelement instructions 6230 // and phi nodes. 6231 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6232 ScalarCost += TTI.getScalarizationOverhead( 6233 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6234 APInt::getAllOnes(VF.getFixedValue()), true, false); 6235 ScalarCost += 6236 VF.getFixedValue() * 6237 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6238 } 6239 6240 // Compute the scalarization overhead of needed extractelement 6241 // instructions. For each of the instruction's operands, if the operand can 6242 // be scalarized, add it to the worklist; otherwise, account for the 6243 // overhead. 6244 for (Use &U : I->operands()) 6245 if (auto *J = dyn_cast<Instruction>(U.get())) { 6246 assert(VectorType::isValidElementType(J->getType()) && 6247 "Instruction has non-scalar type"); 6248 if (canBeScalarized(J)) 6249 Worklist.push_back(J); 6250 else if (needsExtract(J, VF)) { 6251 ScalarCost += TTI.getScalarizationOverhead( 6252 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6253 APInt::getAllOnes(VF.getFixedValue()), false, true); 6254 } 6255 } 6256 6257 // Scale the total scalar cost by block probability. 6258 ScalarCost /= getReciprocalPredBlockProb(); 6259 6260 // Compute the discount. A non-negative discount means the vector version 6261 // of the instruction costs more, and scalarizing would be beneficial. 6262 Discount += VectorCost - ScalarCost; 6263 ScalarCosts[I] = ScalarCost; 6264 } 6265 6266 return *Discount.getValue(); 6267 } 6268 6269 LoopVectorizationCostModel::VectorizationCostTy 6270 LoopVectorizationCostModel::expectedCost( 6271 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6272 VectorizationCostTy Cost; 6273 6274 // For each block. 6275 for (BasicBlock *BB : TheLoop->blocks()) { 6276 VectorizationCostTy BlockCost; 6277 6278 // For each instruction in the old loop. 6279 for (Instruction &I : BB->instructionsWithoutDebug()) { 6280 // Skip ignored values. 6281 if (ValuesToIgnore.count(&I) || 6282 (VF.isVector() && VecValuesToIgnore.count(&I))) 6283 continue; 6284 6285 VectorizationCostTy C = getInstructionCost(&I, VF); 6286 6287 // Check if we should override the cost. 6288 if (C.first.isValid() && 6289 ForceTargetInstructionCost.getNumOccurrences() > 0) 6290 C.first = InstructionCost(ForceTargetInstructionCost); 6291 6292 // Keep a list of instructions with invalid costs. 6293 if (Invalid && !C.first.isValid()) 6294 Invalid->emplace_back(&I, VF); 6295 6296 BlockCost.first += C.first; 6297 BlockCost.second |= C.second; 6298 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6299 << " for VF " << VF << " For instruction: " << I 6300 << '\n'); 6301 } 6302 6303 // If we are vectorizing a predicated block, it will have been 6304 // if-converted. This means that the block's instructions (aside from 6305 // stores and instructions that may divide by zero) will now be 6306 // unconditionally executed. For the scalar case, we may not always execute 6307 // the predicated block, if it is an if-else block. Thus, scale the block's 6308 // cost by the probability of executing it. blockNeedsPredication from 6309 // Legal is used so as to not include all blocks in tail folded loops. 6310 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6311 BlockCost.first /= getReciprocalPredBlockProb(); 6312 6313 Cost.first += BlockCost.first; 6314 Cost.second |= BlockCost.second; 6315 } 6316 6317 return Cost; 6318 } 6319 6320 /// Gets Address Access SCEV after verifying that the access pattern 6321 /// is loop invariant except the induction variable dependence. 6322 /// 6323 /// This SCEV can be sent to the Target in order to estimate the address 6324 /// calculation cost. 6325 static const SCEV *getAddressAccessSCEV( 6326 Value *Ptr, 6327 LoopVectorizationLegality *Legal, 6328 PredicatedScalarEvolution &PSE, 6329 const Loop *TheLoop) { 6330 6331 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6332 if (!Gep) 6333 return nullptr; 6334 6335 // We are looking for a gep with all loop invariant indices except for one 6336 // which should be an induction variable. 6337 auto SE = PSE.getSE(); 6338 unsigned NumOperands = Gep->getNumOperands(); 6339 for (unsigned i = 1; i < NumOperands; ++i) { 6340 Value *Opd = Gep->getOperand(i); 6341 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6342 !Legal->isInductionVariable(Opd)) 6343 return nullptr; 6344 } 6345 6346 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6347 return PSE.getSCEV(Ptr); 6348 } 6349 6350 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6351 return Legal->hasStride(I->getOperand(0)) || 6352 Legal->hasStride(I->getOperand(1)); 6353 } 6354 6355 InstructionCost 6356 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6357 ElementCount VF) { 6358 assert(VF.isVector() && 6359 "Scalarization cost of instruction implies vectorization."); 6360 if (VF.isScalable()) 6361 return InstructionCost::getInvalid(); 6362 6363 Type *ValTy = getLoadStoreType(I); 6364 auto SE = PSE.getSE(); 6365 6366 unsigned AS = getLoadStoreAddressSpace(I); 6367 Value *Ptr = getLoadStorePointerOperand(I); 6368 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6369 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6370 // that it is being called from this specific place. 6371 6372 // Figure out whether the access is strided and get the stride value 6373 // if it's known in compile time 6374 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6375 6376 // Get the cost of the scalar memory instruction and address computation. 6377 InstructionCost Cost = 6378 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6379 6380 // Don't pass *I here, since it is scalar but will actually be part of a 6381 // vectorized loop where the user of it is a vectorized instruction. 6382 const Align Alignment = getLoadStoreAlignment(I); 6383 Cost += VF.getKnownMinValue() * 6384 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6385 AS, TTI::TCK_RecipThroughput); 6386 6387 // Get the overhead of the extractelement and insertelement instructions 6388 // we might create due to scalarization. 6389 Cost += getScalarizationOverhead(I, VF); 6390 6391 // If we have a predicated load/store, it will need extra i1 extracts and 6392 // conditional branches, but may not be executed for each vector lane. Scale 6393 // the cost by the probability of executing the predicated block. 6394 if (isPredicatedInst(I, VF)) { 6395 Cost /= getReciprocalPredBlockProb(); 6396 6397 // Add the cost of an i1 extract and a branch 6398 auto *Vec_i1Ty = 6399 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6400 Cost += TTI.getScalarizationOverhead( 6401 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6402 /*Insert=*/false, /*Extract=*/true); 6403 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6404 6405 if (useEmulatedMaskMemRefHack(I, VF)) 6406 // Artificially setting to a high enough value to practically disable 6407 // vectorization with such operations. 6408 Cost = 3000000; 6409 } 6410 6411 return Cost; 6412 } 6413 6414 InstructionCost 6415 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6416 ElementCount VF) { 6417 Type *ValTy = getLoadStoreType(I); 6418 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6419 Value *Ptr = getLoadStorePointerOperand(I); 6420 unsigned AS = getLoadStoreAddressSpace(I); 6421 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6422 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6423 6424 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6425 "Stride should be 1 or -1 for consecutive memory access"); 6426 const Align Alignment = getLoadStoreAlignment(I); 6427 InstructionCost Cost = 0; 6428 if (Legal->isMaskRequired(I)) 6429 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6430 CostKind); 6431 else 6432 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6433 CostKind, I); 6434 6435 bool Reverse = ConsecutiveStride < 0; 6436 if (Reverse) 6437 Cost += 6438 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6439 return Cost; 6440 } 6441 6442 InstructionCost 6443 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6444 ElementCount VF) { 6445 assert(Legal->isUniformMemOp(*I)); 6446 6447 Type *ValTy = getLoadStoreType(I); 6448 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6449 const Align Alignment = getLoadStoreAlignment(I); 6450 unsigned AS = getLoadStoreAddressSpace(I); 6451 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6452 if (isa<LoadInst>(I)) { 6453 return TTI.getAddressComputationCost(ValTy) + 6454 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6455 CostKind) + 6456 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6457 } 6458 StoreInst *SI = cast<StoreInst>(I); 6459 6460 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6461 return TTI.getAddressComputationCost(ValTy) + 6462 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6463 CostKind) + 6464 (isLoopInvariantStoreValue 6465 ? 0 6466 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6467 VF.getKnownMinValue() - 1)); 6468 } 6469 6470 InstructionCost 6471 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6472 ElementCount VF) { 6473 Type *ValTy = getLoadStoreType(I); 6474 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6475 const Align Alignment = getLoadStoreAlignment(I); 6476 const Value *Ptr = getLoadStorePointerOperand(I); 6477 6478 return TTI.getAddressComputationCost(VectorTy) + 6479 TTI.getGatherScatterOpCost( 6480 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6481 TargetTransformInfo::TCK_RecipThroughput, I); 6482 } 6483 6484 InstructionCost 6485 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6486 ElementCount VF) { 6487 // TODO: Once we have support for interleaving with scalable vectors 6488 // we can calculate the cost properly here. 6489 if (VF.isScalable()) 6490 return InstructionCost::getInvalid(); 6491 6492 Type *ValTy = getLoadStoreType(I); 6493 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6494 unsigned AS = getLoadStoreAddressSpace(I); 6495 6496 auto Group = getInterleavedAccessGroup(I); 6497 assert(Group && "Fail to get an interleaved access group."); 6498 6499 unsigned InterleaveFactor = Group->getFactor(); 6500 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6501 6502 // Holds the indices of existing members in the interleaved group. 6503 SmallVector<unsigned, 4> Indices; 6504 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6505 if (Group->getMember(IF)) 6506 Indices.push_back(IF); 6507 6508 // Calculate the cost of the whole interleaved group. 6509 bool UseMaskForGaps = 6510 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6511 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6512 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6513 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6514 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6515 6516 if (Group->isReverse()) { 6517 // TODO: Add support for reversed masked interleaved access. 6518 assert(!Legal->isMaskRequired(I) && 6519 "Reverse masked interleaved access not supported."); 6520 Cost += 6521 Group->getNumMembers() * 6522 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6523 } 6524 return Cost; 6525 } 6526 6527 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6528 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6529 using namespace llvm::PatternMatch; 6530 // Early exit for no inloop reductions 6531 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6532 return None; 6533 auto *VectorTy = cast<VectorType>(Ty); 6534 6535 // We are looking for a pattern of, and finding the minimal acceptable cost: 6536 // reduce(mul(ext(A), ext(B))) or 6537 // reduce(mul(A, B)) or 6538 // reduce(ext(A)) or 6539 // reduce(A). 6540 // The basic idea is that we walk down the tree to do that, finding the root 6541 // reduction instruction in InLoopReductionImmediateChains. From there we find 6542 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6543 // of the components. If the reduction cost is lower then we return it for the 6544 // reduction instruction and 0 for the other instructions in the pattern. If 6545 // it is not we return an invalid cost specifying the orignal cost method 6546 // should be used. 6547 Instruction *RetI = I; 6548 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6549 if (!RetI->hasOneUser()) 6550 return None; 6551 RetI = RetI->user_back(); 6552 } 6553 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6554 RetI->user_back()->getOpcode() == Instruction::Add) { 6555 if (!RetI->hasOneUser()) 6556 return None; 6557 RetI = RetI->user_back(); 6558 } 6559 6560 // Test if the found instruction is a reduction, and if not return an invalid 6561 // cost specifying the parent to use the original cost modelling. 6562 if (!InLoopReductionImmediateChains.count(RetI)) 6563 return None; 6564 6565 // Find the reduction this chain is a part of and calculate the basic cost of 6566 // the reduction on its own. 6567 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6568 Instruction *ReductionPhi = LastChain; 6569 while (!isa<PHINode>(ReductionPhi)) 6570 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6571 6572 const RecurrenceDescriptor &RdxDesc = 6573 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6574 6575 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6576 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6577 6578 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6579 // normal fmul instruction to the cost of the fadd reduction. 6580 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6581 BaseCost += 6582 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6583 6584 // If we're using ordered reductions then we can just return the base cost 6585 // here, since getArithmeticReductionCost calculates the full ordered 6586 // reduction cost when FP reassociation is not allowed. 6587 if (useOrderedReductions(RdxDesc)) 6588 return BaseCost; 6589 6590 // Get the operand that was not the reduction chain and match it to one of the 6591 // patterns, returning the better cost if it is found. 6592 Instruction *RedOp = RetI->getOperand(1) == LastChain 6593 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6594 : dyn_cast<Instruction>(RetI->getOperand(1)); 6595 6596 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6597 6598 Instruction *Op0, *Op1; 6599 if (RedOp && 6600 match(RedOp, 6601 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6602 match(Op0, m_ZExtOrSExt(m_Value())) && 6603 Op0->getOpcode() == Op1->getOpcode() && 6604 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6605 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6606 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6607 6608 // Matched reduce(ext(mul(ext(A), ext(B))) 6609 // Note that the extend opcodes need to all match, or if A==B they will have 6610 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6611 // which is equally fine. 6612 bool IsUnsigned = isa<ZExtInst>(Op0); 6613 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6614 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6615 6616 InstructionCost ExtCost = 6617 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6618 TTI::CastContextHint::None, CostKind, Op0); 6619 InstructionCost MulCost = 6620 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6621 InstructionCost Ext2Cost = 6622 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6623 TTI::CastContextHint::None, CostKind, RedOp); 6624 6625 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6626 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6627 CostKind); 6628 6629 if (RedCost.isValid() && 6630 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6631 return I == RetI ? RedCost : 0; 6632 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6633 !TheLoop->isLoopInvariant(RedOp)) { 6634 // Matched reduce(ext(A)) 6635 bool IsUnsigned = isa<ZExtInst>(RedOp); 6636 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6637 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6638 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6639 CostKind); 6640 6641 InstructionCost ExtCost = 6642 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6643 TTI::CastContextHint::None, CostKind, RedOp); 6644 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6645 return I == RetI ? RedCost : 0; 6646 } else if (RedOp && 6647 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6648 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6649 Op0->getOpcode() == Op1->getOpcode() && 6650 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6651 bool IsUnsigned = isa<ZExtInst>(Op0); 6652 Type *Op0Ty = Op0->getOperand(0)->getType(); 6653 Type *Op1Ty = Op1->getOperand(0)->getType(); 6654 Type *LargestOpTy = 6655 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6656 : Op0Ty; 6657 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6658 6659 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6660 // different sizes. We take the largest type as the ext to reduce, and add 6661 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6662 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6663 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6664 TTI::CastContextHint::None, CostKind, Op0); 6665 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6666 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6667 TTI::CastContextHint::None, CostKind, Op1); 6668 InstructionCost MulCost = 6669 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6670 6671 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6672 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6673 CostKind); 6674 InstructionCost ExtraExtCost = 0; 6675 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6676 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6677 ExtraExtCost = TTI.getCastInstrCost( 6678 ExtraExtOp->getOpcode(), ExtType, 6679 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6680 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6681 } 6682 6683 if (RedCost.isValid() && 6684 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6685 return I == RetI ? RedCost : 0; 6686 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6687 // Matched reduce(mul()) 6688 InstructionCost MulCost = 6689 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6690 6691 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6692 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6693 CostKind); 6694 6695 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6696 return I == RetI ? RedCost : 0; 6697 } 6698 } 6699 6700 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6701 } 6702 6703 InstructionCost 6704 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6705 ElementCount VF) { 6706 // Calculate scalar cost only. Vectorization cost should be ready at this 6707 // moment. 6708 if (VF.isScalar()) { 6709 Type *ValTy = getLoadStoreType(I); 6710 const Align Alignment = getLoadStoreAlignment(I); 6711 unsigned AS = getLoadStoreAddressSpace(I); 6712 6713 return TTI.getAddressComputationCost(ValTy) + 6714 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6715 TTI::TCK_RecipThroughput, I); 6716 } 6717 return getWideningCost(I, VF); 6718 } 6719 6720 LoopVectorizationCostModel::VectorizationCostTy 6721 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6722 ElementCount VF) { 6723 // If we know that this instruction will remain uniform, check the cost of 6724 // the scalar version. 6725 if (isUniformAfterVectorization(I, VF)) 6726 VF = ElementCount::getFixed(1); 6727 6728 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6729 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6730 6731 // Forced scalars do not have any scalarization overhead. 6732 auto ForcedScalar = ForcedScalars.find(VF); 6733 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6734 auto InstSet = ForcedScalar->second; 6735 if (InstSet.count(I)) 6736 return VectorizationCostTy( 6737 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6738 VF.getKnownMinValue()), 6739 false); 6740 } 6741 6742 Type *VectorTy; 6743 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6744 6745 bool TypeNotScalarized = false; 6746 if (VF.isVector() && VectorTy->isVectorTy()) { 6747 unsigned NumParts = TTI.getNumberOfParts(VectorTy); 6748 if (NumParts) 6749 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6750 else 6751 C = InstructionCost::getInvalid(); 6752 } 6753 return VectorizationCostTy(C, TypeNotScalarized); 6754 } 6755 6756 InstructionCost 6757 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6758 ElementCount VF) const { 6759 6760 // There is no mechanism yet to create a scalable scalarization loop, 6761 // so this is currently Invalid. 6762 if (VF.isScalable()) 6763 return InstructionCost::getInvalid(); 6764 6765 if (VF.isScalar()) 6766 return 0; 6767 6768 InstructionCost Cost = 0; 6769 Type *RetTy = ToVectorTy(I->getType(), VF); 6770 if (!RetTy->isVoidTy() && 6771 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6772 Cost += TTI.getScalarizationOverhead( 6773 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6774 false); 6775 6776 // Some targets keep addresses scalar. 6777 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6778 return Cost; 6779 6780 // Some targets support efficient element stores. 6781 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6782 return Cost; 6783 6784 // Collect operands to consider. 6785 CallInst *CI = dyn_cast<CallInst>(I); 6786 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6787 6788 // Skip operands that do not require extraction/scalarization and do not incur 6789 // any overhead. 6790 SmallVector<Type *> Tys; 6791 for (auto *V : filterExtractingOperands(Ops, VF)) 6792 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6793 return Cost + TTI.getOperandsScalarizationOverhead( 6794 filterExtractingOperands(Ops, VF), Tys); 6795 } 6796 6797 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6798 if (VF.isScalar()) 6799 return; 6800 NumPredStores = 0; 6801 for (BasicBlock *BB : TheLoop->blocks()) { 6802 // For each instruction in the old loop. 6803 for (Instruction &I : *BB) { 6804 Value *Ptr = getLoadStorePointerOperand(&I); 6805 if (!Ptr) 6806 continue; 6807 6808 // TODO: We should generate better code and update the cost model for 6809 // predicated uniform stores. Today they are treated as any other 6810 // predicated store (see added test cases in 6811 // invariant-store-vectorization.ll). 6812 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6813 NumPredStores++; 6814 6815 if (Legal->isUniformMemOp(I)) { 6816 // TODO: Avoid replicating loads and stores instead of 6817 // relying on instcombine to remove them. 6818 // Load: Scalar load + broadcast 6819 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6820 InstructionCost Cost; 6821 if (isa<StoreInst>(&I) && VF.isScalable() && 6822 isLegalGatherOrScatter(&I, VF)) { 6823 Cost = getGatherScatterCost(&I, VF); 6824 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6825 } else { 6826 assert((isa<LoadInst>(&I) || !VF.isScalable()) && 6827 "Cannot yet scalarize uniform stores"); 6828 Cost = getUniformMemOpCost(&I, VF); 6829 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6830 } 6831 continue; 6832 } 6833 6834 // We assume that widening is the best solution when possible. 6835 if (memoryInstructionCanBeWidened(&I, VF)) { 6836 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6837 int ConsecutiveStride = Legal->isConsecutivePtr( 6838 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6839 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6840 "Expected consecutive stride."); 6841 InstWidening Decision = 6842 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6843 setWideningDecision(&I, VF, Decision, Cost); 6844 continue; 6845 } 6846 6847 // Choose between Interleaving, Gather/Scatter or Scalarization. 6848 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6849 unsigned NumAccesses = 1; 6850 if (isAccessInterleaved(&I)) { 6851 auto Group = getInterleavedAccessGroup(&I); 6852 assert(Group && "Fail to get an interleaved access group."); 6853 6854 // Make one decision for the whole group. 6855 if (getWideningDecision(&I, VF) != CM_Unknown) 6856 continue; 6857 6858 NumAccesses = Group->getNumMembers(); 6859 if (interleavedAccessCanBeWidened(&I, VF)) 6860 InterleaveCost = getInterleaveGroupCost(&I, VF); 6861 } 6862 6863 InstructionCost GatherScatterCost = 6864 isLegalGatherOrScatter(&I, VF) 6865 ? getGatherScatterCost(&I, VF) * NumAccesses 6866 : InstructionCost::getInvalid(); 6867 6868 InstructionCost ScalarizationCost = 6869 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6870 6871 // Choose better solution for the current VF, 6872 // write down this decision and use it during vectorization. 6873 InstructionCost Cost; 6874 InstWidening Decision; 6875 if (InterleaveCost <= GatherScatterCost && 6876 InterleaveCost < ScalarizationCost) { 6877 Decision = CM_Interleave; 6878 Cost = InterleaveCost; 6879 } else if (GatherScatterCost < ScalarizationCost) { 6880 Decision = CM_GatherScatter; 6881 Cost = GatherScatterCost; 6882 } else { 6883 Decision = CM_Scalarize; 6884 Cost = ScalarizationCost; 6885 } 6886 // If the instructions belongs to an interleave group, the whole group 6887 // receives the same decision. The whole group receives the cost, but 6888 // the cost will actually be assigned to one instruction. 6889 if (auto Group = getInterleavedAccessGroup(&I)) 6890 setWideningDecision(Group, VF, Decision, Cost); 6891 else 6892 setWideningDecision(&I, VF, Decision, Cost); 6893 } 6894 } 6895 6896 // Make sure that any load of address and any other address computation 6897 // remains scalar unless there is gather/scatter support. This avoids 6898 // inevitable extracts into address registers, and also has the benefit of 6899 // activating LSR more, since that pass can't optimize vectorized 6900 // addresses. 6901 if (TTI.prefersVectorizedAddressing()) 6902 return; 6903 6904 // Start with all scalar pointer uses. 6905 SmallPtrSet<Instruction *, 8> AddrDefs; 6906 for (BasicBlock *BB : TheLoop->blocks()) 6907 for (Instruction &I : *BB) { 6908 Instruction *PtrDef = 6909 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6910 if (PtrDef && TheLoop->contains(PtrDef) && 6911 getWideningDecision(&I, VF) != CM_GatherScatter) 6912 AddrDefs.insert(PtrDef); 6913 } 6914 6915 // Add all instructions used to generate the addresses. 6916 SmallVector<Instruction *, 4> Worklist; 6917 append_range(Worklist, AddrDefs); 6918 while (!Worklist.empty()) { 6919 Instruction *I = Worklist.pop_back_val(); 6920 for (auto &Op : I->operands()) 6921 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6922 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6923 AddrDefs.insert(InstOp).second) 6924 Worklist.push_back(InstOp); 6925 } 6926 6927 for (auto *I : AddrDefs) { 6928 if (isa<LoadInst>(I)) { 6929 // Setting the desired widening decision should ideally be handled in 6930 // by cost functions, but since this involves the task of finding out 6931 // if the loaded register is involved in an address computation, it is 6932 // instead changed here when we know this is the case. 6933 InstWidening Decision = getWideningDecision(I, VF); 6934 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6935 // Scalarize a widened load of address. 6936 setWideningDecision( 6937 I, VF, CM_Scalarize, 6938 (VF.getKnownMinValue() * 6939 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6940 else if (auto Group = getInterleavedAccessGroup(I)) { 6941 // Scalarize an interleave group of address loads. 6942 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6943 if (Instruction *Member = Group->getMember(I)) 6944 setWideningDecision( 6945 Member, VF, CM_Scalarize, 6946 (VF.getKnownMinValue() * 6947 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6948 } 6949 } 6950 } else 6951 // Make sure I gets scalarized and a cost estimate without 6952 // scalarization overhead. 6953 ForcedScalars[VF].insert(I); 6954 } 6955 } 6956 6957 InstructionCost 6958 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6959 Type *&VectorTy) { 6960 Type *RetTy = I->getType(); 6961 if (canTruncateToMinimalBitwidth(I, VF)) 6962 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6963 auto SE = PSE.getSE(); 6964 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6965 6966 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6967 ElementCount VF) -> bool { 6968 if (VF.isScalar()) 6969 return true; 6970 6971 auto Scalarized = InstsToScalarize.find(VF); 6972 assert(Scalarized != InstsToScalarize.end() && 6973 "VF not yet analyzed for scalarization profitability"); 6974 return !Scalarized->second.count(I) && 6975 llvm::all_of(I->users(), [&](User *U) { 6976 auto *UI = cast<Instruction>(U); 6977 return !Scalarized->second.count(UI); 6978 }); 6979 }; 6980 (void) hasSingleCopyAfterVectorization; 6981 6982 if (isScalarAfterVectorization(I, VF)) { 6983 // With the exception of GEPs and PHIs, after scalarization there should 6984 // only be one copy of the instruction generated in the loop. This is 6985 // because the VF is either 1, or any instructions that need scalarizing 6986 // have already been dealt with by the the time we get here. As a result, 6987 // it means we don't have to multiply the instruction cost by VF. 6988 assert(I->getOpcode() == Instruction::GetElementPtr || 6989 I->getOpcode() == Instruction::PHI || 6990 (I->getOpcode() == Instruction::BitCast && 6991 I->getType()->isPointerTy()) || 6992 hasSingleCopyAfterVectorization(I, VF)); 6993 VectorTy = RetTy; 6994 } else 6995 VectorTy = ToVectorTy(RetTy, VF); 6996 6997 // TODO: We need to estimate the cost of intrinsic calls. 6998 switch (I->getOpcode()) { 6999 case Instruction::GetElementPtr: 7000 // We mark this instruction as zero-cost because the cost of GEPs in 7001 // vectorized code depends on whether the corresponding memory instruction 7002 // is scalarized or not. Therefore, we handle GEPs with the memory 7003 // instruction cost. 7004 return 0; 7005 case Instruction::Br: { 7006 // In cases of scalarized and predicated instructions, there will be VF 7007 // predicated blocks in the vectorized loop. Each branch around these 7008 // blocks requires also an extract of its vector compare i1 element. 7009 bool ScalarPredicatedBB = false; 7010 BranchInst *BI = cast<BranchInst>(I); 7011 if (VF.isVector() && BI->isConditional() && 7012 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7013 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7014 ScalarPredicatedBB = true; 7015 7016 if (ScalarPredicatedBB) { 7017 // Not possible to scalarize scalable vector with predicated instructions. 7018 if (VF.isScalable()) 7019 return InstructionCost::getInvalid(); 7020 // Return cost for branches around scalarized and predicated blocks. 7021 auto *Vec_i1Ty = 7022 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7023 return ( 7024 TTI.getScalarizationOverhead( 7025 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 7026 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 7027 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7028 // The back-edge branch will remain, as will all scalar branches. 7029 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7030 else 7031 // This branch will be eliminated by if-conversion. 7032 return 0; 7033 // Note: We currently assume zero cost for an unconditional branch inside 7034 // a predicated block since it will become a fall-through, although we 7035 // may decide in the future to call TTI for all branches. 7036 } 7037 case Instruction::PHI: { 7038 auto *Phi = cast<PHINode>(I); 7039 7040 // First-order recurrences are replaced by vector shuffles inside the loop. 7041 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7042 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7043 return TTI.getShuffleCost( 7044 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7045 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7046 7047 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7048 // converted into select instructions. We require N - 1 selects per phi 7049 // node, where N is the number of incoming values. 7050 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7051 return (Phi->getNumIncomingValues() - 1) * 7052 TTI.getCmpSelInstrCost( 7053 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7054 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7055 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7056 7057 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7058 } 7059 case Instruction::UDiv: 7060 case Instruction::SDiv: 7061 case Instruction::URem: 7062 case Instruction::SRem: 7063 // If we have a predicated instruction, it may not be executed for each 7064 // vector lane. Get the scalarization cost and scale this amount by the 7065 // probability of executing the predicated block. If the instruction is not 7066 // predicated, we fall through to the next case. 7067 if (VF.isVector() && isScalarWithPredication(I, VF)) { 7068 InstructionCost Cost = 0; 7069 7070 // These instructions have a non-void type, so account for the phi nodes 7071 // that we will create. This cost is likely to be zero. The phi node 7072 // cost, if any, should be scaled by the block probability because it 7073 // models a copy at the end of each predicated block. 7074 Cost += VF.getKnownMinValue() * 7075 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7076 7077 // The cost of the non-predicated instruction. 7078 Cost += VF.getKnownMinValue() * 7079 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7080 7081 // The cost of insertelement and extractelement instructions needed for 7082 // scalarization. 7083 Cost += getScalarizationOverhead(I, VF); 7084 7085 // Scale the cost by the probability of executing the predicated blocks. 7086 // This assumes the predicated block for each vector lane is equally 7087 // likely. 7088 return Cost / getReciprocalPredBlockProb(); 7089 } 7090 LLVM_FALLTHROUGH; 7091 case Instruction::Add: 7092 case Instruction::FAdd: 7093 case Instruction::Sub: 7094 case Instruction::FSub: 7095 case Instruction::Mul: 7096 case Instruction::FMul: 7097 case Instruction::FDiv: 7098 case Instruction::FRem: 7099 case Instruction::Shl: 7100 case Instruction::LShr: 7101 case Instruction::AShr: 7102 case Instruction::And: 7103 case Instruction::Or: 7104 case Instruction::Xor: { 7105 // Since we will replace the stride by 1 the multiplication should go away. 7106 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7107 return 0; 7108 7109 // Detect reduction patterns 7110 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7111 return *RedCost; 7112 7113 // Certain instructions can be cheaper to vectorize if they have a constant 7114 // second vector operand. One example of this are shifts on x86. 7115 Value *Op2 = I->getOperand(1); 7116 TargetTransformInfo::OperandValueProperties Op2VP; 7117 TargetTransformInfo::OperandValueKind Op2VK = 7118 TTI.getOperandInfo(Op2, Op2VP); 7119 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7120 Op2VK = TargetTransformInfo::OK_UniformValue; 7121 7122 SmallVector<const Value *, 4> Operands(I->operand_values()); 7123 return TTI.getArithmeticInstrCost( 7124 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7125 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7126 } 7127 case Instruction::FNeg: { 7128 return TTI.getArithmeticInstrCost( 7129 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7130 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7131 TargetTransformInfo::OP_None, I->getOperand(0), I); 7132 } 7133 case Instruction::Select: { 7134 SelectInst *SI = cast<SelectInst>(I); 7135 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7136 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7137 7138 const Value *Op0, *Op1; 7139 using namespace llvm::PatternMatch; 7140 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7141 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7142 // select x, y, false --> x & y 7143 // select x, true, y --> x | y 7144 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7145 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7146 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7147 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7148 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7149 Op1->getType()->getScalarSizeInBits() == 1); 7150 7151 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7152 return TTI.getArithmeticInstrCost( 7153 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7154 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7155 } 7156 7157 Type *CondTy = SI->getCondition()->getType(); 7158 if (!ScalarCond) 7159 CondTy = VectorType::get(CondTy, VF); 7160 7161 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7162 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7163 Pred = Cmp->getPredicate(); 7164 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7165 CostKind, I); 7166 } 7167 case Instruction::ICmp: 7168 case Instruction::FCmp: { 7169 Type *ValTy = I->getOperand(0)->getType(); 7170 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7171 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7172 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7173 VectorTy = ToVectorTy(ValTy, VF); 7174 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7175 cast<CmpInst>(I)->getPredicate(), CostKind, 7176 I); 7177 } 7178 case Instruction::Store: 7179 case Instruction::Load: { 7180 ElementCount Width = VF; 7181 if (Width.isVector()) { 7182 InstWidening Decision = getWideningDecision(I, Width); 7183 assert(Decision != CM_Unknown && 7184 "CM decision should be taken at this point"); 7185 if (Decision == CM_Scalarize) 7186 Width = ElementCount::getFixed(1); 7187 } 7188 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7189 return getMemoryInstructionCost(I, VF); 7190 } 7191 case Instruction::BitCast: 7192 if (I->getType()->isPointerTy()) 7193 return 0; 7194 LLVM_FALLTHROUGH; 7195 case Instruction::ZExt: 7196 case Instruction::SExt: 7197 case Instruction::FPToUI: 7198 case Instruction::FPToSI: 7199 case Instruction::FPExt: 7200 case Instruction::PtrToInt: 7201 case Instruction::IntToPtr: 7202 case Instruction::SIToFP: 7203 case Instruction::UIToFP: 7204 case Instruction::Trunc: 7205 case Instruction::FPTrunc: { 7206 // Computes the CastContextHint from a Load/Store instruction. 7207 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7208 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7209 "Expected a load or a store!"); 7210 7211 if (VF.isScalar() || !TheLoop->contains(I)) 7212 return TTI::CastContextHint::Normal; 7213 7214 switch (getWideningDecision(I, VF)) { 7215 case LoopVectorizationCostModel::CM_GatherScatter: 7216 return TTI::CastContextHint::GatherScatter; 7217 case LoopVectorizationCostModel::CM_Interleave: 7218 return TTI::CastContextHint::Interleave; 7219 case LoopVectorizationCostModel::CM_Scalarize: 7220 case LoopVectorizationCostModel::CM_Widen: 7221 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7222 : TTI::CastContextHint::Normal; 7223 case LoopVectorizationCostModel::CM_Widen_Reverse: 7224 return TTI::CastContextHint::Reversed; 7225 case LoopVectorizationCostModel::CM_Unknown: 7226 llvm_unreachable("Instr did not go through cost modelling?"); 7227 } 7228 7229 llvm_unreachable("Unhandled case!"); 7230 }; 7231 7232 unsigned Opcode = I->getOpcode(); 7233 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7234 // For Trunc, the context is the only user, which must be a StoreInst. 7235 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7236 if (I->hasOneUse()) 7237 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7238 CCH = ComputeCCH(Store); 7239 } 7240 // For Z/Sext, the context is the operand, which must be a LoadInst. 7241 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7242 Opcode == Instruction::FPExt) { 7243 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7244 CCH = ComputeCCH(Load); 7245 } 7246 7247 // We optimize the truncation of induction variables having constant 7248 // integer steps. The cost of these truncations is the same as the scalar 7249 // operation. 7250 if (isOptimizableIVTruncate(I, VF)) { 7251 auto *Trunc = cast<TruncInst>(I); 7252 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7253 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7254 } 7255 7256 // Detect reduction patterns 7257 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7258 return *RedCost; 7259 7260 Type *SrcScalarTy = I->getOperand(0)->getType(); 7261 Type *SrcVecTy = 7262 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7263 if (canTruncateToMinimalBitwidth(I, VF)) { 7264 // This cast is going to be shrunk. This may remove the cast or it might 7265 // turn it into slightly different cast. For example, if MinBW == 16, 7266 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7267 // 7268 // Calculate the modified src and dest types. 7269 Type *MinVecTy = VectorTy; 7270 if (Opcode == Instruction::Trunc) { 7271 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7272 VectorTy = 7273 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7274 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7275 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7276 VectorTy = 7277 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7278 } 7279 } 7280 7281 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7282 } 7283 case Instruction::Call: { 7284 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7285 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7286 return *RedCost; 7287 bool NeedToScalarize; 7288 CallInst *CI = cast<CallInst>(I); 7289 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7290 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7291 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7292 return std::min(CallCost, IntrinsicCost); 7293 } 7294 return CallCost; 7295 } 7296 case Instruction::ExtractValue: 7297 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7298 case Instruction::Alloca: 7299 // We cannot easily widen alloca to a scalable alloca, as 7300 // the result would need to be a vector of pointers. 7301 if (VF.isScalable()) 7302 return InstructionCost::getInvalid(); 7303 LLVM_FALLTHROUGH; 7304 default: 7305 // This opcode is unknown. Assume that it is the same as 'mul'. 7306 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7307 } // end of switch. 7308 } 7309 7310 char LoopVectorize::ID = 0; 7311 7312 static const char lv_name[] = "Loop Vectorization"; 7313 7314 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7315 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7316 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7317 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7318 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7319 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7320 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7321 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7322 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7323 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7324 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7325 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7326 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7327 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7328 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7329 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7330 7331 namespace llvm { 7332 7333 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7334 7335 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7336 bool VectorizeOnlyWhenForced) { 7337 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7338 } 7339 7340 } // end namespace llvm 7341 7342 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7343 // Check if the pointer operand of a load or store instruction is 7344 // consecutive. 7345 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7346 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7347 return false; 7348 } 7349 7350 void LoopVectorizationCostModel::collectValuesToIgnore() { 7351 // Ignore ephemeral values. 7352 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7353 7354 // Find all stores to invariant variables. Since they are going to sink 7355 // outside the loop we do not need calculate cost for them. 7356 for (BasicBlock *BB : TheLoop->blocks()) 7357 for (Instruction &I : *BB) { 7358 StoreInst *SI; 7359 if ((SI = dyn_cast<StoreInst>(&I)) && 7360 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7361 ValuesToIgnore.insert(&I); 7362 } 7363 7364 // Ignore type-promoting instructions we identified during reduction 7365 // detection. 7366 for (auto &Reduction : Legal->getReductionVars()) { 7367 const RecurrenceDescriptor &RedDes = Reduction.second; 7368 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7369 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7370 } 7371 // Ignore type-casting instructions we identified during induction 7372 // detection. 7373 for (auto &Induction : Legal->getInductionVars()) { 7374 const InductionDescriptor &IndDes = Induction.second; 7375 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7376 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7377 } 7378 } 7379 7380 void LoopVectorizationCostModel::collectInLoopReductions() { 7381 for (auto &Reduction : Legal->getReductionVars()) { 7382 PHINode *Phi = Reduction.first; 7383 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7384 7385 // We don't collect reductions that are type promoted (yet). 7386 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7387 continue; 7388 7389 // If the target would prefer this reduction to happen "in-loop", then we 7390 // want to record it as such. 7391 unsigned Opcode = RdxDesc.getOpcode(); 7392 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7393 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7394 TargetTransformInfo::ReductionFlags())) 7395 continue; 7396 7397 // Check that we can correctly put the reductions into the loop, by 7398 // finding the chain of operations that leads from the phi to the loop 7399 // exit value. 7400 SmallVector<Instruction *, 4> ReductionOperations = 7401 RdxDesc.getReductionOpChain(Phi, TheLoop); 7402 bool InLoop = !ReductionOperations.empty(); 7403 if (InLoop) { 7404 InLoopReductionChains[Phi] = ReductionOperations; 7405 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7406 Instruction *LastChain = Phi; 7407 for (auto *I : ReductionOperations) { 7408 InLoopReductionImmediateChains[I] = LastChain; 7409 LastChain = I; 7410 } 7411 } 7412 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7413 << " reduction for phi: " << *Phi << "\n"); 7414 } 7415 } 7416 7417 // TODO: we could return a pair of values that specify the max VF and 7418 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7419 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7420 // doesn't have a cost model that can choose which plan to execute if 7421 // more than one is generated. 7422 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7423 LoopVectorizationCostModel &CM) { 7424 unsigned WidestType; 7425 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7426 return WidestVectorRegBits / WidestType; 7427 } 7428 7429 VectorizationFactor 7430 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7431 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7432 ElementCount VF = UserVF; 7433 // Outer loop handling: They may require CFG and instruction level 7434 // transformations before even evaluating whether vectorization is profitable. 7435 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7436 // the vectorization pipeline. 7437 if (!OrigLoop->isInnermost()) { 7438 // If the user doesn't provide a vectorization factor, determine a 7439 // reasonable one. 7440 if (UserVF.isZero()) { 7441 VF = ElementCount::getFixed(determineVPlanVF( 7442 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7443 .getFixedSize(), 7444 CM)); 7445 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7446 7447 // Make sure we have a VF > 1 for stress testing. 7448 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7449 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7450 << "overriding computed VF.\n"); 7451 VF = ElementCount::getFixed(4); 7452 } 7453 } 7454 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7455 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7456 "VF needs to be a power of two"); 7457 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7458 << "VF " << VF << " to build VPlans.\n"); 7459 buildVPlans(VF, VF); 7460 7461 // For VPlan build stress testing, we bail out after VPlan construction. 7462 if (VPlanBuildStressTest) 7463 return VectorizationFactor::Disabled(); 7464 7465 return {VF, 0 /*Cost*/}; 7466 } 7467 7468 LLVM_DEBUG( 7469 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7470 "VPlan-native path.\n"); 7471 return VectorizationFactor::Disabled(); 7472 } 7473 7474 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { 7475 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7476 return (NumRuntimePointerChecks > 7477 VectorizerParams::RuntimeMemoryCheckThreshold && 7478 !Hints.allowReordering()) || 7479 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7480 } 7481 7482 Optional<VectorizationFactor> 7483 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7484 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7485 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7486 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7487 return None; 7488 7489 // Invalidate interleave groups if all blocks of loop will be predicated. 7490 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7491 !useMaskedInterleavedAccesses(*TTI)) { 7492 LLVM_DEBUG( 7493 dbgs() 7494 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7495 "which requires masked-interleaved support.\n"); 7496 if (CM.InterleaveInfo.invalidateGroups()) 7497 // Invalidating interleave groups also requires invalidating all decisions 7498 // based on them, which includes widening decisions and uniform and scalar 7499 // values. 7500 CM.invalidateCostModelingDecisions(); 7501 } 7502 7503 ElementCount MaxUserVF = 7504 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7505 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7506 if (!UserVF.isZero() && UserVFIsLegal) { 7507 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7508 "VF needs to be a power of two"); 7509 // Collect the instructions (and their associated costs) that will be more 7510 // profitable to scalarize. 7511 if (CM.selectUserVectorizationFactor(UserVF)) { 7512 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7513 CM.collectInLoopReductions(); 7514 buildVPlansWithVPRecipes(UserVF, UserVF); 7515 LLVM_DEBUG(printPlans(dbgs())); 7516 return {{UserVF, 0}}; 7517 } else 7518 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7519 "InvalidCost", ORE, OrigLoop); 7520 } 7521 7522 // Populate the set of Vectorization Factor Candidates. 7523 ElementCountSet VFCandidates; 7524 for (auto VF = ElementCount::getFixed(1); 7525 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7526 VFCandidates.insert(VF); 7527 for (auto VF = ElementCount::getScalable(1); 7528 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7529 VFCandidates.insert(VF); 7530 7531 for (const auto &VF : VFCandidates) { 7532 // Collect Uniform and Scalar instructions after vectorization with VF. 7533 CM.collectUniformsAndScalars(VF); 7534 7535 // Collect the instructions (and their associated costs) that will be more 7536 // profitable to scalarize. 7537 if (VF.isVector()) 7538 CM.collectInstsToScalarize(VF); 7539 } 7540 7541 CM.collectInLoopReductions(); 7542 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7543 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7544 7545 LLVM_DEBUG(printPlans(dbgs())); 7546 if (!MaxFactors.hasVector()) 7547 return VectorizationFactor::Disabled(); 7548 7549 // Select the optimal vectorization factor. 7550 return CM.selectVectorizationFactor(VFCandidates); 7551 } 7552 7553 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7554 assert(count_if(VPlans, 7555 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7556 1 && 7557 "Best VF has not a single VPlan."); 7558 7559 for (const VPlanPtr &Plan : VPlans) { 7560 if (Plan->hasVF(VF)) 7561 return *Plan.get(); 7562 } 7563 llvm_unreachable("No plan found!"); 7564 } 7565 7566 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7567 SmallVector<Metadata *, 4> MDs; 7568 // Reserve first location for self reference to the LoopID metadata node. 7569 MDs.push_back(nullptr); 7570 bool IsUnrollMetadata = false; 7571 MDNode *LoopID = L->getLoopID(); 7572 if (LoopID) { 7573 // First find existing loop unrolling disable metadata. 7574 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7575 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7576 if (MD) { 7577 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7578 IsUnrollMetadata = 7579 S && S->getString().startswith("llvm.loop.unroll.disable"); 7580 } 7581 MDs.push_back(LoopID->getOperand(i)); 7582 } 7583 } 7584 7585 if (!IsUnrollMetadata) { 7586 // Add runtime unroll disable metadata. 7587 LLVMContext &Context = L->getHeader()->getContext(); 7588 SmallVector<Metadata *, 1> DisableOperands; 7589 DisableOperands.push_back( 7590 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7591 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7592 MDs.push_back(DisableNode); 7593 MDNode *NewLoopID = MDNode::get(Context, MDs); 7594 // Set operand 0 to refer to the loop id itself. 7595 NewLoopID->replaceOperandWith(0, NewLoopID); 7596 L->setLoopID(NewLoopID); 7597 } 7598 } 7599 7600 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7601 VPlan &BestVPlan, 7602 InnerLoopVectorizer &ILV, 7603 DominatorTree *DT) { 7604 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7605 << '\n'); 7606 7607 // Perform the actual loop transformation. 7608 7609 // 1. Set up the skeleton for vectorization, including vector pre-header and 7610 // middle block. The vector loop is created during VPlan execution. 7611 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7612 Value *CanonicalIVStartValue; 7613 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7614 ILV.createVectorizedLoopSkeleton(); 7615 ILV.collectPoisonGeneratingRecipes(State); 7616 7617 ILV.printDebugTracesAtStart(); 7618 7619 //===------------------------------------------------===// 7620 // 7621 // Notice: any optimization or new instruction that go 7622 // into the code below should also be implemented in 7623 // the cost-model. 7624 // 7625 //===------------------------------------------------===// 7626 7627 // 2. Copy and widen instructions from the old loop into the new loop. 7628 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7629 ILV.getOrCreateVectorTripCount(nullptr), 7630 CanonicalIVStartValue, State); 7631 BestVPlan.execute(&State); 7632 7633 // Keep all loop hints from the original loop on the vector loop (we'll 7634 // replace the vectorizer-specific hints below). 7635 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7636 7637 Optional<MDNode *> VectorizedLoopID = 7638 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7639 LLVMLoopVectorizeFollowupVectorized}); 7640 7641 VPBasicBlock *HeaderVPBB = 7642 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7643 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7644 if (VectorizedLoopID.hasValue()) 7645 L->setLoopID(VectorizedLoopID.getValue()); 7646 else { 7647 // Keep all loop hints from the original loop on the vector loop (we'll 7648 // replace the vectorizer-specific hints below). 7649 if (MDNode *LID = OrigLoop->getLoopID()) 7650 L->setLoopID(LID); 7651 7652 LoopVectorizeHints Hints(L, true, *ORE); 7653 Hints.setAlreadyVectorized(); 7654 } 7655 // Disable runtime unrolling when vectorizing the epilogue loop. 7656 if (CanonicalIVStartValue) 7657 AddRuntimeUnrollDisableMetaData(L); 7658 7659 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7660 // predication, updating analyses. 7661 ILV.fixVectorizedLoop(State, BestVPlan); 7662 7663 ILV.printDebugTracesAtEnd(); 7664 } 7665 7666 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7667 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7668 for (const auto &Plan : VPlans) 7669 if (PrintVPlansInDotFormat) 7670 Plan->printDOT(O); 7671 else 7672 Plan->print(O); 7673 } 7674 #endif 7675 7676 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7677 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7678 7679 // We create new control-flow for the vectorized loop, so the original exit 7680 // conditions will be dead after vectorization if it's only used by the 7681 // terminator 7682 SmallVector<BasicBlock*> ExitingBlocks; 7683 OrigLoop->getExitingBlocks(ExitingBlocks); 7684 for (auto *BB : ExitingBlocks) { 7685 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7686 if (!Cmp || !Cmp->hasOneUse()) 7687 continue; 7688 7689 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7690 if (!DeadInstructions.insert(Cmp).second) 7691 continue; 7692 7693 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7694 // TODO: can recurse through operands in general 7695 for (Value *Op : Cmp->operands()) { 7696 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7697 DeadInstructions.insert(cast<Instruction>(Op)); 7698 } 7699 } 7700 7701 // We create new "steps" for induction variable updates to which the original 7702 // induction variables map. An original update instruction will be dead if 7703 // all its users except the induction variable are dead. 7704 auto *Latch = OrigLoop->getLoopLatch(); 7705 for (auto &Induction : Legal->getInductionVars()) { 7706 PHINode *Ind = Induction.first; 7707 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7708 7709 // If the tail is to be folded by masking, the primary induction variable, 7710 // if exists, isn't dead: it will be used for masking. Don't kill it. 7711 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7712 continue; 7713 7714 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7715 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7716 })) 7717 DeadInstructions.insert(IndUpdate); 7718 } 7719 } 7720 7721 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7722 7723 //===--------------------------------------------------------------------===// 7724 // EpilogueVectorizerMainLoop 7725 //===--------------------------------------------------------------------===// 7726 7727 /// This function is partially responsible for generating the control flow 7728 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7729 std::pair<BasicBlock *, Value *> 7730 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7731 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7732 7733 // Workaround! Compute the trip count of the original loop and cache it 7734 // before we start modifying the CFG. This code has a systemic problem 7735 // wherein it tries to run analysis over partially constructed IR; this is 7736 // wrong, and not simply for SCEV. The trip count of the original loop 7737 // simply happens to be prone to hitting this in practice. In theory, we 7738 // can hit the same issue for any SCEV, or ValueTracking query done during 7739 // mutation. See PR49900. 7740 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7741 createVectorLoopSkeleton(""); 7742 7743 // Generate the code to check the minimum iteration count of the vector 7744 // epilogue (see below). 7745 EPI.EpilogueIterationCountCheck = 7746 emitIterationCountCheck(LoopScalarPreHeader, true); 7747 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7748 7749 // Generate the code to check any assumptions that we've made for SCEV 7750 // expressions. 7751 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7752 7753 // Generate the code that checks at runtime if arrays overlap. We put the 7754 // checks into a separate block to make the more common case of few elements 7755 // faster. 7756 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7757 7758 // Generate the iteration count check for the main loop, *after* the check 7759 // for the epilogue loop, so that the path-length is shorter for the case 7760 // that goes directly through the vector epilogue. The longer-path length for 7761 // the main loop is compensated for, by the gain from vectorizing the larger 7762 // trip count. Note: the branch will get updated later on when we vectorize 7763 // the epilogue. 7764 EPI.MainLoopIterationCountCheck = 7765 emitIterationCountCheck(LoopScalarPreHeader, false); 7766 7767 // Generate the induction variable. 7768 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7769 7770 // Skip induction resume value creation here because they will be created in 7771 // the second pass. If we created them here, they wouldn't be used anyway, 7772 // because the vplan in the second pass still contains the inductions from the 7773 // original loop. 7774 7775 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7776 } 7777 7778 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7779 LLVM_DEBUG({ 7780 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7781 << "Main Loop VF:" << EPI.MainLoopVF 7782 << ", Main Loop UF:" << EPI.MainLoopUF 7783 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7784 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7785 }); 7786 } 7787 7788 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7789 DEBUG_WITH_TYPE(VerboseDebug, { 7790 dbgs() << "intermediate fn:\n" 7791 << *OrigLoop->getHeader()->getParent() << "\n"; 7792 }); 7793 } 7794 7795 BasicBlock * 7796 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7797 bool ForEpilogue) { 7798 assert(Bypass && "Expected valid bypass basic block."); 7799 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7800 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7801 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7802 // Reuse existing vector loop preheader for TC checks. 7803 // Note that new preheader block is generated for vector loop. 7804 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7805 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7806 7807 // Generate code to check if the loop's trip count is less than VF * UF of the 7808 // main vector loop. 7809 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7810 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7811 7812 Value *CheckMinIters = Builder.CreateICmp( 7813 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7814 "min.iters.check"); 7815 7816 if (!ForEpilogue) 7817 TCCheckBlock->setName("vector.main.loop.iter.check"); 7818 7819 // Create new preheader for vector loop. 7820 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7821 DT, LI, nullptr, "vector.ph"); 7822 7823 if (ForEpilogue) { 7824 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7825 DT->getNode(Bypass)->getIDom()) && 7826 "TC check is expected to dominate Bypass"); 7827 7828 // Update dominator for Bypass & LoopExit. 7829 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7830 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7831 // For loops with multiple exits, there's no edge from the middle block 7832 // to exit blocks (as the epilogue must run) and thus no need to update 7833 // the immediate dominator of the exit blocks. 7834 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7835 7836 LoopBypassBlocks.push_back(TCCheckBlock); 7837 7838 // Save the trip count so we don't have to regenerate it in the 7839 // vec.epilog.iter.check. This is safe to do because the trip count 7840 // generated here dominates the vector epilog iter check. 7841 EPI.TripCount = Count; 7842 } 7843 7844 ReplaceInstWithInst( 7845 TCCheckBlock->getTerminator(), 7846 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7847 7848 return TCCheckBlock; 7849 } 7850 7851 //===--------------------------------------------------------------------===// 7852 // EpilogueVectorizerEpilogueLoop 7853 //===--------------------------------------------------------------------===// 7854 7855 /// This function is partially responsible for generating the control flow 7856 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7857 std::pair<BasicBlock *, Value *> 7858 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7859 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7860 createVectorLoopSkeleton("vec.epilog."); 7861 7862 // Now, compare the remaining count and if there aren't enough iterations to 7863 // execute the vectorized epilogue skip to the scalar part. 7864 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7865 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7866 LoopVectorPreHeader = 7867 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7868 LI, nullptr, "vec.epilog.ph"); 7869 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7870 VecEpilogueIterationCountCheck); 7871 7872 // Adjust the control flow taking the state info from the main loop 7873 // vectorization into account. 7874 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7875 "expected this to be saved from the previous pass."); 7876 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7877 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7878 7879 DT->changeImmediateDominator(LoopVectorPreHeader, 7880 EPI.MainLoopIterationCountCheck); 7881 7882 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7883 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7884 7885 if (EPI.SCEVSafetyCheck) 7886 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7887 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7888 if (EPI.MemSafetyCheck) 7889 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7890 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7891 7892 DT->changeImmediateDominator( 7893 VecEpilogueIterationCountCheck, 7894 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7895 7896 DT->changeImmediateDominator(LoopScalarPreHeader, 7897 EPI.EpilogueIterationCountCheck); 7898 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7899 // If there is an epilogue which must run, there's no edge from the 7900 // middle block to exit blocks and thus no need to update the immediate 7901 // dominator of the exit blocks. 7902 DT->changeImmediateDominator(LoopExitBlock, 7903 EPI.EpilogueIterationCountCheck); 7904 7905 // Keep track of bypass blocks, as they feed start values to the induction 7906 // phis in the scalar loop preheader. 7907 if (EPI.SCEVSafetyCheck) 7908 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7909 if (EPI.MemSafetyCheck) 7910 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7911 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7912 7913 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7914 // merge control-flow from the latch block and the middle block. Update the 7915 // incoming values here and move the Phi into the preheader. 7916 SmallVector<PHINode *, 4> PhisInBlock; 7917 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7918 PhisInBlock.push_back(&Phi); 7919 7920 for (PHINode *Phi : PhisInBlock) { 7921 Phi->replaceIncomingBlockWith( 7922 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7923 VecEpilogueIterationCountCheck); 7924 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7925 if (EPI.SCEVSafetyCheck) 7926 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7927 if (EPI.MemSafetyCheck) 7928 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7929 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7930 } 7931 7932 // Generate a resume induction for the vector epilogue and put it in the 7933 // vector epilogue preheader 7934 Type *IdxTy = Legal->getWidestInductionType(); 7935 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7936 LoopVectorPreHeader->getFirstNonPHI()); 7937 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7938 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7939 EPI.MainLoopIterationCountCheck); 7940 7941 // Generate induction resume values. These variables save the new starting 7942 // indexes for the scalar loop. They are used to test if there are any tail 7943 // iterations left once the vector loop has completed. 7944 // Note that when the vectorized epilogue is skipped due to iteration count 7945 // check, then the resume value for the induction variable comes from 7946 // the trip count of the main vector loop, hence passing the AdditionalBypass 7947 // argument. 7948 createInductionResumeValues({VecEpilogueIterationCountCheck, 7949 EPI.VectorTripCount} /* AdditionalBypass */); 7950 7951 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7952 } 7953 7954 BasicBlock * 7955 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7956 BasicBlock *Bypass, BasicBlock *Insert) { 7957 7958 assert(EPI.TripCount && 7959 "Expected trip count to have been safed in the first pass."); 7960 assert( 7961 (!isa<Instruction>(EPI.TripCount) || 7962 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7963 "saved trip count does not dominate insertion point."); 7964 Value *TC = EPI.TripCount; 7965 IRBuilder<> Builder(Insert->getTerminator()); 7966 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7967 7968 // Generate code to check if the loop's trip count is less than VF * UF of the 7969 // vector epilogue loop. 7970 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7971 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7972 7973 Value *CheckMinIters = 7974 Builder.CreateICmp(P, Count, 7975 createStepForVF(Builder, Count->getType(), 7976 EPI.EpilogueVF, EPI.EpilogueUF), 7977 "min.epilog.iters.check"); 7978 7979 ReplaceInstWithInst( 7980 Insert->getTerminator(), 7981 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7982 7983 LoopBypassBlocks.push_back(Insert); 7984 return Insert; 7985 } 7986 7987 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7988 LLVM_DEBUG({ 7989 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7990 << "Epilogue Loop VF:" << EPI.EpilogueVF 7991 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7992 }); 7993 } 7994 7995 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7996 DEBUG_WITH_TYPE(VerboseDebug, { 7997 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7998 }); 7999 } 8000 8001 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8002 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8003 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8004 bool PredicateAtRangeStart = Predicate(Range.Start); 8005 8006 for (ElementCount TmpVF = Range.Start * 2; 8007 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8008 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8009 Range.End = TmpVF; 8010 break; 8011 } 8012 8013 return PredicateAtRangeStart; 8014 } 8015 8016 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8017 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8018 /// of VF's starting at a given VF and extending it as much as possible. Each 8019 /// vectorization decision can potentially shorten this sub-range during 8020 /// buildVPlan(). 8021 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8022 ElementCount MaxVF) { 8023 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8024 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8025 VFRange SubRange = {VF, MaxVFPlusOne}; 8026 VPlans.push_back(buildVPlan(SubRange)); 8027 VF = SubRange.End; 8028 } 8029 } 8030 8031 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8032 VPlanPtr &Plan) { 8033 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8034 8035 // Look for cached value. 8036 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8037 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8038 if (ECEntryIt != EdgeMaskCache.end()) 8039 return ECEntryIt->second; 8040 8041 VPValue *SrcMask = createBlockInMask(Src, Plan); 8042 8043 // The terminator has to be a branch inst! 8044 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8045 assert(BI && "Unexpected terminator found"); 8046 8047 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8048 return EdgeMaskCache[Edge] = SrcMask; 8049 8050 // If source is an exiting block, we know the exit edge is dynamically dead 8051 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8052 // adding uses of an otherwise potentially dead instruction. 8053 if (OrigLoop->isLoopExiting(Src)) 8054 return EdgeMaskCache[Edge] = SrcMask; 8055 8056 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8057 assert(EdgeMask && "No Edge Mask found for condition"); 8058 8059 if (BI->getSuccessor(0) != Dst) 8060 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8061 8062 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8063 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8064 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8065 // The select version does not introduce new UB if SrcMask is false and 8066 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8067 VPValue *False = Plan->getOrAddVPValue( 8068 ConstantInt::getFalse(BI->getCondition()->getType())); 8069 EdgeMask = 8070 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8071 } 8072 8073 return EdgeMaskCache[Edge] = EdgeMask; 8074 } 8075 8076 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8077 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8078 8079 // Look for cached value. 8080 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8081 if (BCEntryIt != BlockMaskCache.end()) 8082 return BCEntryIt->second; 8083 8084 // All-one mask is modelled as no-mask following the convention for masked 8085 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8086 VPValue *BlockMask = nullptr; 8087 8088 if (OrigLoop->getHeader() == BB) { 8089 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8090 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8091 8092 // Introduce the early-exit compare IV <= BTC to form header block mask. 8093 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8094 // constructing the desired canonical IV in the header block as its first 8095 // non-phi instructions. 8096 assert(CM.foldTailByMasking() && "must fold the tail"); 8097 VPBasicBlock *HeaderVPBB = 8098 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8099 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8100 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8101 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8102 8103 VPBuilder::InsertPointGuard Guard(Builder); 8104 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8105 if (CM.TTI.emitGetActiveLaneMask()) { 8106 VPValue *TC = Plan->getOrCreateTripCount(); 8107 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8108 } else { 8109 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8110 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8111 } 8112 return BlockMaskCache[BB] = BlockMask; 8113 } 8114 8115 // This is the block mask. We OR all incoming edges. 8116 for (auto *Predecessor : predecessors(BB)) { 8117 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8118 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8119 return BlockMaskCache[BB] = EdgeMask; 8120 8121 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8122 BlockMask = EdgeMask; 8123 continue; 8124 } 8125 8126 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8127 } 8128 8129 return BlockMaskCache[BB] = BlockMask; 8130 } 8131 8132 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8133 ArrayRef<VPValue *> Operands, 8134 VFRange &Range, 8135 VPlanPtr &Plan) { 8136 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8137 "Must be called with either a load or store"); 8138 8139 auto willWiden = [&](ElementCount VF) -> bool { 8140 LoopVectorizationCostModel::InstWidening Decision = 8141 CM.getWideningDecision(I, VF); 8142 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8143 "CM decision should be taken at this point."); 8144 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8145 return true; 8146 if (CM.isScalarAfterVectorization(I, VF) || 8147 CM.isProfitableToScalarize(I, VF)) 8148 return false; 8149 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8150 }; 8151 8152 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8153 return nullptr; 8154 8155 VPValue *Mask = nullptr; 8156 if (Legal->isMaskRequired(I)) 8157 Mask = createBlockInMask(I->getParent(), Plan); 8158 8159 // Determine if the pointer operand of the access is either consecutive or 8160 // reverse consecutive. 8161 LoopVectorizationCostModel::InstWidening Decision = 8162 CM.getWideningDecision(I, Range.Start); 8163 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8164 bool Consecutive = 8165 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8166 8167 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8168 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8169 Consecutive, Reverse); 8170 8171 StoreInst *Store = cast<StoreInst>(I); 8172 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8173 Mask, Consecutive, Reverse); 8174 } 8175 8176 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8177 /// insert a recipe to expand the step for the induction recipe. 8178 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8179 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8180 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8181 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8182 // Returns true if an instruction \p I should be scalarized instead of 8183 // vectorized for the chosen vectorization factor. 8184 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8185 return CM.isScalarAfterVectorization(I, VF) || 8186 CM.isProfitableToScalarize(I, VF); 8187 }; 8188 8189 bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange( 8190 [&](ElementCount VF) { 8191 // Returns true if we should generate a scalar version of \p IV. 8192 if (ShouldScalarizeInstruction(PhiOrTrunc, VF)) 8193 return true; 8194 auto isScalarInst = [&](User *U) -> bool { 8195 auto *I = cast<Instruction>(U); 8196 return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF); 8197 }; 8198 return any_of(PhiOrTrunc->users(), isScalarInst); 8199 }, 8200 Range); 8201 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8202 [&](ElementCount VF) { 8203 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8204 }, 8205 Range); 8206 assert(IndDesc.getStartValue() == 8207 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8208 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8209 "step must be loop invariant"); 8210 8211 VPValue *Step = 8212 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8213 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8214 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8215 NeedsScalarIV, !NeedsScalarIVOnly); 8216 } 8217 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8218 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8219 NeedsScalarIV, !NeedsScalarIVOnly); 8220 } 8221 8222 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8223 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8224 8225 // Check if this is an integer or fp induction. If so, build the recipe that 8226 // produces its scalar and vector values. 8227 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8228 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8229 *PSE.getSE(), *OrigLoop, Range); 8230 8231 // Check if this is pointer induction. If so, build the recipe for it. 8232 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8233 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8234 *PSE.getSE()); 8235 return nullptr; 8236 } 8237 8238 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8239 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8240 // Optimize the special case where the source is a constant integer 8241 // induction variable. Notice that we can only optimize the 'trunc' case 8242 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8243 // (c) other casts depend on pointer size. 8244 8245 // Determine whether \p K is a truncation based on an induction variable that 8246 // can be optimized. 8247 auto isOptimizableIVTruncate = 8248 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8249 return [=](ElementCount VF) -> bool { 8250 return CM.isOptimizableIVTruncate(K, VF); 8251 }; 8252 }; 8253 8254 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8255 isOptimizableIVTruncate(I), Range)) { 8256 8257 auto *Phi = cast<PHINode>(I->getOperand(0)); 8258 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8259 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8260 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8261 *PSE.getSE(), *OrigLoop, Range); 8262 } 8263 return nullptr; 8264 } 8265 8266 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8267 ArrayRef<VPValue *> Operands, 8268 VPlanPtr &Plan) { 8269 // If all incoming values are equal, the incoming VPValue can be used directly 8270 // instead of creating a new VPBlendRecipe. 8271 VPValue *FirstIncoming = Operands[0]; 8272 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8273 return FirstIncoming == Inc; 8274 })) { 8275 return Operands[0]; 8276 } 8277 8278 unsigned NumIncoming = Phi->getNumIncomingValues(); 8279 // For in-loop reductions, we do not need to create an additional select. 8280 VPValue *InLoopVal = nullptr; 8281 for (unsigned In = 0; In < NumIncoming; In++) { 8282 PHINode *PhiOp = 8283 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8284 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8285 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8286 InLoopVal = Operands[In]; 8287 } 8288 } 8289 8290 assert((!InLoopVal || NumIncoming == 2) && 8291 "Found an in-loop reduction for PHI with unexpected number of " 8292 "incoming values"); 8293 if (InLoopVal) 8294 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8295 8296 // We know that all PHIs in non-header blocks are converted into selects, so 8297 // we don't have to worry about the insertion order and we can just use the 8298 // builder. At this point we generate the predication tree. There may be 8299 // duplications since this is a simple recursive scan, but future 8300 // optimizations will clean it up. 8301 SmallVector<VPValue *, 2> OperandsWithMask; 8302 8303 for (unsigned In = 0; In < NumIncoming; In++) { 8304 VPValue *EdgeMask = 8305 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8306 assert((EdgeMask || NumIncoming == 1) && 8307 "Multiple predecessors with one having a full mask"); 8308 OperandsWithMask.push_back(Operands[In]); 8309 if (EdgeMask) 8310 OperandsWithMask.push_back(EdgeMask); 8311 } 8312 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8313 } 8314 8315 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8316 ArrayRef<VPValue *> Operands, 8317 VFRange &Range) const { 8318 8319 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8320 [this, CI](ElementCount VF) { 8321 return CM.isScalarWithPredication(CI, VF); 8322 }, 8323 Range); 8324 8325 if (IsPredicated) 8326 return nullptr; 8327 8328 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8329 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8330 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8331 ID == Intrinsic::pseudoprobe || 8332 ID == Intrinsic::experimental_noalias_scope_decl)) 8333 return nullptr; 8334 8335 auto willWiden = [&](ElementCount VF) -> bool { 8336 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8337 // The following case may be scalarized depending on the VF. 8338 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8339 // version of the instruction. 8340 // Is it beneficial to perform intrinsic call compared to lib call? 8341 bool NeedToScalarize = false; 8342 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8343 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8344 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8345 return UseVectorIntrinsic || !NeedToScalarize; 8346 }; 8347 8348 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8349 return nullptr; 8350 8351 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8352 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8353 } 8354 8355 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8356 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8357 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8358 // Instruction should be widened, unless it is scalar after vectorization, 8359 // scalarization is profitable or it is predicated. 8360 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8361 return CM.isScalarAfterVectorization(I, VF) || 8362 CM.isProfitableToScalarize(I, VF) || 8363 CM.isScalarWithPredication(I, VF); 8364 }; 8365 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8366 Range); 8367 } 8368 8369 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8370 ArrayRef<VPValue *> Operands) const { 8371 auto IsVectorizableOpcode = [](unsigned Opcode) { 8372 switch (Opcode) { 8373 case Instruction::Add: 8374 case Instruction::And: 8375 case Instruction::AShr: 8376 case Instruction::BitCast: 8377 case Instruction::FAdd: 8378 case Instruction::FCmp: 8379 case Instruction::FDiv: 8380 case Instruction::FMul: 8381 case Instruction::FNeg: 8382 case Instruction::FPExt: 8383 case Instruction::FPToSI: 8384 case Instruction::FPToUI: 8385 case Instruction::FPTrunc: 8386 case Instruction::FRem: 8387 case Instruction::FSub: 8388 case Instruction::ICmp: 8389 case Instruction::IntToPtr: 8390 case Instruction::LShr: 8391 case Instruction::Mul: 8392 case Instruction::Or: 8393 case Instruction::PtrToInt: 8394 case Instruction::SDiv: 8395 case Instruction::Select: 8396 case Instruction::SExt: 8397 case Instruction::Shl: 8398 case Instruction::SIToFP: 8399 case Instruction::SRem: 8400 case Instruction::Sub: 8401 case Instruction::Trunc: 8402 case Instruction::UDiv: 8403 case Instruction::UIToFP: 8404 case Instruction::URem: 8405 case Instruction::Xor: 8406 case Instruction::ZExt: 8407 case Instruction::Freeze: 8408 return true; 8409 } 8410 return false; 8411 }; 8412 8413 if (!IsVectorizableOpcode(I->getOpcode())) 8414 return nullptr; 8415 8416 // Success: widen this instruction. 8417 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8418 } 8419 8420 void VPRecipeBuilder::fixHeaderPhis() { 8421 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8422 for (VPHeaderPHIRecipe *R : PhisToFix) { 8423 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8424 VPRecipeBase *IncR = 8425 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8426 R->addOperand(IncR->getVPSingleValue()); 8427 } 8428 } 8429 8430 VPBasicBlock *VPRecipeBuilder::handleReplication( 8431 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8432 VPlanPtr &Plan) { 8433 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8434 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8435 Range); 8436 8437 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8438 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8439 Range); 8440 8441 // Even if the instruction is not marked as uniform, there are certain 8442 // intrinsic calls that can be effectively treated as such, so we check for 8443 // them here. Conservatively, we only do this for scalable vectors, since 8444 // for fixed-width VFs we can always fall back on full scalarization. 8445 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8446 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8447 case Intrinsic::assume: 8448 case Intrinsic::lifetime_start: 8449 case Intrinsic::lifetime_end: 8450 // For scalable vectors if one of the operands is variant then we still 8451 // want to mark as uniform, which will generate one instruction for just 8452 // the first lane of the vector. We can't scalarize the call in the same 8453 // way as for fixed-width vectors because we don't know how many lanes 8454 // there are. 8455 // 8456 // The reasons for doing it this way for scalable vectors are: 8457 // 1. For the assume intrinsic generating the instruction for the first 8458 // lane is still be better than not generating any at all. For 8459 // example, the input may be a splat across all lanes. 8460 // 2. For the lifetime start/end intrinsics the pointer operand only 8461 // does anything useful when the input comes from a stack object, 8462 // which suggests it should always be uniform. For non-stack objects 8463 // the effect is to poison the object, which still allows us to 8464 // remove the call. 8465 IsUniform = true; 8466 break; 8467 default: 8468 break; 8469 } 8470 } 8471 8472 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8473 IsUniform, IsPredicated); 8474 setRecipe(I, Recipe); 8475 Plan->addVPValue(I, Recipe); 8476 8477 // Find if I uses a predicated instruction. If so, it will use its scalar 8478 // value. Avoid hoisting the insert-element which packs the scalar value into 8479 // a vector value, as that happens iff all users use the vector value. 8480 for (VPValue *Op : Recipe->operands()) { 8481 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8482 if (!PredR) 8483 continue; 8484 auto *RepR = 8485 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8486 assert(RepR->isPredicated() && 8487 "expected Replicate recipe to be predicated"); 8488 RepR->setAlsoPack(false); 8489 } 8490 8491 // Finalize the recipe for Instr, first if it is not predicated. 8492 if (!IsPredicated) { 8493 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8494 VPBB->appendRecipe(Recipe); 8495 return VPBB; 8496 } 8497 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8498 8499 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8500 assert(SingleSucc && "VPBB must have a single successor when handling " 8501 "predicated replication."); 8502 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8503 // Record predicated instructions for above packing optimizations. 8504 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8505 VPBlockUtils::insertBlockAfter(Region, VPBB); 8506 auto *RegSucc = new VPBasicBlock(); 8507 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8508 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8509 return RegSucc; 8510 } 8511 8512 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8513 VPRecipeBase *PredRecipe, 8514 VPlanPtr &Plan) { 8515 // Instructions marked for predication are replicated and placed under an 8516 // if-then construct to prevent side-effects. 8517 8518 // Generate recipes to compute the block mask for this region. 8519 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8520 8521 // Build the triangular if-then region. 8522 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8523 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8524 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8525 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8526 auto *PHIRecipe = Instr->getType()->isVoidTy() 8527 ? nullptr 8528 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8529 if (PHIRecipe) { 8530 Plan->removeVPValueFor(Instr); 8531 Plan->addVPValue(Instr, PHIRecipe); 8532 } 8533 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8534 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8535 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8536 8537 // Note: first set Entry as region entry and then connect successors starting 8538 // from it in order, to propagate the "parent" of each VPBasicBlock. 8539 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8540 VPBlockUtils::connectBlocks(Pred, Exit); 8541 8542 return Region; 8543 } 8544 8545 VPRecipeOrVPValueTy 8546 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8547 ArrayRef<VPValue *> Operands, 8548 VFRange &Range, VPlanPtr &Plan) { 8549 // First, check for specific widening recipes that deal with inductions, Phi 8550 // nodes, calls and memory operations. 8551 VPRecipeBase *Recipe; 8552 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8553 if (Phi->getParent() != OrigLoop->getHeader()) 8554 return tryToBlend(Phi, Operands, Plan); 8555 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8556 return toVPRecipeResult(Recipe); 8557 8558 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8559 assert((Legal->isReductionVariable(Phi) || 8560 Legal->isFirstOrderRecurrence(Phi)) && 8561 "can only widen reductions and first-order recurrences here"); 8562 VPValue *StartV = Operands[0]; 8563 if (Legal->isReductionVariable(Phi)) { 8564 const RecurrenceDescriptor &RdxDesc = 8565 Legal->getReductionVars().find(Phi)->second; 8566 assert(RdxDesc.getRecurrenceStartValue() == 8567 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8568 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8569 CM.isInLoopReduction(Phi), 8570 CM.useOrderedReductions(RdxDesc)); 8571 } else { 8572 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8573 } 8574 8575 // Record the incoming value from the backedge, so we can add the incoming 8576 // value from the backedge after all recipes have been created. 8577 recordRecipeOf(cast<Instruction>( 8578 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8579 PhisToFix.push_back(PhiRecipe); 8580 return toVPRecipeResult(PhiRecipe); 8581 } 8582 8583 if (isa<TruncInst>(Instr) && 8584 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8585 Range, *Plan))) 8586 return toVPRecipeResult(Recipe); 8587 8588 // All widen recipes below deal only with VF > 1. 8589 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8590 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8591 return nullptr; 8592 8593 if (auto *CI = dyn_cast<CallInst>(Instr)) 8594 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8595 8596 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8597 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8598 8599 if (!shouldWiden(Instr, Range)) 8600 return nullptr; 8601 8602 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8603 return toVPRecipeResult(new VPWidenGEPRecipe( 8604 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8605 8606 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8607 bool InvariantCond = 8608 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8609 return toVPRecipeResult(new VPWidenSelectRecipe( 8610 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8611 } 8612 8613 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8614 } 8615 8616 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8617 ElementCount MaxVF) { 8618 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8619 8620 // Collect instructions from the original loop that will become trivially dead 8621 // in the vectorized loop. We don't need to vectorize these instructions. For 8622 // example, original induction update instructions can become dead because we 8623 // separately emit induction "steps" when generating code for the new loop. 8624 // Similarly, we create a new latch condition when setting up the structure 8625 // of the new loop, so the old one can become dead. 8626 SmallPtrSet<Instruction *, 4> DeadInstructions; 8627 collectTriviallyDeadInstructions(DeadInstructions); 8628 8629 // Add assume instructions we need to drop to DeadInstructions, to prevent 8630 // them from being added to the VPlan. 8631 // TODO: We only need to drop assumes in blocks that get flattend. If the 8632 // control flow is preserved, we should keep them. 8633 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8634 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8635 8636 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8637 // Dead instructions do not need sinking. Remove them from SinkAfter. 8638 for (Instruction *I : DeadInstructions) 8639 SinkAfter.erase(I); 8640 8641 // Cannot sink instructions after dead instructions (there won't be any 8642 // recipes for them). Instead, find the first non-dead previous instruction. 8643 for (auto &P : Legal->getSinkAfter()) { 8644 Instruction *SinkTarget = P.second; 8645 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8646 (void)FirstInst; 8647 while (DeadInstructions.contains(SinkTarget)) { 8648 assert( 8649 SinkTarget != FirstInst && 8650 "Must find a live instruction (at least the one feeding the " 8651 "first-order recurrence PHI) before reaching beginning of the block"); 8652 SinkTarget = SinkTarget->getPrevNode(); 8653 assert(SinkTarget != P.first && 8654 "sink source equals target, no sinking required"); 8655 } 8656 P.second = SinkTarget; 8657 } 8658 8659 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8660 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8661 VFRange SubRange = {VF, MaxVFPlusOne}; 8662 VPlans.push_back( 8663 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8664 VF = SubRange.End; 8665 } 8666 } 8667 8668 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8669 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8670 // BranchOnCount VPInstruction to the latch. 8671 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8672 bool HasNUW, bool IsVPlanNative) { 8673 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8674 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8675 8676 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8677 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8678 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8679 Header->insert(CanonicalIVPHI, Header->begin()); 8680 8681 auto *CanonicalIVIncrement = 8682 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8683 : VPInstruction::CanonicalIVIncrement, 8684 {CanonicalIVPHI}, DL); 8685 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8686 8687 VPBasicBlock *EB = TopRegion->getExitBasicBlock(); 8688 if (IsVPlanNative) 8689 EB->setCondBit(nullptr); 8690 EB->appendRecipe(CanonicalIVIncrement); 8691 8692 auto *BranchOnCount = 8693 new VPInstruction(VPInstruction::BranchOnCount, 8694 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8695 EB->appendRecipe(BranchOnCount); 8696 } 8697 8698 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8699 // original exit block. 8700 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8701 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8702 VPlan &Plan) { 8703 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8704 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8705 // Only handle single-exit loops with unique exit blocks for now. 8706 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8707 return; 8708 8709 // Introduce VPUsers modeling the exit values. 8710 for (PHINode &ExitPhi : ExitBB->phis()) { 8711 Value *IncomingValue = 8712 ExitPhi.getIncomingValueForBlock(ExitingBB); 8713 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8714 Plan.addLiveOut(&ExitPhi, V); 8715 } 8716 } 8717 8718 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8719 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8720 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8721 8722 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8723 8724 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8725 8726 // --------------------------------------------------------------------------- 8727 // Pre-construction: record ingredients whose recipes we'll need to further 8728 // process after constructing the initial VPlan. 8729 // --------------------------------------------------------------------------- 8730 8731 // Mark instructions we'll need to sink later and their targets as 8732 // ingredients whose recipe we'll need to record. 8733 for (auto &Entry : SinkAfter) { 8734 RecipeBuilder.recordRecipeOf(Entry.first); 8735 RecipeBuilder.recordRecipeOf(Entry.second); 8736 } 8737 for (auto &Reduction : CM.getInLoopReductionChains()) { 8738 PHINode *Phi = Reduction.first; 8739 RecurKind Kind = 8740 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8741 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8742 8743 RecipeBuilder.recordRecipeOf(Phi); 8744 for (auto &R : ReductionOperations) { 8745 RecipeBuilder.recordRecipeOf(R); 8746 // For min/max reductions, where we have a pair of icmp/select, we also 8747 // need to record the ICmp recipe, so it can be removed later. 8748 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8749 "Only min/max recurrences allowed for inloop reductions"); 8750 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8751 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8752 } 8753 } 8754 8755 // For each interleave group which is relevant for this (possibly trimmed) 8756 // Range, add it to the set of groups to be later applied to the VPlan and add 8757 // placeholders for its members' Recipes which we'll be replacing with a 8758 // single VPInterleaveRecipe. 8759 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8760 auto applyIG = [IG, this](ElementCount VF) -> bool { 8761 return (VF.isVector() && // Query is illegal for VF == 1 8762 CM.getWideningDecision(IG->getInsertPos(), VF) == 8763 LoopVectorizationCostModel::CM_Interleave); 8764 }; 8765 if (!getDecisionAndClampRange(applyIG, Range)) 8766 continue; 8767 InterleaveGroups.insert(IG); 8768 for (unsigned i = 0; i < IG->getFactor(); i++) 8769 if (Instruction *Member = IG->getMember(i)) 8770 RecipeBuilder.recordRecipeOf(Member); 8771 }; 8772 8773 // --------------------------------------------------------------------------- 8774 // Build initial VPlan: Scan the body of the loop in a topological order to 8775 // visit each basic block after having visited its predecessor basic blocks. 8776 // --------------------------------------------------------------------------- 8777 8778 // Create initial VPlan skeleton, starting with a block for the pre-header, 8779 // followed by a region for the vector loop, followed by the middle block. The 8780 // skeleton vector loop region contains a header and latch block. 8781 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8782 auto Plan = std::make_unique<VPlan>(Preheader); 8783 8784 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8785 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8786 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8787 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8788 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8789 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8790 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8791 8792 Instruction *DLInst = 8793 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8794 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8795 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8796 !CM.foldTailByMasking(), false); 8797 8798 // Scan the body of the loop in a topological order to visit each basic block 8799 // after having visited its predecessor basic blocks. 8800 LoopBlocksDFS DFS(OrigLoop); 8801 DFS.perform(LI); 8802 8803 VPBasicBlock *VPBB = HeaderVPBB; 8804 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8805 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8806 // Relevant instructions from basic block BB will be grouped into VPRecipe 8807 // ingredients and fill a new VPBasicBlock. 8808 unsigned VPBBsForBB = 0; 8809 if (VPBB != HeaderVPBB) 8810 VPBB->setName(BB->getName()); 8811 Builder.setInsertPoint(VPBB); 8812 8813 // Introduce each ingredient into VPlan. 8814 // TODO: Model and preserve debug intrinsics in VPlan. 8815 for (Instruction &I : BB->instructionsWithoutDebug()) { 8816 Instruction *Instr = &I; 8817 8818 // First filter out irrelevant instructions, to ensure no recipes are 8819 // built for them. 8820 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8821 continue; 8822 8823 SmallVector<VPValue *, 4> Operands; 8824 auto *Phi = dyn_cast<PHINode>(Instr); 8825 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8826 Operands.push_back(Plan->getOrAddVPValue( 8827 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8828 } else { 8829 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8830 Operands = {OpRange.begin(), OpRange.end()}; 8831 } 8832 8833 // Invariant stores inside loop will be deleted and a single store 8834 // with the final reduction value will be added to the exit block 8835 StoreInst *SI; 8836 if ((SI = dyn_cast<StoreInst>(&I)) && 8837 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8838 continue; 8839 8840 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8841 Instr, Operands, Range, Plan)) { 8842 // If Instr can be simplified to an existing VPValue, use it. 8843 if (RecipeOrValue.is<VPValue *>()) { 8844 auto *VPV = RecipeOrValue.get<VPValue *>(); 8845 Plan->addVPValue(Instr, VPV); 8846 // If the re-used value is a recipe, register the recipe for the 8847 // instruction, in case the recipe for Instr needs to be recorded. 8848 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8849 RecipeBuilder.setRecipe(Instr, R); 8850 continue; 8851 } 8852 // Otherwise, add the new recipe. 8853 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8854 for (auto *Def : Recipe->definedValues()) { 8855 auto *UV = Def->getUnderlyingValue(); 8856 Plan->addVPValue(UV, Def); 8857 } 8858 8859 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8860 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8861 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8862 // of the header block. That can happen for truncates of induction 8863 // variables. Those recipes are moved to the phi section of the header 8864 // block after applying SinkAfter, which relies on the original 8865 // position of the trunc. 8866 assert(isa<TruncInst>(Instr)); 8867 InductionsToMove.push_back( 8868 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8869 } 8870 RecipeBuilder.setRecipe(Instr, Recipe); 8871 VPBB->appendRecipe(Recipe); 8872 continue; 8873 } 8874 8875 // Otherwise, if all widening options failed, Instruction is to be 8876 // replicated. This may create a successor for VPBB. 8877 VPBasicBlock *NextVPBB = 8878 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8879 if (NextVPBB != VPBB) { 8880 VPBB = NextVPBB; 8881 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8882 : ""); 8883 } 8884 } 8885 8886 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8887 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8888 } 8889 8890 HeaderVPBB->setName("vector.body"); 8891 8892 // Fold the last, empty block into its predecessor. 8893 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8894 assert(VPBB && "expected to fold last (empty) block"); 8895 // After here, VPBB should not be used. 8896 VPBB = nullptr; 8897 8898 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8899 8900 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8901 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8902 "entry block must be set to a VPRegionBlock having a non-empty entry " 8903 "VPBasicBlock"); 8904 RecipeBuilder.fixHeaderPhis(); 8905 8906 // --------------------------------------------------------------------------- 8907 // Transform initial VPlan: Apply previously taken decisions, in order, to 8908 // bring the VPlan to its final state. 8909 // --------------------------------------------------------------------------- 8910 8911 // Apply Sink-After legal constraints. 8912 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8913 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8914 if (Region && Region->isReplicator()) { 8915 assert(Region->getNumSuccessors() == 1 && 8916 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8917 assert(R->getParent()->size() == 1 && 8918 "A recipe in an original replicator region must be the only " 8919 "recipe in its block"); 8920 return Region; 8921 } 8922 return nullptr; 8923 }; 8924 for (auto &Entry : SinkAfter) { 8925 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8926 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8927 8928 auto *TargetRegion = GetReplicateRegion(Target); 8929 auto *SinkRegion = GetReplicateRegion(Sink); 8930 if (!SinkRegion) { 8931 // If the sink source is not a replicate region, sink the recipe directly. 8932 if (TargetRegion) { 8933 // The target is in a replication region, make sure to move Sink to 8934 // the block after it, not into the replication region itself. 8935 VPBasicBlock *NextBlock = 8936 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8937 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8938 } else 8939 Sink->moveAfter(Target); 8940 continue; 8941 } 8942 8943 // The sink source is in a replicate region. Unhook the region from the CFG. 8944 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8945 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8946 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8947 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8948 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8949 8950 if (TargetRegion) { 8951 // The target recipe is also in a replicate region, move the sink region 8952 // after the target region. 8953 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8954 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8955 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8956 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8957 } else { 8958 // The sink source is in a replicate region, we need to move the whole 8959 // replicate region, which should only contain a single recipe in the 8960 // main block. 8961 auto *SplitBlock = 8962 Target->getParent()->splitAt(std::next(Target->getIterator())); 8963 8964 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8965 8966 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8967 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8968 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8969 } 8970 } 8971 8972 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8973 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8974 8975 // Now that sink-after is done, move induction recipes for optimized truncates 8976 // to the phi section of the header block. 8977 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8978 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8979 8980 // Adjust the recipes for any inloop reductions. 8981 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, 8982 RecipeBuilder, Range.Start); 8983 8984 // Introduce a recipe to combine the incoming and previous values of a 8985 // first-order recurrence. 8986 for (VPRecipeBase &R : 8987 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8988 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8989 if (!RecurPhi) 8990 continue; 8991 8992 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8993 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8994 auto *Region = GetReplicateRegion(PrevRecipe); 8995 if (Region) 8996 InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor()); 8997 if (Region || PrevRecipe->isPhi()) 8998 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8999 else 9000 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 9001 9002 auto *RecurSplice = cast<VPInstruction>( 9003 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 9004 {RecurPhi, RecurPhi->getBackedgeValue()})); 9005 9006 RecurPhi->replaceAllUsesWith(RecurSplice); 9007 // Set the first operand of RecurSplice to RecurPhi again, after replacing 9008 // all users. 9009 RecurSplice->setOperand(0, RecurPhi); 9010 } 9011 9012 // Interleave memory: for each Interleave Group we marked earlier as relevant 9013 // for this VPlan, replace the Recipes widening its memory instructions with a 9014 // single VPInterleaveRecipe at its insertion point. 9015 for (auto IG : InterleaveGroups) { 9016 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9017 RecipeBuilder.getRecipe(IG->getInsertPos())); 9018 SmallVector<VPValue *, 4> StoredValues; 9019 for (unsigned i = 0; i < IG->getFactor(); ++i) 9020 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 9021 auto *StoreR = 9022 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 9023 StoredValues.push_back(StoreR->getStoredValue()); 9024 } 9025 9026 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9027 Recipe->getMask()); 9028 VPIG->insertBefore(Recipe); 9029 unsigned J = 0; 9030 for (unsigned i = 0; i < IG->getFactor(); ++i) 9031 if (Instruction *Member = IG->getMember(i)) { 9032 if (!Member->getType()->isVoidTy()) { 9033 VPValue *OriginalV = Plan->getVPValue(Member); 9034 Plan->removeVPValueFor(Member); 9035 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9036 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9037 J++; 9038 } 9039 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9040 } 9041 } 9042 9043 // From this point onwards, VPlan-to-VPlan transformations may change the plan 9044 // in ways that accessing values using original IR values is incorrect. 9045 Plan->disableValue2VPValue(); 9046 9047 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 9048 VPlanTransforms::sinkScalarOperands(*Plan); 9049 VPlanTransforms::mergeReplicateRegions(*Plan); 9050 VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop); 9051 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 9052 9053 std::string PlanName; 9054 raw_string_ostream RSO(PlanName); 9055 ElementCount VF = Range.Start; 9056 Plan->addVF(VF); 9057 RSO << "Initial VPlan for VF={" << VF; 9058 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9059 Plan->addVF(VF); 9060 RSO << "," << VF; 9061 } 9062 RSO << "},UF>=1"; 9063 RSO.flush(); 9064 Plan->setName(PlanName); 9065 9066 // Fold Exit block into its predecessor if possible. 9067 // TODO: Fold block earlier once all VPlan transforms properly maintain a 9068 // VPBasicBlock as exit. 9069 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); 9070 9071 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9072 return Plan; 9073 } 9074 9075 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9076 // Outer loop handling: They may require CFG and instruction level 9077 // transformations before even evaluating whether vectorization is profitable. 9078 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9079 // the vectorization pipeline. 9080 assert(!OrigLoop->isInnermost()); 9081 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9082 9083 // Create new empty VPlan 9084 auto Plan = std::make_unique<VPlan>(); 9085 9086 // Build hierarchical CFG 9087 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9088 HCFGBuilder.buildHierarchicalCFG(); 9089 9090 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9091 VF *= 2) 9092 Plan->addVF(VF); 9093 9094 if (EnableVPlanPredication) { 9095 VPlanPredicator VPP(*Plan); 9096 VPP.predicate(); 9097 9098 // Avoid running transformation to recipes until masked code generation in 9099 // VPlan-native path is in place. 9100 return Plan; 9101 } 9102 9103 SmallPtrSet<Instruction *, 1> DeadInstructions; 9104 VPlanTransforms::VPInstructionsToVPRecipes( 9105 OrigLoop, Plan, 9106 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9107 DeadInstructions, *PSE.getSE()); 9108 9109 // Update plan to be compatible with the inner loop vectorizer for 9110 // code-generation. 9111 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion(); 9112 VPBasicBlock *Preheader = LoopRegion->getEntryBasicBlock(); 9113 VPBasicBlock *Exit = LoopRegion->getExitBasicBlock(); 9114 VPBlockBase *Latch = Exit->getSinglePredecessor(); 9115 VPBlockBase *Header = Preheader->getSingleSuccessor(); 9116 9117 // 1. Move preheader block out of main vector loop. 9118 Preheader->setParent(LoopRegion->getParent()); 9119 VPBlockUtils::disconnectBlocks(Preheader, Header); 9120 VPBlockUtils::connectBlocks(Preheader, LoopRegion); 9121 Plan->setEntry(Preheader); 9122 9123 // 2. Disconnect backedge and exit block. 9124 VPBlockUtils::disconnectBlocks(Latch, Header); 9125 VPBlockUtils::disconnectBlocks(Latch, Exit); 9126 9127 // 3. Update entry and exit of main vector loop region. 9128 LoopRegion->setEntry(Header); 9129 LoopRegion->setExit(Latch); 9130 9131 // 4. Remove exit block. 9132 delete Exit; 9133 9134 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9135 true, true); 9136 return Plan; 9137 } 9138 9139 // Adjust the recipes for reductions. For in-loop reductions the chain of 9140 // instructions leading from the loop exit instr to the phi need to be converted 9141 // to reductions, with one operand being vector and the other being the scalar 9142 // reduction chain. For other reductions, a select is introduced between the phi 9143 // and live-out recipes when folding the tail. 9144 void LoopVectorizationPlanner::adjustRecipesForReductions( 9145 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9146 ElementCount MinVF) { 9147 for (auto &Reduction : CM.getInLoopReductionChains()) { 9148 PHINode *Phi = Reduction.first; 9149 const RecurrenceDescriptor &RdxDesc = 9150 Legal->getReductionVars().find(Phi)->second; 9151 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9152 9153 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9154 continue; 9155 9156 // ReductionOperations are orders top-down from the phi's use to the 9157 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9158 // which of the two operands will remain scalar and which will be reduced. 9159 // For minmax the chain will be the select instructions. 9160 Instruction *Chain = Phi; 9161 for (Instruction *R : ReductionOperations) { 9162 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9163 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9164 9165 VPValue *ChainOp = Plan->getVPValue(Chain); 9166 unsigned FirstOpId; 9167 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9168 "Only min/max recurrences allowed for inloop reductions"); 9169 // Recognize a call to the llvm.fmuladd intrinsic. 9170 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9171 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9172 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9173 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9174 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9175 "Expected to replace a VPWidenSelectSC"); 9176 FirstOpId = 1; 9177 } else { 9178 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9179 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9180 "Expected to replace a VPWidenSC"); 9181 FirstOpId = 0; 9182 } 9183 unsigned VecOpId = 9184 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9185 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9186 9187 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9188 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9189 : nullptr; 9190 9191 if (IsFMulAdd) { 9192 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9193 // need to create an fmul recipe to use as the vector operand for the 9194 // fadd reduction. 9195 VPInstruction *FMulRecipe = new VPInstruction( 9196 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9197 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9198 WidenRecipe->getParent()->insert(FMulRecipe, 9199 WidenRecipe->getIterator()); 9200 VecOp = FMulRecipe; 9201 } 9202 VPReductionRecipe *RedRecipe = 9203 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9204 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9205 Plan->removeVPValueFor(R); 9206 Plan->addVPValue(R, RedRecipe); 9207 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9208 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9209 WidenRecipe->eraseFromParent(); 9210 9211 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9212 VPRecipeBase *CompareRecipe = 9213 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9214 assert(isa<VPWidenRecipe>(CompareRecipe) && 9215 "Expected to replace a VPWidenSC"); 9216 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9217 "Expected no remaining users"); 9218 CompareRecipe->eraseFromParent(); 9219 } 9220 Chain = R; 9221 } 9222 } 9223 9224 // If tail is folded by masking, introduce selects between the phi 9225 // and the live-out instruction of each reduction, at the beginning of the 9226 // dedicated latch block. 9227 if (CM.foldTailByMasking()) { 9228 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9229 for (VPRecipeBase &R : 9230 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9231 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9232 if (!PhiR || PhiR->isInLoop()) 9233 continue; 9234 VPValue *Cond = 9235 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9236 VPValue *Red = PhiR->getBackedgeValue(); 9237 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9238 "reduction recipe must be defined before latch"); 9239 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9240 } 9241 } 9242 } 9243 9244 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9245 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9246 VPSlotTracker &SlotTracker) const { 9247 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9248 IG->getInsertPos()->printAsOperand(O, false); 9249 O << ", "; 9250 getAddr()->printAsOperand(O, SlotTracker); 9251 VPValue *Mask = getMask(); 9252 if (Mask) { 9253 O << ", "; 9254 Mask->printAsOperand(O, SlotTracker); 9255 } 9256 9257 unsigned OpIdx = 0; 9258 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9259 if (!IG->getMember(i)) 9260 continue; 9261 if (getNumStoreOperands() > 0) { 9262 O << "\n" << Indent << " store "; 9263 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9264 O << " to index " << i; 9265 } else { 9266 O << "\n" << Indent << " "; 9267 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9268 O << " = load from index " << i; 9269 } 9270 ++OpIdx; 9271 } 9272 } 9273 #endif 9274 9275 void VPWidenCallRecipe::execute(VPTransformState &State) { 9276 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9277 *this, State); 9278 } 9279 9280 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9281 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9282 State.ILV->setDebugLocFromInst(&I); 9283 9284 // The condition can be loop invariant but still defined inside the 9285 // loop. This means that we can't just use the original 'cond' value. 9286 // We have to take the 'vectorized' value and pick the first lane. 9287 // Instcombine will make this a no-op. 9288 auto *InvarCond = 9289 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9290 9291 for (unsigned Part = 0; Part < State.UF; ++Part) { 9292 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9293 Value *Op0 = State.get(getOperand(1), Part); 9294 Value *Op1 = State.get(getOperand(2), Part); 9295 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9296 State.set(this, Sel, Part); 9297 State.ILV->addMetadata(Sel, &I); 9298 } 9299 } 9300 9301 void VPWidenRecipe::execute(VPTransformState &State) { 9302 auto &I = *cast<Instruction>(getUnderlyingValue()); 9303 auto &Builder = State.Builder; 9304 switch (I.getOpcode()) { 9305 case Instruction::Call: 9306 case Instruction::Br: 9307 case Instruction::PHI: 9308 case Instruction::GetElementPtr: 9309 case Instruction::Select: 9310 llvm_unreachable("This instruction is handled by a different recipe."); 9311 case Instruction::UDiv: 9312 case Instruction::SDiv: 9313 case Instruction::SRem: 9314 case Instruction::URem: 9315 case Instruction::Add: 9316 case Instruction::FAdd: 9317 case Instruction::Sub: 9318 case Instruction::FSub: 9319 case Instruction::FNeg: 9320 case Instruction::Mul: 9321 case Instruction::FMul: 9322 case Instruction::FDiv: 9323 case Instruction::FRem: 9324 case Instruction::Shl: 9325 case Instruction::LShr: 9326 case Instruction::AShr: 9327 case Instruction::And: 9328 case Instruction::Or: 9329 case Instruction::Xor: { 9330 // Just widen unops and binops. 9331 State.ILV->setDebugLocFromInst(&I); 9332 9333 for (unsigned Part = 0; Part < State.UF; ++Part) { 9334 SmallVector<Value *, 2> Ops; 9335 for (VPValue *VPOp : operands()) 9336 Ops.push_back(State.get(VPOp, Part)); 9337 9338 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9339 9340 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9341 VecOp->copyIRFlags(&I); 9342 9343 // If the instruction is vectorized and was in a basic block that needed 9344 // predication, we can't propagate poison-generating flags (nuw/nsw, 9345 // exact, etc.). The control flow has been linearized and the 9346 // instruction is no longer guarded by the predicate, which could make 9347 // the flag properties to no longer hold. 9348 if (State.MayGeneratePoisonRecipes.contains(this)) 9349 VecOp->dropPoisonGeneratingFlags(); 9350 } 9351 9352 // Use this vector value for all users of the original instruction. 9353 State.set(this, V, Part); 9354 State.ILV->addMetadata(V, &I); 9355 } 9356 9357 break; 9358 } 9359 case Instruction::Freeze: { 9360 State.ILV->setDebugLocFromInst(&I); 9361 9362 for (unsigned Part = 0; Part < State.UF; ++Part) { 9363 Value *Op = State.get(getOperand(0), Part); 9364 9365 Value *Freeze = Builder.CreateFreeze(Op); 9366 State.set(this, Freeze, Part); 9367 } 9368 break; 9369 } 9370 case Instruction::ICmp: 9371 case Instruction::FCmp: { 9372 // Widen compares. Generate vector compares. 9373 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9374 auto *Cmp = cast<CmpInst>(&I); 9375 State.ILV->setDebugLocFromInst(Cmp); 9376 for (unsigned Part = 0; Part < State.UF; ++Part) { 9377 Value *A = State.get(getOperand(0), Part); 9378 Value *B = State.get(getOperand(1), Part); 9379 Value *C = nullptr; 9380 if (FCmp) { 9381 // Propagate fast math flags. 9382 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9383 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9384 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9385 } else { 9386 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9387 } 9388 State.set(this, C, Part); 9389 State.ILV->addMetadata(C, &I); 9390 } 9391 9392 break; 9393 } 9394 9395 case Instruction::ZExt: 9396 case Instruction::SExt: 9397 case Instruction::FPToUI: 9398 case Instruction::FPToSI: 9399 case Instruction::FPExt: 9400 case Instruction::PtrToInt: 9401 case Instruction::IntToPtr: 9402 case Instruction::SIToFP: 9403 case Instruction::UIToFP: 9404 case Instruction::Trunc: 9405 case Instruction::FPTrunc: 9406 case Instruction::BitCast: { 9407 auto *CI = cast<CastInst>(&I); 9408 State.ILV->setDebugLocFromInst(CI); 9409 9410 /// Vectorize casts. 9411 Type *DestTy = (State.VF.isScalar()) 9412 ? CI->getType() 9413 : VectorType::get(CI->getType(), State.VF); 9414 9415 for (unsigned Part = 0; Part < State.UF; ++Part) { 9416 Value *A = State.get(getOperand(0), Part); 9417 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9418 State.set(this, Cast, Part); 9419 State.ILV->addMetadata(Cast, &I); 9420 } 9421 break; 9422 } 9423 default: 9424 // This instruction is not vectorized by simple widening. 9425 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9426 llvm_unreachable("Unhandled instruction!"); 9427 } // end of switch. 9428 } 9429 9430 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9431 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9432 // Construct a vector GEP by widening the operands of the scalar GEP as 9433 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9434 // results in a vector of pointers when at least one operand of the GEP 9435 // is vector-typed. Thus, to keep the representation compact, we only use 9436 // vector-typed operands for loop-varying values. 9437 9438 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9439 // If we are vectorizing, but the GEP has only loop-invariant operands, 9440 // the GEP we build (by only using vector-typed operands for 9441 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9442 // produce a vector of pointers, we need to either arbitrarily pick an 9443 // operand to broadcast, or broadcast a clone of the original GEP. 9444 // Here, we broadcast a clone of the original. 9445 // 9446 // TODO: If at some point we decide to scalarize instructions having 9447 // loop-invariant operands, this special case will no longer be 9448 // required. We would add the scalarization decision to 9449 // collectLoopScalars() and teach getVectorValue() to broadcast 9450 // the lane-zero scalar value. 9451 auto *Clone = State.Builder.Insert(GEP->clone()); 9452 for (unsigned Part = 0; Part < State.UF; ++Part) { 9453 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9454 State.set(this, EntryPart, Part); 9455 State.ILV->addMetadata(EntryPart, GEP); 9456 } 9457 } else { 9458 // If the GEP has at least one loop-varying operand, we are sure to 9459 // produce a vector of pointers. But if we are only unrolling, we want 9460 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9461 // produce with the code below will be scalar (if VF == 1) or vector 9462 // (otherwise). Note that for the unroll-only case, we still maintain 9463 // values in the vector mapping with initVector, as we do for other 9464 // instructions. 9465 for (unsigned Part = 0; Part < State.UF; ++Part) { 9466 // The pointer operand of the new GEP. If it's loop-invariant, we 9467 // won't broadcast it. 9468 auto *Ptr = IsPtrLoopInvariant 9469 ? State.get(getOperand(0), VPIteration(0, 0)) 9470 : State.get(getOperand(0), Part); 9471 9472 // Collect all the indices for the new GEP. If any index is 9473 // loop-invariant, we won't broadcast it. 9474 SmallVector<Value *, 4> Indices; 9475 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9476 VPValue *Operand = getOperand(I); 9477 if (IsIndexLoopInvariant[I - 1]) 9478 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9479 else 9480 Indices.push_back(State.get(Operand, Part)); 9481 } 9482 9483 // If the GEP instruction is vectorized and was in a basic block that 9484 // needed predication, we can't propagate the poison-generating 'inbounds' 9485 // flag. The control flow has been linearized and the GEP is no longer 9486 // guarded by the predicate, which could make the 'inbounds' properties to 9487 // no longer hold. 9488 bool IsInBounds = 9489 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9490 9491 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9492 // but it should be a vector, otherwise. 9493 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9494 Indices, "", IsInBounds); 9495 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9496 "NewGEP is not a pointer vector"); 9497 State.set(this, NewGEP, Part); 9498 State.ILV->addMetadata(NewGEP, GEP); 9499 } 9500 } 9501 } 9502 9503 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9504 assert(!State.Instance && "Int or FP induction being replicated."); 9505 9506 Value *Start = getStartValue()->getLiveInIRValue(); 9507 const InductionDescriptor &ID = getInductionDescriptor(); 9508 TruncInst *Trunc = getTruncInst(); 9509 IRBuilderBase &Builder = State.Builder; 9510 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9511 assert(State.VF.isVector() && "must have vector VF"); 9512 9513 // The value from the original loop to which we are mapping the new induction 9514 // variable. 9515 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9516 9517 // Fast-math-flags propagate from the original induction instruction. 9518 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9519 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9520 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9521 9522 // Now do the actual transformations, and start with fetching the step value. 9523 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9524 9525 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9526 "Expected either an induction phi-node or a truncate of it!"); 9527 9528 // Construct the initial value of the vector IV in the vector loop preheader 9529 auto CurrIP = Builder.saveIP(); 9530 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9531 Builder.SetInsertPoint(VectorPH->getTerminator()); 9532 if (isa<TruncInst>(EntryVal)) { 9533 assert(Start->getType()->isIntegerTy() && 9534 "Truncation requires an integer type"); 9535 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9536 Step = Builder.CreateTrunc(Step, TruncType); 9537 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9538 } 9539 9540 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9541 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9542 Value *SteppedStart = getStepVector( 9543 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9544 9545 // We create vector phi nodes for both integer and floating-point induction 9546 // variables. Here, we determine the kind of arithmetic we will perform. 9547 Instruction::BinaryOps AddOp; 9548 Instruction::BinaryOps MulOp; 9549 if (Step->getType()->isIntegerTy()) { 9550 AddOp = Instruction::Add; 9551 MulOp = Instruction::Mul; 9552 } else { 9553 AddOp = ID.getInductionOpcode(); 9554 MulOp = Instruction::FMul; 9555 } 9556 9557 // Multiply the vectorization factor by the step using integer or 9558 // floating-point arithmetic as appropriate. 9559 Type *StepType = Step->getType(); 9560 Value *RuntimeVF; 9561 if (Step->getType()->isFloatingPointTy()) 9562 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9563 else 9564 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9565 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9566 9567 // Create a vector splat to use in the induction update. 9568 // 9569 // FIXME: If the step is non-constant, we create the vector splat with 9570 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9571 // handle a constant vector splat. 9572 Value *SplatVF = isa<Constant>(Mul) 9573 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9574 : Builder.CreateVectorSplat(State.VF, Mul); 9575 Builder.restoreIP(CurrIP); 9576 9577 // We may need to add the step a number of times, depending on the unroll 9578 // factor. The last of those goes into the PHI. 9579 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9580 &*State.CFG.PrevBB->getFirstInsertionPt()); 9581 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9582 Instruction *LastInduction = VecInd; 9583 for (unsigned Part = 0; Part < State.UF; ++Part) { 9584 State.set(this, LastInduction, Part); 9585 9586 if (isa<TruncInst>(EntryVal)) 9587 State.ILV->addMetadata(LastInduction, EntryVal); 9588 9589 LastInduction = cast<Instruction>( 9590 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9591 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9592 } 9593 9594 LastInduction->setName("vec.ind.next"); 9595 VecInd->addIncoming(SteppedStart, VectorPH); 9596 // Add induction update using an incorrect block temporarily. The phi node 9597 // will be fixed after VPlan execution. Note that at this point the latch 9598 // block cannot be used, as it does not exist yet. 9599 // TODO: Model increment value in VPlan, by turning the recipe into a 9600 // multi-def and a subclass of VPHeaderPHIRecipe. 9601 VecInd->addIncoming(LastInduction, VectorPH); 9602 } 9603 9604 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9605 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9606 "Not a pointer induction according to InductionDescriptor!"); 9607 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9608 "Unexpected type."); 9609 9610 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9611 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9612 9613 if (onlyScalarsGenerated(State.VF)) { 9614 // This is the normalized GEP that starts counting at zero. 9615 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9616 CanonicalIV, IndDesc.getStep()->getType()); 9617 // Determine the number of scalars we need to generate for each unroll 9618 // iteration. If the instruction is uniform, we only need to generate the 9619 // first lane. Otherwise, we generate all VF values. 9620 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9621 assert((IsUniform || !State.VF.isScalable()) && 9622 "Cannot scalarize a scalable VF"); 9623 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9624 9625 for (unsigned Part = 0; Part < State.UF; ++Part) { 9626 Value *PartStart = 9627 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9628 9629 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9630 Value *Idx = State.Builder.CreateAdd( 9631 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9632 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9633 9634 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9635 State.CFG.PrevBB->getTerminator()); 9636 Value *SclrGep = emitTransformedIndex( 9637 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9638 SclrGep->setName("next.gep"); 9639 State.set(this, SclrGep, VPIteration(Part, Lane)); 9640 } 9641 } 9642 return; 9643 } 9644 9645 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9646 "Induction step not a SCEV constant!"); 9647 Type *PhiType = IndDesc.getStep()->getType(); 9648 9649 // Build a pointer phi 9650 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9651 Type *ScStValueType = ScalarStartValue->getType(); 9652 PHINode *NewPointerPhi = 9653 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9654 9655 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9656 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9657 9658 // A pointer induction, performed by using a gep 9659 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9660 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9661 9662 const SCEV *ScalarStep = IndDesc.getStep(); 9663 SCEVExpander Exp(SE, DL, "induction"); 9664 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9665 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9666 Value *NumUnrolledElems = 9667 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9668 Value *InductionGEP = GetElementPtrInst::Create( 9669 IndDesc.getElementType(), NewPointerPhi, 9670 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9671 InductionLoc); 9672 // Add induction update using an incorrect block temporarily. The phi node 9673 // will be fixed after VPlan execution. Note that at this point the latch 9674 // block cannot be used, as it does not exist yet. 9675 // TODO: Model increment value in VPlan, by turning the recipe into a 9676 // multi-def and a subclass of VPHeaderPHIRecipe. 9677 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9678 9679 // Create UF many actual address geps that use the pointer 9680 // phi as base and a vectorized version of the step value 9681 // (<step*0, ..., step*N>) as offset. 9682 for (unsigned Part = 0; Part < State.UF; ++Part) { 9683 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9684 Value *StartOffsetScalar = 9685 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9686 Value *StartOffset = 9687 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9688 // Create a vector of consecutive numbers from zero to VF. 9689 StartOffset = State.Builder.CreateAdd( 9690 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9691 9692 Value *GEP = State.Builder.CreateGEP( 9693 IndDesc.getElementType(), NewPointerPhi, 9694 State.Builder.CreateMul( 9695 StartOffset, 9696 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9697 "vector.gep")); 9698 State.set(this, GEP, Part); 9699 } 9700 } 9701 9702 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9703 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9704 9705 // Fast-math-flags propagate from the original induction instruction. 9706 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9707 if (IndDesc.getInductionBinOp() && 9708 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9709 State.Builder.setFastMathFlags( 9710 IndDesc.getInductionBinOp()->getFastMathFlags()); 9711 9712 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9713 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9714 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9715 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9716 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9717 ScalarIV = 9718 Ty->isIntegerTy() 9719 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9720 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9721 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9722 getStartValue()->getLiveInIRValue(), Step, 9723 IndDesc); 9724 ScalarIV->setName("offset.idx"); 9725 } 9726 if (TruncToTy) { 9727 assert(Step->getType()->isIntegerTy() && 9728 "Truncation requires an integer step"); 9729 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9730 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9731 } 9732 return ScalarIV; 9733 }; 9734 9735 Value *ScalarIV = CreateScalarIV(Step); 9736 if (State.VF.isVector()) { 9737 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9738 return; 9739 } 9740 9741 for (unsigned Part = 0; Part < State.UF; ++Part) { 9742 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9743 Value *EntryPart; 9744 if (Step->getType()->isFloatingPointTy()) { 9745 Value *StartIdx = 9746 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9747 // Floating-point operations inherit FMF via the builder's flags. 9748 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9749 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9750 ScalarIV, MulOp); 9751 } else { 9752 Value *StartIdx = 9753 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9754 EntryPart = State.Builder.CreateAdd( 9755 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9756 } 9757 State.set(this, EntryPart, Part); 9758 } 9759 } 9760 9761 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9762 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this, 9763 State); 9764 } 9765 9766 void VPBlendRecipe::execute(VPTransformState &State) { 9767 State.ILV->setDebugLocFromInst(Phi, &State.Builder); 9768 // We know that all PHIs in non-header blocks are converted into 9769 // selects, so we don't have to worry about the insertion order and we 9770 // can just use the builder. 9771 // At this point we generate the predication tree. There may be 9772 // duplications since this is a simple recursive scan, but future 9773 // optimizations will clean it up. 9774 9775 unsigned NumIncoming = getNumIncomingValues(); 9776 9777 // Generate a sequence of selects of the form: 9778 // SELECT(Mask3, In3, 9779 // SELECT(Mask2, In2, 9780 // SELECT(Mask1, In1, 9781 // In0))) 9782 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9783 // are essentially undef are taken from In0. 9784 InnerLoopVectorizer::VectorParts Entry(State.UF); 9785 for (unsigned In = 0; In < NumIncoming; ++In) { 9786 for (unsigned Part = 0; Part < State.UF; ++Part) { 9787 // We might have single edge PHIs (blocks) - use an identity 9788 // 'select' for the first PHI operand. 9789 Value *In0 = State.get(getIncomingValue(In), Part); 9790 if (In == 0) 9791 Entry[Part] = In0; // Initialize with the first incoming value. 9792 else { 9793 // Select between the current value and the previous incoming edge 9794 // based on the incoming mask. 9795 Value *Cond = State.get(getMask(In), Part); 9796 Entry[Part] = 9797 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9798 } 9799 } 9800 } 9801 for (unsigned Part = 0; Part < State.UF; ++Part) 9802 State.set(this, Entry[Part], Part); 9803 } 9804 9805 void VPInterleaveRecipe::execute(VPTransformState &State) { 9806 assert(!State.Instance && "Interleave group being replicated."); 9807 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9808 getStoredValues(), getMask()); 9809 } 9810 9811 void VPReductionRecipe::execute(VPTransformState &State) { 9812 assert(!State.Instance && "Reduction being replicated."); 9813 Value *PrevInChain = State.get(getChainOp(), 0); 9814 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9815 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9816 // Propagate the fast-math flags carried by the underlying instruction. 9817 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9818 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9819 for (unsigned Part = 0; Part < State.UF; ++Part) { 9820 Value *NewVecOp = State.get(getVecOp(), Part); 9821 if (VPValue *Cond = getCondOp()) { 9822 Value *NewCond = State.get(Cond, Part); 9823 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9824 Value *Iden = RdxDesc->getRecurrenceIdentity( 9825 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9826 Value *IdenVec = 9827 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9828 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9829 NewVecOp = Select; 9830 } 9831 Value *NewRed; 9832 Value *NextInChain; 9833 if (IsOrdered) { 9834 if (State.VF.isVector()) 9835 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9836 PrevInChain); 9837 else 9838 NewRed = State.Builder.CreateBinOp( 9839 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9840 NewVecOp); 9841 PrevInChain = NewRed; 9842 } else { 9843 PrevInChain = State.get(getChainOp(), Part); 9844 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9845 } 9846 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9847 NextInChain = 9848 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9849 NewRed, PrevInChain); 9850 } else if (IsOrdered) 9851 NextInChain = NewRed; 9852 else 9853 NextInChain = State.Builder.CreateBinOp( 9854 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9855 PrevInChain); 9856 State.set(this, NextInChain, Part); 9857 } 9858 } 9859 9860 void VPReplicateRecipe::execute(VPTransformState &State) { 9861 if (State.Instance) { // Generate a single instance. 9862 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9863 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9864 IsPredicated, State); 9865 // Insert scalar instance packing it into a vector. 9866 if (AlsoPack && State.VF.isVector()) { 9867 // If we're constructing lane 0, initialize to start from poison. 9868 if (State.Instance->Lane.isFirstLane()) { 9869 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9870 Value *Poison = PoisonValue::get( 9871 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9872 State.set(this, Poison, State.Instance->Part); 9873 } 9874 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9875 } 9876 return; 9877 } 9878 9879 // Generate scalar instances for all VF lanes of all UF parts, unless the 9880 // instruction is uniform inwhich case generate only the first lane for each 9881 // of the UF parts. 9882 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9883 assert((!State.VF.isScalable() || IsUniform) && 9884 "Can't scalarize a scalable vector"); 9885 for (unsigned Part = 0; Part < State.UF; ++Part) 9886 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9887 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9888 VPIteration(Part, Lane), IsPredicated, 9889 State); 9890 } 9891 9892 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9893 assert(State.Instance && "Branch on Mask works only on single instance."); 9894 9895 unsigned Part = State.Instance->Part; 9896 unsigned Lane = State.Instance->Lane.getKnownLane(); 9897 9898 Value *ConditionBit = nullptr; 9899 VPValue *BlockInMask = getMask(); 9900 if (BlockInMask) { 9901 ConditionBit = State.get(BlockInMask, Part); 9902 if (ConditionBit->getType()->isVectorTy()) 9903 ConditionBit = State.Builder.CreateExtractElement( 9904 ConditionBit, State.Builder.getInt32(Lane)); 9905 } else // Block in mask is all-one. 9906 ConditionBit = State.Builder.getTrue(); 9907 9908 // Replace the temporary unreachable terminator with a new conditional branch, 9909 // whose two destinations will be set later when they are created. 9910 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9911 assert(isa<UnreachableInst>(CurrentTerminator) && 9912 "Expected to replace unreachable terminator with conditional branch."); 9913 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9914 CondBr->setSuccessor(0, nullptr); 9915 ReplaceInstWithInst(CurrentTerminator, CondBr); 9916 } 9917 9918 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9919 assert(State.Instance && "Predicated instruction PHI works per instance."); 9920 Instruction *ScalarPredInst = 9921 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9922 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9923 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9924 assert(PredicatingBB && "Predicated block has no single predecessor."); 9925 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9926 "operand must be VPReplicateRecipe"); 9927 9928 // By current pack/unpack logic we need to generate only a single phi node: if 9929 // a vector value for the predicated instruction exists at this point it means 9930 // the instruction has vector users only, and a phi for the vector value is 9931 // needed. In this case the recipe of the predicated instruction is marked to 9932 // also do that packing, thereby "hoisting" the insert-element sequence. 9933 // Otherwise, a phi node for the scalar value is needed. 9934 unsigned Part = State.Instance->Part; 9935 if (State.hasVectorValue(getOperand(0), Part)) { 9936 Value *VectorValue = State.get(getOperand(0), Part); 9937 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9938 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9939 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9940 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9941 if (State.hasVectorValue(this, Part)) 9942 State.reset(this, VPhi, Part); 9943 else 9944 State.set(this, VPhi, Part); 9945 // NOTE: Currently we need to update the value of the operand, so the next 9946 // predicated iteration inserts its generated value in the correct vector. 9947 State.reset(getOperand(0), VPhi, Part); 9948 } else { 9949 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9950 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9951 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9952 PredicatingBB); 9953 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9954 if (State.hasScalarValue(this, *State.Instance)) 9955 State.reset(this, Phi, *State.Instance); 9956 else 9957 State.set(this, Phi, *State.Instance); 9958 // NOTE: Currently we need to update the value of the operand, so the next 9959 // predicated iteration inserts its generated value in the correct vector. 9960 State.reset(getOperand(0), Phi, *State.Instance); 9961 } 9962 } 9963 9964 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9965 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9966 9967 // Attempt to issue a wide load. 9968 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9969 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9970 9971 assert((LI || SI) && "Invalid Load/Store instruction"); 9972 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9973 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9974 9975 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9976 9977 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9978 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9979 bool CreateGatherScatter = !Consecutive; 9980 9981 auto &Builder = State.Builder; 9982 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9983 bool isMaskRequired = getMask(); 9984 if (isMaskRequired) 9985 for (unsigned Part = 0; Part < State.UF; ++Part) 9986 BlockInMaskParts[Part] = State.get(getMask(), Part); 9987 9988 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9989 // Calculate the pointer for the specific unroll-part. 9990 GetElementPtrInst *PartPtr = nullptr; 9991 9992 bool InBounds = false; 9993 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9994 InBounds = gep->isInBounds(); 9995 if (Reverse) { 9996 // If the address is consecutive but reversed, then the 9997 // wide store needs to start at the last vector element. 9998 // RunTimeVF = VScale * VF.getKnownMinValue() 9999 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 10000 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 10001 // NumElt = -Part * RunTimeVF 10002 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 10003 // LastLane = 1 - RunTimeVF 10004 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 10005 PartPtr = 10006 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 10007 PartPtr->setIsInBounds(InBounds); 10008 PartPtr = cast<GetElementPtrInst>( 10009 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 10010 PartPtr->setIsInBounds(InBounds); 10011 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 10012 BlockInMaskParts[Part] = 10013 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 10014 } else { 10015 Value *Increment = 10016 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 10017 PartPtr = cast<GetElementPtrInst>( 10018 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 10019 PartPtr->setIsInBounds(InBounds); 10020 } 10021 10022 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 10023 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 10024 }; 10025 10026 // Handle Stores: 10027 if (SI) { 10028 State.ILV->setDebugLocFromInst(SI); 10029 10030 for (unsigned Part = 0; Part < State.UF; ++Part) { 10031 Instruction *NewSI = nullptr; 10032 Value *StoredVal = State.get(StoredValue, Part); 10033 if (CreateGatherScatter) { 10034 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10035 Value *VectorGep = State.get(getAddr(), Part); 10036 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 10037 MaskPart); 10038 } else { 10039 if (Reverse) { 10040 // If we store to reverse consecutive memory locations, then we need 10041 // to reverse the order of elements in the stored value. 10042 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 10043 // We don't want to update the value in the map as it might be used in 10044 // another expression. So don't call resetVectorValue(StoredVal). 10045 } 10046 auto *VecPtr = 10047 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10048 if (isMaskRequired) 10049 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 10050 BlockInMaskParts[Part]); 10051 else 10052 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 10053 } 10054 State.ILV->addMetadata(NewSI, SI); 10055 } 10056 return; 10057 } 10058 10059 // Handle loads. 10060 assert(LI && "Must have a load instruction"); 10061 State.ILV->setDebugLocFromInst(LI); 10062 for (unsigned Part = 0; Part < State.UF; ++Part) { 10063 Value *NewLI; 10064 if (CreateGatherScatter) { 10065 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 10066 Value *VectorGep = State.get(getAddr(), Part); 10067 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 10068 nullptr, "wide.masked.gather"); 10069 State.ILV->addMetadata(NewLI, LI); 10070 } else { 10071 auto *VecPtr = 10072 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 10073 if (isMaskRequired) 10074 NewLI = Builder.CreateMaskedLoad( 10075 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 10076 PoisonValue::get(DataTy), "wide.masked.load"); 10077 else 10078 NewLI = 10079 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 10080 10081 // Add metadata to the load, but setVectorValue to the reverse shuffle. 10082 State.ILV->addMetadata(NewLI, LI); 10083 if (Reverse) 10084 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 10085 } 10086 10087 State.set(getVPSingleValue(), NewLI, Part); 10088 } 10089 } 10090 10091 // Determine how to lower the scalar epilogue, which depends on 1) optimising 10092 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 10093 // predication, and 4) a TTI hook that analyses whether the loop is suitable 10094 // for predication. 10095 static ScalarEpilogueLowering getScalarEpilogueLowering( 10096 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 10097 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 10098 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 10099 LoopVectorizationLegality &LVL) { 10100 // 1) OptSize takes precedence over all other options, i.e. if this is set, 10101 // don't look at hints or options, and don't request a scalar epilogue. 10102 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 10103 // LoopAccessInfo (due to code dependency and not being able to reliably get 10104 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10105 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10106 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10107 // back to the old way and vectorize with versioning when forced. See D81345.) 10108 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10109 PGSOQueryType::IRPass) && 10110 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10111 return CM_ScalarEpilogueNotAllowedOptSize; 10112 10113 // 2) If set, obey the directives 10114 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10115 switch (PreferPredicateOverEpilogue) { 10116 case PreferPredicateTy::ScalarEpilogue: 10117 return CM_ScalarEpilogueAllowed; 10118 case PreferPredicateTy::PredicateElseScalarEpilogue: 10119 return CM_ScalarEpilogueNotNeededUsePredicate; 10120 case PreferPredicateTy::PredicateOrDontVectorize: 10121 return CM_ScalarEpilogueNotAllowedUsePredicate; 10122 }; 10123 } 10124 10125 // 3) If set, obey the hints 10126 switch (Hints.getPredicate()) { 10127 case LoopVectorizeHints::FK_Enabled: 10128 return CM_ScalarEpilogueNotNeededUsePredicate; 10129 case LoopVectorizeHints::FK_Disabled: 10130 return CM_ScalarEpilogueAllowed; 10131 }; 10132 10133 // 4) if the TTI hook indicates this is profitable, request predication. 10134 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10135 LVL.getLAI())) 10136 return CM_ScalarEpilogueNotNeededUsePredicate; 10137 10138 return CM_ScalarEpilogueAllowed; 10139 } 10140 10141 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10142 // If Values have been set for this Def return the one relevant for \p Part. 10143 if (hasVectorValue(Def, Part)) 10144 return Data.PerPartOutput[Def][Part]; 10145 10146 if (!hasScalarValue(Def, {Part, 0})) { 10147 Value *IRV = Def->getLiveInIRValue(); 10148 Value *B = ILV->getBroadcastInstrs(IRV); 10149 set(Def, B, Part); 10150 return B; 10151 } 10152 10153 Value *ScalarValue = get(Def, {Part, 0}); 10154 // If we aren't vectorizing, we can just copy the scalar map values over 10155 // to the vector map. 10156 if (VF.isScalar()) { 10157 set(Def, ScalarValue, Part); 10158 return ScalarValue; 10159 } 10160 10161 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10162 bool IsUniform = RepR && RepR->isUniform(); 10163 10164 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10165 // Check if there is a scalar value for the selected lane. 10166 if (!hasScalarValue(Def, {Part, LastLane})) { 10167 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10168 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10169 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10170 "unexpected recipe found to be invariant"); 10171 IsUniform = true; 10172 LastLane = 0; 10173 } 10174 10175 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10176 // Set the insert point after the last scalarized instruction or after the 10177 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10178 // will directly follow the scalar definitions. 10179 auto OldIP = Builder.saveIP(); 10180 auto NewIP = 10181 isa<PHINode>(LastInst) 10182 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10183 : std::next(BasicBlock::iterator(LastInst)); 10184 Builder.SetInsertPoint(&*NewIP); 10185 10186 // However, if we are vectorizing, we need to construct the vector values. 10187 // If the value is known to be uniform after vectorization, we can just 10188 // broadcast the scalar value corresponding to lane zero for each unroll 10189 // iteration. Otherwise, we construct the vector values using 10190 // insertelement instructions. Since the resulting vectors are stored in 10191 // State, we will only generate the insertelements once. 10192 Value *VectorValue = nullptr; 10193 if (IsUniform) { 10194 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10195 set(Def, VectorValue, Part); 10196 } else { 10197 // Initialize packing with insertelements to start from undef. 10198 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10199 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10200 set(Def, Undef, Part); 10201 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10202 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10203 VectorValue = get(Def, Part); 10204 } 10205 Builder.restoreIP(OldIP); 10206 return VectorValue; 10207 } 10208 10209 // Process the loop in the VPlan-native vectorization path. This path builds 10210 // VPlan upfront in the vectorization pipeline, which allows to apply 10211 // VPlan-to-VPlan transformations from the very beginning without modifying the 10212 // input LLVM IR. 10213 static bool processLoopInVPlanNativePath( 10214 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10215 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10216 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10217 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10218 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10219 LoopVectorizationRequirements &Requirements) { 10220 10221 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10222 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10223 return false; 10224 } 10225 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10226 Function *F = L->getHeader()->getParent(); 10227 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10228 10229 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10230 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10231 10232 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10233 &Hints, IAI); 10234 // Use the planner for outer loop vectorization. 10235 // TODO: CM is not used at this point inside the planner. Turn CM into an 10236 // optional argument if we don't need it in the future. 10237 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10238 Requirements, ORE); 10239 10240 // Get user vectorization factor. 10241 ElementCount UserVF = Hints.getWidth(); 10242 10243 CM.collectElementTypesForWidening(); 10244 10245 // Plan how to best vectorize, return the best VF and its cost. 10246 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10247 10248 // If we are stress testing VPlan builds, do not attempt to generate vector 10249 // code. Masked vector code generation support will follow soon. 10250 // Also, do not attempt to vectorize if no vector code will be produced. 10251 if (VPlanBuildStressTest || EnableVPlanPredication || 10252 VectorizationFactor::Disabled() == VF) 10253 return false; 10254 10255 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10256 10257 { 10258 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10259 F->getParent()->getDataLayout()); 10260 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10261 &CM, BFI, PSI, Checks); 10262 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10263 << L->getHeader()->getParent()->getName() << "\"\n"); 10264 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10265 } 10266 10267 // Mark the loop as already vectorized to avoid vectorizing again. 10268 Hints.setAlreadyVectorized(); 10269 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10270 return true; 10271 } 10272 10273 // Emit a remark if there are stores to floats that required a floating point 10274 // extension. If the vectorized loop was generated with floating point there 10275 // will be a performance penalty from the conversion overhead and the change in 10276 // the vector width. 10277 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10278 SmallVector<Instruction *, 4> Worklist; 10279 for (BasicBlock *BB : L->getBlocks()) { 10280 for (Instruction &Inst : *BB) { 10281 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10282 if (S->getValueOperand()->getType()->isFloatTy()) 10283 Worklist.push_back(S); 10284 } 10285 } 10286 } 10287 10288 // Traverse the floating point stores upwards searching, for floating point 10289 // conversions. 10290 SmallPtrSet<const Instruction *, 4> Visited; 10291 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10292 while (!Worklist.empty()) { 10293 auto *I = Worklist.pop_back_val(); 10294 if (!L->contains(I)) 10295 continue; 10296 if (!Visited.insert(I).second) 10297 continue; 10298 10299 // Emit a remark if the floating point store required a floating 10300 // point conversion. 10301 // TODO: More work could be done to identify the root cause such as a 10302 // constant or a function return type and point the user to it. 10303 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10304 ORE->emit([&]() { 10305 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10306 I->getDebugLoc(), L->getHeader()) 10307 << "floating point conversion changes vector width. " 10308 << "Mixed floating point precision requires an up/down " 10309 << "cast that will negatively impact performance."; 10310 }); 10311 10312 for (Use &Op : I->operands()) 10313 if (auto *OpI = dyn_cast<Instruction>(Op)) 10314 Worklist.push_back(OpI); 10315 } 10316 } 10317 10318 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10319 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10320 !EnableLoopInterleaving), 10321 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10322 !EnableLoopVectorization) {} 10323 10324 bool LoopVectorizePass::processLoop(Loop *L) { 10325 assert((EnableVPlanNativePath || L->isInnermost()) && 10326 "VPlan-native path is not enabled. Only process inner loops."); 10327 10328 #ifndef NDEBUG 10329 const std::string DebugLocStr = getDebugLocString(L); 10330 #endif /* NDEBUG */ 10331 10332 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10333 << L->getHeader()->getParent()->getName() << "' from " 10334 << DebugLocStr << "\n"); 10335 10336 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10337 10338 LLVM_DEBUG( 10339 dbgs() << "LV: Loop hints:" 10340 << " force=" 10341 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10342 ? "disabled" 10343 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10344 ? "enabled" 10345 : "?")) 10346 << " width=" << Hints.getWidth() 10347 << " interleave=" << Hints.getInterleave() << "\n"); 10348 10349 // Function containing loop 10350 Function *F = L->getHeader()->getParent(); 10351 10352 // Looking at the diagnostic output is the only way to determine if a loop 10353 // was vectorized (other than looking at the IR or machine code), so it 10354 // is important to generate an optimization remark for each loop. Most of 10355 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10356 // generated as OptimizationRemark and OptimizationRemarkMissed are 10357 // less verbose reporting vectorized loops and unvectorized loops that may 10358 // benefit from vectorization, respectively. 10359 10360 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10361 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10362 return false; 10363 } 10364 10365 PredicatedScalarEvolution PSE(*SE, *L); 10366 10367 // Check if it is legal to vectorize the loop. 10368 LoopVectorizationRequirements Requirements; 10369 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10370 &Requirements, &Hints, DB, AC, BFI, PSI); 10371 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10372 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10373 Hints.emitRemarkWithHints(); 10374 return false; 10375 } 10376 10377 // Check the function attributes and profiles to find out if this function 10378 // should be optimized for size. 10379 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10380 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10381 10382 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10383 // here. They may require CFG and instruction level transformations before 10384 // even evaluating whether vectorization is profitable. Since we cannot modify 10385 // the incoming IR, we need to build VPlan upfront in the vectorization 10386 // pipeline. 10387 if (!L->isInnermost()) 10388 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10389 ORE, BFI, PSI, Hints, Requirements); 10390 10391 assert(L->isInnermost() && "Inner loop expected."); 10392 10393 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10394 // count by optimizing for size, to minimize overheads. 10395 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10396 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10397 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10398 << "This loop is worth vectorizing only if no scalar " 10399 << "iteration overheads are incurred."); 10400 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10401 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10402 else { 10403 LLVM_DEBUG(dbgs() << "\n"); 10404 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10405 } 10406 } 10407 10408 // Check the function attributes to see if implicit floats are allowed. 10409 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10410 // an integer loop and the vector instructions selected are purely integer 10411 // vector instructions? 10412 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10413 reportVectorizationFailure( 10414 "Can't vectorize when the NoImplicitFloat attribute is used", 10415 "loop not vectorized due to NoImplicitFloat attribute", 10416 "NoImplicitFloat", ORE, L); 10417 Hints.emitRemarkWithHints(); 10418 return false; 10419 } 10420 10421 // Check if the target supports potentially unsafe FP vectorization. 10422 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10423 // for the target we're vectorizing for, to make sure none of the 10424 // additional fp-math flags can help. 10425 if (Hints.isPotentiallyUnsafe() && 10426 TTI->isFPVectorizationPotentiallyUnsafe()) { 10427 reportVectorizationFailure( 10428 "Potentially unsafe FP op prevents vectorization", 10429 "loop not vectorized due to unsafe FP support.", 10430 "UnsafeFP", ORE, L); 10431 Hints.emitRemarkWithHints(); 10432 return false; 10433 } 10434 10435 bool AllowOrderedReductions; 10436 // If the flag is set, use that instead and override the TTI behaviour. 10437 if (ForceOrderedReductions.getNumOccurrences() > 0) 10438 AllowOrderedReductions = ForceOrderedReductions; 10439 else 10440 AllowOrderedReductions = TTI->enableOrderedReductions(); 10441 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10442 ORE->emit([&]() { 10443 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10444 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10445 ExactFPMathInst->getDebugLoc(), 10446 ExactFPMathInst->getParent()) 10447 << "loop not vectorized: cannot prove it is safe to reorder " 10448 "floating-point operations"; 10449 }); 10450 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10451 "reorder floating-point operations\n"); 10452 Hints.emitRemarkWithHints(); 10453 return false; 10454 } 10455 10456 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10457 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10458 10459 // If an override option has been passed in for interleaved accesses, use it. 10460 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10461 UseInterleaved = EnableInterleavedMemAccesses; 10462 10463 // Analyze interleaved memory accesses. 10464 if (UseInterleaved) { 10465 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10466 } 10467 10468 // Use the cost model. 10469 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10470 F, &Hints, IAI); 10471 CM.collectValuesToIgnore(); 10472 CM.collectElementTypesForWidening(); 10473 10474 // Use the planner for vectorization. 10475 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10476 Requirements, ORE); 10477 10478 // Get user vectorization factor and interleave count. 10479 ElementCount UserVF = Hints.getWidth(); 10480 unsigned UserIC = Hints.getInterleave(); 10481 10482 // Plan how to best vectorize, return the best VF and its cost. 10483 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10484 10485 VectorizationFactor VF = VectorizationFactor::Disabled(); 10486 unsigned IC = 1; 10487 10488 if (MaybeVF) { 10489 if (LVP.requiresTooManyRuntimeChecks()) { 10490 ORE->emit([&]() { 10491 return OptimizationRemarkAnalysisAliasing( 10492 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10493 L->getHeader()) 10494 << "loop not vectorized: cannot prove it is safe to reorder " 10495 "memory operations"; 10496 }); 10497 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10498 Hints.emitRemarkWithHints(); 10499 return false; 10500 } 10501 VF = *MaybeVF; 10502 // Select the interleave count. 10503 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10504 } 10505 10506 // Identify the diagnostic messages that should be produced. 10507 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10508 bool VectorizeLoop = true, InterleaveLoop = true; 10509 if (VF.Width.isScalar()) { 10510 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10511 VecDiagMsg = std::make_pair( 10512 "VectorizationNotBeneficial", 10513 "the cost-model indicates that vectorization is not beneficial"); 10514 VectorizeLoop = false; 10515 } 10516 10517 if (!MaybeVF && UserIC > 1) { 10518 // Tell the user interleaving was avoided up-front, despite being explicitly 10519 // requested. 10520 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10521 "interleaving should be avoided up front\n"); 10522 IntDiagMsg = std::make_pair( 10523 "InterleavingAvoided", 10524 "Ignoring UserIC, because interleaving was avoided up front"); 10525 InterleaveLoop = false; 10526 } else if (IC == 1 && UserIC <= 1) { 10527 // Tell the user interleaving is not beneficial. 10528 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10529 IntDiagMsg = std::make_pair( 10530 "InterleavingNotBeneficial", 10531 "the cost-model indicates that interleaving is not beneficial"); 10532 InterleaveLoop = false; 10533 if (UserIC == 1) { 10534 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10535 IntDiagMsg.second += 10536 " and is explicitly disabled or interleave count is set to 1"; 10537 } 10538 } else if (IC > 1 && UserIC == 1) { 10539 // Tell the user interleaving is beneficial, but it explicitly disabled. 10540 LLVM_DEBUG( 10541 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10542 IntDiagMsg = std::make_pair( 10543 "InterleavingBeneficialButDisabled", 10544 "the cost-model indicates that interleaving is beneficial " 10545 "but is explicitly disabled or interleave count is set to 1"); 10546 InterleaveLoop = false; 10547 } 10548 10549 // Override IC if user provided an interleave count. 10550 IC = UserIC > 0 ? UserIC : IC; 10551 10552 // Emit diagnostic messages, if any. 10553 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10554 if (!VectorizeLoop && !InterleaveLoop) { 10555 // Do not vectorize or interleaving the loop. 10556 ORE->emit([&]() { 10557 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10558 L->getStartLoc(), L->getHeader()) 10559 << VecDiagMsg.second; 10560 }); 10561 ORE->emit([&]() { 10562 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10563 L->getStartLoc(), L->getHeader()) 10564 << IntDiagMsg.second; 10565 }); 10566 return false; 10567 } else if (!VectorizeLoop && InterleaveLoop) { 10568 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10569 ORE->emit([&]() { 10570 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10571 L->getStartLoc(), L->getHeader()) 10572 << VecDiagMsg.second; 10573 }); 10574 } else if (VectorizeLoop && !InterleaveLoop) { 10575 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10576 << ") in " << DebugLocStr << '\n'); 10577 ORE->emit([&]() { 10578 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10579 L->getStartLoc(), L->getHeader()) 10580 << IntDiagMsg.second; 10581 }); 10582 } else if (VectorizeLoop && InterleaveLoop) { 10583 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10584 << ") in " << DebugLocStr << '\n'); 10585 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10586 } 10587 10588 bool DisableRuntimeUnroll = false; 10589 MDNode *OrigLoopID = L->getLoopID(); 10590 { 10591 // Optimistically generate runtime checks. Drop them if they turn out to not 10592 // be profitable. Limit the scope of Checks, so the cleanup happens 10593 // immediately after vector codegeneration is done. 10594 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10595 F->getParent()->getDataLayout()); 10596 if (!VF.Width.isScalar() || IC > 1) 10597 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC); 10598 10599 using namespace ore; 10600 if (!VectorizeLoop) { 10601 assert(IC > 1 && "interleave count should not be 1 or 0"); 10602 // If we decided that it is not legal to vectorize the loop, then 10603 // interleave it. 10604 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10605 &CM, BFI, PSI, Checks); 10606 10607 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10608 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10609 10610 ORE->emit([&]() { 10611 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10612 L->getHeader()) 10613 << "interleaved loop (interleaved count: " 10614 << NV("InterleaveCount", IC) << ")"; 10615 }); 10616 } else { 10617 // If we decided that it is *legal* to vectorize the loop, then do it. 10618 10619 // Consider vectorizing the epilogue too if it's profitable. 10620 VectorizationFactor EpilogueVF = 10621 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10622 if (EpilogueVF.Width.isVector()) { 10623 10624 // The first pass vectorizes the main loop and creates a scalar epilogue 10625 // to be vectorized by executing the plan (potentially with a different 10626 // factor) again shortly afterwards. 10627 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10628 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10629 EPI, &LVL, &CM, BFI, PSI, Checks); 10630 10631 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10632 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10633 DT); 10634 ++LoopsVectorized; 10635 10636 // Second pass vectorizes the epilogue and adjusts the control flow 10637 // edges from the first pass. 10638 EPI.MainLoopVF = EPI.EpilogueVF; 10639 EPI.MainLoopUF = EPI.EpilogueUF; 10640 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10641 ORE, EPI, &LVL, &CM, BFI, PSI, 10642 Checks); 10643 10644 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10645 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10646 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10647 Header->setName("vec.epilog.vector.body"); 10648 10649 // Ensure that the start values for any VPReductionPHIRecipes are 10650 // updated before vectorising the epilogue loop. 10651 for (VPRecipeBase &R : Header->phis()) { 10652 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10653 if (auto *Resume = MainILV.getReductionResumeValue( 10654 ReductionPhi->getRecurrenceDescriptor())) { 10655 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10656 ReductionPhi->setOperand(0, StartVal); 10657 } 10658 } 10659 } 10660 10661 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10662 DT); 10663 ++LoopsEpilogueVectorized; 10664 10665 if (!MainILV.areSafetyChecksAdded()) 10666 DisableRuntimeUnroll = true; 10667 } else { 10668 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10669 &LVL, &CM, BFI, PSI, Checks); 10670 10671 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10672 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10673 ++LoopsVectorized; 10674 10675 // Add metadata to disable runtime unrolling a scalar loop when there 10676 // are no runtime checks about strides and memory. A scalar loop that is 10677 // rarely used is not worth unrolling. 10678 if (!LB.areSafetyChecksAdded()) 10679 DisableRuntimeUnroll = true; 10680 } 10681 // Report the vectorization decision. 10682 ORE->emit([&]() { 10683 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10684 L->getHeader()) 10685 << "vectorized loop (vectorization width: " 10686 << NV("VectorizationFactor", VF.Width) 10687 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10688 }); 10689 } 10690 10691 if (ORE->allowExtraAnalysis(LV_NAME)) 10692 checkMixedPrecision(L, ORE); 10693 } 10694 10695 Optional<MDNode *> RemainderLoopID = 10696 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10697 LLVMLoopVectorizeFollowupEpilogue}); 10698 if (RemainderLoopID.hasValue()) { 10699 L->setLoopID(RemainderLoopID.getValue()); 10700 } else { 10701 if (DisableRuntimeUnroll) 10702 AddRuntimeUnrollDisableMetaData(L); 10703 10704 // Mark the loop as already vectorized to avoid vectorizing again. 10705 Hints.setAlreadyVectorized(); 10706 } 10707 10708 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10709 return true; 10710 } 10711 10712 LoopVectorizeResult LoopVectorizePass::runImpl( 10713 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10714 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10715 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10716 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10717 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10718 SE = &SE_; 10719 LI = &LI_; 10720 TTI = &TTI_; 10721 DT = &DT_; 10722 BFI = &BFI_; 10723 TLI = TLI_; 10724 AA = &AA_; 10725 AC = &AC_; 10726 GetLAA = &GetLAA_; 10727 DB = &DB_; 10728 ORE = &ORE_; 10729 PSI = PSI_; 10730 10731 // Don't attempt if 10732 // 1. the target claims to have no vector registers, and 10733 // 2. interleaving won't help ILP. 10734 // 10735 // The second condition is necessary because, even if the target has no 10736 // vector registers, loop vectorization may still enable scalar 10737 // interleaving. 10738 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10739 TTI->getMaxInterleaveFactor(1) < 2) 10740 return LoopVectorizeResult(false, false); 10741 10742 bool Changed = false, CFGChanged = false; 10743 10744 // The vectorizer requires loops to be in simplified form. 10745 // Since simplification may add new inner loops, it has to run before the 10746 // legality and profitability checks. This means running the loop vectorizer 10747 // will simplify all loops, regardless of whether anything end up being 10748 // vectorized. 10749 for (auto &L : *LI) 10750 Changed |= CFGChanged |= 10751 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10752 10753 // Build up a worklist of inner-loops to vectorize. This is necessary as 10754 // the act of vectorizing or partially unrolling a loop creates new loops 10755 // and can invalidate iterators across the loops. 10756 SmallVector<Loop *, 8> Worklist; 10757 10758 for (Loop *L : *LI) 10759 collectSupportedLoops(*L, LI, ORE, Worklist); 10760 10761 LoopsAnalyzed += Worklist.size(); 10762 10763 // Now walk the identified inner loops. 10764 while (!Worklist.empty()) { 10765 Loop *L = Worklist.pop_back_val(); 10766 10767 // For the inner loops we actually process, form LCSSA to simplify the 10768 // transform. 10769 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10770 10771 Changed |= CFGChanged |= processLoop(L); 10772 } 10773 10774 // Process each loop nest in the function. 10775 return LoopVectorizeResult(Changed, CFGChanged); 10776 } 10777 10778 PreservedAnalyses LoopVectorizePass::run(Function &F, 10779 FunctionAnalysisManager &AM) { 10780 auto &LI = AM.getResult<LoopAnalysis>(F); 10781 // There are no loops in the function. Return before computing other expensive 10782 // analyses. 10783 if (LI.empty()) 10784 return PreservedAnalyses::all(); 10785 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10786 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10787 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10788 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10789 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10790 auto &AA = AM.getResult<AAManager>(F); 10791 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10792 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10793 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10794 10795 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10796 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10797 [&](Loop &L) -> const LoopAccessInfo & { 10798 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10799 TLI, TTI, nullptr, nullptr, nullptr}; 10800 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10801 }; 10802 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10803 ProfileSummaryInfo *PSI = 10804 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10805 LoopVectorizeResult Result = 10806 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10807 if (!Result.MadeAnyChange) 10808 return PreservedAnalyses::all(); 10809 PreservedAnalyses PA; 10810 10811 // We currently do not preserve loopinfo/dominator analyses with outer loop 10812 // vectorization. Until this is addressed, mark these analyses as preserved 10813 // only for non-VPlan-native path. 10814 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10815 if (!EnableVPlanNativePath) { 10816 PA.preserve<LoopAnalysis>(); 10817 PA.preserve<DominatorTreeAnalysis>(); 10818 } 10819 10820 if (Result.MadeCFGChange) { 10821 // Making CFG changes likely means a loop got vectorized. Indicate that 10822 // extra simplification passes should be run. 10823 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10824 // be run if runtime checks have been added. 10825 AM.getResult<ShouldRunExtraVectorPasses>(F); 10826 PA.preserve<ShouldRunExtraVectorPasses>(); 10827 } else { 10828 PA.preserveSet<CFGAnalyses>(); 10829 } 10830 return PA; 10831 } 10832 10833 void LoopVectorizePass::printPipeline( 10834 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10835 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10836 OS, MapClassName2PassName); 10837 10838 OS << "<"; 10839 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10840 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10841 OS << ">"; 10842 } 10843