1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanTransforms.h" 62 #include "llvm/ADT/APInt.h" 63 #include "llvm/ADT/ArrayRef.h" 64 #include "llvm/ADT/DenseMap.h" 65 #include "llvm/ADT/DenseMapInfo.h" 66 #include "llvm/ADT/Hashing.h" 67 #include "llvm/ADT/MapVector.h" 68 #include "llvm/ADT/None.h" 69 #include "llvm/ADT/Optional.h" 70 #include "llvm/ADT/STLExtras.h" 71 #include "llvm/ADT/SmallPtrSet.h" 72 #include "llvm/ADT/SmallSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 90 #include "llvm/Analysis/ProfileSummaryInfo.h" 91 #include "llvm/Analysis/ScalarEvolution.h" 92 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 93 #include "llvm/Analysis/TargetLibraryInfo.h" 94 #include "llvm/Analysis/TargetTransformInfo.h" 95 #include "llvm/Analysis/VectorUtils.h" 96 #include "llvm/IR/Attributes.h" 97 #include "llvm/IR/BasicBlock.h" 98 #include "llvm/IR/CFG.h" 99 #include "llvm/IR/Constant.h" 100 #include "llvm/IR/Constants.h" 101 #include "llvm/IR/DataLayout.h" 102 #include "llvm/IR/DebugInfoMetadata.h" 103 #include "llvm/IR/DebugLoc.h" 104 #include "llvm/IR/DerivedTypes.h" 105 #include "llvm/IR/DiagnosticInfo.h" 106 #include "llvm/IR/Dominators.h" 107 #include "llvm/IR/Function.h" 108 #include "llvm/IR/IRBuilder.h" 109 #include "llvm/IR/InstrTypes.h" 110 #include "llvm/IR/Instruction.h" 111 #include "llvm/IR/Instructions.h" 112 #include "llvm/IR/IntrinsicInst.h" 113 #include "llvm/IR/Intrinsics.h" 114 #include "llvm/IR/Metadata.h" 115 #include "llvm/IR/Module.h" 116 #include "llvm/IR/Operator.h" 117 #include "llvm/IR/PatternMatch.h" 118 #include "llvm/IR/Type.h" 119 #include "llvm/IR/Use.h" 120 #include "llvm/IR/User.h" 121 #include "llvm/IR/Value.h" 122 #include "llvm/IR/ValueHandle.h" 123 #include "llvm/IR/Verifier.h" 124 #include "llvm/InitializePasses.h" 125 #include "llvm/Pass.h" 126 #include "llvm/Support/Casting.h" 127 #include "llvm/Support/CommandLine.h" 128 #include "llvm/Support/Compiler.h" 129 #include "llvm/Support/Debug.h" 130 #include "llvm/Support/ErrorHandling.h" 131 #include "llvm/Support/InstructionCost.h" 132 #include "llvm/Support/MathExtras.h" 133 #include "llvm/Support/raw_ostream.h" 134 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 135 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <map> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 #ifndef NDEBUG 160 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 161 #endif 162 163 /// @{ 164 /// Metadata attribute names 165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 166 const char LLVMLoopVectorizeFollowupVectorized[] = 167 "llvm.loop.vectorize.followup_vectorized"; 168 const char LLVMLoopVectorizeFollowupEpilogue[] = 169 "llvm.loop.vectorize.followup_epilogue"; 170 /// @} 171 172 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 175 176 static cl::opt<bool> EnableEpilogueVectorization( 177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 178 cl::desc("Enable vectorization of epilogue loops.")); 179 180 static cl::opt<unsigned> EpilogueVectorizationForceVF( 181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 182 cl::desc("When epilogue vectorization is enabled, and a value greater than " 183 "1 is specified, forces the given VF for all applicable epilogue " 184 "loops.")); 185 186 static cl::opt<unsigned> EpilogueVectorizationMinVF( 187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 188 cl::desc("Only loops with vectorization factor equal to or larger than " 189 "the specified value are considered for epilogue vectorization.")); 190 191 /// Loops with a known constant trip count below this number are vectorized only 192 /// if no scalar iteration overheads are incurred. 193 static cl::opt<unsigned> TinyTripCountVectorThreshold( 194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 195 cl::desc("Loops with a constant trip count that is smaller than this " 196 "value are vectorized only if no scalar iteration overheads " 197 "are incurred.")); 198 199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 200 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 201 cl::desc("The maximum allowed number of runtime memory checks with a " 202 "vectorize(enable) pragma.")); 203 204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 205 // that predication is preferred, and this lists all options. I.e., the 206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 207 // and predicate the instructions accordingly. If tail-folding fails, there are 208 // different fallback strategies depending on these values: 209 namespace PreferPredicateTy { 210 enum Option { 211 ScalarEpilogue = 0, 212 PredicateElseScalarEpilogue, 213 PredicateOrDontVectorize 214 }; 215 } // namespace PreferPredicateTy 216 217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 218 "prefer-predicate-over-epilogue", 219 cl::init(PreferPredicateTy::ScalarEpilogue), 220 cl::Hidden, 221 cl::desc("Tail-folding and predication preferences over creating a scalar " 222 "epilogue loop."), 223 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 224 "scalar-epilogue", 225 "Don't tail-predicate loops, create scalar epilogue"), 226 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 227 "predicate-else-scalar-epilogue", 228 "prefer tail-folding, create scalar epilogue if tail " 229 "folding fails."), 230 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 231 "predicate-dont-vectorize", 232 "prefers tail-folding, don't attempt vectorization if " 233 "tail-folding fails."))); 234 235 static cl::opt<bool> MaximizeBandwidth( 236 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 237 cl::desc("Maximize bandwidth when selecting vectorization factor which " 238 "will be determined by the smallest type in loop.")); 239 240 static cl::opt<bool> EnableInterleavedMemAccesses( 241 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 242 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 243 244 /// An interleave-group may need masking if it resides in a block that needs 245 /// predication, or in order to mask away gaps. 246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 247 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 248 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 249 250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 251 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 252 cl::desc("We don't interleave loops with a estimated constant trip count " 253 "below this number")); 254 255 static cl::opt<unsigned> ForceTargetNumScalarRegs( 256 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of scalar registers.")); 258 259 static cl::opt<unsigned> ForceTargetNumVectorRegs( 260 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's number of vector registers.")); 262 263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 264 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 265 cl::desc("A flag that overrides the target's max interleave factor for " 266 "scalar loops.")); 267 268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 269 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 270 cl::desc("A flag that overrides the target's max interleave factor for " 271 "vectorized loops.")); 272 273 static cl::opt<unsigned> ForceTargetInstructionCost( 274 "force-target-instruction-cost", cl::init(0), cl::Hidden, 275 cl::desc("A flag that overrides the target's expected cost for " 276 "an instruction to a single constant value. Mostly " 277 "useful for getting consistent testing.")); 278 279 static cl::opt<bool> ForceTargetSupportsScalableVectors( 280 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 281 cl::desc( 282 "Pretend that scalable vectors are supported, even if the target does " 283 "not support them. This flag should only be used for testing.")); 284 285 static cl::opt<unsigned> SmallLoopCost( 286 "small-loop-cost", cl::init(20), cl::Hidden, 287 cl::desc( 288 "The cost of a loop that is considered 'small' by the interleaver.")); 289 290 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 291 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 292 cl::desc("Enable the use of the block frequency analysis to access PGO " 293 "heuristics minimizing code growth in cold regions and being more " 294 "aggressive in hot regions.")); 295 296 // Runtime interleave loops for load/store throughput. 297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 298 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 299 cl::desc( 300 "Enable runtime interleaving until load/store ports are saturated")); 301 302 /// Interleave small loops with scalar reductions. 303 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 304 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 305 cl::desc("Enable interleaving for loops with small iteration counts that " 306 "contain scalar reductions to expose ILP.")); 307 308 /// The number of stores in a loop that are allowed to need predication. 309 static cl::opt<unsigned> NumberOfStoresToPredicate( 310 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 311 cl::desc("Max number of stores to be predicated behind an if.")); 312 313 static cl::opt<bool> EnableIndVarRegisterHeur( 314 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 315 cl::desc("Count the induction variable only once when interleaving")); 316 317 static cl::opt<bool> EnableCondStoresVectorization( 318 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 319 cl::desc("Enable if predication of stores during vectorization.")); 320 321 static cl::opt<unsigned> MaxNestedScalarReductionIC( 322 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 323 cl::desc("The maximum interleave count to use when interleaving a scalar " 324 "reduction in a nested loop.")); 325 326 static cl::opt<bool> 327 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 328 cl::Hidden, 329 cl::desc("Prefer in-loop vector reductions, " 330 "overriding the targets preference.")); 331 332 static cl::opt<bool> ForceOrderedReductions( 333 "force-ordered-reductions", cl::init(false), cl::Hidden, 334 cl::desc("Enable the vectorisation of loops with in-order (strict) " 335 "FP reductions")); 336 337 static cl::opt<bool> PreferPredicatedReductionSelect( 338 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 339 cl::desc( 340 "Prefer predicating a reduction operation over an after loop select.")); 341 342 cl::opt<bool> EnableVPlanNativePath( 343 "enable-vplan-native-path", cl::init(false), cl::Hidden, 344 cl::desc("Enable VPlan-native vectorization path with " 345 "support for outer loop vectorization.")); 346 347 // This flag enables the stress testing of the VPlan H-CFG construction in the 348 // VPlan-native vectorization path. It must be used in conjuction with 349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 350 // verification of the H-CFGs built. 351 static cl::opt<bool> VPlanBuildStressTest( 352 "vplan-build-stress-test", cl::init(false), cl::Hidden, 353 cl::desc( 354 "Build VPlan for every supported loop nest in the function and bail " 355 "out right after the build (stress test the VPlan H-CFG construction " 356 "in the VPlan-native vectorization path).")); 357 358 cl::opt<bool> llvm::EnableLoopInterleaving( 359 "interleave-loops", cl::init(true), cl::Hidden, 360 cl::desc("Enable loop interleaving in Loop vectorization passes")); 361 cl::opt<bool> llvm::EnableLoopVectorization( 362 "vectorize-loops", cl::init(true), cl::Hidden, 363 cl::desc("Run the Loop vectorization passes")); 364 365 cl::opt<bool> PrintVPlansInDotFormat( 366 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 367 cl::desc("Use dot format instead of plain text when dumping VPlans")); 368 369 /// A helper function that returns true if the given type is irregular. The 370 /// type is irregular if its allocated size doesn't equal the store size of an 371 /// element of the corresponding vector type. 372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 373 // Determine if an array of N elements of type Ty is "bitcast compatible" 374 // with a <N x Ty> vector. 375 // This is only true if there is no padding between the array elements. 376 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 377 } 378 379 /// A helper function that returns the reciprocal of the block probability of 380 /// predicated blocks. If we return X, we are assuming the predicated block 381 /// will execute once for every X iterations of the loop header. 382 /// 383 /// TODO: We should use actual block probability here, if available. Currently, 384 /// we always assume predicated blocks have a 50% chance of executing. 385 static unsigned getReciprocalPredBlockProb() { return 2; } 386 387 /// A helper function that returns an integer or floating-point constant with 388 /// value C. 389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 390 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 391 : ConstantFP::get(Ty, C); 392 } 393 394 /// Returns "best known" trip count for the specified loop \p L as defined by 395 /// the following procedure: 396 /// 1) Returns exact trip count if it is known. 397 /// 2) Returns expected trip count according to profile data if any. 398 /// 3) Returns upper bound estimate if it is known. 399 /// 4) Returns None if all of the above failed. 400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 401 // Check if exact trip count is known. 402 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 403 return ExpectedTC; 404 405 // Check if there is an expected trip count available from profile data. 406 if (LoopVectorizeWithBlockFrequency) 407 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 408 return EstimatedTC; 409 410 // Check if upper bound estimate is known. 411 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 412 return ExpectedTC; 413 414 return None; 415 } 416 417 // Forward declare GeneratedRTChecks. 418 class GeneratedRTChecks; 419 420 namespace llvm { 421 422 AnalysisKey ShouldRunExtraVectorPasses::Key; 423 424 /// InnerLoopVectorizer vectorizes loops which contain only one basic 425 /// block to a specified vectorization factor (VF). 426 /// This class performs the widening of scalars into vectors, or multiple 427 /// scalars. This class also implements the following features: 428 /// * It inserts an epilogue loop for handling loops that don't have iteration 429 /// counts that are known to be a multiple of the vectorization factor. 430 /// * It handles the code generation for reduction variables. 431 /// * Scalarization (implementation using scalars) of un-vectorizable 432 /// instructions. 433 /// InnerLoopVectorizer does not perform any vectorization-legality 434 /// checks, and relies on the caller to check for the different legality 435 /// aspects. The InnerLoopVectorizer relies on the 436 /// LoopVectorizationLegality class to provide information about the induction 437 /// and reduction variables that were found to a given vectorization factor. 438 class InnerLoopVectorizer { 439 public: 440 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 441 LoopInfo *LI, DominatorTree *DT, 442 const TargetLibraryInfo *TLI, 443 const TargetTransformInfo *TTI, AssumptionCache *AC, 444 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 445 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 446 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 447 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 448 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 449 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 450 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 451 PSI(PSI), RTChecks(RTChecks) { 452 // Query this against the original loop and save it here because the profile 453 // of the original loop header may change as the transformation happens. 454 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 455 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 456 } 457 458 virtual ~InnerLoopVectorizer() = default; 459 460 /// Create a new empty loop that will contain vectorized instructions later 461 /// on, while the old loop will be used as the scalar remainder. Control flow 462 /// is generated around the vectorized (and scalar epilogue) loops consisting 463 /// of various checks and bypasses. Return the pre-header block of the new 464 /// loop and the start value for the canonical induction, if it is != 0. The 465 /// latter is the case when vectorizing the epilogue loop. In the case of 466 /// epilogue vectorization, this function is overriden to handle the more 467 /// complex control flow around the loops. 468 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(); 469 470 /// Widen a single call instruction within the innermost loop. 471 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 472 VPTransformState &State); 473 474 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 475 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 476 477 // Return true if any runtime check is added. 478 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 479 480 /// A type for vectorized values in the new loop. Each value from the 481 /// original loop, when vectorized, is represented by UF vector values in the 482 /// new unrolled loop, where UF is the unroll factor. 483 using VectorParts = SmallVector<Value *, 2>; 484 485 /// A helper function to scalarize a single Instruction in the innermost loop. 486 /// Generates a sequence of scalar instances for each lane between \p MinLane 487 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 488 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 489 /// Instr's operands. 490 void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, 491 const VPIteration &Instance, bool IfPredicateInstr, 492 VPTransformState &State); 493 494 /// Construct the vector value of a scalarized value \p V one lane at a time. 495 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 496 VPTransformState &State); 497 498 /// Try to vectorize interleaved access group \p Group with the base address 499 /// given in \p Addr, optionally masking the vector operations if \p 500 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 501 /// values in the vectorized loop. 502 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 503 ArrayRef<VPValue *> VPDefs, 504 VPTransformState &State, VPValue *Addr, 505 ArrayRef<VPValue *> StoredValues, 506 VPValue *BlockInMask = nullptr); 507 508 /// Set the debug location in the builder \p Ptr using the debug location in 509 /// \p V. If \p Ptr is None then it uses the class member's Builder. 510 void setDebugLocFromInst(const Value *V); 511 512 /// Fix the non-induction PHIs in \p Plan. 513 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 514 515 /// Returns true if the reordering of FP operations is not allowed, but we are 516 /// able to vectorize with strict in-order reductions for the given RdxDesc. 517 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 518 519 /// Create a broadcast instruction. This method generates a broadcast 520 /// instruction (shuffle) for loop invariant values and for the induction 521 /// value. If this is the induction variable then we extend it to N, N+1, ... 522 /// this is needed because each iteration in the loop corresponds to a SIMD 523 /// element. 524 virtual Value *getBroadcastInstrs(Value *V); 525 526 // Returns the resume value (bc.merge.rdx) for a reduction as 527 // generated by fixReduction. 528 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc); 529 530 protected: 531 friend class LoopVectorizationPlanner; 532 533 /// A small list of PHINodes. 534 using PhiVector = SmallVector<PHINode *, 4>; 535 536 /// A type for scalarized values in the new loop. Each value from the 537 /// original loop, when scalarized, is represented by UF x VF scalar values 538 /// in the new unrolled loop, where UF is the unroll factor and VF is the 539 /// vectorization factor. 540 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 541 542 /// Set up the values of the IVs correctly when exiting the vector loop. 543 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 544 Value *VectorTripCount, Value *EndValue, 545 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 546 VPlan &Plan); 547 548 /// Handle all cross-iteration phis in the header. 549 void fixCrossIterationPHIs(VPTransformState &State); 550 551 /// Create the exit value of first order recurrences in the middle block and 552 /// update their users. 553 void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 554 VPTransformState &State); 555 556 /// Create code for the loop exit value of the reduction. 557 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 558 559 /// Clear NSW/NUW flags from reduction instructions if necessary. 560 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 561 VPTransformState &State); 562 563 /// Iteratively sink the scalarized operands of a predicated instruction into 564 /// the block that was created for it. 565 void sinkScalarOperands(Instruction *PredInst); 566 567 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 568 /// represented as. 569 void truncateToMinimalBitwidths(VPTransformState &State); 570 571 /// Returns (and creates if needed) the original loop trip count. 572 Value *getOrCreateTripCount(BasicBlock *InsertBlock); 573 574 /// Returns (and creates if needed) the trip count of the widened loop. 575 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 576 577 /// Returns a bitcasted value to the requested vector type. 578 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 579 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 580 const DataLayout &DL); 581 582 /// Emit a bypass check to see if the vector trip count is zero, including if 583 /// it overflows. 584 void emitIterationCountCheck(BasicBlock *Bypass); 585 586 /// Emit a bypass check to see if all of the SCEV assumptions we've 587 /// had to make are correct. Returns the block containing the checks or 588 /// nullptr if no checks have been added. 589 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 590 591 /// Emit bypass checks to check any memory assumptions we may have made. 592 /// Returns the block containing the checks or nullptr if no checks have been 593 /// added. 594 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 595 596 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 597 /// vector loop preheader, middle block and scalar preheader. 598 void createVectorLoopSkeleton(StringRef Prefix); 599 600 /// Create new phi nodes for the induction variables to resume iteration count 601 /// in the scalar epilogue, from where the vectorized loop left off. 602 /// In cases where the loop skeleton is more complicated (eg. epilogue 603 /// vectorization) and the resume values can come from an additional bypass 604 /// block, the \p AdditionalBypass pair provides information about the bypass 605 /// block and the end value on the edge from bypass to this loop. 606 void createInductionResumeValues( 607 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 608 609 /// Complete the loop skeleton by adding debug MDs, creating appropriate 610 /// conditional branches in the middle block, preparing the builder and 611 /// running the verifier. Return the preheader of the completed vector loop. 612 BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID); 613 614 /// Collect poison-generating recipes that may generate a poison value that is 615 /// used after vectorization, even when their operands are not poison. Those 616 /// recipes meet the following conditions: 617 /// * Contribute to the address computation of a recipe generating a widen 618 /// memory load/store (VPWidenMemoryInstructionRecipe or 619 /// VPInterleaveRecipe). 620 /// * Such a widen memory load/store has at least one underlying Instruction 621 /// that is in a basic block that needs predication and after vectorization 622 /// the generated instruction won't be predicated. 623 void collectPoisonGeneratingRecipes(VPTransformState &State); 624 625 /// Allow subclasses to override and print debug traces before/after vplan 626 /// execution, when trace information is requested. 627 virtual void printDebugTracesAtStart(){}; 628 virtual void printDebugTracesAtEnd(){}; 629 630 /// The original loop. 631 Loop *OrigLoop; 632 633 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 634 /// dynamic knowledge to simplify SCEV expressions and converts them to a 635 /// more usable form. 636 PredicatedScalarEvolution &PSE; 637 638 /// Loop Info. 639 LoopInfo *LI; 640 641 /// Dominator Tree. 642 DominatorTree *DT; 643 644 /// Alias Analysis. 645 AAResults *AA; 646 647 /// Target Library Info. 648 const TargetLibraryInfo *TLI; 649 650 /// Target Transform Info. 651 const TargetTransformInfo *TTI; 652 653 /// Assumption Cache. 654 AssumptionCache *AC; 655 656 /// Interface to emit optimization remarks. 657 OptimizationRemarkEmitter *ORE; 658 659 /// The vectorization SIMD factor to use. Each vector will have this many 660 /// vector elements. 661 ElementCount VF; 662 663 /// The vectorization unroll factor to use. Each scalar is vectorized to this 664 /// many different vector instructions. 665 unsigned UF; 666 667 /// The builder that we use 668 IRBuilder<> Builder; 669 670 // --- Vectorization state --- 671 672 /// The vector-loop preheader. 673 BasicBlock *LoopVectorPreHeader; 674 675 /// The scalar-loop preheader. 676 BasicBlock *LoopScalarPreHeader; 677 678 /// Middle Block between the vector and the scalar. 679 BasicBlock *LoopMiddleBlock; 680 681 /// The unique ExitBlock of the scalar loop if one exists. Note that 682 /// there can be multiple exiting edges reaching this block. 683 BasicBlock *LoopExitBlock; 684 685 /// The scalar loop body. 686 BasicBlock *LoopScalarBody; 687 688 /// A list of all bypass blocks. The first block is the entry of the loop. 689 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 690 691 /// Store instructions that were predicated. 692 SmallVector<Instruction *, 4> PredicatedInstructions; 693 694 /// Trip count of the original loop. 695 Value *TripCount = nullptr; 696 697 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 698 Value *VectorTripCount = nullptr; 699 700 /// The legality analysis. 701 LoopVectorizationLegality *Legal; 702 703 /// The profitablity analysis. 704 LoopVectorizationCostModel *Cost; 705 706 // Record whether runtime checks are added. 707 bool AddedSafetyChecks = false; 708 709 // Holds the end values for each induction variable. We save the end values 710 // so we can later fix-up the external users of the induction variables. 711 DenseMap<PHINode *, Value *> IVEndValues; 712 713 /// BFI and PSI are used to check for profile guided size optimizations. 714 BlockFrequencyInfo *BFI; 715 ProfileSummaryInfo *PSI; 716 717 // Whether this loop should be optimized for size based on profile guided size 718 // optimizatios. 719 bool OptForSizeBasedOnProfile; 720 721 /// Structure to hold information about generated runtime checks, responsible 722 /// for cleaning the checks, if vectorization turns out unprofitable. 723 GeneratedRTChecks &RTChecks; 724 725 // Holds the resume values for reductions in the loops, used to set the 726 // correct start value of reduction PHIs when vectorizing the epilogue. 727 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 728 ReductionResumeValues; 729 }; 730 731 class InnerLoopUnroller : public InnerLoopVectorizer { 732 public: 733 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 734 LoopInfo *LI, DominatorTree *DT, 735 const TargetLibraryInfo *TLI, 736 const TargetTransformInfo *TTI, AssumptionCache *AC, 737 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 738 LoopVectorizationLegality *LVL, 739 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 740 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 741 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 742 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 743 BFI, PSI, Check) {} 744 745 private: 746 Value *getBroadcastInstrs(Value *V) override; 747 }; 748 749 /// Encapsulate information regarding vectorization of a loop and its epilogue. 750 /// This information is meant to be updated and used across two stages of 751 /// epilogue vectorization. 752 struct EpilogueLoopVectorizationInfo { 753 ElementCount MainLoopVF = ElementCount::getFixed(0); 754 unsigned MainLoopUF = 0; 755 ElementCount EpilogueVF = ElementCount::getFixed(0); 756 unsigned EpilogueUF = 0; 757 BasicBlock *MainLoopIterationCountCheck = nullptr; 758 BasicBlock *EpilogueIterationCountCheck = nullptr; 759 BasicBlock *SCEVSafetyCheck = nullptr; 760 BasicBlock *MemSafetyCheck = nullptr; 761 Value *TripCount = nullptr; 762 Value *VectorTripCount = nullptr; 763 764 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 765 ElementCount EVF, unsigned EUF) 766 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 767 assert(EUF == 1 && 768 "A high UF for the epilogue loop is likely not beneficial."); 769 } 770 }; 771 772 /// An extension of the inner loop vectorizer that creates a skeleton for a 773 /// vectorized loop that has its epilogue (residual) also vectorized. 774 /// The idea is to run the vplan on a given loop twice, firstly to setup the 775 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 776 /// from the first step and vectorize the epilogue. This is achieved by 777 /// deriving two concrete strategy classes from this base class and invoking 778 /// them in succession from the loop vectorizer planner. 779 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 780 public: 781 InnerLoopAndEpilogueVectorizer( 782 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 783 DominatorTree *DT, const TargetLibraryInfo *TLI, 784 const TargetTransformInfo *TTI, AssumptionCache *AC, 785 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 786 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 787 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 788 GeneratedRTChecks &Checks) 789 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 790 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 791 Checks), 792 EPI(EPI) {} 793 794 // Override this function to handle the more complex control flow around the 795 // three loops. 796 std::pair<BasicBlock *, Value *> 797 createVectorizedLoopSkeleton() final override { 798 return createEpilogueVectorizedLoopSkeleton(); 799 } 800 801 /// The interface for creating a vectorized skeleton using one of two 802 /// different strategies, each corresponding to one execution of the vplan 803 /// as described above. 804 virtual std::pair<BasicBlock *, Value *> 805 createEpilogueVectorizedLoopSkeleton() = 0; 806 807 /// Holds and updates state information required to vectorize the main loop 808 /// and its epilogue in two separate passes. This setup helps us avoid 809 /// regenerating and recomputing runtime safety checks. It also helps us to 810 /// shorten the iteration-count-check path length for the cases where the 811 /// iteration count of the loop is so small that the main vector loop is 812 /// completely skipped. 813 EpilogueLoopVectorizationInfo &EPI; 814 }; 815 816 /// A specialized derived class of inner loop vectorizer that performs 817 /// vectorization of *main* loops in the process of vectorizing loops and their 818 /// epilogues. 819 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 820 public: 821 EpilogueVectorizerMainLoop( 822 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 823 DominatorTree *DT, const TargetLibraryInfo *TLI, 824 const TargetTransformInfo *TTI, AssumptionCache *AC, 825 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 826 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 827 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 828 GeneratedRTChecks &Check) 829 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 830 EPI, LVL, CM, BFI, PSI, Check) {} 831 /// Implements the interface for creating a vectorized skeleton using the 832 /// *main loop* strategy (ie the first pass of vplan execution). 833 std::pair<BasicBlock *, Value *> 834 createEpilogueVectorizedLoopSkeleton() final override; 835 836 protected: 837 /// Emits an iteration count bypass check once for the main loop (when \p 838 /// ForEpilogue is false) and once for the epilogue loop (when \p 839 /// ForEpilogue is true). 840 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 841 void printDebugTracesAtStart() override; 842 void printDebugTracesAtEnd() override; 843 }; 844 845 // A specialized derived class of inner loop vectorizer that performs 846 // vectorization of *epilogue* loops in the process of vectorizing loops and 847 // their epilogues. 848 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 849 public: 850 EpilogueVectorizerEpilogueLoop( 851 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 852 DominatorTree *DT, const TargetLibraryInfo *TLI, 853 const TargetTransformInfo *TTI, AssumptionCache *AC, 854 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 855 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 856 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 857 GeneratedRTChecks &Checks) 858 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 859 EPI, LVL, CM, BFI, PSI, Checks) { 860 TripCount = EPI.TripCount; 861 } 862 /// Implements the interface for creating a vectorized skeleton using the 863 /// *epilogue loop* strategy (ie the second pass of vplan execution). 864 std::pair<BasicBlock *, Value *> 865 createEpilogueVectorizedLoopSkeleton() final override; 866 867 protected: 868 /// Emits an iteration count bypass check after the main vector loop has 869 /// finished to see if there are any iterations left to execute by either 870 /// the vector epilogue or the scalar epilogue. 871 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 872 BasicBlock *Bypass, 873 BasicBlock *Insert); 874 void printDebugTracesAtStart() override; 875 void printDebugTracesAtEnd() override; 876 }; 877 } // end namespace llvm 878 879 /// Look for a meaningful debug location on the instruction or it's 880 /// operands. 881 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 882 if (!I) 883 return I; 884 885 DebugLoc Empty; 886 if (I->getDebugLoc() != Empty) 887 return I; 888 889 for (Use &Op : I->operands()) { 890 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 891 if (OpInst->getDebugLoc() != Empty) 892 return OpInst; 893 } 894 895 return I; 896 } 897 898 void InnerLoopVectorizer::setDebugLocFromInst( 899 const Value *V) { 900 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) { 901 const DILocation *DIL = Inst->getDebugLoc(); 902 903 // When a FSDiscriminator is enabled, we don't need to add the multiply 904 // factors to the discriminators. 905 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 906 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 907 // FIXME: For scalable vectors, assume vscale=1. 908 auto NewDIL = 909 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 910 if (NewDIL) 911 Builder.SetCurrentDebugLocation(*NewDIL); 912 else 913 LLVM_DEBUG(dbgs() 914 << "Failed to create new discriminator: " 915 << DIL->getFilename() << " Line: " << DIL->getLine()); 916 } else 917 Builder.SetCurrentDebugLocation(DIL); 918 } else 919 Builder.SetCurrentDebugLocation(DebugLoc()); 920 } 921 922 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 923 /// is passed, the message relates to that particular instruction. 924 #ifndef NDEBUG 925 static void debugVectorizationMessage(const StringRef Prefix, 926 const StringRef DebugMsg, 927 Instruction *I) { 928 dbgs() << "LV: " << Prefix << DebugMsg; 929 if (I != nullptr) 930 dbgs() << " " << *I; 931 else 932 dbgs() << '.'; 933 dbgs() << '\n'; 934 } 935 #endif 936 937 /// Create an analysis remark that explains why vectorization failed 938 /// 939 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 940 /// RemarkName is the identifier for the remark. If \p I is passed it is an 941 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 942 /// the location of the remark. \return the remark object that can be 943 /// streamed to. 944 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 945 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 946 Value *CodeRegion = TheLoop->getHeader(); 947 DebugLoc DL = TheLoop->getStartLoc(); 948 949 if (I) { 950 CodeRegion = I->getParent(); 951 // If there is no debug location attached to the instruction, revert back to 952 // using the loop's. 953 if (I->getDebugLoc()) 954 DL = I->getDebugLoc(); 955 } 956 957 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 958 } 959 960 namespace llvm { 961 962 /// Return a value for Step multiplied by VF. 963 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 964 int64_t Step) { 965 assert(Ty->isIntegerTy() && "Expected an integer step"); 966 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue()); 967 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 968 } 969 970 /// Return the runtime value for VF. 971 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 972 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 973 return VF.isScalable() ? B.CreateVScale(EC) : EC; 974 } 975 976 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, 977 ElementCount VF) { 978 assert(FTy->isFloatingPointTy() && "Expected floating point type!"); 979 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits()); 980 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF); 981 return B.CreateUIToFP(RuntimeVF, FTy); 982 } 983 984 void reportVectorizationFailure(const StringRef DebugMsg, 985 const StringRef OREMsg, const StringRef ORETag, 986 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 987 Instruction *I) { 988 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 989 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 990 ORE->emit( 991 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 992 << "loop not vectorized: " << OREMsg); 993 } 994 995 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 996 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 997 Instruction *I) { 998 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 999 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1000 ORE->emit( 1001 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1002 << Msg); 1003 } 1004 1005 } // end namespace llvm 1006 1007 #ifndef NDEBUG 1008 /// \return string containing a file name and a line # for the given loop. 1009 static std::string getDebugLocString(const Loop *L) { 1010 std::string Result; 1011 if (L) { 1012 raw_string_ostream OS(Result); 1013 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1014 LoopDbgLoc.print(OS); 1015 else 1016 // Just print the module name. 1017 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1018 OS.flush(); 1019 } 1020 return Result; 1021 } 1022 #endif 1023 1024 void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1025 VPTransformState &State) { 1026 1027 // Collect recipes in the backward slice of `Root` that may generate a poison 1028 // value that is used after vectorization. 1029 SmallPtrSet<VPRecipeBase *, 16> Visited; 1030 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1031 SmallVector<VPRecipeBase *, 16> Worklist; 1032 Worklist.push_back(Root); 1033 1034 // Traverse the backward slice of Root through its use-def chain. 1035 while (!Worklist.empty()) { 1036 VPRecipeBase *CurRec = Worklist.back(); 1037 Worklist.pop_back(); 1038 1039 if (!Visited.insert(CurRec).second) 1040 continue; 1041 1042 // Prune search if we find another recipe generating a widen memory 1043 // instruction. Widen memory instructions involved in address computation 1044 // will lead to gather/scatter instructions, which don't need to be 1045 // handled. 1046 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1047 isa<VPInterleaveRecipe>(CurRec) || 1048 isa<VPScalarIVStepsRecipe>(CurRec) || 1049 isa<VPCanonicalIVPHIRecipe>(CurRec)) 1050 continue; 1051 1052 // This recipe contributes to the address computation of a widen 1053 // load/store. Collect recipe if its underlying instruction has 1054 // poison-generating flags. 1055 Instruction *Instr = CurRec->getUnderlyingInstr(); 1056 if (Instr && Instr->hasPoisonGeneratingFlags()) 1057 State.MayGeneratePoisonRecipes.insert(CurRec); 1058 1059 // Add new definitions to the worklist. 1060 for (VPValue *operand : CurRec->operands()) 1061 if (VPDef *OpDef = operand->getDef()) 1062 Worklist.push_back(cast<VPRecipeBase>(OpDef)); 1063 } 1064 }); 1065 1066 // Traverse all the recipes in the VPlan and collect the poison-generating 1067 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1068 // VPInterleaveRecipe. 1069 auto Iter = depth_first( 1070 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry())); 1071 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1072 for (VPRecipeBase &Recipe : *VPBB) { 1073 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1074 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1075 VPDef *AddrDef = WidenRec->getAddr()->getDef(); 1076 if (AddrDef && WidenRec->isConsecutive() && 1077 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1078 collectPoisonGeneratingInstrsInBackwardSlice( 1079 cast<VPRecipeBase>(AddrDef)); 1080 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1081 VPDef *AddrDef = InterleaveRec->getAddr()->getDef(); 1082 if (AddrDef) { 1083 // Check if any member of the interleave group needs predication. 1084 const InterleaveGroup<Instruction> *InterGroup = 1085 InterleaveRec->getInterleaveGroup(); 1086 bool NeedPredication = false; 1087 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1088 I < NumMembers; ++I) { 1089 Instruction *Member = InterGroup->getMember(I); 1090 if (Member) 1091 NeedPredication |= 1092 Legal->blockNeedsPredication(Member->getParent()); 1093 } 1094 1095 if (NeedPredication) 1096 collectPoisonGeneratingInstrsInBackwardSlice( 1097 cast<VPRecipeBase>(AddrDef)); 1098 } 1099 } 1100 } 1101 } 1102 } 1103 1104 PHINode *InnerLoopVectorizer::getReductionResumeValue( 1105 const RecurrenceDescriptor &RdxDesc) { 1106 auto It = ReductionResumeValues.find(&RdxDesc); 1107 assert(It != ReductionResumeValues.end() && 1108 "Expected to find a resume value for the reduction."); 1109 return It->second; 1110 } 1111 1112 namespace llvm { 1113 1114 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1115 // lowered. 1116 enum ScalarEpilogueLowering { 1117 1118 // The default: allowing scalar epilogues. 1119 CM_ScalarEpilogueAllowed, 1120 1121 // Vectorization with OptForSize: don't allow epilogues. 1122 CM_ScalarEpilogueNotAllowedOptSize, 1123 1124 // A special case of vectorisation with OptForSize: loops with a very small 1125 // trip count are considered for vectorization under OptForSize, thereby 1126 // making sure the cost of their loop body is dominant, free of runtime 1127 // guards and scalar iteration overheads. 1128 CM_ScalarEpilogueNotAllowedLowTripLoop, 1129 1130 // Loop hint predicate indicating an epilogue is undesired. 1131 CM_ScalarEpilogueNotNeededUsePredicate, 1132 1133 // Directive indicating we must either tail fold or not vectorize 1134 CM_ScalarEpilogueNotAllowedUsePredicate 1135 }; 1136 1137 /// ElementCountComparator creates a total ordering for ElementCount 1138 /// for the purposes of using it in a set structure. 1139 struct ElementCountComparator { 1140 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { 1141 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < 1142 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); 1143 } 1144 }; 1145 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>; 1146 1147 /// LoopVectorizationCostModel - estimates the expected speedups due to 1148 /// vectorization. 1149 /// In many cases vectorization is not profitable. This can happen because of 1150 /// a number of reasons. In this class we mainly attempt to predict the 1151 /// expected speedup/slowdowns due to the supported instruction set. We use the 1152 /// TargetTransformInfo to query the different backends for the cost of 1153 /// different operations. 1154 class LoopVectorizationCostModel { 1155 public: 1156 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1157 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1158 LoopVectorizationLegality *Legal, 1159 const TargetTransformInfo &TTI, 1160 const TargetLibraryInfo *TLI, DemandedBits *DB, 1161 AssumptionCache *AC, 1162 OptimizationRemarkEmitter *ORE, const Function *F, 1163 const LoopVectorizeHints *Hints, 1164 InterleavedAccessInfo &IAI) 1165 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1166 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1167 Hints(Hints), InterleaveInfo(IAI) {} 1168 1169 /// \return An upper bound for the vectorization factors (both fixed and 1170 /// scalable). If the factors are 0, vectorization and interleaving should be 1171 /// avoided up front. 1172 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1173 1174 /// \return True if runtime checks are required for vectorization, and false 1175 /// otherwise. 1176 bool runtimeChecksRequired(); 1177 1178 /// \return The most profitable vectorization factor and the cost of that VF. 1179 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO 1180 /// then this vectorization factor will be selected if vectorization is 1181 /// possible. 1182 VectorizationFactor 1183 selectVectorizationFactor(const ElementCountSet &CandidateVFs); 1184 1185 VectorizationFactor 1186 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1187 const LoopVectorizationPlanner &LVP); 1188 1189 /// Setup cost-based decisions for user vectorization factor. 1190 /// \return true if the UserVF is a feasible VF to be chosen. 1191 bool selectUserVectorizationFactor(ElementCount UserVF) { 1192 collectUniformsAndScalars(UserVF); 1193 collectInstsToScalarize(UserVF); 1194 return expectedCost(UserVF).first.isValid(); 1195 } 1196 1197 /// \return The size (in bits) of the smallest and widest types in the code 1198 /// that needs to be vectorized. We ignore values that remain scalar such as 1199 /// 64 bit loop indices. 1200 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1201 1202 /// \return The desired interleave count. 1203 /// If interleave count has been specified by metadata it will be returned. 1204 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1205 /// are the selected vectorization factor and the cost of the selected VF. 1206 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1207 1208 /// Memory access instruction may be vectorized in more than one way. 1209 /// Form of instruction after vectorization depends on cost. 1210 /// This function takes cost-based decisions for Load/Store instructions 1211 /// and collects them in a map. This decisions map is used for building 1212 /// the lists of loop-uniform and loop-scalar instructions. 1213 /// The calculated cost is saved with widening decision in order to 1214 /// avoid redundant calculations. 1215 void setCostBasedWideningDecision(ElementCount VF); 1216 1217 /// A struct that represents some properties of the register usage 1218 /// of a loop. 1219 struct RegisterUsage { 1220 /// Holds the number of loop invariant values that are used in the loop. 1221 /// The key is ClassID of target-provided register class. 1222 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1223 /// Holds the maximum number of concurrent live intervals in the loop. 1224 /// The key is ClassID of target-provided register class. 1225 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1226 }; 1227 1228 /// \return Returns information about the register usages of the loop for the 1229 /// given vectorization factors. 1230 SmallVector<RegisterUsage, 8> 1231 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1232 1233 /// Collect values we want to ignore in the cost model. 1234 void collectValuesToIgnore(); 1235 1236 /// Collect all element types in the loop for which widening is needed. 1237 void collectElementTypesForWidening(); 1238 1239 /// Split reductions into those that happen in the loop, and those that happen 1240 /// outside. In loop reductions are collected into InLoopReductionChains. 1241 void collectInLoopReductions(); 1242 1243 /// Returns true if we should use strict in-order reductions for the given 1244 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1245 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1246 /// of FP operations. 1247 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1248 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1249 } 1250 1251 /// \returns The smallest bitwidth each instruction can be represented with. 1252 /// The vector equivalents of these instructions should be truncated to this 1253 /// type. 1254 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1255 return MinBWs; 1256 } 1257 1258 /// \returns True if it is more profitable to scalarize instruction \p I for 1259 /// vectorization factor \p VF. 1260 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1261 assert(VF.isVector() && 1262 "Profitable to scalarize relevant only for VF > 1."); 1263 1264 // Cost model is not run in the VPlan-native path - return conservative 1265 // result until this changes. 1266 if (EnableVPlanNativePath) 1267 return false; 1268 1269 auto Scalars = InstsToScalarize.find(VF); 1270 assert(Scalars != InstsToScalarize.end() && 1271 "VF not yet analyzed for scalarization profitability"); 1272 return Scalars->second.find(I) != Scalars->second.end(); 1273 } 1274 1275 /// Returns true if \p I is known to be uniform after vectorization. 1276 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1277 if (VF.isScalar()) 1278 return true; 1279 1280 // Cost model is not run in the VPlan-native path - return conservative 1281 // result until this changes. 1282 if (EnableVPlanNativePath) 1283 return false; 1284 1285 auto UniformsPerVF = Uniforms.find(VF); 1286 assert(UniformsPerVF != Uniforms.end() && 1287 "VF not yet analyzed for uniformity"); 1288 return UniformsPerVF->second.count(I); 1289 } 1290 1291 /// Returns true if \p I is known to be scalar after vectorization. 1292 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1293 if (VF.isScalar()) 1294 return true; 1295 1296 // Cost model is not run in the VPlan-native path - return conservative 1297 // result until this changes. 1298 if (EnableVPlanNativePath) 1299 return false; 1300 1301 auto ScalarsPerVF = Scalars.find(VF); 1302 assert(ScalarsPerVF != Scalars.end() && 1303 "Scalar values are not calculated for VF"); 1304 return ScalarsPerVF->second.count(I); 1305 } 1306 1307 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1308 /// for vectorization factor \p VF. 1309 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1310 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1311 !isProfitableToScalarize(I, VF) && 1312 !isScalarAfterVectorization(I, VF); 1313 } 1314 1315 /// Decision that was taken during cost calculation for memory instruction. 1316 enum InstWidening { 1317 CM_Unknown, 1318 CM_Widen, // For consecutive accesses with stride +1. 1319 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1320 CM_Interleave, 1321 CM_GatherScatter, 1322 CM_Scalarize 1323 }; 1324 1325 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1326 /// instruction \p I and vector width \p VF. 1327 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1328 InstructionCost Cost) { 1329 assert(VF.isVector() && "Expected VF >=2"); 1330 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1331 } 1332 1333 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1334 /// interleaving group \p Grp and vector width \p VF. 1335 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1336 ElementCount VF, InstWidening W, 1337 InstructionCost Cost) { 1338 assert(VF.isVector() && "Expected VF >=2"); 1339 /// Broadcast this decicion to all instructions inside the group. 1340 /// But the cost will be assigned to one instruction only. 1341 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1342 if (auto *I = Grp->getMember(i)) { 1343 if (Grp->getInsertPos() == I) 1344 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1345 else 1346 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1347 } 1348 } 1349 } 1350 1351 /// Return the cost model decision for the given instruction \p I and vector 1352 /// width \p VF. Return CM_Unknown if this instruction did not pass 1353 /// through the cost modeling. 1354 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1355 assert(VF.isVector() && "Expected VF to be a vector VF"); 1356 // Cost model is not run in the VPlan-native path - return conservative 1357 // result until this changes. 1358 if (EnableVPlanNativePath) 1359 return CM_GatherScatter; 1360 1361 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1362 auto Itr = WideningDecisions.find(InstOnVF); 1363 if (Itr == WideningDecisions.end()) 1364 return CM_Unknown; 1365 return Itr->second.first; 1366 } 1367 1368 /// Return the vectorization cost for the given instruction \p I and vector 1369 /// width \p VF. 1370 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1371 assert(VF.isVector() && "Expected VF >=2"); 1372 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1373 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1374 "The cost is not calculated"); 1375 return WideningDecisions[InstOnVF].second; 1376 } 1377 1378 /// Return True if instruction \p I is an optimizable truncate whose operand 1379 /// is an induction variable. Such a truncate will be removed by adding a new 1380 /// induction variable with the destination type. 1381 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1382 // If the instruction is not a truncate, return false. 1383 auto *Trunc = dyn_cast<TruncInst>(I); 1384 if (!Trunc) 1385 return false; 1386 1387 // Get the source and destination types of the truncate. 1388 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1389 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1390 1391 // If the truncate is free for the given types, return false. Replacing a 1392 // free truncate with an induction variable would add an induction variable 1393 // update instruction to each iteration of the loop. We exclude from this 1394 // check the primary induction variable since it will need an update 1395 // instruction regardless. 1396 Value *Op = Trunc->getOperand(0); 1397 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1398 return false; 1399 1400 // If the truncated value is not an induction variable, return false. 1401 return Legal->isInductionPhi(Op); 1402 } 1403 1404 /// Collects the instructions to scalarize for each predicated instruction in 1405 /// the loop. 1406 void collectInstsToScalarize(ElementCount VF); 1407 1408 /// Collect Uniform and Scalar values for the given \p VF. 1409 /// The sets depend on CM decision for Load/Store instructions 1410 /// that may be vectorized as interleave, gather-scatter or scalarized. 1411 void collectUniformsAndScalars(ElementCount VF) { 1412 // Do the analysis once. 1413 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1414 return; 1415 setCostBasedWideningDecision(VF); 1416 collectLoopUniforms(VF); 1417 collectLoopScalars(VF); 1418 } 1419 1420 /// Returns true if the target machine supports masked store operation 1421 /// for the given \p DataType and kind of access to \p Ptr. 1422 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1423 return Legal->isConsecutivePtr(DataType, Ptr) && 1424 TTI.isLegalMaskedStore(DataType, Alignment); 1425 } 1426 1427 /// Returns true if the target machine supports masked load operation 1428 /// for the given \p DataType and kind of access to \p Ptr. 1429 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1430 return Legal->isConsecutivePtr(DataType, Ptr) && 1431 TTI.isLegalMaskedLoad(DataType, Alignment); 1432 } 1433 1434 /// Returns true if the target machine can represent \p V as a masked gather 1435 /// or scatter operation. 1436 bool isLegalGatherOrScatter(Value *V, 1437 ElementCount VF = ElementCount::getFixed(1)) { 1438 bool LI = isa<LoadInst>(V); 1439 bool SI = isa<StoreInst>(V); 1440 if (!LI && !SI) 1441 return false; 1442 auto *Ty = getLoadStoreType(V); 1443 Align Align = getLoadStoreAlignment(V); 1444 if (VF.isVector()) 1445 Ty = VectorType::get(Ty, VF); 1446 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1447 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1448 } 1449 1450 /// Returns true if the target machine supports all of the reduction 1451 /// variables found for the given VF. 1452 bool canVectorizeReductions(ElementCount VF) const { 1453 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1454 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1455 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1456 })); 1457 } 1458 1459 /// Returns true if \p I is an instruction that will be scalarized with 1460 /// predication when vectorizing \p I with vectorization factor \p VF. Such 1461 /// instructions include conditional stores and instructions that may divide 1462 /// by zero. 1463 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1464 1465 // Returns true if \p I is an instruction that will be predicated either 1466 // through scalar predication or masked load/store or masked gather/scatter. 1467 // \p VF is the vectorization factor that will be used to vectorize \p I. 1468 // Superset of instructions that return true for isScalarWithPredication. 1469 bool isPredicatedInst(Instruction *I, ElementCount VF, 1470 bool IsKnownUniform = false) { 1471 // When we know the load is uniform and the original scalar loop was not 1472 // predicated we don't need to mark it as a predicated instruction. Any 1473 // vectorised blocks created when tail-folding are something artificial we 1474 // have introduced and we know there is always at least one active lane. 1475 // That's why we call Legal->blockNeedsPredication here because it doesn't 1476 // query tail-folding. 1477 if (IsKnownUniform && isa<LoadInst>(I) && 1478 !Legal->blockNeedsPredication(I->getParent())) 1479 return false; 1480 if (!blockNeedsPredicationForAnyReason(I->getParent())) 1481 return false; 1482 // Loads and stores that need some form of masked operation are predicated 1483 // instructions. 1484 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1485 return Legal->isMaskRequired(I); 1486 return isScalarWithPredication(I, VF); 1487 } 1488 1489 /// Returns true if \p I is a memory instruction with consecutive memory 1490 /// access that can be widened. 1491 bool 1492 memoryInstructionCanBeWidened(Instruction *I, 1493 ElementCount VF = ElementCount::getFixed(1)); 1494 1495 /// Returns true if \p I is a memory instruction in an interleaved-group 1496 /// of memory accesses that can be vectorized with wide vector loads/stores 1497 /// and shuffles. 1498 bool 1499 interleavedAccessCanBeWidened(Instruction *I, 1500 ElementCount VF = ElementCount::getFixed(1)); 1501 1502 /// Check if \p Instr belongs to any interleaved access group. 1503 bool isAccessInterleaved(Instruction *Instr) { 1504 return InterleaveInfo.isInterleaved(Instr); 1505 } 1506 1507 /// Get the interleaved access group that \p Instr belongs to. 1508 const InterleaveGroup<Instruction> * 1509 getInterleavedAccessGroup(Instruction *Instr) { 1510 return InterleaveInfo.getInterleaveGroup(Instr); 1511 } 1512 1513 /// Returns true if we're required to use a scalar epilogue for at least 1514 /// the final iteration of the original loop. 1515 bool requiresScalarEpilogue(ElementCount VF) const { 1516 if (!isScalarEpilogueAllowed()) 1517 return false; 1518 // If we might exit from anywhere but the latch, must run the exiting 1519 // iteration in scalar form. 1520 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1521 return true; 1522 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue(); 1523 } 1524 1525 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1526 /// loop hint annotation. 1527 bool isScalarEpilogueAllowed() const { 1528 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1529 } 1530 1531 /// Returns true if all loop blocks should be masked to fold tail loop. 1532 bool foldTailByMasking() const { return FoldTailByMasking; } 1533 1534 /// Returns true if the instructions in this block requires predication 1535 /// for any reason, e.g. because tail folding now requires a predicate 1536 /// or because the block in the original loop was predicated. 1537 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1538 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1539 } 1540 1541 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1542 /// nodes to the chain of instructions representing the reductions. Uses a 1543 /// MapVector to ensure deterministic iteration order. 1544 using ReductionChainMap = 1545 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1546 1547 /// Return the chain of instructions representing an inloop reduction. 1548 const ReductionChainMap &getInLoopReductionChains() const { 1549 return InLoopReductionChains; 1550 } 1551 1552 /// Returns true if the Phi is part of an inloop reduction. 1553 bool isInLoopReduction(PHINode *Phi) const { 1554 return InLoopReductionChains.count(Phi); 1555 } 1556 1557 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1558 /// with factor VF. Return the cost of the instruction, including 1559 /// scalarization overhead if it's needed. 1560 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1561 1562 /// Estimate cost of a call instruction CI if it were vectorized with factor 1563 /// VF. Return the cost of the instruction, including scalarization overhead 1564 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1565 /// scalarized - 1566 /// i.e. either vector version isn't available, or is too expensive. 1567 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1568 bool &NeedToScalarize) const; 1569 1570 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1571 /// that of B. 1572 bool isMoreProfitable(const VectorizationFactor &A, 1573 const VectorizationFactor &B) const; 1574 1575 /// Invalidates decisions already taken by the cost model. 1576 void invalidateCostModelingDecisions() { 1577 WideningDecisions.clear(); 1578 Uniforms.clear(); 1579 Scalars.clear(); 1580 } 1581 1582 private: 1583 unsigned NumPredStores = 0; 1584 1585 /// Convenience function that returns the value of vscale_range iff 1586 /// vscale_range.min == vscale_range.max or otherwise returns the value 1587 /// returned by the corresponding TLI method. 1588 Optional<unsigned> getVScaleForTuning() const; 1589 1590 /// \return An upper bound for the vectorization factors for both 1591 /// fixed and scalable vectorization, where the minimum-known number of 1592 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1593 /// disabled or unsupported, then the scalable part will be equal to 1594 /// ElementCount::getScalable(0). 1595 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1596 ElementCount UserVF, 1597 bool FoldTailByMasking); 1598 1599 /// \return the maximized element count based on the targets vector 1600 /// registers and the loop trip-count, but limited to a maximum safe VF. 1601 /// This is a helper function of computeFeasibleMaxVF. 1602 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1603 unsigned SmallestType, 1604 unsigned WidestType, 1605 ElementCount MaxSafeVF, 1606 bool FoldTailByMasking); 1607 1608 /// \return the maximum legal scalable VF, based on the safe max number 1609 /// of elements. 1610 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1611 1612 /// The vectorization cost is a combination of the cost itself and a boolean 1613 /// indicating whether any of the contributing operations will actually 1614 /// operate on vector values after type legalization in the backend. If this 1615 /// latter value is false, then all operations will be scalarized (i.e. no 1616 /// vectorization has actually taken place). 1617 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1618 1619 /// Returns the expected execution cost. The unit of the cost does 1620 /// not matter because we use the 'cost' units to compare different 1621 /// vector widths. The cost that is returned is *not* normalized by 1622 /// the factor width. If \p Invalid is not nullptr, this function 1623 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1624 /// each instruction that has an Invalid cost for the given VF. 1625 using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1626 VectorizationCostTy 1627 expectedCost(ElementCount VF, 1628 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1629 1630 /// Returns the execution time cost of an instruction for a given vector 1631 /// width. Vector width of one means scalar. 1632 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1633 1634 /// The cost-computation logic from getInstructionCost which provides 1635 /// the vector type as an output parameter. 1636 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1637 Type *&VectorTy); 1638 1639 /// Return the cost of instructions in an inloop reduction pattern, if I is 1640 /// part of that pattern. 1641 Optional<InstructionCost> 1642 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1643 TTI::TargetCostKind CostKind); 1644 1645 /// Calculate vectorization cost of memory instruction \p I. 1646 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1647 1648 /// The cost computation for scalarized memory instruction. 1649 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost computation for interleaving group of memory instructions. 1652 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1653 1654 /// The cost computation for Gather/Scatter instruction. 1655 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1656 1657 /// The cost computation for widening instruction \p I with consecutive 1658 /// memory access. 1659 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1660 1661 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1662 /// Load: scalar load + broadcast. 1663 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1664 /// element) 1665 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1666 1667 /// Estimate the overhead of scalarizing an instruction. This is a 1668 /// convenience wrapper for the type-based getScalarizationOverhead API. 1669 InstructionCost getScalarizationOverhead(Instruction *I, 1670 ElementCount VF) const; 1671 1672 /// Returns whether the instruction is a load or store and will be a emitted 1673 /// as a vector operation. 1674 bool isConsecutiveLoadOrStore(Instruction *I); 1675 1676 /// Returns true if an artificially high cost for emulated masked memrefs 1677 /// should be used. 1678 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1679 1680 /// Map of scalar integer values to the smallest bitwidth they can be legally 1681 /// represented as. The vector equivalents of these values should be truncated 1682 /// to this type. 1683 MapVector<Instruction *, uint64_t> MinBWs; 1684 1685 /// A type representing the costs for instructions if they were to be 1686 /// scalarized rather than vectorized. The entries are Instruction-Cost 1687 /// pairs. 1688 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1689 1690 /// A set containing all BasicBlocks that are known to present after 1691 /// vectorization as a predicated block. 1692 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1693 1694 /// Records whether it is allowed to have the original scalar loop execute at 1695 /// least once. This may be needed as a fallback loop in case runtime 1696 /// aliasing/dependence checks fail, or to handle the tail/remainder 1697 /// iterations when the trip count is unknown or doesn't divide by the VF, 1698 /// or as a peel-loop to handle gaps in interleave-groups. 1699 /// Under optsize and when the trip count is very small we don't allow any 1700 /// iterations to execute in the scalar loop. 1701 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1702 1703 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1704 bool FoldTailByMasking = false; 1705 1706 /// A map holding scalar costs for different vectorization factors. The 1707 /// presence of a cost for an instruction in the mapping indicates that the 1708 /// instruction will be scalarized when vectorizing with the associated 1709 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1710 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1711 1712 /// Holds the instructions known to be uniform after vectorization. 1713 /// The data is collected per VF. 1714 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1715 1716 /// Holds the instructions known to be scalar after vectorization. 1717 /// The data is collected per VF. 1718 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1719 1720 /// Holds the instructions (address computations) that are forced to be 1721 /// scalarized. 1722 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1723 1724 /// PHINodes of the reductions that should be expanded in-loop along with 1725 /// their associated chains of reduction operations, in program order from top 1726 /// (PHI) to bottom 1727 ReductionChainMap InLoopReductionChains; 1728 1729 /// A Map of inloop reduction operations and their immediate chain operand. 1730 /// FIXME: This can be removed once reductions can be costed correctly in 1731 /// vplan. This was added to allow quick lookup to the inloop operations, 1732 /// without having to loop through InLoopReductionChains. 1733 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1734 1735 /// Returns the expected difference in cost from scalarizing the expression 1736 /// feeding a predicated instruction \p PredInst. The instructions to 1737 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1738 /// non-negative return value implies the expression will be scalarized. 1739 /// Currently, only single-use chains are considered for scalarization. 1740 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1741 ElementCount VF); 1742 1743 /// Collect the instructions that are uniform after vectorization. An 1744 /// instruction is uniform if we represent it with a single scalar value in 1745 /// the vectorized loop corresponding to each vector iteration. Examples of 1746 /// uniform instructions include pointer operands of consecutive or 1747 /// interleaved memory accesses. Note that although uniformity implies an 1748 /// instruction will be scalar, the reverse is not true. In general, a 1749 /// scalarized instruction will be represented by VF scalar values in the 1750 /// vectorized loop, each corresponding to an iteration of the original 1751 /// scalar loop. 1752 void collectLoopUniforms(ElementCount VF); 1753 1754 /// Collect the instructions that are scalar after vectorization. An 1755 /// instruction is scalar if it is known to be uniform or will be scalarized 1756 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1757 /// to the list if they are used by a load/store instruction that is marked as 1758 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1759 /// VF values in the vectorized loop, each corresponding to an iteration of 1760 /// the original scalar loop. 1761 void collectLoopScalars(ElementCount VF); 1762 1763 /// Keeps cost model vectorization decision and cost for instructions. 1764 /// Right now it is used for memory instructions only. 1765 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1766 std::pair<InstWidening, InstructionCost>>; 1767 1768 DecisionList WideningDecisions; 1769 1770 /// Returns true if \p V is expected to be vectorized and it needs to be 1771 /// extracted. 1772 bool needsExtract(Value *V, ElementCount VF) const { 1773 Instruction *I = dyn_cast<Instruction>(V); 1774 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1775 TheLoop->isLoopInvariant(I)) 1776 return false; 1777 1778 // Assume we can vectorize V (and hence we need extraction) if the 1779 // scalars are not computed yet. This can happen, because it is called 1780 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1781 // the scalars are collected. That should be a safe assumption in most 1782 // cases, because we check if the operands have vectorizable types 1783 // beforehand in LoopVectorizationLegality. 1784 return Scalars.find(VF) == Scalars.end() || 1785 !isScalarAfterVectorization(I, VF); 1786 }; 1787 1788 /// Returns a range containing only operands needing to be extracted. 1789 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1790 ElementCount VF) const { 1791 return SmallVector<Value *, 4>(make_filter_range( 1792 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1793 } 1794 1795 /// Determines if we have the infrastructure to vectorize loop \p L and its 1796 /// epilogue, assuming the main loop is vectorized by \p VF. 1797 bool isCandidateForEpilogueVectorization(const Loop &L, 1798 const ElementCount VF) const; 1799 1800 /// Returns true if epilogue vectorization is considered profitable, and 1801 /// false otherwise. 1802 /// \p VF is the vectorization factor chosen for the original loop. 1803 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1804 1805 public: 1806 /// The loop that we evaluate. 1807 Loop *TheLoop; 1808 1809 /// Predicated scalar evolution analysis. 1810 PredicatedScalarEvolution &PSE; 1811 1812 /// Loop Info analysis. 1813 LoopInfo *LI; 1814 1815 /// Vectorization legality. 1816 LoopVectorizationLegality *Legal; 1817 1818 /// Vector target information. 1819 const TargetTransformInfo &TTI; 1820 1821 /// Target Library Info. 1822 const TargetLibraryInfo *TLI; 1823 1824 /// Demanded bits analysis. 1825 DemandedBits *DB; 1826 1827 /// Assumption cache. 1828 AssumptionCache *AC; 1829 1830 /// Interface to emit optimization remarks. 1831 OptimizationRemarkEmitter *ORE; 1832 1833 const Function *TheFunction; 1834 1835 /// Loop Vectorize Hint. 1836 const LoopVectorizeHints *Hints; 1837 1838 /// The interleave access information contains groups of interleaved accesses 1839 /// with the same stride and close to each other. 1840 InterleavedAccessInfo &InterleaveInfo; 1841 1842 /// Values to ignore in the cost model. 1843 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1844 1845 /// Values to ignore in the cost model when VF > 1. 1846 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1847 1848 /// All element types found in the loop. 1849 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1850 1851 /// Profitable vector factors. 1852 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1853 }; 1854 } // end namespace llvm 1855 1856 /// Helper struct to manage generating runtime checks for vectorization. 1857 /// 1858 /// The runtime checks are created up-front in temporary blocks to allow better 1859 /// estimating the cost and un-linked from the existing IR. After deciding to 1860 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1861 /// temporary blocks are completely removed. 1862 class GeneratedRTChecks { 1863 /// Basic block which contains the generated SCEV checks, if any. 1864 BasicBlock *SCEVCheckBlock = nullptr; 1865 1866 /// The value representing the result of the generated SCEV checks. If it is 1867 /// nullptr, either no SCEV checks have been generated or they have been used. 1868 Value *SCEVCheckCond = nullptr; 1869 1870 /// Basic block which contains the generated memory runtime checks, if any. 1871 BasicBlock *MemCheckBlock = nullptr; 1872 1873 /// The value representing the result of the generated memory runtime checks. 1874 /// If it is nullptr, either no memory runtime checks have been generated or 1875 /// they have been used. 1876 Value *MemRuntimeCheckCond = nullptr; 1877 1878 DominatorTree *DT; 1879 LoopInfo *LI; 1880 1881 SCEVExpander SCEVExp; 1882 SCEVExpander MemCheckExp; 1883 1884 public: 1885 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1886 const DataLayout &DL) 1887 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1888 MemCheckExp(SE, DL, "scev.check") {} 1889 1890 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1891 /// accurately estimate the cost of the runtime checks. The blocks are 1892 /// un-linked from the IR and is added back during vector code generation. If 1893 /// there is no vector code generation, the check blocks are removed 1894 /// completely. 1895 void Create(Loop *L, const LoopAccessInfo &LAI, 1896 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1897 1898 BasicBlock *LoopHeader = L->getHeader(); 1899 BasicBlock *Preheader = L->getLoopPreheader(); 1900 1901 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1902 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1903 // may be used by SCEVExpander. The blocks will be un-linked from their 1904 // predecessors and removed from LI & DT at the end of the function. 1905 if (!UnionPred.isAlwaysTrue()) { 1906 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1907 nullptr, "vector.scevcheck"); 1908 1909 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1910 &UnionPred, SCEVCheckBlock->getTerminator()); 1911 } 1912 1913 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1914 if (RtPtrChecking.Need) { 1915 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1916 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1917 "vector.memcheck"); 1918 1919 auto DiffChecks = RtPtrChecking.getDiffChecks(); 1920 if (DiffChecks) { 1921 MemRuntimeCheckCond = addDiffRuntimeChecks( 1922 MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp, 1923 [VF](IRBuilderBase &B, unsigned Bits) { 1924 return getRuntimeVF(B, B.getIntNTy(Bits), VF); 1925 }, 1926 IC); 1927 } else { 1928 MemRuntimeCheckCond = 1929 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1930 RtPtrChecking.getChecks(), MemCheckExp); 1931 } 1932 assert(MemRuntimeCheckCond && 1933 "no RT checks generated although RtPtrChecking " 1934 "claimed checks are required"); 1935 } 1936 1937 if (!MemCheckBlock && !SCEVCheckBlock) 1938 return; 1939 1940 // Unhook the temporary block with the checks, update various places 1941 // accordingly. 1942 if (SCEVCheckBlock) 1943 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1944 if (MemCheckBlock) 1945 MemCheckBlock->replaceAllUsesWith(Preheader); 1946 1947 if (SCEVCheckBlock) { 1948 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1949 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1950 Preheader->getTerminator()->eraseFromParent(); 1951 } 1952 if (MemCheckBlock) { 1953 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1954 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1955 Preheader->getTerminator()->eraseFromParent(); 1956 } 1957 1958 DT->changeImmediateDominator(LoopHeader, Preheader); 1959 if (MemCheckBlock) { 1960 DT->eraseNode(MemCheckBlock); 1961 LI->removeBlock(MemCheckBlock); 1962 } 1963 if (SCEVCheckBlock) { 1964 DT->eraseNode(SCEVCheckBlock); 1965 LI->removeBlock(SCEVCheckBlock); 1966 } 1967 } 1968 1969 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1970 /// unused. 1971 ~GeneratedRTChecks() { 1972 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 1973 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 1974 if (!SCEVCheckCond) 1975 SCEVCleaner.markResultUsed(); 1976 1977 if (!MemRuntimeCheckCond) 1978 MemCheckCleaner.markResultUsed(); 1979 1980 if (MemRuntimeCheckCond) { 1981 auto &SE = *MemCheckExp.getSE(); 1982 // Memory runtime check generation creates compares that use expanded 1983 // values. Remove them before running the SCEVExpanderCleaners. 1984 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1985 if (MemCheckExp.isInsertedInstruction(&I)) 1986 continue; 1987 SE.forgetValue(&I); 1988 I.eraseFromParent(); 1989 } 1990 } 1991 MemCheckCleaner.cleanup(); 1992 SCEVCleaner.cleanup(); 1993 1994 if (SCEVCheckCond) 1995 SCEVCheckBlock->eraseFromParent(); 1996 if (MemRuntimeCheckCond) 1997 MemCheckBlock->eraseFromParent(); 1998 } 1999 2000 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2001 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2002 /// depending on the generated condition. 2003 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2004 BasicBlock *LoopVectorPreHeader, 2005 BasicBlock *LoopExitBlock) { 2006 if (!SCEVCheckCond) 2007 return nullptr; 2008 2009 Value *Cond = SCEVCheckCond; 2010 // Mark the check as used, to prevent it from being removed during cleanup. 2011 SCEVCheckCond = nullptr; 2012 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2013 if (C->isZero()) 2014 return nullptr; 2015 2016 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2017 2018 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2019 // Create new preheader for vector loop. 2020 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2021 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2022 2023 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2024 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2025 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2026 SCEVCheckBlock); 2027 2028 DT->addNewBlock(SCEVCheckBlock, Pred); 2029 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2030 2031 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), 2032 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond)); 2033 return SCEVCheckBlock; 2034 } 2035 2036 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2037 /// the branches to branch to the vector preheader or \p Bypass, depending on 2038 /// the generated condition. 2039 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2040 BasicBlock *LoopVectorPreHeader) { 2041 // Check if we generated code that checks in runtime if arrays overlap. 2042 if (!MemRuntimeCheckCond) 2043 return nullptr; 2044 2045 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2046 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2047 MemCheckBlock); 2048 2049 DT->addNewBlock(MemCheckBlock, Pred); 2050 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2051 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2052 2053 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2054 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2055 2056 ReplaceInstWithInst( 2057 MemCheckBlock->getTerminator(), 2058 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2059 MemCheckBlock->getTerminator()->setDebugLoc( 2060 Pred->getTerminator()->getDebugLoc()); 2061 2062 // Mark the check as used, to prevent it from being removed during cleanup. 2063 MemRuntimeCheckCond = nullptr; 2064 return MemCheckBlock; 2065 } 2066 }; 2067 2068 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2069 // vectorization. The loop needs to be annotated with #pragma omp simd 2070 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2071 // vector length information is not provided, vectorization is not considered 2072 // explicit. Interleave hints are not allowed either. These limitations will be 2073 // relaxed in the future. 2074 // Please, note that we are currently forced to abuse the pragma 'clang 2075 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2076 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2077 // provides *explicit vectorization hints* (LV can bypass legal checks and 2078 // assume that vectorization is legal). However, both hints are implemented 2079 // using the same metadata (llvm.loop.vectorize, processed by 2080 // LoopVectorizeHints). This will be fixed in the future when the native IR 2081 // representation for pragma 'omp simd' is introduced. 2082 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2083 OptimizationRemarkEmitter *ORE) { 2084 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2085 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2086 2087 // Only outer loops with an explicit vectorization hint are supported. 2088 // Unannotated outer loops are ignored. 2089 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2090 return false; 2091 2092 Function *Fn = OuterLp->getHeader()->getParent(); 2093 if (!Hints.allowVectorization(Fn, OuterLp, 2094 true /*VectorizeOnlyWhenForced*/)) { 2095 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2096 return false; 2097 } 2098 2099 if (Hints.getInterleave() > 1) { 2100 // TODO: Interleave support is future work. 2101 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2102 "outer loops.\n"); 2103 Hints.emitRemarkWithHints(); 2104 return false; 2105 } 2106 2107 return true; 2108 } 2109 2110 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2111 OptimizationRemarkEmitter *ORE, 2112 SmallVectorImpl<Loop *> &V) { 2113 // Collect inner loops and outer loops without irreducible control flow. For 2114 // now, only collect outer loops that have explicit vectorization hints. If we 2115 // are stress testing the VPlan H-CFG construction, we collect the outermost 2116 // loop of every loop nest. 2117 if (L.isInnermost() || VPlanBuildStressTest || 2118 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2119 LoopBlocksRPO RPOT(&L); 2120 RPOT.perform(LI); 2121 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2122 V.push_back(&L); 2123 // TODO: Collect inner loops inside marked outer loops in case 2124 // vectorization fails for the outer loop. Do not invoke 2125 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2126 // already known to be reducible. We can use an inherited attribute for 2127 // that. 2128 return; 2129 } 2130 } 2131 for (Loop *InnerL : L) 2132 collectSupportedLoops(*InnerL, LI, ORE, V); 2133 } 2134 2135 namespace { 2136 2137 /// The LoopVectorize Pass. 2138 struct LoopVectorize : public FunctionPass { 2139 /// Pass identification, replacement for typeid 2140 static char ID; 2141 2142 LoopVectorizePass Impl; 2143 2144 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2145 bool VectorizeOnlyWhenForced = false) 2146 : FunctionPass(ID), 2147 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2148 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2149 } 2150 2151 bool runOnFunction(Function &F) override { 2152 if (skipFunction(F)) 2153 return false; 2154 2155 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2156 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2157 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2158 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2159 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2160 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2161 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2162 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2163 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2164 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2165 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2166 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2167 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2168 2169 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2170 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2171 2172 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2173 GetLAA, *ORE, PSI).MadeAnyChange; 2174 } 2175 2176 void getAnalysisUsage(AnalysisUsage &AU) const override { 2177 AU.addRequired<AssumptionCacheTracker>(); 2178 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2179 AU.addRequired<DominatorTreeWrapperPass>(); 2180 AU.addRequired<LoopInfoWrapperPass>(); 2181 AU.addRequired<ScalarEvolutionWrapperPass>(); 2182 AU.addRequired<TargetTransformInfoWrapperPass>(); 2183 AU.addRequired<AAResultsWrapperPass>(); 2184 AU.addRequired<LoopAccessLegacyAnalysis>(); 2185 AU.addRequired<DemandedBitsWrapperPass>(); 2186 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2187 AU.addRequired<InjectTLIMappingsLegacy>(); 2188 2189 // We currently do not preserve loopinfo/dominator analyses with outer loop 2190 // vectorization. Until this is addressed, mark these analyses as preserved 2191 // only for non-VPlan-native path. 2192 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2193 if (!EnableVPlanNativePath) { 2194 AU.addPreserved<LoopInfoWrapperPass>(); 2195 AU.addPreserved<DominatorTreeWrapperPass>(); 2196 } 2197 2198 AU.addPreserved<BasicAAWrapperPass>(); 2199 AU.addPreserved<GlobalsAAWrapperPass>(); 2200 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2201 } 2202 }; 2203 2204 } // end anonymous namespace 2205 2206 //===----------------------------------------------------------------------===// 2207 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2208 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2209 //===----------------------------------------------------------------------===// 2210 2211 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2212 // We need to place the broadcast of invariant variables outside the loop, 2213 // but only if it's proven safe to do so. Else, broadcast will be inside 2214 // vector loop body. 2215 Instruction *Instr = dyn_cast<Instruction>(V); 2216 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2217 (!Instr || 2218 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2219 // Place the code for broadcasting invariant variables in the new preheader. 2220 IRBuilder<>::InsertPointGuard Guard(Builder); 2221 if (SafeToHoist) 2222 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2223 2224 // Broadcast the scalar into all locations in the vector. 2225 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2226 2227 return Shuf; 2228 } 2229 2230 /// This function adds 2231 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 2232 /// to each vector element of Val. The sequence starts at StartIndex. 2233 /// \p Opcode is relevant for FP induction variable. 2234 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, 2235 Instruction::BinaryOps BinOp, ElementCount VF, 2236 IRBuilderBase &Builder) { 2237 assert(VF.isVector() && "only vector VFs are supported"); 2238 2239 // Create and check the types. 2240 auto *ValVTy = cast<VectorType>(Val->getType()); 2241 ElementCount VLen = ValVTy->getElementCount(); 2242 2243 Type *STy = Val->getType()->getScalarType(); 2244 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2245 "Induction Step must be an integer or FP"); 2246 assert(Step->getType() == STy && "Step has wrong type"); 2247 2248 SmallVector<Constant *, 8> Indices; 2249 2250 // Create a vector of consecutive numbers from zero to VF. 2251 VectorType *InitVecValVTy = ValVTy; 2252 if (STy->isFloatingPointTy()) { 2253 Type *InitVecValSTy = 2254 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2255 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2256 } 2257 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2258 2259 // Splat the StartIdx 2260 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx); 2261 2262 if (STy->isIntegerTy()) { 2263 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2264 Step = Builder.CreateVectorSplat(VLen, Step); 2265 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2266 // FIXME: The newly created binary instructions should contain nsw/nuw 2267 // flags, which can be found from the original scalar operations. 2268 Step = Builder.CreateMul(InitVec, Step); 2269 return Builder.CreateAdd(Val, Step, "induction"); 2270 } 2271 2272 // Floating point induction. 2273 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2274 "Binary Opcode should be specified for FP induction"); 2275 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2276 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat); 2277 2278 Step = Builder.CreateVectorSplat(VLen, Step); 2279 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2280 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2281 } 2282 2283 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 2284 /// variable on which to base the steps, \p Step is the size of the step. 2285 static void buildScalarSteps(Value *ScalarIV, Value *Step, 2286 const InductionDescriptor &ID, VPValue *Def, 2287 VPTransformState &State) { 2288 IRBuilderBase &Builder = State.Builder; 2289 // We shouldn't have to build scalar steps if we aren't vectorizing. 2290 assert(State.VF.isVector() && "VF should be greater than one"); 2291 // Get the value type and ensure it and the step have the same integer type. 2292 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2293 assert(ScalarIVTy == Step->getType() && 2294 "Val and Step should have the same type"); 2295 2296 // We build scalar steps for both integer and floating-point induction 2297 // variables. Here, we determine the kind of arithmetic we will perform. 2298 Instruction::BinaryOps AddOp; 2299 Instruction::BinaryOps MulOp; 2300 if (ScalarIVTy->isIntegerTy()) { 2301 AddOp = Instruction::Add; 2302 MulOp = Instruction::Mul; 2303 } else { 2304 AddOp = ID.getInductionOpcode(); 2305 MulOp = Instruction::FMul; 2306 } 2307 2308 // Determine the number of scalars we need to generate for each unroll 2309 // iteration. 2310 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def); 2311 unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); 2312 // Compute the scalar steps and save the results in State. 2313 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2314 ScalarIVTy->getScalarSizeInBits()); 2315 Type *VecIVTy = nullptr; 2316 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2317 if (!FirstLaneOnly && State.VF.isScalable()) { 2318 VecIVTy = VectorType::get(ScalarIVTy, State.VF); 2319 UnitStepVec = 2320 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); 2321 SplatStep = Builder.CreateVectorSplat(State.VF, Step); 2322 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); 2323 } 2324 2325 for (unsigned Part = 0; Part < State.UF; ++Part) { 2326 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); 2327 2328 if (!FirstLaneOnly && State.VF.isScalable()) { 2329 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); 2330 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2331 if (ScalarIVTy->isFloatingPointTy()) 2332 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2333 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2334 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2335 State.set(Def, Add, Part); 2336 // It's useful to record the lane values too for the known minimum number 2337 // of elements so we do those below. This improves the code quality when 2338 // trying to extract the first element, for example. 2339 } 2340 2341 if (ScalarIVTy->isFloatingPointTy()) 2342 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2343 2344 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2345 Value *StartIdx = Builder.CreateBinOp( 2346 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2347 // The step returned by `createStepForVF` is a runtime-evaluated value 2348 // when VF is scalable. Otherwise, it should be folded into a Constant. 2349 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && 2350 "Expected StartIdx to be folded to a constant when VF is not " 2351 "scalable"); 2352 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2353 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2354 State.set(Def, Add, VPIteration(Part, Lane)); 2355 } 2356 } 2357 } 2358 2359 // Generate code for the induction step. Note that induction steps are 2360 // required to be loop-invariant 2361 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE, 2362 Instruction *InsertBefore, 2363 Loop *OrigLoop = nullptr) { 2364 const DataLayout &DL = SE.getDataLayout(); 2365 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && 2366 "Induction step should be loop invariant"); 2367 if (auto *E = dyn_cast<SCEVUnknown>(Step)) 2368 return E->getValue(); 2369 2370 SCEVExpander Exp(SE, DL, "induction"); 2371 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore); 2372 } 2373 2374 /// Compute the transformed value of Index at offset StartValue using step 2375 /// StepValue. 2376 /// For integer induction, returns StartValue + Index * StepValue. 2377 /// For pointer induction, returns StartValue[Index * StepValue]. 2378 /// FIXME: The newly created binary instructions should contain nsw/nuw 2379 /// flags, which can be found from the original scalar operations. 2380 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, 2381 Value *StartValue, Value *Step, 2382 const InductionDescriptor &ID) { 2383 assert(Index->getType()->getScalarType() == Step->getType() && 2384 "Index scalar type does not match StepValue type"); 2385 2386 // Note: the IR at this point is broken. We cannot use SE to create any new 2387 // SCEV and then expand it, hoping that SCEV's simplification will give us 2388 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2389 // lead to various SCEV crashes. So all we can do is to use builder and rely 2390 // on InstCombine for future simplifications. Here we handle some trivial 2391 // cases only. 2392 auto CreateAdd = [&B](Value *X, Value *Y) { 2393 assert(X->getType() == Y->getType() && "Types don't match!"); 2394 if (auto *CX = dyn_cast<ConstantInt>(X)) 2395 if (CX->isZero()) 2396 return Y; 2397 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2398 if (CY->isZero()) 2399 return X; 2400 return B.CreateAdd(X, Y); 2401 }; 2402 2403 // We allow X to be a vector type, in which case Y will potentially be 2404 // splatted into a vector with the same element count. 2405 auto CreateMul = [&B](Value *X, Value *Y) { 2406 assert(X->getType()->getScalarType() == Y->getType() && 2407 "Types don't match!"); 2408 if (auto *CX = dyn_cast<ConstantInt>(X)) 2409 if (CX->isOne()) 2410 return Y; 2411 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2412 if (CY->isOne()) 2413 return X; 2414 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2415 if (XVTy && !isa<VectorType>(Y->getType())) 2416 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2417 return B.CreateMul(X, Y); 2418 }; 2419 2420 switch (ID.getKind()) { 2421 case InductionDescriptor::IK_IntInduction: { 2422 assert(!isa<VectorType>(Index->getType()) && 2423 "Vector indices not supported for integer inductions yet"); 2424 assert(Index->getType() == StartValue->getType() && 2425 "Index type does not match StartValue type"); 2426 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2427 return B.CreateSub(StartValue, Index); 2428 auto *Offset = CreateMul(Index, Step); 2429 return CreateAdd(StartValue, Offset); 2430 } 2431 case InductionDescriptor::IK_PtrInduction: { 2432 assert(isa<Constant>(Step) && 2433 "Expected constant step for pointer induction"); 2434 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step)); 2435 } 2436 case InductionDescriptor::IK_FpInduction: { 2437 assert(!isa<VectorType>(Index->getType()) && 2438 "Vector indices not supported for FP inductions yet"); 2439 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2440 auto InductionBinOp = ID.getInductionBinOp(); 2441 assert(InductionBinOp && 2442 (InductionBinOp->getOpcode() == Instruction::FAdd || 2443 InductionBinOp->getOpcode() == Instruction::FSub) && 2444 "Original bin op should be defined for FP induction"); 2445 2446 Value *MulExp = B.CreateFMul(Step, Index); 2447 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2448 "induction"); 2449 } 2450 case InductionDescriptor::IK_NoInduction: 2451 return nullptr; 2452 } 2453 llvm_unreachable("invalid enum"); 2454 } 2455 2456 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2457 const VPIteration &Instance, 2458 VPTransformState &State) { 2459 Value *ScalarInst = State.get(Def, Instance); 2460 Value *VectorValue = State.get(Def, Instance.Part); 2461 VectorValue = Builder.CreateInsertElement( 2462 VectorValue, ScalarInst, 2463 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2464 State.set(Def, VectorValue, Instance.Part); 2465 } 2466 2467 // Return whether we allow using masked interleave-groups (for dealing with 2468 // strided loads/stores that reside in predicated blocks, or for dealing 2469 // with gaps). 2470 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2471 // If an override option has been passed in for interleaved accesses, use it. 2472 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2473 return EnableMaskedInterleavedMemAccesses; 2474 2475 return TTI.enableMaskedInterleavedAccessVectorization(); 2476 } 2477 2478 // Try to vectorize the interleave group that \p Instr belongs to. 2479 // 2480 // E.g. Translate following interleaved load group (factor = 3): 2481 // for (i = 0; i < N; i+=3) { 2482 // R = Pic[i]; // Member of index 0 2483 // G = Pic[i+1]; // Member of index 1 2484 // B = Pic[i+2]; // Member of index 2 2485 // ... // do something to R, G, B 2486 // } 2487 // To: 2488 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2489 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2490 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2491 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2492 // 2493 // Or translate following interleaved store group (factor = 3): 2494 // for (i = 0; i < N; i+=3) { 2495 // ... do something to R, G, B 2496 // Pic[i] = R; // Member of index 0 2497 // Pic[i+1] = G; // Member of index 1 2498 // Pic[i+2] = B; // Member of index 2 2499 // } 2500 // To: 2501 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2502 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2503 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2504 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2505 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2506 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2507 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2508 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2509 VPValue *BlockInMask) { 2510 Instruction *Instr = Group->getInsertPos(); 2511 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2512 2513 // Prepare for the vector type of the interleaved load/store. 2514 Type *ScalarTy = getLoadStoreType(Instr); 2515 unsigned InterleaveFactor = Group->getFactor(); 2516 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2517 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2518 2519 // Prepare for the new pointers. 2520 SmallVector<Value *, 2> AddrParts; 2521 unsigned Index = Group->getIndex(Instr); 2522 2523 // TODO: extend the masked interleaved-group support to reversed access. 2524 assert((!BlockInMask || !Group->isReverse()) && 2525 "Reversed masked interleave-group not supported."); 2526 2527 // If the group is reverse, adjust the index to refer to the last vector lane 2528 // instead of the first. We adjust the index from the first vector lane, 2529 // rather than directly getting the pointer for lane VF - 1, because the 2530 // pointer operand of the interleaved access is supposed to be uniform. For 2531 // uniform instructions, we're only required to generate a value for the 2532 // first vector lane in each unroll iteration. 2533 if (Group->isReverse()) 2534 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2535 2536 for (unsigned Part = 0; Part < UF; Part++) { 2537 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2538 setDebugLocFromInst(AddrPart); 2539 2540 // Notice current instruction could be any index. Need to adjust the address 2541 // to the member of index 0. 2542 // 2543 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2544 // b = A[i]; // Member of index 0 2545 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2546 // 2547 // E.g. A[i+1] = a; // Member of index 1 2548 // A[i] = b; // Member of index 0 2549 // A[i+2] = c; // Member of index 2 (Current instruction) 2550 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2551 2552 bool InBounds = false; 2553 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2554 InBounds = gep->isInBounds(); 2555 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2556 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2557 2558 // Cast to the vector pointer type. 2559 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2560 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2561 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2562 } 2563 2564 setDebugLocFromInst(Instr); 2565 Value *PoisonVec = PoisonValue::get(VecTy); 2566 2567 Value *MaskForGaps = nullptr; 2568 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2569 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2570 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2571 } 2572 2573 // Vectorize the interleaved load group. 2574 if (isa<LoadInst>(Instr)) { 2575 // For each unroll part, create a wide load for the group. 2576 SmallVector<Value *, 2> NewLoads; 2577 for (unsigned Part = 0; Part < UF; Part++) { 2578 Instruction *NewLoad; 2579 if (BlockInMask || MaskForGaps) { 2580 assert(useMaskedInterleavedAccesses(*TTI) && 2581 "masked interleaved groups are not allowed."); 2582 Value *GroupMask = MaskForGaps; 2583 if (BlockInMask) { 2584 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2585 Value *ShuffledMask = Builder.CreateShuffleVector( 2586 BlockInMaskPart, 2587 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2588 "interleaved.mask"); 2589 GroupMask = MaskForGaps 2590 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2591 MaskForGaps) 2592 : ShuffledMask; 2593 } 2594 NewLoad = 2595 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2596 GroupMask, PoisonVec, "wide.masked.vec"); 2597 } 2598 else 2599 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2600 Group->getAlign(), "wide.vec"); 2601 Group->addMetadata(NewLoad); 2602 NewLoads.push_back(NewLoad); 2603 } 2604 2605 // For each member in the group, shuffle out the appropriate data from the 2606 // wide loads. 2607 unsigned J = 0; 2608 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2609 Instruction *Member = Group->getMember(I); 2610 2611 // Skip the gaps in the group. 2612 if (!Member) 2613 continue; 2614 2615 auto StrideMask = 2616 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2617 for (unsigned Part = 0; Part < UF; Part++) { 2618 Value *StridedVec = Builder.CreateShuffleVector( 2619 NewLoads[Part], StrideMask, "strided.vec"); 2620 2621 // If this member has different type, cast the result type. 2622 if (Member->getType() != ScalarTy) { 2623 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2624 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2625 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2626 } 2627 2628 if (Group->isReverse()) 2629 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2630 2631 State.set(VPDefs[J], StridedVec, Part); 2632 } 2633 ++J; 2634 } 2635 return; 2636 } 2637 2638 // The sub vector type for current instruction. 2639 auto *SubVT = VectorType::get(ScalarTy, VF); 2640 2641 // Vectorize the interleaved store group. 2642 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2643 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2644 "masked interleaved groups are not allowed."); 2645 assert((!MaskForGaps || !VF.isScalable()) && 2646 "masking gaps for scalable vectors is not yet supported."); 2647 for (unsigned Part = 0; Part < UF; Part++) { 2648 // Collect the stored vector from each member. 2649 SmallVector<Value *, 4> StoredVecs; 2650 for (unsigned i = 0; i < InterleaveFactor; i++) { 2651 assert((Group->getMember(i) || MaskForGaps) && 2652 "Fail to get a member from an interleaved store group"); 2653 Instruction *Member = Group->getMember(i); 2654 2655 // Skip the gaps in the group. 2656 if (!Member) { 2657 Value *Undef = PoisonValue::get(SubVT); 2658 StoredVecs.push_back(Undef); 2659 continue; 2660 } 2661 2662 Value *StoredVec = State.get(StoredValues[i], Part); 2663 2664 if (Group->isReverse()) 2665 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2666 2667 // If this member has different type, cast it to a unified type. 2668 2669 if (StoredVec->getType() != SubVT) 2670 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2671 2672 StoredVecs.push_back(StoredVec); 2673 } 2674 2675 // Concatenate all vectors into a wide vector. 2676 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2677 2678 // Interleave the elements in the wide vector. 2679 Value *IVec = Builder.CreateShuffleVector( 2680 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2681 "interleaved.vec"); 2682 2683 Instruction *NewStoreInstr; 2684 if (BlockInMask || MaskForGaps) { 2685 Value *GroupMask = MaskForGaps; 2686 if (BlockInMask) { 2687 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2688 Value *ShuffledMask = Builder.CreateShuffleVector( 2689 BlockInMaskPart, 2690 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2691 "interleaved.mask"); 2692 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, 2693 ShuffledMask, MaskForGaps) 2694 : ShuffledMask; 2695 } 2696 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2697 Group->getAlign(), GroupMask); 2698 } else 2699 NewStoreInstr = 2700 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2701 2702 Group->addMetadata(NewStoreInstr); 2703 } 2704 } 2705 2706 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2707 VPReplicateRecipe *RepRecipe, 2708 const VPIteration &Instance, 2709 bool IfPredicateInstr, 2710 VPTransformState &State) { 2711 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2712 2713 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2714 // the first lane and part. 2715 if (isa<NoAliasScopeDeclInst>(Instr)) 2716 if (!Instance.isFirstIteration()) 2717 return; 2718 2719 // Does this instruction return a value ? 2720 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2721 2722 Instruction *Cloned = Instr->clone(); 2723 if (!IsVoidRetTy) 2724 Cloned->setName(Instr->getName() + ".cloned"); 2725 2726 // If the scalarized instruction contributes to the address computation of a 2727 // widen masked load/store which was in a basic block that needed predication 2728 // and is not predicated after vectorization, we can't propagate 2729 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized 2730 // instruction could feed a poison value to the base address of the widen 2731 // load/store. 2732 if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) 2733 Cloned->dropPoisonGeneratingFlags(); 2734 2735 if (Instr->getDebugLoc()) 2736 setDebugLocFromInst(Instr); 2737 2738 // Replace the operands of the cloned instructions with their scalar 2739 // equivalents in the new loop. 2740 for (auto &I : enumerate(RepRecipe->operands())) { 2741 auto InputInstance = Instance; 2742 VPValue *Operand = I.value(); 2743 VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand); 2744 if (OperandR && OperandR->isUniform()) 2745 InputInstance.Lane = VPLane::getFirstLane(); 2746 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2747 } 2748 State.addNewMetadata(Cloned, Instr); 2749 2750 // Place the cloned scalar in the new loop. 2751 State.Builder.Insert(Cloned); 2752 2753 State.set(RepRecipe, Cloned, Instance); 2754 2755 // If we just cloned a new assumption, add it the assumption cache. 2756 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2757 AC->registerAssumption(II); 2758 2759 // End if-block. 2760 if (IfPredicateInstr) 2761 PredicatedInstructions.push_back(Cloned); 2762 } 2763 2764 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { 2765 if (TripCount) 2766 return TripCount; 2767 2768 assert(InsertBlock); 2769 IRBuilder<> Builder(InsertBlock->getTerminator()); 2770 // Find the loop boundaries. 2771 ScalarEvolution *SE = PSE.getSE(); 2772 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2773 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2774 "Invalid loop count"); 2775 2776 Type *IdxTy = Legal->getWidestInductionType(); 2777 assert(IdxTy && "No type for induction"); 2778 2779 // The exit count might have the type of i64 while the phi is i32. This can 2780 // happen if we have an induction variable that is sign extended before the 2781 // compare. The only way that we get a backedge taken count is that the 2782 // induction variable was signed and as such will not overflow. In such a case 2783 // truncation is legal. 2784 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2785 IdxTy->getPrimitiveSizeInBits()) 2786 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2787 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2788 2789 // Get the total trip count from the count by adding 1. 2790 const SCEV *ExitCount = SE->getAddExpr( 2791 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2792 2793 const DataLayout &DL = InsertBlock->getModule()->getDataLayout(); 2794 2795 // Expand the trip count and place the new instructions in the preheader. 2796 // Notice that the pre-header does not change, only the loop body. 2797 SCEVExpander Exp(*SE, DL, "induction"); 2798 2799 // Count holds the overall loop count (N). 2800 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2801 InsertBlock->getTerminator()); 2802 2803 if (TripCount->getType()->isPointerTy()) 2804 TripCount = 2805 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2806 InsertBlock->getTerminator()); 2807 2808 return TripCount; 2809 } 2810 2811 Value * 2812 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2813 if (VectorTripCount) 2814 return VectorTripCount; 2815 2816 Value *TC = getOrCreateTripCount(InsertBlock); 2817 IRBuilder<> Builder(InsertBlock->getTerminator()); 2818 2819 Type *Ty = TC->getType(); 2820 // This is where we can make the step a runtime constant. 2821 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2822 2823 // If the tail is to be folded by masking, round the number of iterations N 2824 // up to a multiple of Step instead of rounding down. This is done by first 2825 // adding Step-1 and then rounding down. Note that it's ok if this addition 2826 // overflows: the vector induction variable will eventually wrap to zero given 2827 // that it starts at zero and its Step is a power of two; the loop will then 2828 // exit, with the last early-exit vector comparison also producing all-true. 2829 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2830 // is accounted for in emitIterationCountCheck that adds an overflow check. 2831 if (Cost->foldTailByMasking()) { 2832 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2833 "VF*UF must be a power of 2 when folding tail by masking"); 2834 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2835 TC = Builder.CreateAdd( 2836 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2837 } 2838 2839 // Now we need to generate the expression for the part of the loop that the 2840 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2841 // iterations are not required for correctness, or N - Step, otherwise. Step 2842 // is equal to the vectorization factor (number of SIMD elements) times the 2843 // unroll factor (number of SIMD instructions). 2844 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2845 2846 // There are cases where we *must* run at least one iteration in the remainder 2847 // loop. See the cost model for when this can happen. If the step evenly 2848 // divides the trip count, we set the remainder to be equal to the step. If 2849 // the step does not evenly divide the trip count, no adjustment is necessary 2850 // since there will already be scalar iterations. Note that the minimum 2851 // iterations check ensures that N >= Step. 2852 if (Cost->requiresScalarEpilogue(VF)) { 2853 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2854 R = Builder.CreateSelect(IsZero, Step, R); 2855 } 2856 2857 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2858 2859 return VectorTripCount; 2860 } 2861 2862 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2863 const DataLayout &DL) { 2864 // Verify that V is a vector type with same number of elements as DstVTy. 2865 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 2866 unsigned VF = DstFVTy->getNumElements(); 2867 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 2868 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2869 Type *SrcElemTy = SrcVecTy->getElementType(); 2870 Type *DstElemTy = DstFVTy->getElementType(); 2871 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2872 "Vector elements must have same size"); 2873 2874 // Do a direct cast if element types are castable. 2875 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2876 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2877 } 2878 // V cannot be directly casted to desired vector type. 2879 // May happen when V is a floating point vector but DstVTy is a vector of 2880 // pointers or vice-versa. Handle this using a two-step bitcast using an 2881 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2882 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2883 "Only one type should be a pointer type"); 2884 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2885 "Only one type should be a floating point type"); 2886 Type *IntTy = 2887 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2888 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 2889 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2890 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2891 } 2892 2893 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2894 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 2895 // Reuse existing vector loop preheader for TC checks. 2896 // Note that new preheader block is generated for vector loop. 2897 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2898 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2899 2900 // Generate code to check if the loop's trip count is less than VF * UF, or 2901 // equal to it in case a scalar epilogue is required; this implies that the 2902 // vector trip count is zero. This check also covers the case where adding one 2903 // to the backedge-taken count overflowed leading to an incorrect trip count 2904 // of zero. In this case we will also jump to the scalar loop. 2905 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE 2906 : ICmpInst::ICMP_ULT; 2907 2908 // If tail is to be folded, vector loop takes care of all iterations. 2909 Type *CountTy = Count->getType(); 2910 Value *CheckMinIters = Builder.getFalse(); 2911 Value *Step = createStepForVF(Builder, CountTy, VF, UF); 2912 if (!Cost->foldTailByMasking()) 2913 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 2914 else if (VF.isScalable()) { 2915 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2916 // an overflow to zero when updating induction variables and so an 2917 // additional overflow check is required before entering the vector loop. 2918 2919 // Get the maximum unsigned value for the type. 2920 Value *MaxUIntTripCount = 2921 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2922 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2923 2924 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2925 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step); 2926 } 2927 // Create new preheader for vector loop. 2928 LoopVectorPreHeader = 2929 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2930 "vector.ph"); 2931 2932 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2933 DT->getNode(Bypass)->getIDom()) && 2934 "TC check is expected to dominate Bypass"); 2935 2936 // Update dominator for Bypass & LoopExit (if needed). 2937 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2938 if (!Cost->requiresScalarEpilogue(VF)) 2939 // If there is an epilogue which must run, there's no edge from the 2940 // middle block to exit blocks and thus no need to update the immediate 2941 // dominator of the exit blocks. 2942 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2943 2944 ReplaceInstWithInst( 2945 TCCheckBlock->getTerminator(), 2946 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2947 LoopBypassBlocks.push_back(TCCheckBlock); 2948 } 2949 2950 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2951 2952 BasicBlock *const SCEVCheckBlock = 2953 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2954 if (!SCEVCheckBlock) 2955 return nullptr; 2956 2957 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2958 (OptForSizeBasedOnProfile && 2959 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2960 "Cannot SCEV check stride or overflow when optimizing for size"); 2961 2962 2963 // Update dominator only if this is first RT check. 2964 if (LoopBypassBlocks.empty()) { 2965 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2966 if (!Cost->requiresScalarEpilogue(VF)) 2967 // If there is an epilogue which must run, there's no edge from the 2968 // middle block to exit blocks and thus no need to update the immediate 2969 // dominator of the exit blocks. 2970 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2971 } 2972 2973 LoopBypassBlocks.push_back(SCEVCheckBlock); 2974 AddedSafetyChecks = true; 2975 return SCEVCheckBlock; 2976 } 2977 2978 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2979 // VPlan-native path does not do any analysis for runtime checks currently. 2980 if (EnableVPlanNativePath) 2981 return nullptr; 2982 2983 BasicBlock *const MemCheckBlock = 2984 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2985 2986 // Check if we generated code that checks in runtime if arrays overlap. We put 2987 // the checks into a separate block to make the more common case of few 2988 // elements faster. 2989 if (!MemCheckBlock) 2990 return nullptr; 2991 2992 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 2993 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2994 "Cannot emit memory checks when optimizing for size, unless forced " 2995 "to vectorize."); 2996 ORE->emit([&]() { 2997 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2998 OrigLoop->getStartLoc(), 2999 OrigLoop->getHeader()) 3000 << "Code-size may be reduced by not forcing " 3001 "vectorization, or by source-code modifications " 3002 "eliminating the need for runtime checks " 3003 "(e.g., adding 'restrict')."; 3004 }); 3005 } 3006 3007 LoopBypassBlocks.push_back(MemCheckBlock); 3008 3009 AddedSafetyChecks = true; 3010 3011 return MemCheckBlock; 3012 } 3013 3014 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3015 LoopScalarBody = OrigLoop->getHeader(); 3016 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3017 assert(LoopVectorPreHeader && "Invalid loop structure"); 3018 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3019 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && 3020 "multiple exit loop without required epilogue?"); 3021 3022 LoopMiddleBlock = 3023 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3024 LI, nullptr, Twine(Prefix) + "middle.block"); 3025 LoopScalarPreHeader = 3026 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3027 nullptr, Twine(Prefix) + "scalar.ph"); 3028 3029 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3030 3031 // Set up the middle block terminator. Two cases: 3032 // 1) If we know that we must execute the scalar epilogue, emit an 3033 // unconditional branch. 3034 // 2) Otherwise, we must have a single unique exit block (due to how we 3035 // implement the multiple exit case). In this case, set up a conditonal 3036 // branch from the middle block to the loop scalar preheader, and the 3037 // exit block. completeLoopSkeleton will update the condition to use an 3038 // iteration check, if required to decide whether to execute the remainder. 3039 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ? 3040 BranchInst::Create(LoopScalarPreHeader) : 3041 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3042 Builder.getTrue()); 3043 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3044 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3045 3046 // Update dominator for loop exit. During skeleton creation, only the vector 3047 // pre-header and the middle block are created. The vector loop is entirely 3048 // created during VPlan exection. 3049 if (!Cost->requiresScalarEpilogue(VF)) 3050 // If there is an epilogue which must run, there's no edge from the 3051 // middle block to exit blocks and thus no need to update the immediate 3052 // dominator of the exit blocks. 3053 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3054 } 3055 3056 void InnerLoopVectorizer::createInductionResumeValues( 3057 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3058 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3059 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3060 "Inconsistent information about additional bypass."); 3061 3062 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3063 assert(VectorTripCount && "Expected valid arguments"); 3064 // We are going to resume the execution of the scalar loop. 3065 // Go over all of the induction variables that we found and fix the 3066 // PHIs that are left in the scalar version of the loop. 3067 // The starting values of PHI nodes depend on the counter of the last 3068 // iteration in the vectorized loop. 3069 // If we come from a bypass edge then we need to start from the original 3070 // start value. 3071 Instruction *OldInduction = Legal->getPrimaryInduction(); 3072 for (auto &InductionEntry : Legal->getInductionVars()) { 3073 PHINode *OrigPhi = InductionEntry.first; 3074 InductionDescriptor II = InductionEntry.second; 3075 3076 Value *&EndValue = IVEndValues[OrigPhi]; 3077 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3078 if (OrigPhi == OldInduction) { 3079 // We know what the end value is. 3080 EndValue = VectorTripCount; 3081 } else { 3082 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3083 3084 // Fast-math-flags propagate from the original induction instruction. 3085 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3086 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3087 3088 Type *StepType = II.getStep()->getType(); 3089 Instruction::CastOps CastOp = 3090 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3091 Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); 3092 Value *Step = 3093 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3094 EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3095 EndValue->setName("ind.end"); 3096 3097 // Compute the end value for the additional bypass (if applicable). 3098 if (AdditionalBypass.first) { 3099 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3100 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3101 StepType, true); 3102 Value *Step = 3103 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint()); 3104 VTC = 3105 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); 3106 EndValueFromAdditionalBypass = 3107 emitTransformedIndex(B, VTC, II.getStartValue(), Step, II); 3108 EndValueFromAdditionalBypass->setName("ind.end"); 3109 } 3110 } 3111 3112 // Create phi nodes to merge from the backedge-taken check block. 3113 PHINode *BCResumeVal = 3114 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3115 LoopScalarPreHeader->getTerminator()); 3116 // Copy original phi DL over to the new one. 3117 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3118 3119 // The new PHI merges the original incoming value, in case of a bypass, 3120 // or the value at the end of the vectorized loop. 3121 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3122 3123 // Fix the scalar body counter (PHI node). 3124 // The old induction's phi node in the scalar body needs the truncated 3125 // value. 3126 for (BasicBlock *BB : LoopBypassBlocks) 3127 BCResumeVal->addIncoming(II.getStartValue(), BB); 3128 3129 if (AdditionalBypass.first) 3130 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3131 EndValueFromAdditionalBypass); 3132 3133 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3134 } 3135 } 3136 3137 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) { 3138 // The trip counts should be cached by now. 3139 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 3140 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3141 3142 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3143 3144 // Add a check in the middle block to see if we have completed 3145 // all of the iterations in the first vector loop. Three cases: 3146 // 1) If we require a scalar epilogue, there is no conditional branch as 3147 // we unconditionally branch to the scalar preheader. Do nothing. 3148 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3149 // Thus if tail is to be folded, we know we don't need to run the 3150 // remainder and we can use the previous value for the condition (true). 3151 // 3) Otherwise, construct a runtime check. 3152 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { 3153 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3154 Count, VectorTripCount, "cmp.n", 3155 LoopMiddleBlock->getTerminator()); 3156 3157 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3158 // of the corresponding compare because they may have ended up with 3159 // different line numbers and we want to avoid awkward line stepping while 3160 // debugging. Eg. if the compare has got a line number inside the loop. 3161 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3162 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3163 } 3164 3165 #ifdef EXPENSIVE_CHECKS 3166 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3167 #endif 3168 3169 return LoopVectorPreHeader; 3170 } 3171 3172 std::pair<BasicBlock *, Value *> 3173 InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3174 /* 3175 In this function we generate a new loop. The new loop will contain 3176 the vectorized instructions while the old loop will continue to run the 3177 scalar remainder. 3178 3179 [ ] <-- loop iteration number check. 3180 / | 3181 / v 3182 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3183 | / | 3184 | / v 3185 || [ ] <-- vector pre header. 3186 |/ | 3187 | v 3188 | [ ] \ 3189 | [ ]_| <-- vector loop (created during VPlan execution). 3190 | | 3191 | v 3192 \ -[ ] <--- middle-block. 3193 \/ | 3194 /\ v 3195 | ->[ ] <--- new preheader. 3196 | | 3197 (opt) v <-- edge from middle to exit iff epilogue is not required. 3198 | [ ] \ 3199 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3200 \ | 3201 \ v 3202 >[ ] <-- exit block(s). 3203 ... 3204 */ 3205 3206 // Get the metadata of the original loop before it gets modified. 3207 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3208 3209 // Workaround! Compute the trip count of the original loop and cache it 3210 // before we start modifying the CFG. This code has a systemic problem 3211 // wherein it tries to run analysis over partially constructed IR; this is 3212 // wrong, and not simply for SCEV. The trip count of the original loop 3213 // simply happens to be prone to hitting this in practice. In theory, we 3214 // can hit the same issue for any SCEV, or ValueTracking query done during 3215 // mutation. See PR49900. 3216 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 3217 3218 // Create an empty vector loop, and prepare basic blocks for the runtime 3219 // checks. 3220 createVectorLoopSkeleton(""); 3221 3222 // Now, compare the new count to zero. If it is zero skip the vector loop and 3223 // jump to the scalar loop. This check also covers the case where the 3224 // backedge-taken count is uint##_max: adding one to it will overflow leading 3225 // to an incorrect trip count of zero. In this (rare) case we will also jump 3226 // to the scalar loop. 3227 emitIterationCountCheck(LoopScalarPreHeader); 3228 3229 // Generate the code to check any assumptions that we've made for SCEV 3230 // expressions. 3231 emitSCEVChecks(LoopScalarPreHeader); 3232 3233 // Generate the code that checks in runtime if arrays overlap. We put the 3234 // checks into a separate block to make the more common case of few elements 3235 // faster. 3236 emitMemRuntimeChecks(LoopScalarPreHeader); 3237 3238 // Emit phis for the new starting index of the scalar loop. 3239 createInductionResumeValues(); 3240 3241 return {completeLoopSkeleton(OrigLoopID), nullptr}; 3242 } 3243 3244 // Fix up external users of the induction variable. At this point, we are 3245 // in LCSSA form, with all external PHIs that use the IV having one input value, 3246 // coming from the remainder loop. We need those PHIs to also have a correct 3247 // value for the IV when arriving directly from the middle block. 3248 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3249 const InductionDescriptor &II, 3250 Value *VectorTripCount, Value *EndValue, 3251 BasicBlock *MiddleBlock, 3252 BasicBlock *VectorHeader, VPlan &Plan) { 3253 // There are two kinds of external IV usages - those that use the value 3254 // computed in the last iteration (the PHI) and those that use the penultimate 3255 // value (the value that feeds into the phi from the loop latch). 3256 // We allow both, but they, obviously, have different values. 3257 3258 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3259 3260 DenseMap<Value *, Value *> MissingVals; 3261 3262 // An external user of the last iteration's value should see the value that 3263 // the remainder loop uses to initialize its own IV. 3264 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3265 for (User *U : PostInc->users()) { 3266 Instruction *UI = cast<Instruction>(U); 3267 if (!OrigLoop->contains(UI)) { 3268 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3269 MissingVals[UI] = EndValue; 3270 } 3271 } 3272 3273 // An external user of the penultimate value need to see EndValue - Step. 3274 // The simplest way to get this is to recompute it from the constituent SCEVs, 3275 // that is Start + (Step * (CRD - 1)). 3276 for (User *U : OrigPhi->users()) { 3277 auto *UI = cast<Instruction>(U); 3278 if (!OrigLoop->contains(UI)) { 3279 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3280 3281 IRBuilder<> B(MiddleBlock->getTerminator()); 3282 3283 // Fast-math-flags propagate from the original induction instruction. 3284 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3285 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3286 3287 Value *CountMinusOne = B.CreateSub( 3288 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3289 Value *CMO = 3290 !II.getStep()->getType()->isIntegerTy() 3291 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3292 II.getStep()->getType()) 3293 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3294 CMO->setName("cast.cmo"); 3295 3296 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(), 3297 VectorHeader->getTerminator()); 3298 Value *Escape = 3299 emitTransformedIndex(B, CMO, II.getStartValue(), Step, II); 3300 Escape->setName("ind.escape"); 3301 MissingVals[UI] = Escape; 3302 } 3303 } 3304 3305 for (auto &I : MissingVals) { 3306 PHINode *PHI = cast<PHINode>(I.first); 3307 // One corner case we have to handle is two IVs "chasing" each-other, 3308 // that is %IV2 = phi [...], [ %IV1, %latch ] 3309 // In this case, if IV1 has an external use, we need to avoid adding both 3310 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3311 // don't already have an incoming value for the middle block. 3312 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3313 PHI->addIncoming(I.second, MiddleBlock); 3314 Plan.removeLiveOut(PHI); 3315 } 3316 } 3317 } 3318 3319 namespace { 3320 3321 struct CSEDenseMapInfo { 3322 static bool canHandle(const Instruction *I) { 3323 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3324 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3325 } 3326 3327 static inline Instruction *getEmptyKey() { 3328 return DenseMapInfo<Instruction *>::getEmptyKey(); 3329 } 3330 3331 static inline Instruction *getTombstoneKey() { 3332 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3333 } 3334 3335 static unsigned getHashValue(const Instruction *I) { 3336 assert(canHandle(I) && "Unknown instruction!"); 3337 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3338 I->value_op_end())); 3339 } 3340 3341 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3342 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3343 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3344 return LHS == RHS; 3345 return LHS->isIdenticalTo(RHS); 3346 } 3347 }; 3348 3349 } // end anonymous namespace 3350 3351 ///Perform cse of induction variable instructions. 3352 static void cse(BasicBlock *BB) { 3353 // Perform simple cse. 3354 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3355 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3356 if (!CSEDenseMapInfo::canHandle(&In)) 3357 continue; 3358 3359 // Check if we can replace this instruction with any of the 3360 // visited instructions. 3361 if (Instruction *V = CSEMap.lookup(&In)) { 3362 In.replaceAllUsesWith(V); 3363 In.eraseFromParent(); 3364 continue; 3365 } 3366 3367 CSEMap[&In] = &In; 3368 } 3369 } 3370 3371 InstructionCost 3372 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3373 bool &NeedToScalarize) const { 3374 Function *F = CI->getCalledFunction(); 3375 Type *ScalarRetTy = CI->getType(); 3376 SmallVector<Type *, 4> Tys, ScalarTys; 3377 for (auto &ArgOp : CI->args()) 3378 ScalarTys.push_back(ArgOp->getType()); 3379 3380 // Estimate cost of scalarized vector call. The source operands are assumed 3381 // to be vectors, so we need to extract individual elements from there, 3382 // execute VF scalar calls, and then gather the result into the vector return 3383 // value. 3384 InstructionCost ScalarCallCost = 3385 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3386 if (VF.isScalar()) 3387 return ScalarCallCost; 3388 3389 // Compute corresponding vector type for return value and arguments. 3390 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3391 for (Type *ScalarTy : ScalarTys) 3392 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3393 3394 // Compute costs of unpacking argument values for the scalar calls and 3395 // packing the return values to a vector. 3396 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3397 3398 InstructionCost Cost = 3399 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3400 3401 // If we can't emit a vector call for this function, then the currently found 3402 // cost is the cost we need to return. 3403 NeedToScalarize = true; 3404 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3405 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3406 3407 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3408 return Cost; 3409 3410 // If the corresponding vector cost is cheaper, return its cost. 3411 InstructionCost VectorCallCost = 3412 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3413 if (VectorCallCost < Cost) { 3414 NeedToScalarize = false; 3415 Cost = VectorCallCost; 3416 } 3417 return Cost; 3418 } 3419 3420 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3421 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3422 return Elt; 3423 return VectorType::get(Elt, VF); 3424 } 3425 3426 InstructionCost 3427 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3428 ElementCount VF) const { 3429 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3430 assert(ID && "Expected intrinsic call!"); 3431 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3432 FastMathFlags FMF; 3433 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3434 FMF = FPMO->getFastMathFlags(); 3435 3436 SmallVector<const Value *> Arguments(CI->args()); 3437 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3438 SmallVector<Type *> ParamTys; 3439 std::transform(FTy->param_begin(), FTy->param_end(), 3440 std::back_inserter(ParamTys), 3441 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3442 3443 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3444 dyn_cast<IntrinsicInst>(CI)); 3445 return TTI.getIntrinsicInstrCost(CostAttrs, 3446 TargetTransformInfo::TCK_RecipThroughput); 3447 } 3448 3449 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3450 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3451 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3452 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3453 } 3454 3455 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3456 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3457 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3458 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3459 } 3460 3461 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3462 // For every instruction `I` in MinBWs, truncate the operands, create a 3463 // truncated version of `I` and reextend its result. InstCombine runs 3464 // later and will remove any ext/trunc pairs. 3465 SmallPtrSet<Value *, 4> Erased; 3466 for (const auto &KV : Cost->getMinimalBitwidths()) { 3467 // If the value wasn't vectorized, we must maintain the original scalar 3468 // type. The absence of the value from State indicates that it 3469 // wasn't vectorized. 3470 // FIXME: Should not rely on getVPValue at this point. 3471 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3472 if (!State.hasAnyVectorValue(Def)) 3473 continue; 3474 for (unsigned Part = 0; Part < UF; ++Part) { 3475 Value *I = State.get(Def, Part); 3476 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3477 continue; 3478 Type *OriginalTy = I->getType(); 3479 Type *ScalarTruncatedTy = 3480 IntegerType::get(OriginalTy->getContext(), KV.second); 3481 auto *TruncatedTy = VectorType::get( 3482 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount()); 3483 if (TruncatedTy == OriginalTy) 3484 continue; 3485 3486 IRBuilder<> B(cast<Instruction>(I)); 3487 auto ShrinkOperand = [&](Value *V) -> Value * { 3488 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3489 if (ZI->getSrcTy() == TruncatedTy) 3490 return ZI->getOperand(0); 3491 return B.CreateZExtOrTrunc(V, TruncatedTy); 3492 }; 3493 3494 // The actual instruction modification depends on the instruction type, 3495 // unfortunately. 3496 Value *NewI = nullptr; 3497 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3498 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3499 ShrinkOperand(BO->getOperand(1))); 3500 3501 // Any wrapping introduced by shrinking this operation shouldn't be 3502 // considered undefined behavior. So, we can't unconditionally copy 3503 // arithmetic wrapping flags to NewI. 3504 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3505 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3506 NewI = 3507 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3508 ShrinkOperand(CI->getOperand(1))); 3509 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3510 NewI = B.CreateSelect(SI->getCondition(), 3511 ShrinkOperand(SI->getTrueValue()), 3512 ShrinkOperand(SI->getFalseValue())); 3513 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3514 switch (CI->getOpcode()) { 3515 default: 3516 llvm_unreachable("Unhandled cast!"); 3517 case Instruction::Trunc: 3518 NewI = ShrinkOperand(CI->getOperand(0)); 3519 break; 3520 case Instruction::SExt: 3521 NewI = B.CreateSExtOrTrunc( 3522 CI->getOperand(0), 3523 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3524 break; 3525 case Instruction::ZExt: 3526 NewI = B.CreateZExtOrTrunc( 3527 CI->getOperand(0), 3528 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3529 break; 3530 } 3531 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3532 auto Elements0 = 3533 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount(); 3534 auto *O0 = B.CreateZExtOrTrunc( 3535 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3536 auto Elements1 = 3537 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount(); 3538 auto *O1 = B.CreateZExtOrTrunc( 3539 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3540 3541 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3542 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3543 // Don't do anything with the operands, just extend the result. 3544 continue; 3545 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3546 auto Elements = 3547 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount(); 3548 auto *O0 = B.CreateZExtOrTrunc( 3549 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3550 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3551 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3552 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3553 auto Elements = 3554 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount(); 3555 auto *O0 = B.CreateZExtOrTrunc( 3556 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3557 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3558 } else { 3559 // If we don't know what to do, be conservative and don't do anything. 3560 continue; 3561 } 3562 3563 // Lastly, extend the result. 3564 NewI->takeName(cast<Instruction>(I)); 3565 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3566 I->replaceAllUsesWith(Res); 3567 cast<Instruction>(I)->eraseFromParent(); 3568 Erased.insert(I); 3569 State.reset(Def, Res, Part); 3570 } 3571 } 3572 3573 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3574 for (const auto &KV : Cost->getMinimalBitwidths()) { 3575 // If the value wasn't vectorized, we must maintain the original scalar 3576 // type. The absence of the value from State indicates that it 3577 // wasn't vectorized. 3578 // FIXME: Should not rely on getVPValue at this point. 3579 VPValue *Def = State.Plan->getVPValue(KV.first, true); 3580 if (!State.hasAnyVectorValue(Def)) 3581 continue; 3582 for (unsigned Part = 0; Part < UF; ++Part) { 3583 Value *I = State.get(Def, Part); 3584 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3585 if (Inst && Inst->use_empty()) { 3586 Value *NewI = Inst->getOperand(0); 3587 Inst->eraseFromParent(); 3588 State.reset(Def, NewI, Part); 3589 } 3590 } 3591 } 3592 } 3593 3594 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3595 VPlan &Plan) { 3596 // Insert truncates and extends for any truncated instructions as hints to 3597 // InstCombine. 3598 if (VF.isVector()) 3599 truncateToMinimalBitwidths(State); 3600 3601 // Fix widened non-induction PHIs by setting up the PHI operands. 3602 if (EnableVPlanNativePath) 3603 fixNonInductionPHIs(Plan, State); 3604 3605 // At this point every instruction in the original loop is widened to a 3606 // vector form. Now we need to fix the recurrences in the loop. These PHI 3607 // nodes are currently empty because we did not want to introduce cycles. 3608 // This is the second stage of vectorizing recurrences. 3609 fixCrossIterationPHIs(State); 3610 3611 // Forget the original basic block. 3612 PSE.getSE()->forgetLoop(OrigLoop); 3613 3614 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock(); 3615 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3616 if (Cost->requiresScalarEpilogue(VF)) { 3617 // No edge from the middle block to the unique exit block has been inserted 3618 // and there is nothing to fix from vector loop; phis should have incoming 3619 // from scalar loop only. 3620 Plan.clearLiveOuts(); 3621 } else { 3622 // If we inserted an edge from the middle block to the unique exit block, 3623 // update uses outside the loop (phis) to account for the newly inserted 3624 // edge. 3625 3626 // Fix-up external users of the induction variables. 3627 for (auto &Entry : Legal->getInductionVars()) 3628 fixupIVUsers(Entry.first, Entry.second, 3629 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3630 IVEndValues[Entry.first], LoopMiddleBlock, 3631 VectorLoop->getHeader(), Plan); 3632 } 3633 3634 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3635 // in the exit block, so update the builder. 3636 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI()); 3637 for (auto &KV : Plan.getLiveOuts()) 3638 KV.second->fixPhi(Plan, State); 3639 3640 for (Instruction *PI : PredicatedInstructions) 3641 sinkScalarOperands(&*PI); 3642 3643 // Remove redundant induction instructions. 3644 cse(VectorLoop->getHeader()); 3645 3646 // Set/update profile weights for the vector and remainder loops as original 3647 // loop iterations are now distributed among them. Note that original loop 3648 // represented by LoopScalarBody becomes remainder loop after vectorization. 3649 // 3650 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3651 // end up getting slightly roughened result but that should be OK since 3652 // profile is not inherently precise anyway. Note also possible bypass of 3653 // vector code caused by legality checks is ignored, assigning all the weight 3654 // to the vector loop, optimistically. 3655 // 3656 // For scalable vectorization we can't know at compile time how many iterations 3657 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3658 // vscale of '1'. 3659 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3660 LI->getLoopFor(LoopScalarBody), 3661 VF.getKnownMinValue() * UF); 3662 } 3663 3664 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 3665 // In order to support recurrences we need to be able to vectorize Phi nodes. 3666 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3667 // stage #2: We now need to fix the recurrences by adding incoming edges to 3668 // the currently empty PHI nodes. At this point every instruction in the 3669 // original loop is widened to a vector form so we can use them to construct 3670 // the incoming edges. 3671 VPBasicBlock *Header = 3672 State.Plan->getVectorLoopRegion()->getEntryBasicBlock(); 3673 for (VPRecipeBase &R : Header->phis()) { 3674 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 3675 fixReduction(ReductionPhi, State); 3676 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3677 fixFirstOrderRecurrence(FOR, State); 3678 } 3679 } 3680 3681 void InnerLoopVectorizer::fixFirstOrderRecurrence( 3682 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3683 // This is the second phase of vectorizing first-order recurrences. An 3684 // overview of the transformation is described below. Suppose we have the 3685 // following loop. 3686 // 3687 // for (int i = 0; i < n; ++i) 3688 // b[i] = a[i] - a[i - 1]; 3689 // 3690 // There is a first-order recurrence on "a". For this loop, the shorthand 3691 // scalar IR looks like: 3692 // 3693 // scalar.ph: 3694 // s_init = a[-1] 3695 // br scalar.body 3696 // 3697 // scalar.body: 3698 // i = phi [0, scalar.ph], [i+1, scalar.body] 3699 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3700 // s2 = a[i] 3701 // b[i] = s2 - s1 3702 // br cond, scalar.body, ... 3703 // 3704 // In this example, s1 is a recurrence because it's value depends on the 3705 // previous iteration. In the first phase of vectorization, we created a 3706 // vector phi v1 for s1. We now complete the vectorization and produce the 3707 // shorthand vector IR shown below (for VF = 4, UF = 1). 3708 // 3709 // vector.ph: 3710 // v_init = vector(..., ..., ..., a[-1]) 3711 // br vector.body 3712 // 3713 // vector.body 3714 // i = phi [0, vector.ph], [i+4, vector.body] 3715 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3716 // v2 = a[i, i+1, i+2, i+3]; 3717 // v3 = vector(v1(3), v2(0, 1, 2)) 3718 // b[i, i+1, i+2, i+3] = v2 - v3 3719 // br cond, vector.body, middle.block 3720 // 3721 // middle.block: 3722 // x = v2(3) 3723 // br scalar.ph 3724 // 3725 // scalar.ph: 3726 // s_init = phi [x, middle.block], [a[-1], otherwise] 3727 // br scalar.body 3728 // 3729 // After execution completes the vector loop, we extract the next value of 3730 // the recurrence (x) to use as the initial value in the scalar loop. 3731 3732 // Extract the last vector element in the middle block. This will be the 3733 // initial value for the recurrence when jumping to the scalar loop. 3734 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3735 Value *Incoming = State.get(PreviousDef, UF - 1); 3736 auto *ExtractForScalar = Incoming; 3737 auto *IdxTy = Builder.getInt32Ty(); 3738 if (VF.isVector()) { 3739 auto *One = ConstantInt::get(IdxTy, 1); 3740 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3741 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3742 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3743 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 3744 "vector.recur.extract"); 3745 } 3746 // Extract the second last element in the middle block if the 3747 // Phi is used outside the loop. We need to extract the phi itself 3748 // and not the last element (the phi update in the current iteration). This 3749 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3750 // when the scalar loop is not run at all. 3751 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3752 if (VF.isVector()) { 3753 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3754 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3755 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3756 Incoming, Idx, "vector.recur.extract.for.phi"); 3757 } else if (UF > 1) 3758 // When loop is unrolled without vectorizing, initialize 3759 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 3760 // of `Incoming`. This is analogous to the vectorized case above: extracting 3761 // the second last element when VF > 1. 3762 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3763 3764 // Fix the initial value of the original recurrence in the scalar loop. 3765 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3766 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3767 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3768 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3769 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3770 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3771 Start->addIncoming(Incoming, BB); 3772 } 3773 3774 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3775 Phi->setName("scalar.recur"); 3776 3777 // Finally, fix users of the recurrence outside the loop. The users will need 3778 // either the last value of the scalar recurrence or the last value of the 3779 // vector recurrence we extracted in the middle block. Since the loop is in 3780 // LCSSA form, we just need to find all the phi nodes for the original scalar 3781 // recurrence in the exit block, and then add an edge for the middle block. 3782 // Note that LCSSA does not imply single entry when the original scalar loop 3783 // had multiple exiting edges (as we always run the last iteration in the 3784 // scalar epilogue); in that case, there is no edge from middle to exit and 3785 // and thus no phis which needed updated. 3786 if (!Cost->requiresScalarEpilogue(VF)) 3787 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3788 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) { 3789 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3790 State.Plan->removeLiveOut(&LCSSAPhi); 3791 } 3792 } 3793 3794 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR, 3795 VPTransformState &State) { 3796 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 3797 // Get it's reduction variable descriptor. 3798 assert(Legal->isReductionVariable(OrigPhi) && 3799 "Unable to find the reduction variable"); 3800 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3801 3802 RecurKind RK = RdxDesc.getRecurrenceKind(); 3803 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3804 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3805 setDebugLocFromInst(ReductionStartValue); 3806 3807 VPValue *LoopExitInstDef = PhiR->getBackedgeValue(); 3808 // This is the vector-clone of the value that leaves the loop. 3809 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 3810 3811 // Wrap flags are in general invalid after vectorization, clear them. 3812 clearReductionWrapFlags(PhiR, State); 3813 3814 // Before each round, move the insertion point right between 3815 // the PHIs and the values we are going to write. 3816 // This allows us to write both PHINodes and the extractelement 3817 // instructions. 3818 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3819 3820 setDebugLocFromInst(LoopExitInst); 3821 3822 Type *PhiTy = OrigPhi->getType(); 3823 3824 VPBasicBlock *LatchVPBB = 3825 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock(); 3826 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB]; 3827 // If tail is folded by masking, the vector value to leave the loop should be 3828 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3829 // instead of the former. For an inloop reduction the reduction will already 3830 // be predicated, and does not need to be handled here. 3831 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { 3832 for (unsigned Part = 0; Part < UF; ++Part) { 3833 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 3834 SelectInst *Sel = nullptr; 3835 for (User *U : VecLoopExitInst->users()) { 3836 if (isa<SelectInst>(U)) { 3837 assert(!Sel && "Reduction exit feeding two selects"); 3838 Sel = cast<SelectInst>(U); 3839 } else 3840 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3841 } 3842 assert(Sel && "Reduction exit feeds no select"); 3843 State.reset(LoopExitInstDef, Sel, Part); 3844 3845 if (isa<FPMathOperator>(Sel)) 3846 Sel->setFastMathFlags(RdxDesc.getFastMathFlags()); 3847 3848 // If the target can create a predicated operator for the reduction at no 3849 // extra cost in the loop (for example a predicated vadd), it can be 3850 // cheaper for the select to remain in the loop than be sunk out of it, 3851 // and so use the select value for the phi instead of the old 3852 // LoopExitValue. 3853 if (PreferPredicatedReductionSelect || 3854 TTI->preferPredicatedReductionSelect( 3855 RdxDesc.getOpcode(), PhiTy, 3856 TargetTransformInfo::ReductionFlags())) { 3857 auto *VecRdxPhi = 3858 cast<PHINode>(State.get(PhiR, Part)); 3859 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel); 3860 } 3861 } 3862 } 3863 3864 // If the vector reduction can be performed in a smaller type, we truncate 3865 // then extend the loop exit value to enable InstCombine to evaluate the 3866 // entire expression in the smaller type. 3867 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 3868 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 3869 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3870 Builder.SetInsertPoint(VectorLoopLatch->getTerminator()); 3871 VectorParts RdxParts(UF); 3872 for (unsigned Part = 0; Part < UF; ++Part) { 3873 RdxParts[Part] = State.get(LoopExitInstDef, Part); 3874 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3875 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3876 : Builder.CreateZExt(Trunc, VecTy); 3877 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users())) 3878 if (U != Trunc) { 3879 U->replaceUsesOfWith(RdxParts[Part], Extnd); 3880 RdxParts[Part] = Extnd; 3881 } 3882 } 3883 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3884 for (unsigned Part = 0; Part < UF; ++Part) { 3885 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3886 State.reset(LoopExitInstDef, RdxParts[Part], Part); 3887 } 3888 } 3889 3890 // Reduce all of the unrolled parts into a single vector. 3891 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 3892 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 3893 3894 // The middle block terminator has already been assigned a DebugLoc here (the 3895 // OrigLoop's single latch terminator). We want the whole middle block to 3896 // appear to execute on this line because: (a) it is all compiler generated, 3897 // (b) these instructions are always executed after evaluating the latch 3898 // conditional branch, and (c) other passes may add new predecessors which 3899 // terminate on this line. This is the easiest way to ensure we don't 3900 // accidentally cause an extra step back into the loop while debugging. 3901 setDebugLocFromInst(LoopMiddleBlock->getTerminator()); 3902 if (PhiR->isOrdered()) 3903 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 3904 else { 3905 // Floating-point operations should have some FMF to enable the reduction. 3906 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 3907 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 3908 for (unsigned Part = 1; Part < UF; ++Part) { 3909 Value *RdxPart = State.get(LoopExitInstDef, Part); 3910 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 3911 ReducedPartRdx = Builder.CreateBinOp( 3912 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 3913 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) 3914 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK, 3915 ReducedPartRdx, RdxPart); 3916 else 3917 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 3918 } 3919 } 3920 3921 // Create the reduction after the loop. Note that inloop reductions create the 3922 // target reduction in the loop using a Reduction recipe. 3923 if (VF.isVector() && !PhiR->isInLoop()) { 3924 ReducedPartRdx = 3925 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi); 3926 // If the reduction can be performed in a smaller type, we need to extend 3927 // the reduction to the wider type before we branch to the original loop. 3928 if (PhiTy != RdxDesc.getRecurrenceType()) 3929 ReducedPartRdx = RdxDesc.isSigned() 3930 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 3931 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 3932 } 3933 3934 PHINode *ResumePhi = 3935 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 3936 3937 // Create a phi node that merges control-flow from the backedge-taken check 3938 // block and the middle block. 3939 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 3940 LoopScalarPreHeader->getTerminator()); 3941 3942 // If we are fixing reductions in the epilogue loop then we should already 3943 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 3944 // we carry over the incoming values correctly. 3945 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 3946 if (Incoming == LoopMiddleBlock) 3947 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming); 3948 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming)) 3949 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 3950 Incoming); 3951 else 3952 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 3953 } 3954 3955 // Set the resume value for this reduction 3956 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi}); 3957 3958 // If there were stores of the reduction value to a uniform memory address 3959 // inside the loop, create the final store here. 3960 if (StoreInst *SI = RdxDesc.IntermediateStore) { 3961 StoreInst *NewSI = 3962 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand()); 3963 propagateMetadata(NewSI, SI); 3964 3965 // If the reduction value is used in other places, 3966 // then let the code below create PHI's for that. 3967 } 3968 3969 // Now, we need to fix the users of the reduction variable 3970 // inside and outside of the scalar remainder loop. 3971 3972 // We know that the loop is in LCSSA form. We need to update the PHI nodes 3973 // in the exit blocks. See comment on analogous loop in 3974 // fixFirstOrderRecurrence for a more complete explaination of the logic. 3975 if (!Cost->requiresScalarEpilogue(VF)) 3976 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 3977 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) { 3978 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3979 State.Plan->removeLiveOut(&LCSSAPhi); 3980 } 3981 3982 // Fix the scalar loop reduction variable with the incoming reduction sum 3983 // from the vector body and from the backedge value. 3984 int IncomingEdgeBlockIdx = 3985 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3986 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3987 // Pick the other block. 3988 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3989 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3990 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3991 } 3992 3993 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR, 3994 VPTransformState &State) { 3995 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 3996 RecurKind RK = RdxDesc.getRecurrenceKind(); 3997 if (RK != RecurKind::Add && RK != RecurKind::Mul) 3998 return; 3999 4000 SmallVector<VPValue *, 8> Worklist; 4001 SmallPtrSet<VPValue *, 8> Visited; 4002 Worklist.push_back(PhiR); 4003 Visited.insert(PhiR); 4004 4005 while (!Worklist.empty()) { 4006 VPValue *Cur = Worklist.pop_back_val(); 4007 for (unsigned Part = 0; Part < UF; ++Part) { 4008 Value *V = State.get(Cur, Part); 4009 if (!isa<OverflowingBinaryOperator>(V)) 4010 break; 4011 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4012 } 4013 4014 for (VPUser *U : Cur->users()) { 4015 auto *UserRecipe = dyn_cast<VPRecipeBase>(U); 4016 if (!UserRecipe) 4017 continue; 4018 for (VPValue *V : UserRecipe->definedValues()) 4019 if (Visited.insert(V).second) 4020 Worklist.push_back(V); 4021 } 4022 } 4023 } 4024 4025 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4026 // The basic block and loop containing the predicated instruction. 4027 auto *PredBB = PredInst->getParent(); 4028 auto *VectorLoop = LI->getLoopFor(PredBB); 4029 4030 // Initialize a worklist with the operands of the predicated instruction. 4031 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4032 4033 // Holds instructions that we need to analyze again. An instruction may be 4034 // reanalyzed if we don't yet know if we can sink it or not. 4035 SmallVector<Instruction *, 8> InstsToReanalyze; 4036 4037 // Returns true if a given use occurs in the predicated block. Phi nodes use 4038 // their operands in their corresponding predecessor blocks. 4039 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4040 auto *I = cast<Instruction>(U.getUser()); 4041 BasicBlock *BB = I->getParent(); 4042 if (auto *Phi = dyn_cast<PHINode>(I)) 4043 BB = Phi->getIncomingBlock( 4044 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4045 return BB == PredBB; 4046 }; 4047 4048 // Iteratively sink the scalarized operands of the predicated instruction 4049 // into the block we created for it. When an instruction is sunk, it's 4050 // operands are then added to the worklist. The algorithm ends after one pass 4051 // through the worklist doesn't sink a single instruction. 4052 bool Changed; 4053 do { 4054 // Add the instructions that need to be reanalyzed to the worklist, and 4055 // reset the changed indicator. 4056 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4057 InstsToReanalyze.clear(); 4058 Changed = false; 4059 4060 while (!Worklist.empty()) { 4061 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4062 4063 // We can't sink an instruction if it is a phi node, is not in the loop, 4064 // or may have side effects. 4065 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4066 I->mayHaveSideEffects()) 4067 continue; 4068 4069 // If the instruction is already in PredBB, check if we can sink its 4070 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4071 // sinking the scalar instruction I, hence it appears in PredBB; but it 4072 // may have failed to sink I's operands (recursively), which we try 4073 // (again) here. 4074 if (I->getParent() == PredBB) { 4075 Worklist.insert(I->op_begin(), I->op_end()); 4076 continue; 4077 } 4078 4079 // It's legal to sink the instruction if all its uses occur in the 4080 // predicated block. Otherwise, there's nothing to do yet, and we may 4081 // need to reanalyze the instruction. 4082 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4083 InstsToReanalyze.push_back(I); 4084 continue; 4085 } 4086 4087 // Move the instruction to the beginning of the predicated block, and add 4088 // it's operands to the worklist. 4089 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4090 Worklist.insert(I->op_begin(), I->op_end()); 4091 4092 // The sinking may have enabled other instructions to be sunk, so we will 4093 // need to iterate. 4094 Changed = true; 4095 } 4096 } while (Changed); 4097 } 4098 4099 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 4100 VPTransformState &State) { 4101 auto Iter = depth_first( 4102 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry())); 4103 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 4104 for (VPRecipeBase &P : VPBB->phis()) { 4105 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 4106 if (!VPPhi) 4107 continue; 4108 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4109 // Make sure the builder has a valid insert point. 4110 Builder.SetInsertPoint(NewPhi); 4111 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4112 VPValue *Inc = VPPhi->getIncomingValue(i); 4113 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4114 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4115 } 4116 } 4117 } 4118 } 4119 4120 bool InnerLoopVectorizer::useOrderedReductions( 4121 const RecurrenceDescriptor &RdxDesc) { 4122 return Cost->useOrderedReductions(RdxDesc); 4123 } 4124 4125 /// A helper function for checking whether an integer division-related 4126 /// instruction may divide by zero (in which case it must be predicated if 4127 /// executed conditionally in the scalar code). 4128 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4129 /// Non-zero divisors that are non compile-time constants will not be 4130 /// converted into multiplication, so we will still end up scalarizing 4131 /// the division, but can do so w/o predication. 4132 static bool mayDivideByZero(Instruction &I) { 4133 assert((I.getOpcode() == Instruction::UDiv || 4134 I.getOpcode() == Instruction::SDiv || 4135 I.getOpcode() == Instruction::URem || 4136 I.getOpcode() == Instruction::SRem) && 4137 "Unexpected instruction"); 4138 Value *Divisor = I.getOperand(1); 4139 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4140 return !CInt || CInt->isZero(); 4141 } 4142 4143 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4144 VPUser &ArgOperands, 4145 VPTransformState &State) { 4146 assert(!isa<DbgInfoIntrinsic>(I) && 4147 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4148 setDebugLocFromInst(&I); 4149 4150 Module *M = I.getParent()->getParent()->getParent(); 4151 auto *CI = cast<CallInst>(&I); 4152 4153 SmallVector<Type *, 4> Tys; 4154 for (Value *ArgOperand : CI->args()) 4155 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4156 4157 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4158 4159 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4160 // version of the instruction. 4161 // Is it beneficial to perform intrinsic call compared to lib call? 4162 bool NeedToScalarize = false; 4163 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4164 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4165 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4166 assert((UseVectorIntrinsic || !NeedToScalarize) && 4167 "Instruction should be scalarized elsewhere."); 4168 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 4169 "Either the intrinsic cost or vector call cost must be valid"); 4170 4171 for (unsigned Part = 0; Part < UF; ++Part) { 4172 SmallVector<Type *, 2> TysForDecl = {CI->getType()}; 4173 SmallVector<Value *, 4> Args; 4174 for (auto &I : enumerate(ArgOperands.operands())) { 4175 // Some intrinsics have a scalar argument - don't replace it with a 4176 // vector. 4177 Value *Arg; 4178 if (!UseVectorIntrinsic || 4179 !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) 4180 Arg = State.get(I.value(), Part); 4181 else 4182 Arg = State.get(I.value(), VPIteration(0, 0)); 4183 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) 4184 TysForDecl.push_back(Arg->getType()); 4185 Args.push_back(Arg); 4186 } 4187 4188 Function *VectorF; 4189 if (UseVectorIntrinsic) { 4190 // Use vector version of the intrinsic. 4191 if (VF.isVector()) 4192 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4193 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4194 assert(VectorF && "Can't retrieve vector intrinsic."); 4195 } else { 4196 // Use vector version of the function call. 4197 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4198 #ifndef NDEBUG 4199 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4200 "Can't create vector function."); 4201 #endif 4202 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4203 } 4204 SmallVector<OperandBundleDef, 1> OpBundles; 4205 CI->getOperandBundlesAsDefs(OpBundles); 4206 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4207 4208 if (isa<FPMathOperator>(V)) 4209 V->copyFastMathFlags(CI); 4210 4211 State.set(Def, V, Part); 4212 State.addMetadata(V, &I); 4213 } 4214 } 4215 4216 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4217 // We should not collect Scalars more than once per VF. Right now, this 4218 // function is called from collectUniformsAndScalars(), which already does 4219 // this check. Collecting Scalars for VF=1 does not make any sense. 4220 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4221 "This function should not be visited twice for the same VF"); 4222 4223 // This avoids any chances of creating a REPLICATE recipe during planning 4224 // since that would result in generation of scalarized code during execution, 4225 // which is not supported for scalable vectors. 4226 if (VF.isScalable()) { 4227 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4228 return; 4229 } 4230 4231 SmallSetVector<Instruction *, 8> Worklist; 4232 4233 // These sets are used to seed the analysis with pointers used by memory 4234 // accesses that will remain scalar. 4235 SmallSetVector<Instruction *, 8> ScalarPtrs; 4236 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4237 auto *Latch = TheLoop->getLoopLatch(); 4238 4239 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4240 // The pointer operands of loads and stores will be scalar as long as the 4241 // memory access is not a gather or scatter operation. The value operand of a 4242 // store will remain scalar if the store is scalarized. 4243 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4244 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4245 assert(WideningDecision != CM_Unknown && 4246 "Widening decision should be ready at this moment"); 4247 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4248 if (Ptr == Store->getValueOperand()) 4249 return WideningDecision == CM_Scalarize; 4250 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4251 "Ptr is neither a value or pointer operand"); 4252 return WideningDecision != CM_GatherScatter; 4253 }; 4254 4255 // A helper that returns true if the given value is a bitcast or 4256 // getelementptr instruction contained in the loop. 4257 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4258 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4259 isa<GetElementPtrInst>(V)) && 4260 !TheLoop->isLoopInvariant(V); 4261 }; 4262 4263 // A helper that evaluates a memory access's use of a pointer. If the use will 4264 // be a scalar use and the pointer is only used by memory accesses, we place 4265 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4266 // PossibleNonScalarPtrs. 4267 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4268 // We only care about bitcast and getelementptr instructions contained in 4269 // the loop. 4270 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4271 return; 4272 4273 // If the pointer has already been identified as scalar (e.g., if it was 4274 // also identified as uniform), there's nothing to do. 4275 auto *I = cast<Instruction>(Ptr); 4276 if (Worklist.count(I)) 4277 return; 4278 4279 // If the use of the pointer will be a scalar use, and all users of the 4280 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4281 // place the pointer in PossibleNonScalarPtrs. 4282 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4283 return isa<LoadInst>(U) || isa<StoreInst>(U); 4284 })) 4285 ScalarPtrs.insert(I); 4286 else 4287 PossibleNonScalarPtrs.insert(I); 4288 }; 4289 4290 // We seed the scalars analysis with three classes of instructions: (1) 4291 // instructions marked uniform-after-vectorization and (2) bitcast, 4292 // getelementptr and (pointer) phi instructions used by memory accesses 4293 // requiring a scalar use. 4294 // 4295 // (1) Add to the worklist all instructions that have been identified as 4296 // uniform-after-vectorization. 4297 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4298 4299 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4300 // memory accesses requiring a scalar use. The pointer operands of loads and 4301 // stores will be scalar as long as the memory accesses is not a gather or 4302 // scatter operation. The value operand of a store will remain scalar if the 4303 // store is scalarized. 4304 for (auto *BB : TheLoop->blocks()) 4305 for (auto &I : *BB) { 4306 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4307 evaluatePtrUse(Load, Load->getPointerOperand()); 4308 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4309 evaluatePtrUse(Store, Store->getPointerOperand()); 4310 evaluatePtrUse(Store, Store->getValueOperand()); 4311 } 4312 } 4313 for (auto *I : ScalarPtrs) 4314 if (!PossibleNonScalarPtrs.count(I)) { 4315 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4316 Worklist.insert(I); 4317 } 4318 4319 // Insert the forced scalars. 4320 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 4321 // induction variable when the PHI user is scalarized. 4322 auto ForcedScalar = ForcedScalars.find(VF); 4323 if (ForcedScalar != ForcedScalars.end()) 4324 for (auto *I : ForcedScalar->second) 4325 Worklist.insert(I); 4326 4327 // Expand the worklist by looking through any bitcasts and getelementptr 4328 // instructions we've already identified as scalar. This is similar to the 4329 // expansion step in collectLoopUniforms(); however, here we're only 4330 // expanding to include additional bitcasts and getelementptr instructions. 4331 unsigned Idx = 0; 4332 while (Idx != Worklist.size()) { 4333 Instruction *Dst = Worklist[Idx++]; 4334 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4335 continue; 4336 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4337 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4338 auto *J = cast<Instruction>(U); 4339 return !TheLoop->contains(J) || Worklist.count(J) || 4340 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4341 isScalarUse(J, Src)); 4342 })) { 4343 Worklist.insert(Src); 4344 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4345 } 4346 } 4347 4348 // An induction variable will remain scalar if all users of the induction 4349 // variable and induction variable update remain scalar. 4350 for (auto &Induction : Legal->getInductionVars()) { 4351 auto *Ind = Induction.first; 4352 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4353 4354 // If tail-folding is applied, the primary induction variable will be used 4355 // to feed a vector compare. 4356 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 4357 continue; 4358 4359 // Returns true if \p Indvar is a pointer induction that is used directly by 4360 // load/store instruction \p I. 4361 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 4362 Instruction *I) { 4363 return Induction.second.getKind() == 4364 InductionDescriptor::IK_PtrInduction && 4365 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 4366 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 4367 }; 4368 4369 // Determine if all users of the induction variable are scalar after 4370 // vectorization. 4371 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4372 auto *I = cast<Instruction>(U); 4373 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4374 IsDirectLoadStoreFromPtrIndvar(Ind, I); 4375 }); 4376 if (!ScalarInd) 4377 continue; 4378 4379 // Determine if all users of the induction variable update instruction are 4380 // scalar after vectorization. 4381 auto ScalarIndUpdate = 4382 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4383 auto *I = cast<Instruction>(U); 4384 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4385 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 4386 }); 4387 if (!ScalarIndUpdate) 4388 continue; 4389 4390 // The induction variable and its update instruction will remain scalar. 4391 Worklist.insert(Ind); 4392 Worklist.insert(IndUpdate); 4393 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4394 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4395 << "\n"); 4396 } 4397 4398 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4399 } 4400 4401 bool LoopVectorizationCostModel::isScalarWithPredication( 4402 Instruction *I, ElementCount VF) const { 4403 if (!blockNeedsPredicationForAnyReason(I->getParent())) 4404 return false; 4405 switch(I->getOpcode()) { 4406 default: 4407 break; 4408 case Instruction::Load: 4409 case Instruction::Store: { 4410 if (!Legal->isMaskRequired(I)) 4411 return false; 4412 auto *Ptr = getLoadStorePointerOperand(I); 4413 auto *Ty = getLoadStoreType(I); 4414 Type *VTy = Ty; 4415 if (VF.isVector()) 4416 VTy = VectorType::get(Ty, VF); 4417 const Align Alignment = getLoadStoreAlignment(I); 4418 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4419 TTI.isLegalMaskedGather(VTy, Alignment)) 4420 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4421 TTI.isLegalMaskedScatter(VTy, Alignment)); 4422 } 4423 case Instruction::UDiv: 4424 case Instruction::SDiv: 4425 case Instruction::SRem: 4426 case Instruction::URem: 4427 return mayDivideByZero(*I); 4428 } 4429 return false; 4430 } 4431 4432 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4433 Instruction *I, ElementCount VF) { 4434 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4435 assert(getWideningDecision(I, VF) == CM_Unknown && 4436 "Decision should not be set yet."); 4437 auto *Group = getInterleavedAccessGroup(I); 4438 assert(Group && "Must have a group."); 4439 4440 // If the instruction's allocated size doesn't equal it's type size, it 4441 // requires padding and will be scalarized. 4442 auto &DL = I->getModule()->getDataLayout(); 4443 auto *ScalarTy = getLoadStoreType(I); 4444 if (hasIrregularType(ScalarTy, DL)) 4445 return false; 4446 4447 // If the group involves a non-integral pointer, we may not be able to 4448 // losslessly cast all values to a common type. 4449 unsigned InterleaveFactor = Group->getFactor(); 4450 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4451 for (unsigned i = 0; i < InterleaveFactor; i++) { 4452 Instruction *Member = Group->getMember(i); 4453 if (!Member) 4454 continue; 4455 auto *MemberTy = getLoadStoreType(Member); 4456 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4457 // Don't coerce non-integral pointers to integers or vice versa. 4458 if (MemberNI != ScalarNI) { 4459 // TODO: Consider adding special nullptr value case here 4460 return false; 4461 } else if (MemberNI && ScalarNI && 4462 ScalarTy->getPointerAddressSpace() != 4463 MemberTy->getPointerAddressSpace()) { 4464 return false; 4465 } 4466 } 4467 4468 // Check if masking is required. 4469 // A Group may need masking for one of two reasons: it resides in a block that 4470 // needs predication, or it was decided to use masking to deal with gaps 4471 // (either a gap at the end of a load-access that may result in a speculative 4472 // load, or any gaps in a store-access). 4473 bool PredicatedAccessRequiresMasking = 4474 blockNeedsPredicationForAnyReason(I->getParent()) && 4475 Legal->isMaskRequired(I); 4476 bool LoadAccessWithGapsRequiresEpilogMasking = 4477 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4478 !isScalarEpilogueAllowed(); 4479 bool StoreAccessWithGapsRequiresMasking = 4480 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4481 if (!PredicatedAccessRequiresMasking && 4482 !LoadAccessWithGapsRequiresEpilogMasking && 4483 !StoreAccessWithGapsRequiresMasking) 4484 return true; 4485 4486 // If masked interleaving is required, we expect that the user/target had 4487 // enabled it, because otherwise it either wouldn't have been created or 4488 // it should have been invalidated by the CostModel. 4489 assert(useMaskedInterleavedAccesses(TTI) && 4490 "Masked interleave-groups for predicated accesses are not enabled."); 4491 4492 if (Group->isReverse()) 4493 return false; 4494 4495 auto *Ty = getLoadStoreType(I); 4496 const Align Alignment = getLoadStoreAlignment(I); 4497 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4498 : TTI.isLegalMaskedStore(Ty, Alignment); 4499 } 4500 4501 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4502 Instruction *I, ElementCount VF) { 4503 // Get and ensure we have a valid memory instruction. 4504 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4505 4506 auto *Ptr = getLoadStorePointerOperand(I); 4507 auto *ScalarTy = getLoadStoreType(I); 4508 4509 // In order to be widened, the pointer should be consecutive, first of all. 4510 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4511 return false; 4512 4513 // If the instruction is a store located in a predicated block, it will be 4514 // scalarized. 4515 if (isScalarWithPredication(I, VF)) 4516 return false; 4517 4518 // If the instruction's allocated size doesn't equal it's type size, it 4519 // requires padding and will be scalarized. 4520 auto &DL = I->getModule()->getDataLayout(); 4521 if (hasIrregularType(ScalarTy, DL)) 4522 return false; 4523 4524 return true; 4525 } 4526 4527 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4528 // We should not collect Uniforms more than once per VF. Right now, 4529 // this function is called from collectUniformsAndScalars(), which 4530 // already does this check. Collecting Uniforms for VF=1 does not make any 4531 // sense. 4532 4533 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 4534 "This function should not be visited twice for the same VF"); 4535 4536 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4537 // not analyze again. Uniforms.count(VF) will return 1. 4538 Uniforms[VF].clear(); 4539 4540 // We now know that the loop is vectorizable! 4541 // Collect instructions inside the loop that will remain uniform after 4542 // vectorization. 4543 4544 // Global values, params and instructions outside of current loop are out of 4545 // scope. 4546 auto isOutOfScope = [&](Value *V) -> bool { 4547 Instruction *I = dyn_cast<Instruction>(V); 4548 return (!I || !TheLoop->contains(I)); 4549 }; 4550 4551 // Worklist containing uniform instructions demanding lane 0. 4552 SetVector<Instruction *> Worklist; 4553 BasicBlock *Latch = TheLoop->getLoopLatch(); 4554 4555 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4556 // that are scalar with predication must not be considered uniform after 4557 // vectorization, because that would create an erroneous replicating region 4558 // where only a single instance out of VF should be formed. 4559 // TODO: optimize such seldom cases if found important, see PR40816. 4560 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4561 if (isOutOfScope(I)) { 4562 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4563 << *I << "\n"); 4564 return; 4565 } 4566 if (isScalarWithPredication(I, VF)) { 4567 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4568 << *I << "\n"); 4569 return; 4570 } 4571 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4572 Worklist.insert(I); 4573 }; 4574 4575 // Start with the conditional branch. If the branch condition is an 4576 // instruction contained in the loop that is only used by the branch, it is 4577 // uniform. 4578 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4579 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4580 addToWorklistIfAllowed(Cmp); 4581 4582 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4583 InstWidening WideningDecision = getWideningDecision(I, VF); 4584 assert(WideningDecision != CM_Unknown && 4585 "Widening decision should be ready at this moment"); 4586 4587 // A uniform memory op is itself uniform. We exclude uniform stores 4588 // here as they demand the last lane, not the first one. 4589 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 4590 assert(WideningDecision == CM_Scalarize); 4591 return true; 4592 } 4593 4594 return (WideningDecision == CM_Widen || 4595 WideningDecision == CM_Widen_Reverse || 4596 WideningDecision == CM_Interleave); 4597 }; 4598 4599 4600 // Returns true if Ptr is the pointer operand of a memory access instruction 4601 // I, and I is known to not require scalarization. 4602 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4603 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4604 }; 4605 4606 // Holds a list of values which are known to have at least one uniform use. 4607 // Note that there may be other uses which aren't uniform. A "uniform use" 4608 // here is something which only demands lane 0 of the unrolled iterations; 4609 // it does not imply that all lanes produce the same value (e.g. this is not 4610 // the usual meaning of uniform) 4611 SetVector<Value *> HasUniformUse; 4612 4613 // Scan the loop for instructions which are either a) known to have only 4614 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4615 for (auto *BB : TheLoop->blocks()) 4616 for (auto &I : *BB) { 4617 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4618 switch (II->getIntrinsicID()) { 4619 case Intrinsic::sideeffect: 4620 case Intrinsic::experimental_noalias_scope_decl: 4621 case Intrinsic::assume: 4622 case Intrinsic::lifetime_start: 4623 case Intrinsic::lifetime_end: 4624 if (TheLoop->hasLoopInvariantOperands(&I)) 4625 addToWorklistIfAllowed(&I); 4626 break; 4627 default: 4628 break; 4629 } 4630 } 4631 4632 // ExtractValue instructions must be uniform, because the operands are 4633 // known to be loop-invariant. 4634 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4635 assert(isOutOfScope(EVI->getAggregateOperand()) && 4636 "Expected aggregate value to be loop invariant"); 4637 addToWorklistIfAllowed(EVI); 4638 continue; 4639 } 4640 4641 // If there's no pointer operand, there's nothing to do. 4642 auto *Ptr = getLoadStorePointerOperand(&I); 4643 if (!Ptr) 4644 continue; 4645 4646 // A uniform memory op is itself uniform. We exclude uniform stores 4647 // here as they demand the last lane, not the first one. 4648 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 4649 addToWorklistIfAllowed(&I); 4650 4651 if (isUniformDecision(&I, VF)) { 4652 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 4653 HasUniformUse.insert(Ptr); 4654 } 4655 } 4656 4657 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4658 // demanding) users. Since loops are assumed to be in LCSSA form, this 4659 // disallows uses outside the loop as well. 4660 for (auto *V : HasUniformUse) { 4661 if (isOutOfScope(V)) 4662 continue; 4663 auto *I = cast<Instruction>(V); 4664 auto UsersAreMemAccesses = 4665 llvm::all_of(I->users(), [&](User *U) -> bool { 4666 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4667 }); 4668 if (UsersAreMemAccesses) 4669 addToWorklistIfAllowed(I); 4670 } 4671 4672 // Expand Worklist in topological order: whenever a new instruction 4673 // is added , its users should be already inside Worklist. It ensures 4674 // a uniform instruction will only be used by uniform instructions. 4675 unsigned idx = 0; 4676 while (idx != Worklist.size()) { 4677 Instruction *I = Worklist[idx++]; 4678 4679 for (auto OV : I->operand_values()) { 4680 // isOutOfScope operands cannot be uniform instructions. 4681 if (isOutOfScope(OV)) 4682 continue; 4683 // First order recurrence Phi's should typically be considered 4684 // non-uniform. 4685 auto *OP = dyn_cast<PHINode>(OV); 4686 if (OP && Legal->isFirstOrderRecurrence(OP)) 4687 continue; 4688 // If all the users of the operand are uniform, then add the 4689 // operand into the uniform worklist. 4690 auto *OI = cast<Instruction>(OV); 4691 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4692 auto *J = cast<Instruction>(U); 4693 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4694 })) 4695 addToWorklistIfAllowed(OI); 4696 } 4697 } 4698 4699 // For an instruction to be added into Worklist above, all its users inside 4700 // the loop should also be in Worklist. However, this condition cannot be 4701 // true for phi nodes that form a cyclic dependence. We must process phi 4702 // nodes separately. An induction variable will remain uniform if all users 4703 // of the induction variable and induction variable update remain uniform. 4704 // The code below handles both pointer and non-pointer induction variables. 4705 for (auto &Induction : Legal->getInductionVars()) { 4706 auto *Ind = Induction.first; 4707 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4708 4709 // Determine if all users of the induction variable are uniform after 4710 // vectorization. 4711 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4712 auto *I = cast<Instruction>(U); 4713 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4714 isVectorizedMemAccessUse(I, Ind); 4715 }); 4716 if (!UniformInd) 4717 continue; 4718 4719 // Determine if all users of the induction variable update instruction are 4720 // uniform after vectorization. 4721 auto UniformIndUpdate = 4722 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4723 auto *I = cast<Instruction>(U); 4724 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4725 isVectorizedMemAccessUse(I, IndUpdate); 4726 }); 4727 if (!UniformIndUpdate) 4728 continue; 4729 4730 // The induction variable and its update instruction will remain uniform. 4731 addToWorklistIfAllowed(Ind); 4732 addToWorklistIfAllowed(IndUpdate); 4733 } 4734 4735 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4736 } 4737 4738 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4739 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4740 4741 if (Legal->getRuntimePointerChecking()->Need) { 4742 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4743 "runtime pointer checks needed. Enable vectorization of this " 4744 "loop with '#pragma clang loop vectorize(enable)' when " 4745 "compiling with -Os/-Oz", 4746 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4747 return true; 4748 } 4749 4750 if (!PSE.getPredicate().isAlwaysTrue()) { 4751 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4752 "runtime SCEV checks needed. Enable vectorization of this " 4753 "loop with '#pragma clang loop vectorize(enable)' when " 4754 "compiling with -Os/-Oz", 4755 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4756 return true; 4757 } 4758 4759 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4760 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4761 reportVectorizationFailure("Runtime stride check for small trip count", 4762 "runtime stride == 1 checks needed. Enable vectorization of " 4763 "this loop without such check by compiling with -Os/-Oz", 4764 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4765 return true; 4766 } 4767 4768 return false; 4769 } 4770 4771 ElementCount 4772 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4773 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4774 return ElementCount::getScalable(0); 4775 4776 if (Hints->isScalableVectorizationDisabled()) { 4777 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4778 "ScalableVectorizationDisabled", ORE, TheLoop); 4779 return ElementCount::getScalable(0); 4780 } 4781 4782 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4783 4784 auto MaxScalableVF = ElementCount::getScalable( 4785 std::numeric_limits<ElementCount::ScalarTy>::max()); 4786 4787 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4788 // FIXME: While for scalable vectors this is currently sufficient, this should 4789 // be replaced by a more detailed mechanism that filters out specific VFs, 4790 // instead of invalidating vectorization for a whole set of VFs based on the 4791 // MaxVF. 4792 4793 // Disable scalable vectorization if the loop contains unsupported reductions. 4794 if (!canVectorizeReductions(MaxScalableVF)) { 4795 reportVectorizationInfo( 4796 "Scalable vectorization not supported for the reduction " 4797 "operations found in this loop.", 4798 "ScalableVFUnfeasible", ORE, TheLoop); 4799 return ElementCount::getScalable(0); 4800 } 4801 4802 // Disable scalable vectorization if the loop contains any instructions 4803 // with element types not supported for scalable vectors. 4804 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4805 return !Ty->isVoidTy() && 4806 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4807 })) { 4808 reportVectorizationInfo("Scalable vectorization is not supported " 4809 "for all element types found in this loop.", 4810 "ScalableVFUnfeasible", ORE, TheLoop); 4811 return ElementCount::getScalable(0); 4812 } 4813 4814 if (Legal->isSafeForAnyVectorWidth()) 4815 return MaxScalableVF; 4816 4817 // Limit MaxScalableVF by the maximum safe dependence distance. 4818 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 4819 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) 4820 MaxVScale = 4821 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 4822 MaxScalableVF = ElementCount::getScalable( 4823 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 4824 if (!MaxScalableVF) 4825 reportVectorizationInfo( 4826 "Max legal vector width too small, scalable vectorization " 4827 "unfeasible.", 4828 "ScalableVFUnfeasible", ORE, TheLoop); 4829 4830 return MaxScalableVF; 4831 } 4832 4833 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4834 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4835 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4836 unsigned SmallestType, WidestType; 4837 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4838 4839 // Get the maximum safe dependence distance in bits computed by LAA. 4840 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4841 // the memory accesses that is most restrictive (involved in the smallest 4842 // dependence distance). 4843 unsigned MaxSafeElements = 4844 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4845 4846 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4847 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4848 4849 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4850 << ".\n"); 4851 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4852 << ".\n"); 4853 4854 // First analyze the UserVF, fall back if the UserVF should be ignored. 4855 if (UserVF) { 4856 auto MaxSafeUserVF = 4857 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4858 4859 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4860 // If `VF=vscale x N` is safe, then so is `VF=N` 4861 if (UserVF.isScalable()) 4862 return FixedScalableVFPair( 4863 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4864 else 4865 return UserVF; 4866 } 4867 4868 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4869 4870 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4871 // is better to ignore the hint and let the compiler choose a suitable VF. 4872 if (!UserVF.isScalable()) { 4873 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4874 << " is unsafe, clamping to max safe VF=" 4875 << MaxSafeFixedVF << ".\n"); 4876 ORE->emit([&]() { 4877 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4878 TheLoop->getStartLoc(), 4879 TheLoop->getHeader()) 4880 << "User-specified vectorization factor " 4881 << ore::NV("UserVectorizationFactor", UserVF) 4882 << " is unsafe, clamping to maximum safe vectorization factor " 4883 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4884 }); 4885 return MaxSafeFixedVF; 4886 } 4887 4888 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4889 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4890 << " is ignored because scalable vectors are not " 4891 "available.\n"); 4892 ORE->emit([&]() { 4893 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4894 TheLoop->getStartLoc(), 4895 TheLoop->getHeader()) 4896 << "User-specified vectorization factor " 4897 << ore::NV("UserVectorizationFactor", UserVF) 4898 << " is ignored because the target does not support scalable " 4899 "vectors. The compiler will pick a more suitable value."; 4900 }); 4901 } else { 4902 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4903 << " is unsafe. Ignoring scalable UserVF.\n"); 4904 ORE->emit([&]() { 4905 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4906 TheLoop->getStartLoc(), 4907 TheLoop->getHeader()) 4908 << "User-specified vectorization factor " 4909 << ore::NV("UserVectorizationFactor", UserVF) 4910 << " is unsafe. Ignoring the hint to let the compiler pick a " 4911 "more suitable value."; 4912 }); 4913 } 4914 } 4915 4916 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4917 << " / " << WidestType << " bits.\n"); 4918 4919 FixedScalableVFPair Result(ElementCount::getFixed(1), 4920 ElementCount::getScalable(0)); 4921 if (auto MaxVF = 4922 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4923 MaxSafeFixedVF, FoldTailByMasking)) 4924 Result.FixedVF = MaxVF; 4925 4926 if (auto MaxVF = 4927 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, 4928 MaxSafeScalableVF, FoldTailByMasking)) 4929 if (MaxVF.isScalable()) { 4930 Result.ScalableVF = MaxVF; 4931 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4932 << "\n"); 4933 } 4934 4935 return Result; 4936 } 4937 4938 FixedScalableVFPair 4939 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4940 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4941 // TODO: It may by useful to do since it's still likely to be dynamically 4942 // uniform if the target can skip. 4943 reportVectorizationFailure( 4944 "Not inserting runtime ptr check for divergent target", 4945 "runtime pointer checks needed. Not enabled for divergent target", 4946 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4947 return FixedScalableVFPair::getNone(); 4948 } 4949 4950 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4951 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4952 if (TC == 1) { 4953 reportVectorizationFailure("Single iteration (non) loop", 4954 "loop trip count is one, irrelevant for vectorization", 4955 "SingleIterationLoop", ORE, TheLoop); 4956 return FixedScalableVFPair::getNone(); 4957 } 4958 4959 switch (ScalarEpilogueStatus) { 4960 case CM_ScalarEpilogueAllowed: 4961 return computeFeasibleMaxVF(TC, UserVF, false); 4962 case CM_ScalarEpilogueNotAllowedUsePredicate: 4963 LLVM_FALLTHROUGH; 4964 case CM_ScalarEpilogueNotNeededUsePredicate: 4965 LLVM_DEBUG( 4966 dbgs() << "LV: vector predicate hint/switch found.\n" 4967 << "LV: Not allowing scalar epilogue, creating predicated " 4968 << "vector loop.\n"); 4969 break; 4970 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4971 // fallthrough as a special case of OptForSize 4972 case CM_ScalarEpilogueNotAllowedOptSize: 4973 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4974 LLVM_DEBUG( 4975 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4976 else 4977 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4978 << "count.\n"); 4979 4980 // Bail if runtime checks are required, which are not good when optimising 4981 // for size. 4982 if (runtimeChecksRequired()) 4983 return FixedScalableVFPair::getNone(); 4984 4985 break; 4986 } 4987 4988 // The only loops we can vectorize without a scalar epilogue, are loops with 4989 // a bottom-test and a single exiting block. We'd have to handle the fact 4990 // that not every instruction executes on the last iteration. This will 4991 // require a lane mask which varies through the vector loop body. (TODO) 4992 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4993 // If there was a tail-folding hint/switch, but we can't fold the tail by 4994 // masking, fallback to a vectorization with a scalar epilogue. 4995 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4996 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4997 "scalar epilogue instead.\n"); 4998 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4999 return computeFeasibleMaxVF(TC, UserVF, false); 5000 } 5001 return FixedScalableVFPair::getNone(); 5002 } 5003 5004 // Now try the tail folding 5005 5006 // Invalidate interleave groups that require an epilogue if we can't mask 5007 // the interleave-group. 5008 if (!useMaskedInterleavedAccesses(TTI)) { 5009 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5010 "No decisions should have been taken at this point"); 5011 // Note: There is no need to invalidate any cost modeling decisions here, as 5012 // non where taken so far. 5013 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5014 } 5015 5016 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); 5017 // Avoid tail folding if the trip count is known to be a multiple of any VF 5018 // we chose. 5019 // FIXME: The condition below pessimises the case for fixed-width vectors, 5020 // when scalable VFs are also candidates for vectorization. 5021 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5022 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5023 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5024 "MaxFixedVF must be a power of 2"); 5025 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5026 : MaxFixedVF.getFixedValue(); 5027 ScalarEvolution *SE = PSE.getSE(); 5028 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5029 const SCEV *ExitCount = SE->getAddExpr( 5030 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5031 const SCEV *Rem = SE->getURemExpr( 5032 SE->applyLoopGuards(ExitCount, TheLoop), 5033 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5034 if (Rem->isZero()) { 5035 // Accept MaxFixedVF if we do not have a tail. 5036 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5037 return MaxFactors; 5038 } 5039 } 5040 5041 // If we don't know the precise trip count, or if the trip count that we 5042 // found modulo the vectorization factor is not zero, try to fold the tail 5043 // by masking. 5044 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5045 if (Legal->prepareToFoldTailByMasking()) { 5046 FoldTailByMasking = true; 5047 return MaxFactors; 5048 } 5049 5050 // If there was a tail-folding hint/switch, but we can't fold the tail by 5051 // masking, fallback to a vectorization with a scalar epilogue. 5052 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5053 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5054 "scalar epilogue instead.\n"); 5055 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5056 return MaxFactors; 5057 } 5058 5059 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5060 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5061 return FixedScalableVFPair::getNone(); 5062 } 5063 5064 if (TC == 0) { 5065 reportVectorizationFailure( 5066 "Unable to calculate the loop count due to complex control flow", 5067 "unable to calculate the loop count due to complex control flow", 5068 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5069 return FixedScalableVFPair::getNone(); 5070 } 5071 5072 reportVectorizationFailure( 5073 "Cannot optimize for size and vectorize at the same time.", 5074 "cannot optimize for size and vectorize at the same time. " 5075 "Enable vectorization of this loop with '#pragma clang loop " 5076 "vectorize(enable)' when compiling with -Os/-Oz", 5077 "NoTailLoopWithOptForSize", ORE, TheLoop); 5078 return FixedScalableVFPair::getNone(); 5079 } 5080 5081 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5082 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5083 ElementCount MaxSafeVF, bool FoldTailByMasking) { 5084 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5085 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5086 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5087 : TargetTransformInfo::RGK_FixedWidthVector); 5088 5089 // Convenience function to return the minimum of two ElementCounts. 5090 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5091 assert((LHS.isScalable() == RHS.isScalable()) && 5092 "Scalable flags must match"); 5093 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5094 }; 5095 5096 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5097 // Note that both WidestRegister and WidestType may not be a powers of 2. 5098 auto MaxVectorElementCount = ElementCount::get( 5099 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5100 ComputeScalableMaxVF); 5101 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5102 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5103 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5104 5105 if (!MaxVectorElementCount) { 5106 LLVM_DEBUG(dbgs() << "LV: The target has no " 5107 << (ComputeScalableMaxVF ? "scalable" : "fixed") 5108 << " vector registers.\n"); 5109 return ElementCount::getFixed(1); 5110 } 5111 5112 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5113 if (ConstTripCount && 5114 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5115 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { 5116 // If loop trip count (TC) is known at compile time there is no point in 5117 // choosing VF greater than TC (as done in the loop below). Select maximum 5118 // power of two which doesn't exceed TC. 5119 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF 5120 // when the TC is less than or equal to the known number of lanes. 5121 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); 5122 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 5123 "exceeding the constant trip count: " 5124 << ClampedConstTripCount << "\n"); 5125 return ElementCount::getFixed(ClampedConstTripCount); 5126 } 5127 5128 TargetTransformInfo::RegisterKind RegKind = 5129 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5130 : TargetTransformInfo::RGK_FixedWidthVector; 5131 ElementCount MaxVF = MaxVectorElementCount; 5132 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && 5133 TTI.shouldMaximizeVectorBandwidth(RegKind))) { 5134 auto MaxVectorElementCountMaxBW = ElementCount::get( 5135 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5136 ComputeScalableMaxVF); 5137 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5138 5139 // Collect all viable vectorization factors larger than the default MaxVF 5140 // (i.e. MaxVectorElementCount). 5141 SmallVector<ElementCount, 8> VFs; 5142 for (ElementCount VS = MaxVectorElementCount * 2; 5143 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5144 VFs.push_back(VS); 5145 5146 // For each VF calculate its register usage. 5147 auto RUs = calculateRegisterUsage(VFs); 5148 5149 // Select the largest VF which doesn't require more registers than existing 5150 // ones. 5151 for (int i = RUs.size() - 1; i >= 0; --i) { 5152 bool Selected = true; 5153 for (auto &pair : RUs[i].MaxLocalUsers) { 5154 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5155 if (pair.second > TargetNumRegisters) 5156 Selected = false; 5157 } 5158 if (Selected) { 5159 MaxVF = VFs[i]; 5160 break; 5161 } 5162 } 5163 if (ElementCount MinVF = 5164 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5165 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5166 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5167 << ") with target's minimum: " << MinVF << '\n'); 5168 MaxVF = MinVF; 5169 } 5170 } 5171 5172 // Invalidate any widening decisions we might have made, in case the loop 5173 // requires prediction (decided later), but we have already made some 5174 // load/store widening decisions. 5175 invalidateCostModelingDecisions(); 5176 } 5177 return MaxVF; 5178 } 5179 5180 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const { 5181 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 5182 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 5183 auto Min = Attr.getVScaleRangeMin(); 5184 auto Max = Attr.getVScaleRangeMax(); 5185 if (Max && Min == Max) 5186 return Max; 5187 } 5188 5189 return TTI.getVScaleForTuning(); 5190 } 5191 5192 bool LoopVectorizationCostModel::isMoreProfitable( 5193 const VectorizationFactor &A, const VectorizationFactor &B) const { 5194 InstructionCost CostA = A.Cost; 5195 InstructionCost CostB = B.Cost; 5196 5197 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5198 5199 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5200 MaxTripCount) { 5201 // If we are folding the tail and the trip count is a known (possibly small) 5202 // constant, the trip count will be rounded up to an integer number of 5203 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5204 // which we compare directly. When not folding the tail, the total cost will 5205 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5206 // approximated with the per-lane cost below instead of using the tripcount 5207 // as here. 5208 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5209 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5210 return RTCostA < RTCostB; 5211 } 5212 5213 // Improve estimate for the vector width if it is scalable. 5214 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 5215 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 5216 if (Optional<unsigned> VScale = getVScaleForTuning()) { 5217 if (A.Width.isScalable()) 5218 EstimatedWidthA *= VScale.getValue(); 5219 if (B.Width.isScalable()) 5220 EstimatedWidthB *= VScale.getValue(); 5221 } 5222 5223 // Assume vscale may be larger than 1 (or the value being tuned for), 5224 // so that scalable vectorization is slightly favorable over fixed-width 5225 // vectorization. 5226 if (A.Width.isScalable() && !B.Width.isScalable()) 5227 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 5228 5229 // To avoid the need for FP division: 5230 // (CostA / A.Width) < (CostB / B.Width) 5231 // <=> (CostA * B.Width) < (CostB * A.Width) 5232 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 5233 } 5234 5235 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( 5236 const ElementCountSet &VFCandidates) { 5237 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5238 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5239 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5240 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5241 "Expected Scalar VF to be a candidate"); 5242 5243 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5244 ExpectedCost); 5245 VectorizationFactor ChosenFactor = ScalarCost; 5246 5247 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5248 if (ForceVectorization && VFCandidates.size() > 1) { 5249 // Ignore scalar width, because the user explicitly wants vectorization. 5250 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5251 // evaluation. 5252 ChosenFactor.Cost = InstructionCost::getMax(); 5253 } 5254 5255 SmallVector<InstructionVFPair> InvalidCosts; 5256 for (const auto &i : VFCandidates) { 5257 // The cost for scalar VF=1 is already calculated, so ignore it. 5258 if (i.isScalar()) 5259 continue; 5260 5261 VectorizationCostTy C = expectedCost(i, &InvalidCosts); 5262 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5263 5264 #ifndef NDEBUG 5265 unsigned AssumedMinimumVscale = 1; 5266 if (Optional<unsigned> VScale = getVScaleForTuning()) 5267 AssumedMinimumVscale = *VScale; 5268 unsigned Width = 5269 Candidate.Width.isScalable() 5270 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5271 : Candidate.Width.getFixedValue(); 5272 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5273 << " costs: " << (Candidate.Cost / Width)); 5274 if (i.isScalable()) 5275 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5276 << AssumedMinimumVscale << ")"); 5277 LLVM_DEBUG(dbgs() << ".\n"); 5278 #endif 5279 5280 if (!C.second && !ForceVectorization) { 5281 LLVM_DEBUG( 5282 dbgs() << "LV: Not considering vector loop of width " << i 5283 << " because it will not generate any vector instructions.\n"); 5284 continue; 5285 } 5286 5287 // If profitable add it to ProfitableVF list. 5288 if (isMoreProfitable(Candidate, ScalarCost)) 5289 ProfitableVFs.push_back(Candidate); 5290 5291 if (isMoreProfitable(Candidate, ChosenFactor)) 5292 ChosenFactor = Candidate; 5293 } 5294 5295 // Emit a report of VFs with invalid costs in the loop. 5296 if (!InvalidCosts.empty()) { 5297 // Group the remarks per instruction, keeping the instruction order from 5298 // InvalidCosts. 5299 std::map<Instruction *, unsigned> Numbering; 5300 unsigned I = 0; 5301 for (auto &Pair : InvalidCosts) 5302 if (!Numbering.count(Pair.first)) 5303 Numbering[Pair.first] = I++; 5304 5305 // Sort the list, first on instruction(number) then on VF. 5306 llvm::sort(InvalidCosts, 5307 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 5308 if (Numbering[A.first] != Numbering[B.first]) 5309 return Numbering[A.first] < Numbering[B.first]; 5310 ElementCountComparator ECC; 5311 return ECC(A.second, B.second); 5312 }); 5313 5314 // For a list of ordered instruction-vf pairs: 5315 // [(load, vf1), (load, vf2), (store, vf1)] 5316 // Group the instructions together to emit separate remarks for: 5317 // load (vf1, vf2) 5318 // store (vf1) 5319 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 5320 auto Subset = ArrayRef<InstructionVFPair>(); 5321 do { 5322 if (Subset.empty()) 5323 Subset = Tail.take_front(1); 5324 5325 Instruction *I = Subset.front().first; 5326 5327 // If the next instruction is different, or if there are no other pairs, 5328 // emit a remark for the collated subset. e.g. 5329 // [(load, vf1), (load, vf2))] 5330 // to emit: 5331 // remark: invalid costs for 'load' at VF=(vf, vf2) 5332 if (Subset == Tail || Tail[Subset.size()].first != I) { 5333 std::string OutString; 5334 raw_string_ostream OS(OutString); 5335 assert(!Subset.empty() && "Unexpected empty range"); 5336 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5337 for (auto &Pair : Subset) 5338 OS << (Pair.second == Subset.front().second ? "" : ", ") 5339 << Pair.second; 5340 OS << "):"; 5341 if (auto *CI = dyn_cast<CallInst>(I)) 5342 OS << " call to " << CI->getCalledFunction()->getName(); 5343 else 5344 OS << " " << I->getOpcodeName(); 5345 OS.flush(); 5346 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5347 Tail = Tail.drop_front(Subset.size()); 5348 Subset = {}; 5349 } else 5350 // Grow the subset by one element 5351 Subset = Tail.take_front(Subset.size() + 1); 5352 } while (!Tail.empty()); 5353 } 5354 5355 if (!EnableCondStoresVectorization && NumPredStores) { 5356 reportVectorizationFailure("There are conditional stores.", 5357 "store that is conditionally executed prevents vectorization", 5358 "ConditionalStore", ORE, TheLoop); 5359 ChosenFactor = ScalarCost; 5360 } 5361 5362 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5363 ChosenFactor.Cost >= ScalarCost.Cost) dbgs() 5364 << "LV: Vectorization seems to be not beneficial, " 5365 << "but was forced by a user.\n"); 5366 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5367 return ChosenFactor; 5368 } 5369 5370 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5371 const Loop &L, ElementCount VF) const { 5372 // Cross iteration phis such as reductions need special handling and are 5373 // currently unsupported. 5374 if (any_of(L.getHeader()->phis(), 5375 [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); })) 5376 return false; 5377 5378 // Phis with uses outside of the loop require special handling and are 5379 // currently unsupported. 5380 for (auto &Entry : Legal->getInductionVars()) { 5381 // Look for uses of the value of the induction at the last iteration. 5382 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5383 for (User *U : PostInc->users()) 5384 if (!L.contains(cast<Instruction>(U))) 5385 return false; 5386 // Look for uses of penultimate value of the induction. 5387 for (User *U : Entry.first->users()) 5388 if (!L.contains(cast<Instruction>(U))) 5389 return false; 5390 } 5391 5392 // Induction variables that are widened require special handling that is 5393 // currently not supported. 5394 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5395 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5396 this->isProfitableToScalarize(Entry.first, VF)); 5397 })) 5398 return false; 5399 5400 // Epilogue vectorization code has not been auditted to ensure it handles 5401 // non-latch exits properly. It may be fine, but it needs auditted and 5402 // tested. 5403 if (L.getExitingBlock() != L.getLoopLatch()) 5404 return false; 5405 5406 return true; 5407 } 5408 5409 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5410 const ElementCount VF) const { 5411 // FIXME: We need a much better cost-model to take different parameters such 5412 // as register pressure, code size increase and cost of extra branches into 5413 // account. For now we apply a very crude heuristic and only consider loops 5414 // with vectorization factors larger than a certain value. 5415 // We also consider epilogue vectorization unprofitable for targets that don't 5416 // consider interleaving beneficial (eg. MVE). 5417 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5418 return false; 5419 // FIXME: We should consider changing the threshold for scalable 5420 // vectors to take VScaleForTuning into account. 5421 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) 5422 return true; 5423 return false; 5424 } 5425 5426 VectorizationFactor 5427 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5428 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5429 VectorizationFactor Result = VectorizationFactor::Disabled(); 5430 if (!EnableEpilogueVectorization) { 5431 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5432 return Result; 5433 } 5434 5435 if (!isScalarEpilogueAllowed()) { 5436 LLVM_DEBUG( 5437 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5438 "allowed.\n";); 5439 return Result; 5440 } 5441 5442 // Not really a cost consideration, but check for unsupported cases here to 5443 // simplify the logic. 5444 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5445 LLVM_DEBUG( 5446 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5447 "not a supported candidate.\n";); 5448 return Result; 5449 } 5450 5451 if (EpilogueVectorizationForceVF > 1) { 5452 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5453 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5454 if (LVP.hasPlanWithVF(ForcedEC)) 5455 return {ForcedEC, 0, 0}; 5456 else { 5457 LLVM_DEBUG( 5458 dbgs() 5459 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5460 return Result; 5461 } 5462 } 5463 5464 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5465 TheLoop->getHeader()->getParent()->hasMinSize()) { 5466 LLVM_DEBUG( 5467 dbgs() 5468 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5469 return Result; 5470 } 5471 5472 if (!isEpilogueVectorizationProfitable(MainLoopVF)) { 5473 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5474 "this loop\n"); 5475 return Result; 5476 } 5477 5478 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5479 // the main loop handles 8 lanes per iteration. We could still benefit from 5480 // vectorizing the epilogue loop with VF=4. 5481 ElementCount EstimatedRuntimeVF = MainLoopVF; 5482 if (MainLoopVF.isScalable()) { 5483 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5484 if (Optional<unsigned> VScale = getVScaleForTuning()) 5485 EstimatedRuntimeVF *= *VScale; 5486 } 5487 5488 for (auto &NextVF : ProfitableVFs) 5489 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5490 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || 5491 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && 5492 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && 5493 LVP.hasPlanWithVF(NextVF.Width)) 5494 Result = NextVF; 5495 5496 if (Result != VectorizationFactor::Disabled()) 5497 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5498 << Result.Width << "\n";); 5499 return Result; 5500 } 5501 5502 std::pair<unsigned, unsigned> 5503 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5504 unsigned MinWidth = -1U; 5505 unsigned MaxWidth = 8; 5506 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5507 // For in-loop reductions, no element types are added to ElementTypesInLoop 5508 // if there are no loads/stores in the loop. In this case, check through the 5509 // reduction variables to determine the maximum width. 5510 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5511 // Reset MaxWidth so that we can find the smallest type used by recurrences 5512 // in the loop. 5513 MaxWidth = -1U; 5514 for (auto &PhiDescriptorPair : Legal->getReductionVars()) { 5515 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5516 // When finding the min width used by the recurrence we need to account 5517 // for casts on the input operands of the recurrence. 5518 MaxWidth = std::min<unsigned>( 5519 MaxWidth, std::min<unsigned>( 5520 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5521 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5522 } 5523 } else { 5524 for (Type *T : ElementTypesInLoop) { 5525 MinWidth = std::min<unsigned>( 5526 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5527 MaxWidth = std::max<unsigned>( 5528 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize()); 5529 } 5530 } 5531 return {MinWidth, MaxWidth}; 5532 } 5533 5534 void LoopVectorizationCostModel::collectElementTypesForWidening() { 5535 ElementTypesInLoop.clear(); 5536 // For each block. 5537 for (BasicBlock *BB : TheLoop->blocks()) { 5538 // For each instruction in the loop. 5539 for (Instruction &I : BB->instructionsWithoutDebug()) { 5540 Type *T = I.getType(); 5541 5542 // Skip ignored values. 5543 if (ValuesToIgnore.count(&I)) 5544 continue; 5545 5546 // Only examine Loads, Stores and PHINodes. 5547 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5548 continue; 5549 5550 // Examine PHI nodes that are reduction variables. Update the type to 5551 // account for the recurrence type. 5552 if (auto *PN = dyn_cast<PHINode>(&I)) { 5553 if (!Legal->isReductionVariable(PN)) 5554 continue; 5555 const RecurrenceDescriptor &RdxDesc = 5556 Legal->getReductionVars().find(PN)->second; 5557 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5558 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5559 RdxDesc.getRecurrenceType(), 5560 TargetTransformInfo::ReductionFlags())) 5561 continue; 5562 T = RdxDesc.getRecurrenceType(); 5563 } 5564 5565 // Examine the stored values. 5566 if (auto *ST = dyn_cast<StoreInst>(&I)) 5567 T = ST->getValueOperand()->getType(); 5568 5569 assert(T->isSized() && 5570 "Expected the load/store/recurrence type to be sized"); 5571 5572 ElementTypesInLoop.insert(T); 5573 } 5574 } 5575 } 5576 5577 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5578 unsigned LoopCost) { 5579 // -- The interleave heuristics -- 5580 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5581 // There are many micro-architectural considerations that we can't predict 5582 // at this level. For example, frontend pressure (on decode or fetch) due to 5583 // code size, or the number and capabilities of the execution ports. 5584 // 5585 // We use the following heuristics to select the interleave count: 5586 // 1. If the code has reductions, then we interleave to break the cross 5587 // iteration dependency. 5588 // 2. If the loop is really small, then we interleave to reduce the loop 5589 // overhead. 5590 // 3. We don't interleave if we think that we will spill registers to memory 5591 // due to the increased register pressure. 5592 5593 if (!isScalarEpilogueAllowed()) 5594 return 1; 5595 5596 // We used the distance for the interleave count. 5597 if (Legal->getMaxSafeDepDistBytes() != -1U) 5598 return 1; 5599 5600 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5601 const bool HasReductions = !Legal->getReductionVars().empty(); 5602 // Do not interleave loops with a relatively small known or estimated trip 5603 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5604 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5605 // because with the above conditions interleaving can expose ILP and break 5606 // cross iteration dependences for reductions. 5607 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5608 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5609 return 1; 5610 5611 // If we did not calculate the cost for VF (because the user selected the VF) 5612 // then we calculate the cost of VF here. 5613 if (LoopCost == 0) { 5614 InstructionCost C = expectedCost(VF).first; 5615 assert(C.isValid() && "Expected to have chosen a VF with valid cost"); 5616 LoopCost = *C.getValue(); 5617 5618 // Loop body is free and there is no need for interleaving. 5619 if (LoopCost == 0) 5620 return 1; 5621 } 5622 5623 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5624 // We divide by these constants so assume that we have at least one 5625 // instruction that uses at least one register. 5626 for (auto& pair : R.MaxLocalUsers) { 5627 pair.second = std::max(pair.second, 1U); 5628 } 5629 5630 // We calculate the interleave count using the following formula. 5631 // Subtract the number of loop invariants from the number of available 5632 // registers. These registers are used by all of the interleaved instances. 5633 // Next, divide the remaining registers by the number of registers that is 5634 // required by the loop, in order to estimate how many parallel instances 5635 // fit without causing spills. All of this is rounded down if necessary to be 5636 // a power of two. We want power of two interleave count to simplify any 5637 // addressing operations or alignment considerations. 5638 // We also want power of two interleave counts to ensure that the induction 5639 // variable of the vector loop wraps to zero, when tail is folded by masking; 5640 // this currently happens when OptForSize, in which case IC is set to 1 above. 5641 unsigned IC = UINT_MAX; 5642 5643 for (auto& pair : R.MaxLocalUsers) { 5644 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5645 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5646 << " registers of " 5647 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5648 if (VF.isScalar()) { 5649 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5650 TargetNumRegisters = ForceTargetNumScalarRegs; 5651 } else { 5652 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5653 TargetNumRegisters = ForceTargetNumVectorRegs; 5654 } 5655 unsigned MaxLocalUsers = pair.second; 5656 unsigned LoopInvariantRegs = 0; 5657 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5658 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5659 5660 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5661 // Don't count the induction variable as interleaved. 5662 if (EnableIndVarRegisterHeur) { 5663 TmpIC = 5664 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5665 std::max(1U, (MaxLocalUsers - 1))); 5666 } 5667 5668 IC = std::min(IC, TmpIC); 5669 } 5670 5671 // Clamp the interleave ranges to reasonable counts. 5672 unsigned MaxInterleaveCount = 5673 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 5674 5675 // Check if the user has overridden the max. 5676 if (VF.isScalar()) { 5677 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5678 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5679 } else { 5680 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5681 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5682 } 5683 5684 // If trip count is known or estimated compile time constant, limit the 5685 // interleave count to be less than the trip count divided by VF, provided it 5686 // is at least 1. 5687 // 5688 // For scalable vectors we can't know if interleaving is beneficial. It may 5689 // not be beneficial for small loops if none of the lanes in the second vector 5690 // iterations is enabled. However, for larger loops, there is likely to be a 5691 // similar benefit as for fixed-width vectors. For now, we choose to leave 5692 // the InterleaveCount as if vscale is '1', although if some information about 5693 // the vector is known (e.g. min vector size), we can make a better decision. 5694 if (BestKnownTC) { 5695 MaxInterleaveCount = 5696 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 5697 // Make sure MaxInterleaveCount is greater than 0. 5698 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 5699 } 5700 5701 assert(MaxInterleaveCount > 0 && 5702 "Maximum interleave count must be greater than 0"); 5703 5704 // Clamp the calculated IC to be between the 1 and the max interleave count 5705 // that the target and trip count allows. 5706 if (IC > MaxInterleaveCount) 5707 IC = MaxInterleaveCount; 5708 else 5709 // Make sure IC is greater than 0. 5710 IC = std::max(1u, IC); 5711 5712 assert(IC > 0 && "Interleave count must be greater than 0."); 5713 5714 // Interleave if we vectorized this loop and there is a reduction that could 5715 // benefit from interleaving. 5716 if (VF.isVector() && HasReductions) { 5717 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5718 return IC; 5719 } 5720 5721 // For any scalar loop that either requires runtime checks or predication we 5722 // are better off leaving this to the unroller. Note that if we've already 5723 // vectorized the loop we will have done the runtime check and so interleaving 5724 // won't require further checks. 5725 bool ScalarInterleavingRequiresPredication = 5726 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5727 return Legal->blockNeedsPredication(BB); 5728 })); 5729 bool ScalarInterleavingRequiresRuntimePointerCheck = 5730 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5731 5732 // We want to interleave small loops in order to reduce the loop overhead and 5733 // potentially expose ILP opportunities. 5734 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5735 << "LV: IC is " << IC << '\n' 5736 << "LV: VF is " << VF << '\n'); 5737 const bool AggressivelyInterleaveReductions = 5738 TTI.enableAggressiveInterleaving(HasReductions); 5739 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5740 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5741 // We assume that the cost overhead is 1 and we use the cost model 5742 // to estimate the cost of the loop and interleave until the cost of the 5743 // loop overhead is about 5% of the cost of the loop. 5744 unsigned SmallIC = 5745 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5746 5747 // Interleave until store/load ports (estimated by max interleave count) are 5748 // saturated. 5749 unsigned NumStores = Legal->getNumStores(); 5750 unsigned NumLoads = Legal->getNumLoads(); 5751 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5752 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5753 5754 // There is little point in interleaving for reductions containing selects 5755 // and compares when VF=1 since it may just create more overhead than it's 5756 // worth for loops with small trip counts. This is because we still have to 5757 // do the final reduction after the loop. 5758 bool HasSelectCmpReductions = 5759 HasReductions && 5760 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5761 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5762 return RecurrenceDescriptor::isSelectCmpRecurrenceKind( 5763 RdxDesc.getRecurrenceKind()); 5764 }); 5765 if (HasSelectCmpReductions) { 5766 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5767 return 1; 5768 } 5769 5770 // If we have a scalar reduction (vector reductions are already dealt with 5771 // by this point), we can increase the critical path length if the loop 5772 // we're interleaving is inside another loop. For tree-wise reductions 5773 // set the limit to 2, and for ordered reductions it's best to disable 5774 // interleaving entirely. 5775 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5776 bool HasOrderedReductions = 5777 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5778 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5779 return RdxDesc.isOrdered(); 5780 }); 5781 if (HasOrderedReductions) { 5782 LLVM_DEBUG( 5783 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5784 return 1; 5785 } 5786 5787 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5788 SmallIC = std::min(SmallIC, F); 5789 StoresIC = std::min(StoresIC, F); 5790 LoadsIC = std::min(LoadsIC, F); 5791 } 5792 5793 if (EnableLoadStoreRuntimeInterleave && 5794 std::max(StoresIC, LoadsIC) > SmallIC) { 5795 LLVM_DEBUG( 5796 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5797 return std::max(StoresIC, LoadsIC); 5798 } 5799 5800 // If there are scalar reductions and TTI has enabled aggressive 5801 // interleaving for reductions, we will interleave to expose ILP. 5802 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5803 AggressivelyInterleaveReductions) { 5804 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5805 // Interleave no less than SmallIC but not as aggressive as the normal IC 5806 // to satisfy the rare situation when resources are too limited. 5807 return std::max(IC / 2, SmallIC); 5808 } else { 5809 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5810 return SmallIC; 5811 } 5812 } 5813 5814 // Interleave if this is a large loop (small loops are already dealt with by 5815 // this point) that could benefit from interleaving. 5816 if (AggressivelyInterleaveReductions) { 5817 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5818 return IC; 5819 } 5820 5821 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5822 return 1; 5823 } 5824 5825 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5826 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5827 // This function calculates the register usage by measuring the highest number 5828 // of values that are alive at a single location. Obviously, this is a very 5829 // rough estimation. We scan the loop in a topological order in order and 5830 // assign a number to each instruction. We use RPO to ensure that defs are 5831 // met before their users. We assume that each instruction that has in-loop 5832 // users starts an interval. We record every time that an in-loop value is 5833 // used, so we have a list of the first and last occurrences of each 5834 // instruction. Next, we transpose this data structure into a multi map that 5835 // holds the list of intervals that *end* at a specific location. This multi 5836 // map allows us to perform a linear search. We scan the instructions linearly 5837 // and record each time that a new interval starts, by placing it in a set. 5838 // If we find this value in the multi-map then we remove it from the set. 5839 // The max register usage is the maximum size of the set. 5840 // We also search for instructions that are defined outside the loop, but are 5841 // used inside the loop. We need this number separately from the max-interval 5842 // usage number because when we unroll, loop-invariant values do not take 5843 // more register. 5844 LoopBlocksDFS DFS(TheLoop); 5845 DFS.perform(LI); 5846 5847 RegisterUsage RU; 5848 5849 // Each 'key' in the map opens a new interval. The values 5850 // of the map are the index of the 'last seen' usage of the 5851 // instruction that is the key. 5852 using IntervalMap = DenseMap<Instruction *, unsigned>; 5853 5854 // Maps instruction to its index. 5855 SmallVector<Instruction *, 64> IdxToInstr; 5856 // Marks the end of each interval. 5857 IntervalMap EndPoint; 5858 // Saves the list of instruction indices that are used in the loop. 5859 SmallPtrSet<Instruction *, 8> Ends; 5860 // Saves the list of values that are used in the loop but are 5861 // defined outside the loop, such as arguments and constants. 5862 SmallPtrSet<Value *, 8> LoopInvariants; 5863 5864 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5865 for (Instruction &I : BB->instructionsWithoutDebug()) { 5866 IdxToInstr.push_back(&I); 5867 5868 // Save the end location of each USE. 5869 for (Value *U : I.operands()) { 5870 auto *Instr = dyn_cast<Instruction>(U); 5871 5872 // Ignore non-instruction values such as arguments, constants, etc. 5873 if (!Instr) 5874 continue; 5875 5876 // If this instruction is outside the loop then record it and continue. 5877 if (!TheLoop->contains(Instr)) { 5878 LoopInvariants.insert(Instr); 5879 continue; 5880 } 5881 5882 // Overwrite previous end points. 5883 EndPoint[Instr] = IdxToInstr.size(); 5884 Ends.insert(Instr); 5885 } 5886 } 5887 } 5888 5889 // Saves the list of intervals that end with the index in 'key'. 5890 using InstrList = SmallVector<Instruction *, 2>; 5891 DenseMap<unsigned, InstrList> TransposeEnds; 5892 5893 // Transpose the EndPoints to a list of values that end at each index. 5894 for (auto &Interval : EndPoint) 5895 TransposeEnds[Interval.second].push_back(Interval.first); 5896 5897 SmallPtrSet<Instruction *, 8> OpenIntervals; 5898 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5899 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5900 5901 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5902 5903 auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned { 5904 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5905 return 0; 5906 return TTI.getRegUsageForType(VectorType::get(Ty, VF)); 5907 }; 5908 5909 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5910 Instruction *I = IdxToInstr[i]; 5911 5912 // Remove all of the instructions that end at this location. 5913 InstrList &List = TransposeEnds[i]; 5914 for (Instruction *ToRemove : List) 5915 OpenIntervals.erase(ToRemove); 5916 5917 // Ignore instructions that are never used within the loop. 5918 if (!Ends.count(I)) 5919 continue; 5920 5921 // Skip ignored values. 5922 if (ValuesToIgnore.count(I)) 5923 continue; 5924 5925 // For each VF find the maximum usage of registers. 5926 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5927 // Count the number of live intervals. 5928 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5929 5930 if (VFs[j].isScalar()) { 5931 for (auto Inst : OpenIntervals) { 5932 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5933 if (RegUsage.find(ClassID) == RegUsage.end()) 5934 RegUsage[ClassID] = 1; 5935 else 5936 RegUsage[ClassID] += 1; 5937 } 5938 } else { 5939 collectUniformsAndScalars(VFs[j]); 5940 for (auto Inst : OpenIntervals) { 5941 // Skip ignored values for VF > 1. 5942 if (VecValuesToIgnore.count(Inst)) 5943 continue; 5944 if (isScalarAfterVectorization(Inst, VFs[j])) { 5945 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5946 if (RegUsage.find(ClassID) == RegUsage.end()) 5947 RegUsage[ClassID] = 1; 5948 else 5949 RegUsage[ClassID] += 1; 5950 } else { 5951 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5952 if (RegUsage.find(ClassID) == RegUsage.end()) 5953 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5954 else 5955 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5956 } 5957 } 5958 } 5959 5960 for (auto& pair : RegUsage) { 5961 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5962 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5963 else 5964 MaxUsages[j][pair.first] = pair.second; 5965 } 5966 } 5967 5968 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5969 << OpenIntervals.size() << '\n'); 5970 5971 // Add the current instruction to the list of open intervals. 5972 OpenIntervals.insert(I); 5973 } 5974 5975 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5976 SmallMapVector<unsigned, unsigned, 4> Invariant; 5977 5978 for (auto Inst : LoopInvariants) { 5979 unsigned Usage = 5980 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5981 unsigned ClassID = 5982 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 5983 if (Invariant.find(ClassID) == Invariant.end()) 5984 Invariant[ClassID] = Usage; 5985 else 5986 Invariant[ClassID] += Usage; 5987 } 5988 5989 LLVM_DEBUG({ 5990 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5991 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5992 << " item\n"; 5993 for (const auto &pair : MaxUsages[i]) { 5994 dbgs() << "LV(REG): RegisterClass: " 5995 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5996 << " registers\n"; 5997 } 5998 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5999 << " item\n"; 6000 for (const auto &pair : Invariant) { 6001 dbgs() << "LV(REG): RegisterClass: " 6002 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6003 << " registers\n"; 6004 } 6005 }); 6006 6007 RU.LoopInvariantRegs = Invariant; 6008 RU.MaxLocalUsers = MaxUsages[i]; 6009 RUs[i] = RU; 6010 } 6011 6012 return RUs; 6013 } 6014 6015 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 6016 ElementCount VF) { 6017 // TODO: Cost model for emulated masked load/store is completely 6018 // broken. This hack guides the cost model to use an artificially 6019 // high enough value to practically disable vectorization with such 6020 // operations, except where previously deployed legality hack allowed 6021 // using very low cost values. This is to avoid regressions coming simply 6022 // from moving "masked load/store" check from legality to cost model. 6023 // Masked Load/Gather emulation was previously never allowed. 6024 // Limited number of Masked Store/Scatter emulation was allowed. 6025 assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); 6026 return isa<LoadInst>(I) || 6027 (isa<StoreInst>(I) && 6028 NumPredStores > NumberOfStoresToPredicate); 6029 } 6030 6031 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6032 // If we aren't vectorizing the loop, or if we've already collected the 6033 // instructions to scalarize, there's nothing to do. Collection may already 6034 // have occurred if we have a user-selected VF and are now computing the 6035 // expected cost for interleaving. 6036 if (VF.isScalar() || VF.isZero() || 6037 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6038 return; 6039 6040 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6041 // not profitable to scalarize any instructions, the presence of VF in the 6042 // map will indicate that we've analyzed it already. 6043 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6044 6045 // Find all the instructions that are scalar with predication in the loop and 6046 // determine if it would be better to not if-convert the blocks they are in. 6047 // If so, we also record the instructions to scalarize. 6048 for (BasicBlock *BB : TheLoop->blocks()) { 6049 if (!blockNeedsPredicationForAnyReason(BB)) 6050 continue; 6051 for (Instruction &I : *BB) 6052 if (isScalarWithPredication(&I, VF)) { 6053 ScalarCostsTy ScalarCosts; 6054 // Do not apply discount if scalable, because that would lead to 6055 // invalid scalarization costs. 6056 // Do not apply discount logic if hacked cost is needed 6057 // for emulated masked memrefs. 6058 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 6059 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6060 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6061 // Remember that BB will remain after vectorization. 6062 PredicatedBBsAfterVectorization.insert(BB); 6063 } 6064 } 6065 } 6066 6067 int LoopVectorizationCostModel::computePredInstDiscount( 6068 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6069 assert(!isUniformAfterVectorization(PredInst, VF) && 6070 "Instruction marked uniform-after-vectorization will be predicated"); 6071 6072 // Initialize the discount to zero, meaning that the scalar version and the 6073 // vector version cost the same. 6074 InstructionCost Discount = 0; 6075 6076 // Holds instructions to analyze. The instructions we visit are mapped in 6077 // ScalarCosts. Those instructions are the ones that would be scalarized if 6078 // we find that the scalar version costs less. 6079 SmallVector<Instruction *, 8> Worklist; 6080 6081 // Returns true if the given instruction can be scalarized. 6082 auto canBeScalarized = [&](Instruction *I) -> bool { 6083 // We only attempt to scalarize instructions forming a single-use chain 6084 // from the original predicated block that would otherwise be vectorized. 6085 // Although not strictly necessary, we give up on instructions we know will 6086 // already be scalar to avoid traversing chains that are unlikely to be 6087 // beneficial. 6088 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6089 isScalarAfterVectorization(I, VF)) 6090 return false; 6091 6092 // If the instruction is scalar with predication, it will be analyzed 6093 // separately. We ignore it within the context of PredInst. 6094 if (isScalarWithPredication(I, VF)) 6095 return false; 6096 6097 // If any of the instruction's operands are uniform after vectorization, 6098 // the instruction cannot be scalarized. This prevents, for example, a 6099 // masked load from being scalarized. 6100 // 6101 // We assume we will only emit a value for lane zero of an instruction 6102 // marked uniform after vectorization, rather than VF identical values. 6103 // Thus, if we scalarize an instruction that uses a uniform, we would 6104 // create uses of values corresponding to the lanes we aren't emitting code 6105 // for. This behavior can be changed by allowing getScalarValue to clone 6106 // the lane zero values for uniforms rather than asserting. 6107 for (Use &U : I->operands()) 6108 if (auto *J = dyn_cast<Instruction>(U.get())) 6109 if (isUniformAfterVectorization(J, VF)) 6110 return false; 6111 6112 // Otherwise, we can scalarize the instruction. 6113 return true; 6114 }; 6115 6116 // Compute the expected cost discount from scalarizing the entire expression 6117 // feeding the predicated instruction. We currently only consider expressions 6118 // that are single-use instruction chains. 6119 Worklist.push_back(PredInst); 6120 while (!Worklist.empty()) { 6121 Instruction *I = Worklist.pop_back_val(); 6122 6123 // If we've already analyzed the instruction, there's nothing to do. 6124 if (ScalarCosts.find(I) != ScalarCosts.end()) 6125 continue; 6126 6127 // Compute the cost of the vector instruction. Note that this cost already 6128 // includes the scalarization overhead of the predicated instruction. 6129 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6130 6131 // Compute the cost of the scalarized instruction. This cost is the cost of 6132 // the instruction as if it wasn't if-converted and instead remained in the 6133 // predicated block. We will scale this cost by block probability after 6134 // computing the scalarization overhead. 6135 InstructionCost ScalarCost = 6136 VF.getFixedValue() * 6137 getInstructionCost(I, ElementCount::getFixed(1)).first; 6138 6139 // Compute the scalarization overhead of needed insertelement instructions 6140 // and phi nodes. 6141 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 6142 ScalarCost += TTI.getScalarizationOverhead( 6143 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6144 APInt::getAllOnes(VF.getFixedValue()), true, false); 6145 ScalarCost += 6146 VF.getFixedValue() * 6147 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6148 } 6149 6150 // Compute the scalarization overhead of needed extractelement 6151 // instructions. For each of the instruction's operands, if the operand can 6152 // be scalarized, add it to the worklist; otherwise, account for the 6153 // overhead. 6154 for (Use &U : I->operands()) 6155 if (auto *J = dyn_cast<Instruction>(U.get())) { 6156 assert(VectorType::isValidElementType(J->getType()) && 6157 "Instruction has non-scalar type"); 6158 if (canBeScalarized(J)) 6159 Worklist.push_back(J); 6160 else if (needsExtract(J, VF)) { 6161 ScalarCost += TTI.getScalarizationOverhead( 6162 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6163 APInt::getAllOnes(VF.getFixedValue()), false, true); 6164 } 6165 } 6166 6167 // Scale the total scalar cost by block probability. 6168 ScalarCost /= getReciprocalPredBlockProb(); 6169 6170 // Compute the discount. A non-negative discount means the vector version 6171 // of the instruction costs more, and scalarizing would be beneficial. 6172 Discount += VectorCost - ScalarCost; 6173 ScalarCosts[I] = ScalarCost; 6174 } 6175 6176 return *Discount.getValue(); 6177 } 6178 6179 LoopVectorizationCostModel::VectorizationCostTy 6180 LoopVectorizationCostModel::expectedCost( 6181 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 6182 VectorizationCostTy Cost; 6183 6184 // For each block. 6185 for (BasicBlock *BB : TheLoop->blocks()) { 6186 VectorizationCostTy BlockCost; 6187 6188 // For each instruction in the old loop. 6189 for (Instruction &I : BB->instructionsWithoutDebug()) { 6190 // Skip ignored values. 6191 if (ValuesToIgnore.count(&I) || 6192 (VF.isVector() && VecValuesToIgnore.count(&I))) 6193 continue; 6194 6195 VectorizationCostTy C = getInstructionCost(&I, VF); 6196 6197 // Check if we should override the cost. 6198 if (C.first.isValid() && 6199 ForceTargetInstructionCost.getNumOccurrences() > 0) 6200 C.first = InstructionCost(ForceTargetInstructionCost); 6201 6202 // Keep a list of instructions with invalid costs. 6203 if (Invalid && !C.first.isValid()) 6204 Invalid->emplace_back(&I, VF); 6205 6206 BlockCost.first += C.first; 6207 BlockCost.second |= C.second; 6208 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6209 << " for VF " << VF << " For instruction: " << I 6210 << '\n'); 6211 } 6212 6213 // If we are vectorizing a predicated block, it will have been 6214 // if-converted. This means that the block's instructions (aside from 6215 // stores and instructions that may divide by zero) will now be 6216 // unconditionally executed. For the scalar case, we may not always execute 6217 // the predicated block, if it is an if-else block. Thus, scale the block's 6218 // cost by the probability of executing it. blockNeedsPredication from 6219 // Legal is used so as to not include all blocks in tail folded loops. 6220 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6221 BlockCost.first /= getReciprocalPredBlockProb(); 6222 6223 Cost.first += BlockCost.first; 6224 Cost.second |= BlockCost.second; 6225 } 6226 6227 return Cost; 6228 } 6229 6230 /// Gets Address Access SCEV after verifying that the access pattern 6231 /// is loop invariant except the induction variable dependence. 6232 /// 6233 /// This SCEV can be sent to the Target in order to estimate the address 6234 /// calculation cost. 6235 static const SCEV *getAddressAccessSCEV( 6236 Value *Ptr, 6237 LoopVectorizationLegality *Legal, 6238 PredicatedScalarEvolution &PSE, 6239 const Loop *TheLoop) { 6240 6241 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6242 if (!Gep) 6243 return nullptr; 6244 6245 // We are looking for a gep with all loop invariant indices except for one 6246 // which should be an induction variable. 6247 auto SE = PSE.getSE(); 6248 unsigned NumOperands = Gep->getNumOperands(); 6249 for (unsigned i = 1; i < NumOperands; ++i) { 6250 Value *Opd = Gep->getOperand(i); 6251 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6252 !Legal->isInductionVariable(Opd)) 6253 return nullptr; 6254 } 6255 6256 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6257 return PSE.getSCEV(Ptr); 6258 } 6259 6260 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6261 return Legal->hasStride(I->getOperand(0)) || 6262 Legal->hasStride(I->getOperand(1)); 6263 } 6264 6265 InstructionCost 6266 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6267 ElementCount VF) { 6268 assert(VF.isVector() && 6269 "Scalarization cost of instruction implies vectorization."); 6270 if (VF.isScalable()) 6271 return InstructionCost::getInvalid(); 6272 6273 Type *ValTy = getLoadStoreType(I); 6274 auto SE = PSE.getSE(); 6275 6276 unsigned AS = getLoadStoreAddressSpace(I); 6277 Value *Ptr = getLoadStorePointerOperand(I); 6278 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6279 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6280 // that it is being called from this specific place. 6281 6282 // Figure out whether the access is strided and get the stride value 6283 // if it's known in compile time 6284 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6285 6286 // Get the cost of the scalar memory instruction and address computation. 6287 InstructionCost Cost = 6288 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6289 6290 // Don't pass *I here, since it is scalar but will actually be part of a 6291 // vectorized loop where the user of it is a vectorized instruction. 6292 const Align Alignment = getLoadStoreAlignment(I); 6293 Cost += VF.getKnownMinValue() * 6294 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6295 AS, TTI::TCK_RecipThroughput); 6296 6297 // Get the overhead of the extractelement and insertelement instructions 6298 // we might create due to scalarization. 6299 Cost += getScalarizationOverhead(I, VF); 6300 6301 // If we have a predicated load/store, it will need extra i1 extracts and 6302 // conditional branches, but may not be executed for each vector lane. Scale 6303 // the cost by the probability of executing the predicated block. 6304 if (isPredicatedInst(I, VF)) { 6305 Cost /= getReciprocalPredBlockProb(); 6306 6307 // Add the cost of an i1 extract and a branch 6308 auto *Vec_i1Ty = 6309 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6310 Cost += TTI.getScalarizationOverhead( 6311 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6312 /*Insert=*/false, /*Extract=*/true); 6313 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6314 6315 if (useEmulatedMaskMemRefHack(I, VF)) 6316 // Artificially setting to a high enough value to practically disable 6317 // vectorization with such operations. 6318 Cost = 3000000; 6319 } 6320 6321 return Cost; 6322 } 6323 6324 InstructionCost 6325 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6326 ElementCount VF) { 6327 Type *ValTy = getLoadStoreType(I); 6328 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6329 Value *Ptr = getLoadStorePointerOperand(I); 6330 unsigned AS = getLoadStoreAddressSpace(I); 6331 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6332 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6333 6334 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6335 "Stride should be 1 or -1 for consecutive memory access"); 6336 const Align Alignment = getLoadStoreAlignment(I); 6337 InstructionCost Cost = 0; 6338 if (Legal->isMaskRequired(I)) 6339 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6340 CostKind); 6341 else 6342 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6343 CostKind, I); 6344 6345 bool Reverse = ConsecutiveStride < 0; 6346 if (Reverse) 6347 Cost += 6348 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6349 return Cost; 6350 } 6351 6352 InstructionCost 6353 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6354 ElementCount VF) { 6355 assert(Legal->isUniformMemOp(*I)); 6356 6357 Type *ValTy = getLoadStoreType(I); 6358 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6359 const Align Alignment = getLoadStoreAlignment(I); 6360 unsigned AS = getLoadStoreAddressSpace(I); 6361 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6362 if (isa<LoadInst>(I)) { 6363 return TTI.getAddressComputationCost(ValTy) + 6364 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6365 CostKind) + 6366 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6367 } 6368 StoreInst *SI = cast<StoreInst>(I); 6369 6370 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6371 return TTI.getAddressComputationCost(ValTy) + 6372 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6373 CostKind) + 6374 (isLoopInvariantStoreValue 6375 ? 0 6376 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6377 VF.getKnownMinValue() - 1)); 6378 } 6379 6380 InstructionCost 6381 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6382 ElementCount VF) { 6383 Type *ValTy = getLoadStoreType(I); 6384 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6385 const Align Alignment = getLoadStoreAlignment(I); 6386 const Value *Ptr = getLoadStorePointerOperand(I); 6387 6388 return TTI.getAddressComputationCost(VectorTy) + 6389 TTI.getGatherScatterOpCost( 6390 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6391 TargetTransformInfo::TCK_RecipThroughput, I); 6392 } 6393 6394 InstructionCost 6395 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6396 ElementCount VF) { 6397 // TODO: Once we have support for interleaving with scalable vectors 6398 // we can calculate the cost properly here. 6399 if (VF.isScalable()) 6400 return InstructionCost::getInvalid(); 6401 6402 Type *ValTy = getLoadStoreType(I); 6403 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6404 unsigned AS = getLoadStoreAddressSpace(I); 6405 6406 auto Group = getInterleavedAccessGroup(I); 6407 assert(Group && "Fail to get an interleaved access group."); 6408 6409 unsigned InterleaveFactor = Group->getFactor(); 6410 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6411 6412 // Holds the indices of existing members in the interleaved group. 6413 SmallVector<unsigned, 4> Indices; 6414 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6415 if (Group->getMember(IF)) 6416 Indices.push_back(IF); 6417 6418 // Calculate the cost of the whole interleaved group. 6419 bool UseMaskForGaps = 6420 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6421 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6422 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6423 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6424 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6425 6426 if (Group->isReverse()) { 6427 // TODO: Add support for reversed masked interleaved access. 6428 assert(!Legal->isMaskRequired(I) && 6429 "Reverse masked interleaved access not supported."); 6430 Cost += 6431 Group->getNumMembers() * 6432 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6433 } 6434 return Cost; 6435 } 6436 6437 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( 6438 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6439 using namespace llvm::PatternMatch; 6440 // Early exit for no inloop reductions 6441 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6442 return None; 6443 auto *VectorTy = cast<VectorType>(Ty); 6444 6445 // We are looking for a pattern of, and finding the minimal acceptable cost: 6446 // reduce(mul(ext(A), ext(B))) or 6447 // reduce(mul(A, B)) or 6448 // reduce(ext(A)) or 6449 // reduce(A). 6450 // The basic idea is that we walk down the tree to do that, finding the root 6451 // reduction instruction in InLoopReductionImmediateChains. From there we find 6452 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6453 // of the components. If the reduction cost is lower then we return it for the 6454 // reduction instruction and 0 for the other instructions in the pattern. If 6455 // it is not we return an invalid cost specifying the orignal cost method 6456 // should be used. 6457 Instruction *RetI = I; 6458 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6459 if (!RetI->hasOneUser()) 6460 return None; 6461 RetI = RetI->user_back(); 6462 } 6463 if (match(RetI, m_Mul(m_Value(), m_Value())) && 6464 RetI->user_back()->getOpcode() == Instruction::Add) { 6465 if (!RetI->hasOneUser()) 6466 return None; 6467 RetI = RetI->user_back(); 6468 } 6469 6470 // Test if the found instruction is a reduction, and if not return an invalid 6471 // cost specifying the parent to use the original cost modelling. 6472 if (!InLoopReductionImmediateChains.count(RetI)) 6473 return None; 6474 6475 // Find the reduction this chain is a part of and calculate the basic cost of 6476 // the reduction on its own. 6477 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6478 Instruction *ReductionPhi = LastChain; 6479 while (!isa<PHINode>(ReductionPhi)) 6480 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6481 6482 const RecurrenceDescriptor &RdxDesc = 6483 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6484 6485 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6486 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6487 6488 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6489 // normal fmul instruction to the cost of the fadd reduction. 6490 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6491 BaseCost += 6492 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6493 6494 // If we're using ordered reductions then we can just return the base cost 6495 // here, since getArithmeticReductionCost calculates the full ordered 6496 // reduction cost when FP reassociation is not allowed. 6497 if (useOrderedReductions(RdxDesc)) 6498 return BaseCost; 6499 6500 // Get the operand that was not the reduction chain and match it to one of the 6501 // patterns, returning the better cost if it is found. 6502 Instruction *RedOp = RetI->getOperand(1) == LastChain 6503 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6504 : dyn_cast<Instruction>(RetI->getOperand(1)); 6505 6506 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6507 6508 Instruction *Op0, *Op1; 6509 if (RedOp && 6510 match(RedOp, 6511 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6512 match(Op0, m_ZExtOrSExt(m_Value())) && 6513 Op0->getOpcode() == Op1->getOpcode() && 6514 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6515 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6516 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6517 6518 // Matched reduce(ext(mul(ext(A), ext(B))) 6519 // Note that the extend opcodes need to all match, or if A==B they will have 6520 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6521 // which is equally fine. 6522 bool IsUnsigned = isa<ZExtInst>(Op0); 6523 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6524 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6525 6526 InstructionCost ExtCost = 6527 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6528 TTI::CastContextHint::None, CostKind, Op0); 6529 InstructionCost MulCost = 6530 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6531 InstructionCost Ext2Cost = 6532 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6533 TTI::CastContextHint::None, CostKind, RedOp); 6534 6535 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6536 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6537 CostKind); 6538 6539 if (RedCost.isValid() && 6540 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6541 return I == RetI ? RedCost : 0; 6542 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6543 !TheLoop->isLoopInvariant(RedOp)) { 6544 // Matched reduce(ext(A)) 6545 bool IsUnsigned = isa<ZExtInst>(RedOp); 6546 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6547 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6548 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6549 CostKind); 6550 6551 InstructionCost ExtCost = 6552 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6553 TTI::CastContextHint::None, CostKind, RedOp); 6554 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6555 return I == RetI ? RedCost : 0; 6556 } else if (RedOp && 6557 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6558 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6559 Op0->getOpcode() == Op1->getOpcode() && 6560 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6561 bool IsUnsigned = isa<ZExtInst>(Op0); 6562 Type *Op0Ty = Op0->getOperand(0)->getType(); 6563 Type *Op1Ty = Op1->getOperand(0)->getType(); 6564 Type *LargestOpTy = 6565 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6566 : Op0Ty; 6567 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6568 6569 // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of 6570 // different sizes. We take the largest type as the ext to reduce, and add 6571 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6572 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6573 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6574 TTI::CastContextHint::None, CostKind, Op0); 6575 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6576 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6577 TTI::CastContextHint::None, CostKind, Op1); 6578 InstructionCost MulCost = 6579 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6580 6581 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6582 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6583 CostKind); 6584 InstructionCost ExtraExtCost = 0; 6585 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6586 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6587 ExtraExtCost = TTI.getCastInstrCost( 6588 ExtraExtOp->getOpcode(), ExtType, 6589 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6590 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6591 } 6592 6593 if (RedCost.isValid() && 6594 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6595 return I == RetI ? RedCost : 0; 6596 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6597 // Matched reduce(mul()) 6598 InstructionCost MulCost = 6599 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6600 6601 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6602 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6603 CostKind); 6604 6605 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6606 return I == RetI ? RedCost : 0; 6607 } 6608 } 6609 6610 return I == RetI ? Optional<InstructionCost>(BaseCost) : None; 6611 } 6612 6613 InstructionCost 6614 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6615 ElementCount VF) { 6616 // Calculate scalar cost only. Vectorization cost should be ready at this 6617 // moment. 6618 if (VF.isScalar()) { 6619 Type *ValTy = getLoadStoreType(I); 6620 const Align Alignment = getLoadStoreAlignment(I); 6621 unsigned AS = getLoadStoreAddressSpace(I); 6622 6623 return TTI.getAddressComputationCost(ValTy) + 6624 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6625 TTI::TCK_RecipThroughput, I); 6626 } 6627 return getWideningCost(I, VF); 6628 } 6629 6630 LoopVectorizationCostModel::VectorizationCostTy 6631 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6632 ElementCount VF) { 6633 // If we know that this instruction will remain uniform, check the cost of 6634 // the scalar version. 6635 if (isUniformAfterVectorization(I, VF)) 6636 VF = ElementCount::getFixed(1); 6637 6638 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6639 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6640 6641 // Forced scalars do not have any scalarization overhead. 6642 auto ForcedScalar = ForcedScalars.find(VF); 6643 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6644 auto InstSet = ForcedScalar->second; 6645 if (InstSet.count(I)) 6646 return VectorizationCostTy( 6647 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6648 VF.getKnownMinValue()), 6649 false); 6650 } 6651 6652 Type *VectorTy; 6653 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6654 6655 bool TypeNotScalarized = false; 6656 if (VF.isVector() && VectorTy->isVectorTy()) { 6657 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6658 if (VF.isScalable()) 6659 // <vscale x 1 x iN> is assumed to be profitable over iN because 6660 // scalable registers are a distinct register class from scalar ones. 6661 // If we ever find a target which wants to lower scalable vectors 6662 // back to scalars, we'll need to update this code to explicitly 6663 // ask TTI about the register class uses for each part. 6664 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6665 else 6666 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6667 } else 6668 C = InstructionCost::getInvalid(); 6669 } 6670 return VectorizationCostTy(C, TypeNotScalarized); 6671 } 6672 6673 InstructionCost 6674 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6675 ElementCount VF) const { 6676 6677 // There is no mechanism yet to create a scalable scalarization loop, 6678 // so this is currently Invalid. 6679 if (VF.isScalable()) 6680 return InstructionCost::getInvalid(); 6681 6682 if (VF.isScalar()) 6683 return 0; 6684 6685 InstructionCost Cost = 0; 6686 Type *RetTy = ToVectorTy(I->getType(), VF); 6687 if (!RetTy->isVoidTy() && 6688 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6689 Cost += TTI.getScalarizationOverhead( 6690 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, 6691 false); 6692 6693 // Some targets keep addresses scalar. 6694 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6695 return Cost; 6696 6697 // Some targets support efficient element stores. 6698 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6699 return Cost; 6700 6701 // Collect operands to consider. 6702 CallInst *CI = dyn_cast<CallInst>(I); 6703 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6704 6705 // Skip operands that do not require extraction/scalarization and do not incur 6706 // any overhead. 6707 SmallVector<Type *> Tys; 6708 for (auto *V : filterExtractingOperands(Ops, VF)) 6709 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6710 return Cost + TTI.getOperandsScalarizationOverhead( 6711 filterExtractingOperands(Ops, VF), Tys); 6712 } 6713 6714 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6715 if (VF.isScalar()) 6716 return; 6717 NumPredStores = 0; 6718 for (BasicBlock *BB : TheLoop->blocks()) { 6719 // For each instruction in the old loop. 6720 for (Instruction &I : *BB) { 6721 Value *Ptr = getLoadStorePointerOperand(&I); 6722 if (!Ptr) 6723 continue; 6724 6725 // TODO: We should generate better code and update the cost model for 6726 // predicated uniform stores. Today they are treated as any other 6727 // predicated store (see added test cases in 6728 // invariant-store-vectorization.ll). 6729 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6730 NumPredStores++; 6731 6732 if (Legal->isUniformMemOp(I)) { 6733 // TODO: Avoid replicating loads and stores instead of 6734 // relying on instcombine to remove them. 6735 // Load: Scalar load + broadcast 6736 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6737 InstructionCost Cost; 6738 if (isa<StoreInst>(&I) && VF.isScalable() && 6739 isLegalGatherOrScatter(&I, VF)) { 6740 Cost = getGatherScatterCost(&I, VF); 6741 setWideningDecision(&I, VF, CM_GatherScatter, Cost); 6742 } else { 6743 Cost = getUniformMemOpCost(&I, VF); 6744 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6745 } 6746 continue; 6747 } 6748 6749 // We assume that widening is the best solution when possible. 6750 if (memoryInstructionCanBeWidened(&I, VF)) { 6751 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6752 int ConsecutiveStride = Legal->isConsecutivePtr( 6753 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6754 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6755 "Expected consecutive stride."); 6756 InstWidening Decision = 6757 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6758 setWideningDecision(&I, VF, Decision, Cost); 6759 continue; 6760 } 6761 6762 // Choose between Interleaving, Gather/Scatter or Scalarization. 6763 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6764 unsigned NumAccesses = 1; 6765 if (isAccessInterleaved(&I)) { 6766 auto Group = getInterleavedAccessGroup(&I); 6767 assert(Group && "Fail to get an interleaved access group."); 6768 6769 // Make one decision for the whole group. 6770 if (getWideningDecision(&I, VF) != CM_Unknown) 6771 continue; 6772 6773 NumAccesses = Group->getNumMembers(); 6774 if (interleavedAccessCanBeWidened(&I, VF)) 6775 InterleaveCost = getInterleaveGroupCost(&I, VF); 6776 } 6777 6778 InstructionCost GatherScatterCost = 6779 isLegalGatherOrScatter(&I, VF) 6780 ? getGatherScatterCost(&I, VF) * NumAccesses 6781 : InstructionCost::getInvalid(); 6782 6783 InstructionCost ScalarizationCost = 6784 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6785 6786 // Choose better solution for the current VF, 6787 // write down this decision and use it during vectorization. 6788 InstructionCost Cost; 6789 InstWidening Decision; 6790 if (InterleaveCost <= GatherScatterCost && 6791 InterleaveCost < ScalarizationCost) { 6792 Decision = CM_Interleave; 6793 Cost = InterleaveCost; 6794 } else if (GatherScatterCost < ScalarizationCost) { 6795 Decision = CM_GatherScatter; 6796 Cost = GatherScatterCost; 6797 } else { 6798 Decision = CM_Scalarize; 6799 Cost = ScalarizationCost; 6800 } 6801 // If the instructions belongs to an interleave group, the whole group 6802 // receives the same decision. The whole group receives the cost, but 6803 // the cost will actually be assigned to one instruction. 6804 if (auto Group = getInterleavedAccessGroup(&I)) 6805 setWideningDecision(Group, VF, Decision, Cost); 6806 else 6807 setWideningDecision(&I, VF, Decision, Cost); 6808 } 6809 } 6810 6811 // Make sure that any load of address and any other address computation 6812 // remains scalar unless there is gather/scatter support. This avoids 6813 // inevitable extracts into address registers, and also has the benefit of 6814 // activating LSR more, since that pass can't optimize vectorized 6815 // addresses. 6816 if (TTI.prefersVectorizedAddressing()) 6817 return; 6818 6819 // Start with all scalar pointer uses. 6820 SmallPtrSet<Instruction *, 8> AddrDefs; 6821 for (BasicBlock *BB : TheLoop->blocks()) 6822 for (Instruction &I : *BB) { 6823 Instruction *PtrDef = 6824 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6825 if (PtrDef && TheLoop->contains(PtrDef) && 6826 getWideningDecision(&I, VF) != CM_GatherScatter) 6827 AddrDefs.insert(PtrDef); 6828 } 6829 6830 // Add all instructions used to generate the addresses. 6831 SmallVector<Instruction *, 4> Worklist; 6832 append_range(Worklist, AddrDefs); 6833 while (!Worklist.empty()) { 6834 Instruction *I = Worklist.pop_back_val(); 6835 for (auto &Op : I->operands()) 6836 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6837 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6838 AddrDefs.insert(InstOp).second) 6839 Worklist.push_back(InstOp); 6840 } 6841 6842 for (auto *I : AddrDefs) { 6843 if (isa<LoadInst>(I)) { 6844 // Setting the desired widening decision should ideally be handled in 6845 // by cost functions, but since this involves the task of finding out 6846 // if the loaded register is involved in an address computation, it is 6847 // instead changed here when we know this is the case. 6848 InstWidening Decision = getWideningDecision(I, VF); 6849 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6850 // Scalarize a widened load of address. 6851 setWideningDecision( 6852 I, VF, CM_Scalarize, 6853 (VF.getKnownMinValue() * 6854 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6855 else if (auto Group = getInterleavedAccessGroup(I)) { 6856 // Scalarize an interleave group of address loads. 6857 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6858 if (Instruction *Member = Group->getMember(I)) 6859 setWideningDecision( 6860 Member, VF, CM_Scalarize, 6861 (VF.getKnownMinValue() * 6862 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6863 } 6864 } 6865 } else 6866 // Make sure I gets scalarized and a cost estimate without 6867 // scalarization overhead. 6868 ForcedScalars[VF].insert(I); 6869 } 6870 } 6871 6872 InstructionCost 6873 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6874 Type *&VectorTy) { 6875 Type *RetTy = I->getType(); 6876 if (canTruncateToMinimalBitwidth(I, VF)) 6877 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6878 auto SE = PSE.getSE(); 6879 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6880 6881 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6882 ElementCount VF) -> bool { 6883 if (VF.isScalar()) 6884 return true; 6885 6886 auto Scalarized = InstsToScalarize.find(VF); 6887 assert(Scalarized != InstsToScalarize.end() && 6888 "VF not yet analyzed for scalarization profitability"); 6889 return !Scalarized->second.count(I) && 6890 llvm::all_of(I->users(), [&](User *U) { 6891 auto *UI = cast<Instruction>(U); 6892 return !Scalarized->second.count(UI); 6893 }); 6894 }; 6895 (void) hasSingleCopyAfterVectorization; 6896 6897 if (isScalarAfterVectorization(I, VF)) { 6898 // With the exception of GEPs and PHIs, after scalarization there should 6899 // only be one copy of the instruction generated in the loop. This is 6900 // because the VF is either 1, or any instructions that need scalarizing 6901 // have already been dealt with by the the time we get here. As a result, 6902 // it means we don't have to multiply the instruction cost by VF. 6903 assert(I->getOpcode() == Instruction::GetElementPtr || 6904 I->getOpcode() == Instruction::PHI || 6905 (I->getOpcode() == Instruction::BitCast && 6906 I->getType()->isPointerTy()) || 6907 hasSingleCopyAfterVectorization(I, VF)); 6908 VectorTy = RetTy; 6909 } else 6910 VectorTy = ToVectorTy(RetTy, VF); 6911 6912 // TODO: We need to estimate the cost of intrinsic calls. 6913 switch (I->getOpcode()) { 6914 case Instruction::GetElementPtr: 6915 // We mark this instruction as zero-cost because the cost of GEPs in 6916 // vectorized code depends on whether the corresponding memory instruction 6917 // is scalarized or not. Therefore, we handle GEPs with the memory 6918 // instruction cost. 6919 return 0; 6920 case Instruction::Br: { 6921 // In cases of scalarized and predicated instructions, there will be VF 6922 // predicated blocks in the vectorized loop. Each branch around these 6923 // blocks requires also an extract of its vector compare i1 element. 6924 bool ScalarPredicatedBB = false; 6925 BranchInst *BI = cast<BranchInst>(I); 6926 if (VF.isVector() && BI->isConditional() && 6927 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 6928 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 6929 ScalarPredicatedBB = true; 6930 6931 if (ScalarPredicatedBB) { 6932 // Not possible to scalarize scalable vector with predicated instructions. 6933 if (VF.isScalable()) 6934 return InstructionCost::getInvalid(); 6935 // Return cost for branches around scalarized and predicated blocks. 6936 auto *Vec_i1Ty = 6937 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6938 return ( 6939 TTI.getScalarizationOverhead( 6940 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + 6941 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6942 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6943 // The back-edge branch will remain, as will all scalar branches. 6944 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6945 else 6946 // This branch will be eliminated by if-conversion. 6947 return 0; 6948 // Note: We currently assume zero cost for an unconditional branch inside 6949 // a predicated block since it will become a fall-through, although we 6950 // may decide in the future to call TTI for all branches. 6951 } 6952 case Instruction::PHI: { 6953 auto *Phi = cast<PHINode>(I); 6954 6955 // First-order recurrences are replaced by vector shuffles inside the loop. 6956 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6957 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 6958 return TTI.getShuffleCost( 6959 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 6960 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 6961 6962 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6963 // converted into select instructions. We require N - 1 selects per phi 6964 // node, where N is the number of incoming values. 6965 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6966 return (Phi->getNumIncomingValues() - 1) * 6967 TTI.getCmpSelInstrCost( 6968 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6969 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6970 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6971 6972 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6973 } 6974 case Instruction::UDiv: 6975 case Instruction::SDiv: 6976 case Instruction::URem: 6977 case Instruction::SRem: 6978 // If we have a predicated instruction, it may not be executed for each 6979 // vector lane. Get the scalarization cost and scale this amount by the 6980 // probability of executing the predicated block. If the instruction is not 6981 // predicated, we fall through to the next case. 6982 if (VF.isVector() && isScalarWithPredication(I, VF)) { 6983 InstructionCost Cost = 0; 6984 6985 // These instructions have a non-void type, so account for the phi nodes 6986 // that we will create. This cost is likely to be zero. The phi node 6987 // cost, if any, should be scaled by the block probability because it 6988 // models a copy at the end of each predicated block. 6989 Cost += VF.getKnownMinValue() * 6990 TTI.getCFInstrCost(Instruction::PHI, CostKind); 6991 6992 // The cost of the non-predicated instruction. 6993 Cost += VF.getKnownMinValue() * 6994 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 6995 6996 // The cost of insertelement and extractelement instructions needed for 6997 // scalarization. 6998 Cost += getScalarizationOverhead(I, VF); 6999 7000 // Scale the cost by the probability of executing the predicated blocks. 7001 // This assumes the predicated block for each vector lane is equally 7002 // likely. 7003 return Cost / getReciprocalPredBlockProb(); 7004 } 7005 LLVM_FALLTHROUGH; 7006 case Instruction::Add: 7007 case Instruction::FAdd: 7008 case Instruction::Sub: 7009 case Instruction::FSub: 7010 case Instruction::Mul: 7011 case Instruction::FMul: 7012 case Instruction::FDiv: 7013 case Instruction::FRem: 7014 case Instruction::Shl: 7015 case Instruction::LShr: 7016 case Instruction::AShr: 7017 case Instruction::And: 7018 case Instruction::Or: 7019 case Instruction::Xor: { 7020 // Since we will replace the stride by 1 the multiplication should go away. 7021 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7022 return 0; 7023 7024 // Detect reduction patterns 7025 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7026 return *RedCost; 7027 7028 // Certain instructions can be cheaper to vectorize if they have a constant 7029 // second vector operand. One example of this are shifts on x86. 7030 Value *Op2 = I->getOperand(1); 7031 TargetTransformInfo::OperandValueProperties Op2VP; 7032 TargetTransformInfo::OperandValueKind Op2VK = 7033 TTI.getOperandInfo(Op2, Op2VP); 7034 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7035 Op2VK = TargetTransformInfo::OK_UniformValue; 7036 7037 SmallVector<const Value *, 4> Operands(I->operand_values()); 7038 return TTI.getArithmeticInstrCost( 7039 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7040 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7041 } 7042 case Instruction::FNeg: { 7043 return TTI.getArithmeticInstrCost( 7044 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7045 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7046 TargetTransformInfo::OP_None, I->getOperand(0), I); 7047 } 7048 case Instruction::Select: { 7049 SelectInst *SI = cast<SelectInst>(I); 7050 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7051 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7052 7053 const Value *Op0, *Op1; 7054 using namespace llvm::PatternMatch; 7055 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7056 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7057 // select x, y, false --> x & y 7058 // select x, true, y --> x | y 7059 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7060 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7061 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7062 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7063 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7064 Op1->getType()->getScalarSizeInBits() == 1); 7065 7066 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7067 return TTI.getArithmeticInstrCost( 7068 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7069 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7070 } 7071 7072 Type *CondTy = SI->getCondition()->getType(); 7073 if (!ScalarCond) 7074 CondTy = VectorType::get(CondTy, VF); 7075 7076 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7077 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7078 Pred = Cmp->getPredicate(); 7079 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7080 CostKind, I); 7081 } 7082 case Instruction::ICmp: 7083 case Instruction::FCmp: { 7084 Type *ValTy = I->getOperand(0)->getType(); 7085 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7086 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7087 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7088 VectorTy = ToVectorTy(ValTy, VF); 7089 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7090 cast<CmpInst>(I)->getPredicate(), CostKind, 7091 I); 7092 } 7093 case Instruction::Store: 7094 case Instruction::Load: { 7095 ElementCount Width = VF; 7096 if (Width.isVector()) { 7097 InstWidening Decision = getWideningDecision(I, Width); 7098 assert(Decision != CM_Unknown && 7099 "CM decision should be taken at this point"); 7100 if (Decision == CM_Scalarize) { 7101 if (VF.isScalable() && isa<StoreInst>(I)) 7102 // We can't scalarize a scalable vector store (even a uniform one 7103 // currently), return an invalid cost so as to prevent vectorization. 7104 return InstructionCost::getInvalid(); 7105 Width = ElementCount::getFixed(1); 7106 } 7107 } 7108 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7109 return getMemoryInstructionCost(I, VF); 7110 } 7111 case Instruction::BitCast: 7112 if (I->getType()->isPointerTy()) 7113 return 0; 7114 LLVM_FALLTHROUGH; 7115 case Instruction::ZExt: 7116 case Instruction::SExt: 7117 case Instruction::FPToUI: 7118 case Instruction::FPToSI: 7119 case Instruction::FPExt: 7120 case Instruction::PtrToInt: 7121 case Instruction::IntToPtr: 7122 case Instruction::SIToFP: 7123 case Instruction::UIToFP: 7124 case Instruction::Trunc: 7125 case Instruction::FPTrunc: { 7126 // Computes the CastContextHint from a Load/Store instruction. 7127 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7128 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7129 "Expected a load or a store!"); 7130 7131 if (VF.isScalar() || !TheLoop->contains(I)) 7132 return TTI::CastContextHint::Normal; 7133 7134 switch (getWideningDecision(I, VF)) { 7135 case LoopVectorizationCostModel::CM_GatherScatter: 7136 return TTI::CastContextHint::GatherScatter; 7137 case LoopVectorizationCostModel::CM_Interleave: 7138 return TTI::CastContextHint::Interleave; 7139 case LoopVectorizationCostModel::CM_Scalarize: 7140 case LoopVectorizationCostModel::CM_Widen: 7141 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7142 : TTI::CastContextHint::Normal; 7143 case LoopVectorizationCostModel::CM_Widen_Reverse: 7144 return TTI::CastContextHint::Reversed; 7145 case LoopVectorizationCostModel::CM_Unknown: 7146 llvm_unreachable("Instr did not go through cost modelling?"); 7147 } 7148 7149 llvm_unreachable("Unhandled case!"); 7150 }; 7151 7152 unsigned Opcode = I->getOpcode(); 7153 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7154 // For Trunc, the context is the only user, which must be a StoreInst. 7155 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7156 if (I->hasOneUse()) 7157 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7158 CCH = ComputeCCH(Store); 7159 } 7160 // For Z/Sext, the context is the operand, which must be a LoadInst. 7161 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7162 Opcode == Instruction::FPExt) { 7163 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7164 CCH = ComputeCCH(Load); 7165 } 7166 7167 // We optimize the truncation of induction variables having constant 7168 // integer steps. The cost of these truncations is the same as the scalar 7169 // operation. 7170 if (isOptimizableIVTruncate(I, VF)) { 7171 auto *Trunc = cast<TruncInst>(I); 7172 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7173 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7174 } 7175 7176 // Detect reduction patterns 7177 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7178 return *RedCost; 7179 7180 Type *SrcScalarTy = I->getOperand(0)->getType(); 7181 Type *SrcVecTy = 7182 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7183 if (canTruncateToMinimalBitwidth(I, VF)) { 7184 // This cast is going to be shrunk. This may remove the cast or it might 7185 // turn it into slightly different cast. For example, if MinBW == 16, 7186 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7187 // 7188 // Calculate the modified src and dest types. 7189 Type *MinVecTy = VectorTy; 7190 if (Opcode == Instruction::Trunc) { 7191 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7192 VectorTy = 7193 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7194 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7195 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7196 VectorTy = 7197 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7198 } 7199 } 7200 7201 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7202 } 7203 case Instruction::Call: { 7204 if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) 7205 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7206 return *RedCost; 7207 bool NeedToScalarize; 7208 CallInst *CI = cast<CallInst>(I); 7209 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7210 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7211 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7212 return std::min(CallCost, IntrinsicCost); 7213 } 7214 return CallCost; 7215 } 7216 case Instruction::ExtractValue: 7217 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7218 case Instruction::Alloca: 7219 // We cannot easily widen alloca to a scalable alloca, as 7220 // the result would need to be a vector of pointers. 7221 if (VF.isScalable()) 7222 return InstructionCost::getInvalid(); 7223 LLVM_FALLTHROUGH; 7224 default: 7225 // This opcode is unknown. Assume that it is the same as 'mul'. 7226 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7227 } // end of switch. 7228 } 7229 7230 char LoopVectorize::ID = 0; 7231 7232 static const char lv_name[] = "Loop Vectorization"; 7233 7234 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7235 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7236 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7237 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7238 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7239 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7240 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7241 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7242 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7243 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7244 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7245 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7246 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7247 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7248 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7249 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7250 7251 namespace llvm { 7252 7253 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7254 7255 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7256 bool VectorizeOnlyWhenForced) { 7257 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7258 } 7259 7260 } // end namespace llvm 7261 7262 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7263 // Check if the pointer operand of a load or store instruction is 7264 // consecutive. 7265 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7266 return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr); 7267 return false; 7268 } 7269 7270 void LoopVectorizationCostModel::collectValuesToIgnore() { 7271 // Ignore ephemeral values. 7272 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7273 7274 // Find all stores to invariant variables. Since they are going to sink 7275 // outside the loop we do not need calculate cost for them. 7276 for (BasicBlock *BB : TheLoop->blocks()) 7277 for (Instruction &I : *BB) { 7278 StoreInst *SI; 7279 if ((SI = dyn_cast<StoreInst>(&I)) && 7280 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7281 ValuesToIgnore.insert(&I); 7282 } 7283 7284 // Ignore type-promoting instructions we identified during reduction 7285 // detection. 7286 for (auto &Reduction : Legal->getReductionVars()) { 7287 const RecurrenceDescriptor &RedDes = Reduction.second; 7288 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7289 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7290 } 7291 // Ignore type-casting instructions we identified during induction 7292 // detection. 7293 for (auto &Induction : Legal->getInductionVars()) { 7294 const InductionDescriptor &IndDes = Induction.second; 7295 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7296 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7297 } 7298 } 7299 7300 void LoopVectorizationCostModel::collectInLoopReductions() { 7301 for (auto &Reduction : Legal->getReductionVars()) { 7302 PHINode *Phi = Reduction.first; 7303 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7304 7305 // We don't collect reductions that are type promoted (yet). 7306 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7307 continue; 7308 7309 // If the target would prefer this reduction to happen "in-loop", then we 7310 // want to record it as such. 7311 unsigned Opcode = RdxDesc.getOpcode(); 7312 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7313 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7314 TargetTransformInfo::ReductionFlags())) 7315 continue; 7316 7317 // Check that we can correctly put the reductions into the loop, by 7318 // finding the chain of operations that leads from the phi to the loop 7319 // exit value. 7320 SmallVector<Instruction *, 4> ReductionOperations = 7321 RdxDesc.getReductionOpChain(Phi, TheLoop); 7322 bool InLoop = !ReductionOperations.empty(); 7323 if (InLoop) { 7324 InLoopReductionChains[Phi] = ReductionOperations; 7325 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7326 Instruction *LastChain = Phi; 7327 for (auto *I : ReductionOperations) { 7328 InLoopReductionImmediateChains[I] = LastChain; 7329 LastChain = I; 7330 } 7331 } 7332 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7333 << " reduction for phi: " << *Phi << "\n"); 7334 } 7335 } 7336 7337 // TODO: we could return a pair of values that specify the max VF and 7338 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7339 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7340 // doesn't have a cost model that can choose which plan to execute if 7341 // more than one is generated. 7342 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7343 LoopVectorizationCostModel &CM) { 7344 unsigned WidestType; 7345 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7346 return WidestVectorRegBits / WidestType; 7347 } 7348 7349 VectorizationFactor 7350 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7351 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7352 ElementCount VF = UserVF; 7353 // Outer loop handling: They may require CFG and instruction level 7354 // transformations before even evaluating whether vectorization is profitable. 7355 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7356 // the vectorization pipeline. 7357 if (!OrigLoop->isInnermost()) { 7358 // If the user doesn't provide a vectorization factor, determine a 7359 // reasonable one. 7360 if (UserVF.isZero()) { 7361 VF = ElementCount::getFixed(determineVPlanVF( 7362 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7363 .getFixedSize(), 7364 CM)); 7365 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7366 7367 // Make sure we have a VF > 1 for stress testing. 7368 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7369 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7370 << "overriding computed VF.\n"); 7371 VF = ElementCount::getFixed(4); 7372 } 7373 } 7374 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7375 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7376 "VF needs to be a power of two"); 7377 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7378 << "VF " << VF << " to build VPlans.\n"); 7379 buildVPlans(VF, VF); 7380 7381 // For VPlan build stress testing, we bail out after VPlan construction. 7382 if (VPlanBuildStressTest) 7383 return VectorizationFactor::Disabled(); 7384 7385 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7386 } 7387 7388 LLVM_DEBUG( 7389 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7390 "VPlan-native path.\n"); 7391 return VectorizationFactor::Disabled(); 7392 } 7393 7394 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const { 7395 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7396 return (NumRuntimePointerChecks > 7397 VectorizerParams::RuntimeMemoryCheckThreshold && 7398 !Hints.allowReordering()) || 7399 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7400 } 7401 7402 Optional<VectorizationFactor> 7403 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7404 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7405 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7406 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7407 return None; 7408 7409 // Invalidate interleave groups if all blocks of loop will be predicated. 7410 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7411 !useMaskedInterleavedAccesses(*TTI)) { 7412 LLVM_DEBUG( 7413 dbgs() 7414 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7415 "which requires masked-interleaved support.\n"); 7416 if (CM.InterleaveInfo.invalidateGroups()) 7417 // Invalidating interleave groups also requires invalidating all decisions 7418 // based on them, which includes widening decisions and uniform and scalar 7419 // values. 7420 CM.invalidateCostModelingDecisions(); 7421 } 7422 7423 ElementCount MaxUserVF = 7424 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7425 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7426 if (!UserVF.isZero() && UserVFIsLegal) { 7427 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7428 "VF needs to be a power of two"); 7429 // Collect the instructions (and their associated costs) that will be more 7430 // profitable to scalarize. 7431 if (CM.selectUserVectorizationFactor(UserVF)) { 7432 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7433 CM.collectInLoopReductions(); 7434 buildVPlansWithVPRecipes(UserVF, UserVF); 7435 LLVM_DEBUG(printPlans(dbgs())); 7436 return {{UserVF, 0, 0}}; 7437 } else 7438 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7439 "InvalidCost", ORE, OrigLoop); 7440 } 7441 7442 // Populate the set of Vectorization Factor Candidates. 7443 ElementCountSet VFCandidates; 7444 for (auto VF = ElementCount::getFixed(1); 7445 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7446 VFCandidates.insert(VF); 7447 for (auto VF = ElementCount::getScalable(1); 7448 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7449 VFCandidates.insert(VF); 7450 7451 for (const auto &VF : VFCandidates) { 7452 // Collect Uniform and Scalar instructions after vectorization with VF. 7453 CM.collectUniformsAndScalars(VF); 7454 7455 // Collect the instructions (and their associated costs) that will be more 7456 // profitable to scalarize. 7457 if (VF.isVector()) 7458 CM.collectInstsToScalarize(VF); 7459 } 7460 7461 CM.collectInLoopReductions(); 7462 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7463 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7464 7465 LLVM_DEBUG(printPlans(dbgs())); 7466 if (!MaxFactors.hasVector()) 7467 return VectorizationFactor::Disabled(); 7468 7469 // Select the optimal vectorization factor. 7470 return CM.selectVectorizationFactor(VFCandidates); 7471 } 7472 7473 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7474 assert(count_if(VPlans, 7475 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7476 1 && 7477 "Best VF has not a single VPlan."); 7478 7479 for (const VPlanPtr &Plan : VPlans) { 7480 if (Plan->hasVF(VF)) 7481 return *Plan.get(); 7482 } 7483 llvm_unreachable("No plan found!"); 7484 } 7485 7486 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7487 SmallVector<Metadata *, 4> MDs; 7488 // Reserve first location for self reference to the LoopID metadata node. 7489 MDs.push_back(nullptr); 7490 bool IsUnrollMetadata = false; 7491 MDNode *LoopID = L->getLoopID(); 7492 if (LoopID) { 7493 // First find existing loop unrolling disable metadata. 7494 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7495 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7496 if (MD) { 7497 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7498 IsUnrollMetadata = 7499 S && S->getString().startswith("llvm.loop.unroll.disable"); 7500 } 7501 MDs.push_back(LoopID->getOperand(i)); 7502 } 7503 } 7504 7505 if (!IsUnrollMetadata) { 7506 // Add runtime unroll disable metadata. 7507 LLVMContext &Context = L->getHeader()->getContext(); 7508 SmallVector<Metadata *, 1> DisableOperands; 7509 DisableOperands.push_back( 7510 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7511 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7512 MDs.push_back(DisableNode); 7513 MDNode *NewLoopID = MDNode::get(Context, MDs); 7514 // Set operand 0 to refer to the loop id itself. 7515 NewLoopID->replaceOperandWith(0, NewLoopID); 7516 L->setLoopID(NewLoopID); 7517 } 7518 } 7519 7520 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF, 7521 VPlan &BestVPlan, 7522 InnerLoopVectorizer &ILV, 7523 DominatorTree *DT) { 7524 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7525 << '\n'); 7526 7527 // Perform the actual loop transformation. 7528 7529 // 1. Set up the skeleton for vectorization, including vector pre-header and 7530 // middle block. The vector loop is created during VPlan execution. 7531 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; 7532 Value *CanonicalIVStartValue; 7533 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7534 ILV.createVectorizedLoopSkeleton(); 7535 7536 // Only use noalias metadata when using memory checks guaranteeing no overlap 7537 // across all iterations. 7538 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7539 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7540 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7541 7542 // We currently don't use LoopVersioning for the actual loop cloning but we 7543 // still use it to add the noalias metadata. 7544 // TODO: Find a better way to re-use LoopVersioning functionality to add 7545 // metadata. 7546 State.LVer = std::make_unique<LoopVersioning>( 7547 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7548 PSE.getSE()); 7549 State.LVer->prepareNoAliasMetadata(); 7550 } 7551 7552 ILV.collectPoisonGeneratingRecipes(State); 7553 7554 ILV.printDebugTracesAtStart(); 7555 7556 //===------------------------------------------------===// 7557 // 7558 // Notice: any optimization or new instruction that go 7559 // into the code below should also be implemented in 7560 // the cost-model. 7561 // 7562 //===------------------------------------------------===// 7563 7564 // 2. Copy and widen instructions from the old loop into the new loop. 7565 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), 7566 ILV.getOrCreateVectorTripCount(nullptr), 7567 CanonicalIVStartValue, State); 7568 7569 BestVPlan.execute(&State); 7570 7571 // Keep all loop hints from the original loop on the vector loop (we'll 7572 // replace the vectorizer-specific hints below). 7573 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7574 7575 Optional<MDNode *> VectorizedLoopID = 7576 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7577 LLVMLoopVectorizeFollowupVectorized}); 7578 7579 VPBasicBlock *HeaderVPBB = 7580 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7581 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7582 if (VectorizedLoopID) 7583 L->setLoopID(VectorizedLoopID.getValue()); 7584 else { 7585 // Keep all loop hints from the original loop on the vector loop (we'll 7586 // replace the vectorizer-specific hints below). 7587 if (MDNode *LID = OrigLoop->getLoopID()) 7588 L->setLoopID(LID); 7589 7590 LoopVectorizeHints Hints(L, true, *ORE); 7591 Hints.setAlreadyVectorized(); 7592 } 7593 // Disable runtime unrolling when vectorizing the epilogue loop. 7594 if (CanonicalIVStartValue) 7595 AddRuntimeUnrollDisableMetaData(L); 7596 7597 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7598 // predication, updating analyses. 7599 ILV.fixVectorizedLoop(State, BestVPlan); 7600 7601 ILV.printDebugTracesAtEnd(); 7602 } 7603 7604 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7605 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7606 for (const auto &Plan : VPlans) 7607 if (PrintVPlansInDotFormat) 7608 Plan->printDOT(O); 7609 else 7610 Plan->print(O); 7611 } 7612 #endif 7613 7614 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7615 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7616 7617 // We create new control-flow for the vectorized loop, so the original exit 7618 // conditions will be dead after vectorization if it's only used by the 7619 // terminator 7620 SmallVector<BasicBlock*> ExitingBlocks; 7621 OrigLoop->getExitingBlocks(ExitingBlocks); 7622 for (auto *BB : ExitingBlocks) { 7623 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7624 if (!Cmp || !Cmp->hasOneUse()) 7625 continue; 7626 7627 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7628 if (!DeadInstructions.insert(Cmp).second) 7629 continue; 7630 7631 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7632 // TODO: can recurse through operands in general 7633 for (Value *Op : Cmp->operands()) { 7634 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7635 DeadInstructions.insert(cast<Instruction>(Op)); 7636 } 7637 } 7638 7639 // We create new "steps" for induction variable updates to which the original 7640 // induction variables map. An original update instruction will be dead if 7641 // all its users except the induction variable are dead. 7642 auto *Latch = OrigLoop->getLoopLatch(); 7643 for (auto &Induction : Legal->getInductionVars()) { 7644 PHINode *Ind = Induction.first; 7645 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7646 7647 // If the tail is to be folded by masking, the primary induction variable, 7648 // if exists, isn't dead: it will be used for masking. Don't kill it. 7649 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7650 continue; 7651 7652 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7653 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7654 })) 7655 DeadInstructions.insert(IndUpdate); 7656 } 7657 } 7658 7659 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7660 7661 //===--------------------------------------------------------------------===// 7662 // EpilogueVectorizerMainLoop 7663 //===--------------------------------------------------------------------===// 7664 7665 /// This function is partially responsible for generating the control flow 7666 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7667 std::pair<BasicBlock *, Value *> 7668 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7669 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7670 7671 // Workaround! Compute the trip count of the original loop and cache it 7672 // before we start modifying the CFG. This code has a systemic problem 7673 // wherein it tries to run analysis over partially constructed IR; this is 7674 // wrong, and not simply for SCEV. The trip count of the original loop 7675 // simply happens to be prone to hitting this in practice. In theory, we 7676 // can hit the same issue for any SCEV, or ValueTracking query done during 7677 // mutation. See PR49900. 7678 getOrCreateTripCount(OrigLoop->getLoopPreheader()); 7679 createVectorLoopSkeleton(""); 7680 7681 // Generate the code to check the minimum iteration count of the vector 7682 // epilogue (see below). 7683 EPI.EpilogueIterationCountCheck = 7684 emitIterationCountCheck(LoopScalarPreHeader, true); 7685 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7686 7687 // Generate the code to check any assumptions that we've made for SCEV 7688 // expressions. 7689 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7690 7691 // Generate the code that checks at runtime if arrays overlap. We put the 7692 // checks into a separate block to make the more common case of few elements 7693 // faster. 7694 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7695 7696 // Generate the iteration count check for the main loop, *after* the check 7697 // for the epilogue loop, so that the path-length is shorter for the case 7698 // that goes directly through the vector epilogue. The longer-path length for 7699 // the main loop is compensated for, by the gain from vectorizing the larger 7700 // trip count. Note: the branch will get updated later on when we vectorize 7701 // the epilogue. 7702 EPI.MainLoopIterationCountCheck = 7703 emitIterationCountCheck(LoopScalarPreHeader, false); 7704 7705 // Generate the induction variable. 7706 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7707 7708 // Skip induction resume value creation here because they will be created in 7709 // the second pass. If we created them here, they wouldn't be used anyway, 7710 // because the vplan in the second pass still contains the inductions from the 7711 // original loop. 7712 7713 return {completeLoopSkeleton(OrigLoopID), nullptr}; 7714 } 7715 7716 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7717 LLVM_DEBUG({ 7718 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7719 << "Main Loop VF:" << EPI.MainLoopVF 7720 << ", Main Loop UF:" << EPI.MainLoopUF 7721 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7722 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7723 }); 7724 } 7725 7726 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7727 DEBUG_WITH_TYPE(VerboseDebug, { 7728 dbgs() << "intermediate fn:\n" 7729 << *OrigLoop->getHeader()->getParent() << "\n"; 7730 }); 7731 } 7732 7733 BasicBlock * 7734 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7735 bool ForEpilogue) { 7736 assert(Bypass && "Expected valid bypass basic block."); 7737 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7738 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7739 Value *Count = getOrCreateTripCount(LoopVectorPreHeader); 7740 // Reuse existing vector loop preheader for TC checks. 7741 // Note that new preheader block is generated for vector loop. 7742 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7743 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7744 7745 // Generate code to check if the loop's trip count is less than VF * UF of the 7746 // main vector loop. 7747 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? 7748 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7749 7750 Value *CheckMinIters = Builder.CreateICmp( 7751 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7752 "min.iters.check"); 7753 7754 if (!ForEpilogue) 7755 TCCheckBlock->setName("vector.main.loop.iter.check"); 7756 7757 // Create new preheader for vector loop. 7758 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7759 DT, LI, nullptr, "vector.ph"); 7760 7761 if (ForEpilogue) { 7762 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7763 DT->getNode(Bypass)->getIDom()) && 7764 "TC check is expected to dominate Bypass"); 7765 7766 // Update dominator for Bypass & LoopExit. 7767 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7768 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7769 // For loops with multiple exits, there's no edge from the middle block 7770 // to exit blocks (as the epilogue must run) and thus no need to update 7771 // the immediate dominator of the exit blocks. 7772 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7773 7774 LoopBypassBlocks.push_back(TCCheckBlock); 7775 7776 // Save the trip count so we don't have to regenerate it in the 7777 // vec.epilog.iter.check. This is safe to do because the trip count 7778 // generated here dominates the vector epilog iter check. 7779 EPI.TripCount = Count; 7780 } 7781 7782 ReplaceInstWithInst( 7783 TCCheckBlock->getTerminator(), 7784 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7785 7786 return TCCheckBlock; 7787 } 7788 7789 //===--------------------------------------------------------------------===// 7790 // EpilogueVectorizerEpilogueLoop 7791 //===--------------------------------------------------------------------===// 7792 7793 /// This function is partially responsible for generating the control flow 7794 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7795 std::pair<BasicBlock *, Value *> 7796 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 7797 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7798 createVectorLoopSkeleton("vec.epilog."); 7799 7800 // Now, compare the remaining count and if there aren't enough iterations to 7801 // execute the vectorized epilogue skip to the scalar part. 7802 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7803 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7804 LoopVectorPreHeader = 7805 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7806 LI, nullptr, "vec.epilog.ph"); 7807 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7808 VecEpilogueIterationCountCheck); 7809 7810 // Adjust the control flow taking the state info from the main loop 7811 // vectorization into account. 7812 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7813 "expected this to be saved from the previous pass."); 7814 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7815 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7816 7817 DT->changeImmediateDominator(LoopVectorPreHeader, 7818 EPI.MainLoopIterationCountCheck); 7819 7820 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7821 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7822 7823 if (EPI.SCEVSafetyCheck) 7824 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7825 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7826 if (EPI.MemSafetyCheck) 7827 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7828 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7829 7830 DT->changeImmediateDominator( 7831 VecEpilogueIterationCountCheck, 7832 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7833 7834 DT->changeImmediateDominator(LoopScalarPreHeader, 7835 EPI.EpilogueIterationCountCheck); 7836 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF)) 7837 // If there is an epilogue which must run, there's no edge from the 7838 // middle block to exit blocks and thus no need to update the immediate 7839 // dominator of the exit blocks. 7840 DT->changeImmediateDominator(LoopExitBlock, 7841 EPI.EpilogueIterationCountCheck); 7842 7843 // Keep track of bypass blocks, as they feed start values to the induction 7844 // phis in the scalar loop preheader. 7845 if (EPI.SCEVSafetyCheck) 7846 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7847 if (EPI.MemSafetyCheck) 7848 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7849 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7850 7851 // The vec.epilog.iter.check block may contain Phi nodes from reductions which 7852 // merge control-flow from the latch block and the middle block. Update the 7853 // incoming values here and move the Phi into the preheader. 7854 SmallVector<PHINode *, 4> PhisInBlock; 7855 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7856 PhisInBlock.push_back(&Phi); 7857 7858 for (PHINode *Phi : PhisInBlock) { 7859 Phi->replaceIncomingBlockWith( 7860 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7861 VecEpilogueIterationCountCheck); 7862 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7863 if (EPI.SCEVSafetyCheck) 7864 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7865 if (EPI.MemSafetyCheck) 7866 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7867 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7868 } 7869 7870 // Generate a resume induction for the vector epilogue and put it in the 7871 // vector epilogue preheader 7872 Type *IdxTy = Legal->getWidestInductionType(); 7873 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 7874 LoopVectorPreHeader->getFirstNonPHI()); 7875 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7876 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7877 EPI.MainLoopIterationCountCheck); 7878 7879 // Generate induction resume values. These variables save the new starting 7880 // indexes for the scalar loop. They are used to test if there are any tail 7881 // iterations left once the vector loop has completed. 7882 // Note that when the vectorized epilogue is skipped due to iteration count 7883 // check, then the resume value for the induction variable comes from 7884 // the trip count of the main vector loop, hence passing the AdditionalBypass 7885 // argument. 7886 createInductionResumeValues({VecEpilogueIterationCountCheck, 7887 EPI.VectorTripCount} /* AdditionalBypass */); 7888 7889 return {completeLoopSkeleton(OrigLoopID), EPResumeVal}; 7890 } 7891 7892 BasicBlock * 7893 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7894 BasicBlock *Bypass, BasicBlock *Insert) { 7895 7896 assert(EPI.TripCount && 7897 "Expected trip count to have been safed in the first pass."); 7898 assert( 7899 (!isa<Instruction>(EPI.TripCount) || 7900 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7901 "saved trip count does not dominate insertion point."); 7902 Value *TC = EPI.TripCount; 7903 IRBuilder<> Builder(Insert->getTerminator()); 7904 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7905 7906 // Generate code to check if the loop's trip count is less than VF * UF of the 7907 // vector epilogue loop. 7908 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? 7909 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7910 7911 Value *CheckMinIters = 7912 Builder.CreateICmp(P, Count, 7913 createStepForVF(Builder, Count->getType(), 7914 EPI.EpilogueVF, EPI.EpilogueUF), 7915 "min.epilog.iters.check"); 7916 7917 ReplaceInstWithInst( 7918 Insert->getTerminator(), 7919 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 7920 7921 LoopBypassBlocks.push_back(Insert); 7922 return Insert; 7923 } 7924 7925 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7926 LLVM_DEBUG({ 7927 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7928 << "Epilogue Loop VF:" << EPI.EpilogueVF 7929 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7930 }); 7931 } 7932 7933 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7934 DEBUG_WITH_TYPE(VerboseDebug, { 7935 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7936 }); 7937 } 7938 7939 bool LoopVectorizationPlanner::getDecisionAndClampRange( 7940 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7941 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7942 bool PredicateAtRangeStart = Predicate(Range.Start); 7943 7944 for (ElementCount TmpVF = Range.Start * 2; 7945 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 7946 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7947 Range.End = TmpVF; 7948 break; 7949 } 7950 7951 return PredicateAtRangeStart; 7952 } 7953 7954 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7955 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7956 /// of VF's starting at a given VF and extending it as much as possible. Each 7957 /// vectorization decision can potentially shorten this sub-range during 7958 /// buildVPlan(). 7959 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7960 ElementCount MaxVF) { 7961 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 7962 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 7963 VFRange SubRange = {VF, MaxVFPlusOne}; 7964 VPlans.push_back(buildVPlan(SubRange)); 7965 VF = SubRange.End; 7966 } 7967 } 7968 7969 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 7970 VPlanPtr &Plan) { 7971 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 7972 7973 // Look for cached value. 7974 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 7975 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 7976 if (ECEntryIt != EdgeMaskCache.end()) 7977 return ECEntryIt->second; 7978 7979 VPValue *SrcMask = createBlockInMask(Src, Plan); 7980 7981 // The terminator has to be a branch inst! 7982 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 7983 assert(BI && "Unexpected terminator found"); 7984 7985 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 7986 return EdgeMaskCache[Edge] = SrcMask; 7987 7988 // If source is an exiting block, we know the exit edge is dynamically dead 7989 // in the vector loop, and thus we don't need to restrict the mask. Avoid 7990 // adding uses of an otherwise potentially dead instruction. 7991 if (OrigLoop->isLoopExiting(Src)) 7992 return EdgeMaskCache[Edge] = SrcMask; 7993 7994 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 7995 assert(EdgeMask && "No Edge Mask found for condition"); 7996 7997 if (BI->getSuccessor(0) != Dst) 7998 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 7999 8000 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8001 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8002 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8003 // The select version does not introduce new UB if SrcMask is false and 8004 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8005 VPValue *False = Plan->getOrAddVPValue( 8006 ConstantInt::getFalse(BI->getCondition()->getType())); 8007 EdgeMask = 8008 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8009 } 8010 8011 return EdgeMaskCache[Edge] = EdgeMask; 8012 } 8013 8014 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8015 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8016 8017 // Look for cached value. 8018 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8019 if (BCEntryIt != BlockMaskCache.end()) 8020 return BCEntryIt->second; 8021 8022 // All-one mask is modelled as no-mask following the convention for masked 8023 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8024 VPValue *BlockMask = nullptr; 8025 8026 if (OrigLoop->getHeader() == BB) { 8027 if (!CM.blockNeedsPredicationForAnyReason(BB)) 8028 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8029 8030 // Introduce the early-exit compare IV <= BTC to form header block mask. 8031 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8032 // constructing the desired canonical IV in the header block as its first 8033 // non-phi instructions. 8034 assert(CM.foldTailByMasking() && "must fold the tail"); 8035 VPBasicBlock *HeaderVPBB = 8036 Plan->getVectorLoopRegion()->getEntryBasicBlock(); 8037 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8038 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV()); 8039 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi()); 8040 8041 VPBuilder::InsertPointGuard Guard(Builder); 8042 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8043 if (CM.TTI.emitGetActiveLaneMask()) { 8044 VPValue *TC = Plan->getOrCreateTripCount(); 8045 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); 8046 } else { 8047 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8048 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8049 } 8050 return BlockMaskCache[BB] = BlockMask; 8051 } 8052 8053 // This is the block mask. We OR all incoming edges. 8054 for (auto *Predecessor : predecessors(BB)) { 8055 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8056 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8057 return BlockMaskCache[BB] = EdgeMask; 8058 8059 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8060 BlockMask = EdgeMask; 8061 continue; 8062 } 8063 8064 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8065 } 8066 8067 return BlockMaskCache[BB] = BlockMask; 8068 } 8069 8070 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8071 ArrayRef<VPValue *> Operands, 8072 VFRange &Range, 8073 VPlanPtr &Plan) { 8074 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8075 "Must be called with either a load or store"); 8076 8077 auto willWiden = [&](ElementCount VF) -> bool { 8078 LoopVectorizationCostModel::InstWidening Decision = 8079 CM.getWideningDecision(I, VF); 8080 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8081 "CM decision should be taken at this point."); 8082 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8083 return true; 8084 if (CM.isScalarAfterVectorization(I, VF) || 8085 CM.isProfitableToScalarize(I, VF)) 8086 return false; 8087 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8088 }; 8089 8090 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8091 return nullptr; 8092 8093 VPValue *Mask = nullptr; 8094 if (Legal->isMaskRequired(I)) 8095 Mask = createBlockInMask(I->getParent(), Plan); 8096 8097 // Determine if the pointer operand of the access is either consecutive or 8098 // reverse consecutive. 8099 LoopVectorizationCostModel::InstWidening Decision = 8100 CM.getWideningDecision(I, Range.Start); 8101 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8102 bool Consecutive = 8103 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8104 8105 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8106 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, 8107 Consecutive, Reverse); 8108 8109 StoreInst *Store = cast<StoreInst>(I); 8110 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8111 Mask, Consecutive, Reverse); 8112 } 8113 8114 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8115 /// insert a recipe to expand the step for the induction recipe. 8116 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( 8117 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, 8118 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM, 8119 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { 8120 // Returns true if an instruction \p I should be scalarized instead of 8121 // vectorized for the chosen vectorization factor. 8122 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { 8123 return CM.isScalarAfterVectorization(I, VF) || 8124 CM.isProfitableToScalarize(I, VF); 8125 }; 8126 8127 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( 8128 [&](ElementCount VF) { 8129 return ShouldScalarizeInstruction(PhiOrTrunc, VF); 8130 }, 8131 Range); 8132 assert(IndDesc.getStartValue() == 8133 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8134 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8135 "step must be loop invariant"); 8136 8137 VPValue *Step = 8138 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8139 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8140 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, 8141 !NeedsScalarIVOnly); 8142 } 8143 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8144 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, 8145 !NeedsScalarIVOnly); 8146 } 8147 8148 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8149 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8150 8151 // Check if this is an integer or fp induction. If so, build the recipe that 8152 // produces its scalar and vector values. 8153 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8154 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan, 8155 *PSE.getSE(), *OrigLoop, Range); 8156 8157 // Check if this is pointer induction. If so, build the recipe for it. 8158 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) 8159 return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II, 8160 *PSE.getSE()); 8161 return nullptr; 8162 } 8163 8164 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8165 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8166 // Optimize the special case where the source is a constant integer 8167 // induction variable. Notice that we can only optimize the 'trunc' case 8168 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8169 // (c) other casts depend on pointer size. 8170 8171 // Determine whether \p K is a truncation based on an induction variable that 8172 // can be optimized. 8173 auto isOptimizableIVTruncate = 8174 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8175 return [=](ElementCount VF) -> bool { 8176 return CM.isOptimizableIVTruncate(K, VF); 8177 }; 8178 }; 8179 8180 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8181 isOptimizableIVTruncate(I), Range)) { 8182 8183 auto *Phi = cast<PHINode>(I->getOperand(0)); 8184 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8185 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8186 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan, 8187 *PSE.getSE(), *OrigLoop, Range); 8188 } 8189 return nullptr; 8190 } 8191 8192 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8193 ArrayRef<VPValue *> Operands, 8194 VPlanPtr &Plan) { 8195 // If all incoming values are equal, the incoming VPValue can be used directly 8196 // instead of creating a new VPBlendRecipe. 8197 VPValue *FirstIncoming = Operands[0]; 8198 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8199 return FirstIncoming == Inc; 8200 })) { 8201 return Operands[0]; 8202 } 8203 8204 unsigned NumIncoming = Phi->getNumIncomingValues(); 8205 // For in-loop reductions, we do not need to create an additional select. 8206 VPValue *InLoopVal = nullptr; 8207 for (unsigned In = 0; In < NumIncoming; In++) { 8208 PHINode *PhiOp = 8209 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8210 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8211 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8212 InLoopVal = Operands[In]; 8213 } 8214 } 8215 8216 assert((!InLoopVal || NumIncoming == 2) && 8217 "Found an in-loop reduction for PHI with unexpected number of " 8218 "incoming values"); 8219 if (InLoopVal) 8220 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8221 8222 // We know that all PHIs in non-header blocks are converted into selects, so 8223 // we don't have to worry about the insertion order and we can just use the 8224 // builder. At this point we generate the predication tree. There may be 8225 // duplications since this is a simple recursive scan, but future 8226 // optimizations will clean it up. 8227 SmallVector<VPValue *, 2> OperandsWithMask; 8228 8229 for (unsigned In = 0; In < NumIncoming; In++) { 8230 VPValue *EdgeMask = 8231 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8232 assert((EdgeMask || NumIncoming == 1) && 8233 "Multiple predecessors with one having a full mask"); 8234 OperandsWithMask.push_back(Operands[In]); 8235 if (EdgeMask) 8236 OperandsWithMask.push_back(EdgeMask); 8237 } 8238 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8239 } 8240 8241 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8242 ArrayRef<VPValue *> Operands, 8243 VFRange &Range) const { 8244 8245 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8246 [this, CI](ElementCount VF) { 8247 return CM.isScalarWithPredication(CI, VF); 8248 }, 8249 Range); 8250 8251 if (IsPredicated) 8252 return nullptr; 8253 8254 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8255 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8256 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8257 ID == Intrinsic::pseudoprobe || 8258 ID == Intrinsic::experimental_noalias_scope_decl)) 8259 return nullptr; 8260 8261 auto willWiden = [&](ElementCount VF) -> bool { 8262 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8263 // The following case may be scalarized depending on the VF. 8264 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8265 // version of the instruction. 8266 // Is it beneficial to perform intrinsic call compared to lib call? 8267 bool NeedToScalarize = false; 8268 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8269 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8270 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8271 return UseVectorIntrinsic || !NeedToScalarize; 8272 }; 8273 8274 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8275 return nullptr; 8276 8277 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size()); 8278 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8279 } 8280 8281 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8282 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8283 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8284 // Instruction should be widened, unless it is scalar after vectorization, 8285 // scalarization is profitable or it is predicated. 8286 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8287 return CM.isScalarAfterVectorization(I, VF) || 8288 CM.isProfitableToScalarize(I, VF) || 8289 CM.isScalarWithPredication(I, VF); 8290 }; 8291 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8292 Range); 8293 } 8294 8295 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8296 ArrayRef<VPValue *> Operands) const { 8297 auto IsVectorizableOpcode = [](unsigned Opcode) { 8298 switch (Opcode) { 8299 case Instruction::Add: 8300 case Instruction::And: 8301 case Instruction::AShr: 8302 case Instruction::BitCast: 8303 case Instruction::FAdd: 8304 case Instruction::FCmp: 8305 case Instruction::FDiv: 8306 case Instruction::FMul: 8307 case Instruction::FNeg: 8308 case Instruction::FPExt: 8309 case Instruction::FPToSI: 8310 case Instruction::FPToUI: 8311 case Instruction::FPTrunc: 8312 case Instruction::FRem: 8313 case Instruction::FSub: 8314 case Instruction::ICmp: 8315 case Instruction::IntToPtr: 8316 case Instruction::LShr: 8317 case Instruction::Mul: 8318 case Instruction::Or: 8319 case Instruction::PtrToInt: 8320 case Instruction::SDiv: 8321 case Instruction::Select: 8322 case Instruction::SExt: 8323 case Instruction::Shl: 8324 case Instruction::SIToFP: 8325 case Instruction::SRem: 8326 case Instruction::Sub: 8327 case Instruction::Trunc: 8328 case Instruction::UDiv: 8329 case Instruction::UIToFP: 8330 case Instruction::URem: 8331 case Instruction::Xor: 8332 case Instruction::ZExt: 8333 case Instruction::Freeze: 8334 return true; 8335 } 8336 return false; 8337 }; 8338 8339 if (!IsVectorizableOpcode(I->getOpcode())) 8340 return nullptr; 8341 8342 // Success: widen this instruction. 8343 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8344 } 8345 8346 void VPRecipeBuilder::fixHeaderPhis() { 8347 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8348 for (VPHeaderPHIRecipe *R : PhisToFix) { 8349 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8350 VPRecipeBase *IncR = 8351 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8352 R->addOperand(IncR->getVPSingleValue()); 8353 } 8354 } 8355 8356 VPBasicBlock *VPRecipeBuilder::handleReplication( 8357 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8358 VPlanPtr &Plan) { 8359 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8360 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8361 Range); 8362 8363 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8364 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, 8365 Range); 8366 8367 // Even if the instruction is not marked as uniform, there are certain 8368 // intrinsic calls that can be effectively treated as such, so we check for 8369 // them here. Conservatively, we only do this for scalable vectors, since 8370 // for fixed-width VFs we can always fall back on full scalarization. 8371 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8372 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8373 case Intrinsic::assume: 8374 case Intrinsic::lifetime_start: 8375 case Intrinsic::lifetime_end: 8376 // For scalable vectors if one of the operands is variant then we still 8377 // want to mark as uniform, which will generate one instruction for just 8378 // the first lane of the vector. We can't scalarize the call in the same 8379 // way as for fixed-width vectors because we don't know how many lanes 8380 // there are. 8381 // 8382 // The reasons for doing it this way for scalable vectors are: 8383 // 1. For the assume intrinsic generating the instruction for the first 8384 // lane is still be better than not generating any at all. For 8385 // example, the input may be a splat across all lanes. 8386 // 2. For the lifetime start/end intrinsics the pointer operand only 8387 // does anything useful when the input comes from a stack object, 8388 // which suggests it should always be uniform. For non-stack objects 8389 // the effect is to poison the object, which still allows us to 8390 // remove the call. 8391 IsUniform = true; 8392 break; 8393 default: 8394 break; 8395 } 8396 } 8397 8398 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8399 IsUniform, IsPredicated); 8400 setRecipe(I, Recipe); 8401 Plan->addVPValue(I, Recipe); 8402 8403 // Find if I uses a predicated instruction. If so, it will use its scalar 8404 // value. Avoid hoisting the insert-element which packs the scalar value into 8405 // a vector value, as that happens iff all users use the vector value. 8406 for (VPValue *Op : Recipe->operands()) { 8407 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8408 if (!PredR) 8409 continue; 8410 auto *RepR = 8411 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8412 assert(RepR->isPredicated() && 8413 "expected Replicate recipe to be predicated"); 8414 RepR->setAlsoPack(false); 8415 } 8416 8417 // Finalize the recipe for Instr, first if it is not predicated. 8418 if (!IsPredicated) { 8419 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8420 VPBB->appendRecipe(Recipe); 8421 return VPBB; 8422 } 8423 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8424 8425 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); 8426 assert(SingleSucc && "VPBB must have a single successor when handling " 8427 "predicated replication."); 8428 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); 8429 // Record predicated instructions for above packing optimizations. 8430 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8431 VPBlockUtils::insertBlockAfter(Region, VPBB); 8432 auto *RegSucc = new VPBasicBlock(); 8433 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8434 VPBlockUtils::connectBlocks(RegSucc, SingleSucc); 8435 return RegSucc; 8436 } 8437 8438 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8439 VPRecipeBase *PredRecipe, 8440 VPlanPtr &Plan) { 8441 // Instructions marked for predication are replicated and placed under an 8442 // if-then construct to prevent side-effects. 8443 8444 // Generate recipes to compute the block mask for this region. 8445 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8446 8447 // Build the triangular if-then region. 8448 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8449 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8450 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8451 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8452 auto *PHIRecipe = Instr->getType()->isVoidTy() 8453 ? nullptr 8454 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8455 if (PHIRecipe) { 8456 Plan->removeVPValueFor(Instr); 8457 Plan->addVPValue(Instr, PHIRecipe); 8458 } 8459 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8460 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8461 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); 8462 8463 // Note: first set Entry as region entry and then connect successors starting 8464 // from it in order, to propagate the "parent" of each VPBasicBlock. 8465 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry); 8466 VPBlockUtils::connectBlocks(Pred, Exiting); 8467 8468 return Region; 8469 } 8470 8471 VPRecipeOrVPValueTy 8472 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8473 ArrayRef<VPValue *> Operands, 8474 VFRange &Range, VPlanPtr &Plan) { 8475 // First, check for specific widening recipes that deal with inductions, Phi 8476 // nodes, calls and memory operations. 8477 VPRecipeBase *Recipe; 8478 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8479 if (Phi->getParent() != OrigLoop->getHeader()) 8480 return tryToBlend(Phi, Operands, Plan); 8481 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8482 return toVPRecipeResult(Recipe); 8483 8484 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8485 assert((Legal->isReductionVariable(Phi) || 8486 Legal->isFirstOrderRecurrence(Phi)) && 8487 "can only widen reductions and first-order recurrences here"); 8488 VPValue *StartV = Operands[0]; 8489 if (Legal->isReductionVariable(Phi)) { 8490 const RecurrenceDescriptor &RdxDesc = 8491 Legal->getReductionVars().find(Phi)->second; 8492 assert(RdxDesc.getRecurrenceStartValue() == 8493 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8494 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8495 CM.isInLoopReduction(Phi), 8496 CM.useOrderedReductions(RdxDesc)); 8497 } else { 8498 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8499 } 8500 8501 // Record the incoming value from the backedge, so we can add the incoming 8502 // value from the backedge after all recipes have been created. 8503 recordRecipeOf(cast<Instruction>( 8504 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8505 PhisToFix.push_back(PhiRecipe); 8506 return toVPRecipeResult(PhiRecipe); 8507 } 8508 8509 if (isa<TruncInst>(Instr) && 8510 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8511 Range, *Plan))) 8512 return toVPRecipeResult(Recipe); 8513 8514 // All widen recipes below deal only with VF > 1. 8515 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8516 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8517 return nullptr; 8518 8519 if (auto *CI = dyn_cast<CallInst>(Instr)) 8520 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8521 8522 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8523 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8524 8525 if (!shouldWiden(Instr, Range)) 8526 return nullptr; 8527 8528 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8529 return toVPRecipeResult(new VPWidenGEPRecipe( 8530 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8531 8532 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8533 bool InvariantCond = 8534 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8535 return toVPRecipeResult(new VPWidenSelectRecipe( 8536 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8537 } 8538 8539 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8540 } 8541 8542 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8543 ElementCount MaxVF) { 8544 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8545 8546 // Collect instructions from the original loop that will become trivially dead 8547 // in the vectorized loop. We don't need to vectorize these instructions. For 8548 // example, original induction update instructions can become dead because we 8549 // separately emit induction "steps" when generating code for the new loop. 8550 // Similarly, we create a new latch condition when setting up the structure 8551 // of the new loop, so the old one can become dead. 8552 SmallPtrSet<Instruction *, 4> DeadInstructions; 8553 collectTriviallyDeadInstructions(DeadInstructions); 8554 8555 // Add assume instructions we need to drop to DeadInstructions, to prevent 8556 // them from being added to the VPlan. 8557 // TODO: We only need to drop assumes in blocks that get flattend. If the 8558 // control flow is preserved, we should keep them. 8559 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8560 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8561 8562 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8563 // Dead instructions do not need sinking. Remove them from SinkAfter. 8564 for (Instruction *I : DeadInstructions) 8565 SinkAfter.erase(I); 8566 8567 // Cannot sink instructions after dead instructions (there won't be any 8568 // recipes for them). Instead, find the first non-dead previous instruction. 8569 for (auto &P : Legal->getSinkAfter()) { 8570 Instruction *SinkTarget = P.second; 8571 Instruction *FirstInst = &*SinkTarget->getParent()->begin(); 8572 (void)FirstInst; 8573 while (DeadInstructions.contains(SinkTarget)) { 8574 assert( 8575 SinkTarget != FirstInst && 8576 "Must find a live instruction (at least the one feeding the " 8577 "first-order recurrence PHI) before reaching beginning of the block"); 8578 SinkTarget = SinkTarget->getPrevNode(); 8579 assert(SinkTarget != P.first && 8580 "sink source equals target, no sinking required"); 8581 } 8582 P.second = SinkTarget; 8583 } 8584 8585 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8586 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8587 VFRange SubRange = {VF, MaxVFPlusOne}; 8588 VPlans.push_back( 8589 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8590 VF = SubRange.End; 8591 } 8592 } 8593 8594 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a 8595 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a 8596 // BranchOnCount VPInstruction to the latch. 8597 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, 8598 bool HasNUW) { 8599 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8600 auto *StartV = Plan.getOrAddVPValue(StartIdx); 8601 8602 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8603 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8604 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8605 Header->insert(CanonicalIVPHI, Header->begin()); 8606 8607 auto *CanonicalIVIncrement = 8608 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW 8609 : VPInstruction::CanonicalIVIncrement, 8610 {CanonicalIVPHI}, DL); 8611 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8612 8613 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8614 EB->appendRecipe(CanonicalIVIncrement); 8615 8616 auto *BranchOnCount = 8617 new VPInstruction(VPInstruction::BranchOnCount, 8618 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8619 EB->appendRecipe(BranchOnCount); 8620 } 8621 8622 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8623 // original exit block. 8624 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, 8625 VPBasicBlock *MiddleVPBB, Loop *OrigLoop, 8626 VPlan &Plan) { 8627 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8628 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8629 // Only handle single-exit loops with unique exit blocks for now. 8630 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8631 return; 8632 8633 // Introduce VPUsers modeling the exit values. 8634 for (PHINode &ExitPhi : ExitBB->phis()) { 8635 Value *IncomingValue = 8636 ExitPhi.getIncomingValueForBlock(ExitingBB); 8637 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true); 8638 Plan.addLiveOut(&ExitPhi, V); 8639 } 8640 } 8641 8642 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8643 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8644 const MapVector<Instruction *, Instruction *> &SinkAfter) { 8645 8646 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8647 8648 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8649 8650 // --------------------------------------------------------------------------- 8651 // Pre-construction: record ingredients whose recipes we'll need to further 8652 // process after constructing the initial VPlan. 8653 // --------------------------------------------------------------------------- 8654 8655 // Mark instructions we'll need to sink later and their targets as 8656 // ingredients whose recipe we'll need to record. 8657 for (auto &Entry : SinkAfter) { 8658 RecipeBuilder.recordRecipeOf(Entry.first); 8659 RecipeBuilder.recordRecipeOf(Entry.second); 8660 } 8661 for (auto &Reduction : CM.getInLoopReductionChains()) { 8662 PHINode *Phi = Reduction.first; 8663 RecurKind Kind = 8664 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); 8665 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8666 8667 RecipeBuilder.recordRecipeOf(Phi); 8668 for (auto &R : ReductionOperations) { 8669 RecipeBuilder.recordRecipeOf(R); 8670 // For min/max reductions, where we have a pair of icmp/select, we also 8671 // need to record the ICmp recipe, so it can be removed later. 8672 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 8673 "Only min/max recurrences allowed for inloop reductions"); 8674 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8675 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8676 } 8677 } 8678 8679 // For each interleave group which is relevant for this (possibly trimmed) 8680 // Range, add it to the set of groups to be later applied to the VPlan and add 8681 // placeholders for its members' Recipes which we'll be replacing with a 8682 // single VPInterleaveRecipe. 8683 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8684 auto applyIG = [IG, this](ElementCount VF) -> bool { 8685 return (VF.isVector() && // Query is illegal for VF == 1 8686 CM.getWideningDecision(IG->getInsertPos(), VF) == 8687 LoopVectorizationCostModel::CM_Interleave); 8688 }; 8689 if (!getDecisionAndClampRange(applyIG, Range)) 8690 continue; 8691 InterleaveGroups.insert(IG); 8692 for (unsigned i = 0; i < IG->getFactor(); i++) 8693 if (Instruction *Member = IG->getMember(i)) 8694 RecipeBuilder.recordRecipeOf(Member); 8695 }; 8696 8697 // --------------------------------------------------------------------------- 8698 // Build initial VPlan: Scan the body of the loop in a topological order to 8699 // visit each basic block after having visited its predecessor basic blocks. 8700 // --------------------------------------------------------------------------- 8701 8702 // Create initial VPlan skeleton, starting with a block for the pre-header, 8703 // followed by a region for the vector loop, followed by the middle block. The 8704 // skeleton vector loop region contains a header and latch block. 8705 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); 8706 auto Plan = std::make_unique<VPlan>(Preheader); 8707 8708 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8709 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8710 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8711 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); 8712 VPBlockUtils::insertBlockAfter(TopRegion, Preheader); 8713 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); 8714 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); 8715 8716 Instruction *DLInst = 8717 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8718 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), 8719 DLInst ? DLInst->getDebugLoc() : DebugLoc(), 8720 !CM.foldTailByMasking()); 8721 8722 // Scan the body of the loop in a topological order to visit each basic block 8723 // after having visited its predecessor basic blocks. 8724 LoopBlocksDFS DFS(OrigLoop); 8725 DFS.perform(LI); 8726 8727 VPBasicBlock *VPBB = HeaderVPBB; 8728 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; 8729 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8730 // Relevant instructions from basic block BB will be grouped into VPRecipe 8731 // ingredients and fill a new VPBasicBlock. 8732 unsigned VPBBsForBB = 0; 8733 if (VPBB != HeaderVPBB) 8734 VPBB->setName(BB->getName()); 8735 Builder.setInsertPoint(VPBB); 8736 8737 // Introduce each ingredient into VPlan. 8738 // TODO: Model and preserve debug intrinsics in VPlan. 8739 for (Instruction &I : BB->instructionsWithoutDebug()) { 8740 Instruction *Instr = &I; 8741 8742 // First filter out irrelevant instructions, to ensure no recipes are 8743 // built for them. 8744 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8745 continue; 8746 8747 SmallVector<VPValue *, 4> Operands; 8748 auto *Phi = dyn_cast<PHINode>(Instr); 8749 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8750 Operands.push_back(Plan->getOrAddVPValue( 8751 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8752 } else { 8753 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8754 Operands = {OpRange.begin(), OpRange.end()}; 8755 } 8756 8757 // Invariant stores inside loop will be deleted and a single store 8758 // with the final reduction value will be added to the exit block 8759 StoreInst *SI; 8760 if ((SI = dyn_cast<StoreInst>(&I)) && 8761 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8762 continue; 8763 8764 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8765 Instr, Operands, Range, Plan)) { 8766 // If Instr can be simplified to an existing VPValue, use it. 8767 if (RecipeOrValue.is<VPValue *>()) { 8768 auto *VPV = RecipeOrValue.get<VPValue *>(); 8769 Plan->addVPValue(Instr, VPV); 8770 // If the re-used value is a recipe, register the recipe for the 8771 // instruction, in case the recipe for Instr needs to be recorded. 8772 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 8773 RecipeBuilder.setRecipe(Instr, R); 8774 continue; 8775 } 8776 // Otherwise, add the new recipe. 8777 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8778 for (auto *Def : Recipe->definedValues()) { 8779 auto *UV = Def->getUnderlyingValue(); 8780 Plan->addVPValue(UV, Def); 8781 } 8782 8783 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && 8784 HeaderVPBB->getFirstNonPhi() != VPBB->end()) { 8785 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section 8786 // of the header block. That can happen for truncates of induction 8787 // variables. Those recipes are moved to the phi section of the header 8788 // block after applying SinkAfter, which relies on the original 8789 // position of the trunc. 8790 assert(isa<TruncInst>(Instr)); 8791 InductionsToMove.push_back( 8792 cast<VPWidenIntOrFpInductionRecipe>(Recipe)); 8793 } 8794 RecipeBuilder.setRecipe(Instr, Recipe); 8795 VPBB->appendRecipe(Recipe); 8796 continue; 8797 } 8798 8799 // Otherwise, if all widening options failed, Instruction is to be 8800 // replicated. This may create a successor for VPBB. 8801 VPBasicBlock *NextVPBB = 8802 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8803 if (NextVPBB != VPBB) { 8804 VPBB = NextVPBB; 8805 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8806 : ""); 8807 } 8808 } 8809 8810 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8811 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8812 } 8813 8814 HeaderVPBB->setName("vector.body"); 8815 8816 // Fold the last, empty block into its predecessor. 8817 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); 8818 assert(VPBB && "expected to fold last (empty) block"); 8819 // After here, VPBB should not be used. 8820 VPBB = nullptr; 8821 8822 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan); 8823 8824 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8825 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8826 "entry block must be set to a VPRegionBlock having a non-empty entry " 8827 "VPBasicBlock"); 8828 RecipeBuilder.fixHeaderPhis(); 8829 8830 // --------------------------------------------------------------------------- 8831 // Transform initial VPlan: Apply previously taken decisions, in order, to 8832 // bring the VPlan to its final state. 8833 // --------------------------------------------------------------------------- 8834 8835 // Apply Sink-After legal constraints. 8836 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 8837 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 8838 if (Region && Region->isReplicator()) { 8839 assert(Region->getNumSuccessors() == 1 && 8840 Region->getNumPredecessors() == 1 && "Expected SESE region!"); 8841 assert(R->getParent()->size() == 1 && 8842 "A recipe in an original replicator region must be the only " 8843 "recipe in its block"); 8844 return Region; 8845 } 8846 return nullptr; 8847 }; 8848 for (auto &Entry : SinkAfter) { 8849 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8850 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8851 8852 auto *TargetRegion = GetReplicateRegion(Target); 8853 auto *SinkRegion = GetReplicateRegion(Sink); 8854 if (!SinkRegion) { 8855 // If the sink source is not a replicate region, sink the recipe directly. 8856 if (TargetRegion) { 8857 // The target is in a replication region, make sure to move Sink to 8858 // the block after it, not into the replication region itself. 8859 VPBasicBlock *NextBlock = 8860 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 8861 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8862 } else 8863 Sink->moveAfter(Target); 8864 continue; 8865 } 8866 8867 // The sink source is in a replicate region. Unhook the region from the CFG. 8868 auto *SinkPred = SinkRegion->getSinglePredecessor(); 8869 auto *SinkSucc = SinkRegion->getSingleSuccessor(); 8870 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); 8871 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); 8872 VPBlockUtils::connectBlocks(SinkPred, SinkSucc); 8873 8874 if (TargetRegion) { 8875 // The target recipe is also in a replicate region, move the sink region 8876 // after the target region. 8877 auto *TargetSucc = TargetRegion->getSingleSuccessor(); 8878 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); 8879 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); 8880 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); 8881 } else { 8882 // The sink source is in a replicate region, we need to move the whole 8883 // replicate region, which should only contain a single recipe in the 8884 // main block. 8885 auto *SplitBlock = 8886 Target->getParent()->splitAt(std::next(Target->getIterator())); 8887 8888 auto *SplitPred = SplitBlock->getSinglePredecessor(); 8889 8890 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 8891 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 8892 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 8893 } 8894 } 8895 8896 VPlanTransforms::removeRedundantCanonicalIVs(*Plan); 8897 VPlanTransforms::removeRedundantInductionCasts(*Plan); 8898 8899 // Now that sink-after is done, move induction recipes for optimized truncates 8900 // to the phi section of the header block. 8901 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) 8902 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8903 8904 // Adjust the recipes for any inloop reductions. 8905 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan, 8906 RecipeBuilder, Range.Start); 8907 8908 // Introduce a recipe to combine the incoming and previous values of a 8909 // first-order recurrence. 8910 for (VPRecipeBase &R : 8911 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 8912 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R); 8913 if (!RecurPhi) 8914 continue; 8915 8916 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe(); 8917 VPBasicBlock *InsertBlock = PrevRecipe->getParent(); 8918 auto *Region = GetReplicateRegion(PrevRecipe); 8919 if (Region) 8920 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor()); 8921 if (!InsertBlock) { 8922 InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); 8923 VPBlockUtils::insertBlockAfter(InsertBlock, Region); 8924 } 8925 if (Region || PrevRecipe->isPhi()) 8926 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); 8927 else 8928 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); 8929 8930 auto *RecurSplice = cast<VPInstruction>( 8931 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, 8932 {RecurPhi, RecurPhi->getBackedgeValue()})); 8933 8934 RecurPhi->replaceAllUsesWith(RecurSplice); 8935 // Set the first operand of RecurSplice to RecurPhi again, after replacing 8936 // all users. 8937 RecurSplice->setOperand(0, RecurPhi); 8938 } 8939 8940 // Interleave memory: for each Interleave Group we marked earlier as relevant 8941 // for this VPlan, replace the Recipes widening its memory instructions with a 8942 // single VPInterleaveRecipe at its insertion point. 8943 for (auto IG : InterleaveGroups) { 8944 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8945 RecipeBuilder.getRecipe(IG->getInsertPos())); 8946 SmallVector<VPValue *, 4> StoredValues; 8947 for (unsigned i = 0; i < IG->getFactor(); ++i) 8948 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8949 auto *StoreR = 8950 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8951 StoredValues.push_back(StoreR->getStoredValue()); 8952 } 8953 8954 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8955 Recipe->getMask()); 8956 VPIG->insertBefore(Recipe); 8957 unsigned J = 0; 8958 for (unsigned i = 0; i < IG->getFactor(); ++i) 8959 if (Instruction *Member = IG->getMember(i)) { 8960 if (!Member->getType()->isVoidTy()) { 8961 VPValue *OriginalV = Plan->getVPValue(Member); 8962 Plan->removeVPValueFor(Member); 8963 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8964 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8965 J++; 8966 } 8967 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8968 } 8969 } 8970 8971 std::string PlanName; 8972 raw_string_ostream RSO(PlanName); 8973 ElementCount VF = Range.Start; 8974 Plan->addVF(VF); 8975 RSO << "Initial VPlan for VF={" << VF; 8976 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8977 Plan->addVF(VF); 8978 RSO << "," << VF; 8979 } 8980 RSO << "},UF>=1"; 8981 RSO.flush(); 8982 Plan->setName(PlanName); 8983 8984 // From this point onwards, VPlan-to-VPlan transformations may change the plan 8985 // in ways that accessing values using original IR values is incorrect. 8986 Plan->disableValue2VPValue(); 8987 8988 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE()); 8989 VPlanTransforms::sinkScalarOperands(*Plan); 8990 VPlanTransforms::mergeReplicateRegions(*Plan); 8991 VPlanTransforms::removeDeadRecipes(*Plan); 8992 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan); 8993 8994 // Fold Exit block into its predecessor if possible. 8995 // TODO: Fold block earlier once all VPlan transforms properly maintain a 8996 // VPBasicBlock as exit. 8997 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting()); 8998 8999 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 9000 return Plan; 9001 } 9002 9003 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9004 // Outer loop handling: They may require CFG and instruction level 9005 // transformations before even evaluating whether vectorization is profitable. 9006 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9007 // the vectorization pipeline. 9008 assert(!OrigLoop->isInnermost()); 9009 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9010 9011 // Create new empty VPlan 9012 auto Plan = std::make_unique<VPlan>(); 9013 9014 // Build hierarchical CFG 9015 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9016 HCFGBuilder.buildHierarchicalCFG(); 9017 9018 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9019 VF *= 2) 9020 Plan->addVF(VF); 9021 9022 SmallPtrSet<Instruction *, 1> DeadInstructions; 9023 VPlanTransforms::VPInstructionsToVPRecipes( 9024 OrigLoop, Plan, 9025 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 9026 DeadInstructions, *PSE.getSE()); 9027 9028 // Remove the existing terminator of the exiting block of the top-most region. 9029 // A BranchOnCount will be added instead when adding the canonical IV recipes. 9030 auto *Term = 9031 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 9032 Term->eraseFromParent(); 9033 9034 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), 9035 true); 9036 return Plan; 9037 } 9038 9039 // Adjust the recipes for reductions. For in-loop reductions the chain of 9040 // instructions leading from the loop exit instr to the phi need to be converted 9041 // to reductions, with one operand being vector and the other being the scalar 9042 // reduction chain. For other reductions, a select is introduced between the phi 9043 // and live-out recipes when folding the tail. 9044 void LoopVectorizationPlanner::adjustRecipesForReductions( 9045 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 9046 ElementCount MinVF) { 9047 for (auto &Reduction : CM.getInLoopReductionChains()) { 9048 PHINode *Phi = Reduction.first; 9049 const RecurrenceDescriptor &RdxDesc = 9050 Legal->getReductionVars().find(Phi)->second; 9051 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9052 9053 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) 9054 continue; 9055 9056 // ReductionOperations are orders top-down from the phi's use to the 9057 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9058 // which of the two operands will remain scalar and which will be reduced. 9059 // For minmax the chain will be the select instructions. 9060 Instruction *Chain = Phi; 9061 for (Instruction *R : ReductionOperations) { 9062 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9063 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9064 9065 VPValue *ChainOp = Plan->getVPValue(Chain); 9066 unsigned FirstOpId; 9067 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && 9068 "Only min/max recurrences allowed for inloop reductions"); 9069 // Recognize a call to the llvm.fmuladd intrinsic. 9070 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9071 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && 9072 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9073 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9074 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9075 "Expected to replace a VPWidenSelectSC"); 9076 FirstOpId = 1; 9077 } else { 9078 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || 9079 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && 9080 "Expected to replace a VPWidenSC"); 9081 FirstOpId = 0; 9082 } 9083 unsigned VecOpId = 9084 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9085 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9086 9087 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent()) 9088 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9089 : nullptr; 9090 9091 if (IsFMulAdd) { 9092 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9093 // need to create an fmul recipe to use as the vector operand for the 9094 // fadd reduction. 9095 VPInstruction *FMulRecipe = new VPInstruction( 9096 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))}); 9097 FMulRecipe->setFastMathFlags(R->getFastMathFlags()); 9098 WidenRecipe->getParent()->insert(FMulRecipe, 9099 WidenRecipe->getIterator()); 9100 VecOp = FMulRecipe; 9101 } 9102 VPReductionRecipe *RedRecipe = 9103 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9104 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9105 Plan->removeVPValueFor(R); 9106 Plan->addVPValue(R, RedRecipe); 9107 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9108 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9109 WidenRecipe->eraseFromParent(); 9110 9111 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9112 VPRecipeBase *CompareRecipe = 9113 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9114 assert(isa<VPWidenRecipe>(CompareRecipe) && 9115 "Expected to replace a VPWidenSC"); 9116 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9117 "Expected no remaining users"); 9118 CompareRecipe->eraseFromParent(); 9119 } 9120 Chain = R; 9121 } 9122 } 9123 9124 // If tail is folded by masking, introduce selects between the phi 9125 // and the live-out instruction of each reduction, at the beginning of the 9126 // dedicated latch block. 9127 if (CM.foldTailByMasking()) { 9128 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); 9129 for (VPRecipeBase &R : 9130 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9131 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9132 if (!PhiR || PhiR->isInLoop()) 9133 continue; 9134 VPValue *Cond = 9135 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9136 VPValue *Red = PhiR->getBackedgeValue(); 9137 assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB && 9138 "reduction recipe must be defined before latch"); 9139 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR}); 9140 } 9141 } 9142 } 9143 9144 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9145 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9146 VPSlotTracker &SlotTracker) const { 9147 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9148 IG->getInsertPos()->printAsOperand(O, false); 9149 O << ", "; 9150 getAddr()->printAsOperand(O, SlotTracker); 9151 VPValue *Mask = getMask(); 9152 if (Mask) { 9153 O << ", "; 9154 Mask->printAsOperand(O, SlotTracker); 9155 } 9156 9157 unsigned OpIdx = 0; 9158 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9159 if (!IG->getMember(i)) 9160 continue; 9161 if (getNumStoreOperands() > 0) { 9162 O << "\n" << Indent << " store "; 9163 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9164 O << " to index " << i; 9165 } else { 9166 O << "\n" << Indent << " "; 9167 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9168 O << " = load from index " << i; 9169 } 9170 ++OpIdx; 9171 } 9172 } 9173 #endif 9174 9175 void VPWidenCallRecipe::execute(VPTransformState &State) { 9176 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9177 *this, State); 9178 } 9179 9180 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9181 auto &I = *cast<SelectInst>(getUnderlyingInstr()); 9182 State.ILV->setDebugLocFromInst(&I); 9183 9184 // The condition can be loop invariant but still defined inside the 9185 // loop. This means that we can't just use the original 'cond' value. 9186 // We have to take the 'vectorized' value and pick the first lane. 9187 // Instcombine will make this a no-op. 9188 auto *InvarCond = 9189 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr; 9190 9191 for (unsigned Part = 0; Part < State.UF; ++Part) { 9192 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part); 9193 Value *Op0 = State.get(getOperand(1), Part); 9194 Value *Op1 = State.get(getOperand(2), Part); 9195 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); 9196 State.set(this, Sel, Part); 9197 State.addMetadata(Sel, &I); 9198 } 9199 } 9200 9201 void VPWidenRecipe::execute(VPTransformState &State) { 9202 auto &I = *cast<Instruction>(getUnderlyingValue()); 9203 auto &Builder = State.Builder; 9204 switch (I.getOpcode()) { 9205 case Instruction::Call: 9206 case Instruction::Br: 9207 case Instruction::PHI: 9208 case Instruction::GetElementPtr: 9209 case Instruction::Select: 9210 llvm_unreachable("This instruction is handled by a different recipe."); 9211 case Instruction::UDiv: 9212 case Instruction::SDiv: 9213 case Instruction::SRem: 9214 case Instruction::URem: 9215 case Instruction::Add: 9216 case Instruction::FAdd: 9217 case Instruction::Sub: 9218 case Instruction::FSub: 9219 case Instruction::FNeg: 9220 case Instruction::Mul: 9221 case Instruction::FMul: 9222 case Instruction::FDiv: 9223 case Instruction::FRem: 9224 case Instruction::Shl: 9225 case Instruction::LShr: 9226 case Instruction::AShr: 9227 case Instruction::And: 9228 case Instruction::Or: 9229 case Instruction::Xor: { 9230 // Just widen unops and binops. 9231 State.ILV->setDebugLocFromInst(&I); 9232 9233 for (unsigned Part = 0; Part < State.UF; ++Part) { 9234 SmallVector<Value *, 2> Ops; 9235 for (VPValue *VPOp : operands()) 9236 Ops.push_back(State.get(VPOp, Part)); 9237 9238 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 9239 9240 if (auto *VecOp = dyn_cast<Instruction>(V)) { 9241 VecOp->copyIRFlags(&I); 9242 9243 // If the instruction is vectorized and was in a basic block that needed 9244 // predication, we can't propagate poison-generating flags (nuw/nsw, 9245 // exact, etc.). The control flow has been linearized and the 9246 // instruction is no longer guarded by the predicate, which could make 9247 // the flag properties to no longer hold. 9248 if (State.MayGeneratePoisonRecipes.contains(this)) 9249 VecOp->dropPoisonGeneratingFlags(); 9250 } 9251 9252 // Use this vector value for all users of the original instruction. 9253 State.set(this, V, Part); 9254 State.addMetadata(V, &I); 9255 } 9256 9257 break; 9258 } 9259 case Instruction::Freeze: { 9260 State.ILV->setDebugLocFromInst(&I); 9261 9262 for (unsigned Part = 0; Part < State.UF; ++Part) { 9263 Value *Op = State.get(getOperand(0), Part); 9264 9265 Value *Freeze = Builder.CreateFreeze(Op); 9266 State.set(this, Freeze, Part); 9267 } 9268 break; 9269 } 9270 case Instruction::ICmp: 9271 case Instruction::FCmp: { 9272 // Widen compares. Generate vector compares. 9273 bool FCmp = (I.getOpcode() == Instruction::FCmp); 9274 auto *Cmp = cast<CmpInst>(&I); 9275 State.ILV->setDebugLocFromInst(Cmp); 9276 for (unsigned Part = 0; Part < State.UF; ++Part) { 9277 Value *A = State.get(getOperand(0), Part); 9278 Value *B = State.get(getOperand(1), Part); 9279 Value *C = nullptr; 9280 if (FCmp) { 9281 // Propagate fast math flags. 9282 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9283 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 9284 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 9285 } else { 9286 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 9287 } 9288 State.set(this, C, Part); 9289 State.addMetadata(C, &I); 9290 } 9291 9292 break; 9293 } 9294 9295 case Instruction::ZExt: 9296 case Instruction::SExt: 9297 case Instruction::FPToUI: 9298 case Instruction::FPToSI: 9299 case Instruction::FPExt: 9300 case Instruction::PtrToInt: 9301 case Instruction::IntToPtr: 9302 case Instruction::SIToFP: 9303 case Instruction::UIToFP: 9304 case Instruction::Trunc: 9305 case Instruction::FPTrunc: 9306 case Instruction::BitCast: { 9307 auto *CI = cast<CastInst>(&I); 9308 State.ILV->setDebugLocFromInst(CI); 9309 9310 /// Vectorize casts. 9311 Type *DestTy = (State.VF.isScalar()) 9312 ? CI->getType() 9313 : VectorType::get(CI->getType(), State.VF); 9314 9315 for (unsigned Part = 0; Part < State.UF; ++Part) { 9316 Value *A = State.get(getOperand(0), Part); 9317 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 9318 State.set(this, Cast, Part); 9319 State.addMetadata(Cast, &I); 9320 } 9321 break; 9322 } 9323 default: 9324 // This instruction is not vectorized by simple widening. 9325 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 9326 llvm_unreachable("Unhandled instruction!"); 9327 } // end of switch. 9328 } 9329 9330 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9331 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr()); 9332 // Construct a vector GEP by widening the operands of the scalar GEP as 9333 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 9334 // results in a vector of pointers when at least one operand of the GEP 9335 // is vector-typed. Thus, to keep the representation compact, we only use 9336 // vector-typed operands for loop-varying values. 9337 9338 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 9339 // If we are vectorizing, but the GEP has only loop-invariant operands, 9340 // the GEP we build (by only using vector-typed operands for 9341 // loop-varying values) would be a scalar pointer. Thus, to ensure we 9342 // produce a vector of pointers, we need to either arbitrarily pick an 9343 // operand to broadcast, or broadcast a clone of the original GEP. 9344 // Here, we broadcast a clone of the original. 9345 // 9346 // TODO: If at some point we decide to scalarize instructions having 9347 // loop-invariant operands, this special case will no longer be 9348 // required. We would add the scalarization decision to 9349 // collectLoopScalars() and teach getVectorValue() to broadcast 9350 // the lane-zero scalar value. 9351 auto *Clone = State.Builder.Insert(GEP->clone()); 9352 for (unsigned Part = 0; Part < State.UF; ++Part) { 9353 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone); 9354 State.set(this, EntryPart, Part); 9355 State.addMetadata(EntryPart, GEP); 9356 } 9357 } else { 9358 // If the GEP has at least one loop-varying operand, we are sure to 9359 // produce a vector of pointers. But if we are only unrolling, we want 9360 // to produce a scalar GEP for each unroll part. Thus, the GEP we 9361 // produce with the code below will be scalar (if VF == 1) or vector 9362 // (otherwise). Note that for the unroll-only case, we still maintain 9363 // values in the vector mapping with initVector, as we do for other 9364 // instructions. 9365 for (unsigned Part = 0; Part < State.UF; ++Part) { 9366 // The pointer operand of the new GEP. If it's loop-invariant, we 9367 // won't broadcast it. 9368 auto *Ptr = IsPtrLoopInvariant 9369 ? State.get(getOperand(0), VPIteration(0, 0)) 9370 : State.get(getOperand(0), Part); 9371 9372 // Collect all the indices for the new GEP. If any index is 9373 // loop-invariant, we won't broadcast it. 9374 SmallVector<Value *, 4> Indices; 9375 for (unsigned I = 1, E = getNumOperands(); I < E; I++) { 9376 VPValue *Operand = getOperand(I); 9377 if (IsIndexLoopInvariant[I - 1]) 9378 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 9379 else 9380 Indices.push_back(State.get(Operand, Part)); 9381 } 9382 9383 // If the GEP instruction is vectorized and was in a basic block that 9384 // needed predication, we can't propagate the poison-generating 'inbounds' 9385 // flag. The control flow has been linearized and the GEP is no longer 9386 // guarded by the predicate, which could make the 'inbounds' properties to 9387 // no longer hold. 9388 bool IsInBounds = 9389 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0; 9390 9391 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 9392 // but it should be a vector, otherwise. 9393 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr, 9394 Indices, "", IsInBounds); 9395 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && 9396 "NewGEP is not a pointer vector"); 9397 State.set(this, NewGEP, Part); 9398 State.addMetadata(NewGEP, GEP); 9399 } 9400 } 9401 } 9402 9403 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9404 assert(!State.Instance && "Int or FP induction being replicated."); 9405 9406 Value *Start = getStartValue()->getLiveInIRValue(); 9407 const InductionDescriptor &ID = getInductionDescriptor(); 9408 TruncInst *Trunc = getTruncInst(); 9409 IRBuilderBase &Builder = State.Builder; 9410 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 9411 assert(State.VF.isVector() && "must have vector VF"); 9412 9413 // The value from the original loop to which we are mapping the new induction 9414 // variable. 9415 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 9416 9417 // Fast-math-flags propagate from the original induction instruction. 9418 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 9419 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 9420 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 9421 9422 // Now do the actual transformations, and start with fetching the step value. 9423 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9424 9425 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 9426 "Expected either an induction phi-node or a truncate of it!"); 9427 9428 // Construct the initial value of the vector IV in the vector loop preheader 9429 auto CurrIP = Builder.saveIP(); 9430 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9431 Builder.SetInsertPoint(VectorPH->getTerminator()); 9432 if (isa<TruncInst>(EntryVal)) { 9433 assert(Start->getType()->isIntegerTy() && 9434 "Truncation requires an integer type"); 9435 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 9436 Step = Builder.CreateTrunc(Step, TruncType); 9437 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 9438 } 9439 9440 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); 9441 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); 9442 Value *SteppedStart = getStepVector( 9443 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder); 9444 9445 // We create vector phi nodes for both integer and floating-point induction 9446 // variables. Here, we determine the kind of arithmetic we will perform. 9447 Instruction::BinaryOps AddOp; 9448 Instruction::BinaryOps MulOp; 9449 if (Step->getType()->isIntegerTy()) { 9450 AddOp = Instruction::Add; 9451 MulOp = Instruction::Mul; 9452 } else { 9453 AddOp = ID.getInductionOpcode(); 9454 MulOp = Instruction::FMul; 9455 } 9456 9457 // Multiply the vectorization factor by the step using integer or 9458 // floating-point arithmetic as appropriate. 9459 Type *StepType = Step->getType(); 9460 Value *RuntimeVF; 9461 if (Step->getType()->isFloatingPointTy()) 9462 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); 9463 else 9464 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); 9465 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 9466 9467 // Create a vector splat to use in the induction update. 9468 // 9469 // FIXME: If the step is non-constant, we create the vector splat with 9470 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 9471 // handle a constant vector splat. 9472 Value *SplatVF = isa<Constant>(Mul) 9473 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) 9474 : Builder.CreateVectorSplat(State.VF, Mul); 9475 Builder.restoreIP(CurrIP); 9476 9477 // We may need to add the step a number of times, depending on the unroll 9478 // factor. The last of those goes into the PHI. 9479 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 9480 &*State.CFG.PrevBB->getFirstInsertionPt()); 9481 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 9482 Instruction *LastInduction = VecInd; 9483 for (unsigned Part = 0; Part < State.UF; ++Part) { 9484 State.set(this, LastInduction, Part); 9485 9486 if (isa<TruncInst>(EntryVal)) 9487 State.addMetadata(LastInduction, EntryVal); 9488 9489 LastInduction = cast<Instruction>( 9490 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 9491 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 9492 } 9493 9494 LastInduction->setName("vec.ind.next"); 9495 VecInd->addIncoming(SteppedStart, VectorPH); 9496 // Add induction update using an incorrect block temporarily. The phi node 9497 // will be fixed after VPlan execution. Note that at this point the latch 9498 // block cannot be used, as it does not exist yet. 9499 // TODO: Model increment value in VPlan, by turning the recipe into a 9500 // multi-def and a subclass of VPHeaderPHIRecipe. 9501 VecInd->addIncoming(LastInduction, VectorPH); 9502 } 9503 9504 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9505 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9506 "Not a pointer induction according to InductionDescriptor!"); 9507 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9508 "Unexpected type."); 9509 9510 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9511 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9512 9513 if (onlyScalarsGenerated(State.VF)) { 9514 // This is the normalized GEP that starts counting at zero. 9515 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9516 CanonicalIV, IndDesc.getStep()->getType()); 9517 // Determine the number of scalars we need to generate for each unroll 9518 // iteration. If the instruction is uniform, we only need to generate the 9519 // first lane. Otherwise, we generate all VF values. 9520 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9521 assert((IsUniform || !State.VF.isScalable()) && 9522 "Cannot scalarize a scalable VF"); 9523 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9524 9525 for (unsigned Part = 0; Part < State.UF; ++Part) { 9526 Value *PartStart = 9527 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9528 9529 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9530 Value *Idx = State.Builder.CreateAdd( 9531 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9532 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9533 9534 Value *Step = CreateStepValue(IndDesc.getStep(), SE, 9535 State.CFG.PrevBB->getTerminator()); 9536 Value *SclrGep = emitTransformedIndex( 9537 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc); 9538 SclrGep->setName("next.gep"); 9539 State.set(this, SclrGep, VPIteration(Part, Lane)); 9540 } 9541 } 9542 return; 9543 } 9544 9545 assert(isa<SCEVConstant>(IndDesc.getStep()) && 9546 "Induction step not a SCEV constant!"); 9547 Type *PhiType = IndDesc.getStep()->getType(); 9548 9549 // Build a pointer phi 9550 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9551 Type *ScStValueType = ScalarStartValue->getType(); 9552 PHINode *NewPointerPhi = 9553 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9554 9555 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9556 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9557 9558 // A pointer induction, performed by using a gep 9559 const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout(); 9560 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9561 9562 const SCEV *ScalarStep = IndDesc.getStep(); 9563 SCEVExpander Exp(SE, DL, "induction"); 9564 Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 9565 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9566 Value *NumUnrolledElems = 9567 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9568 Value *InductionGEP = GetElementPtrInst::Create( 9569 IndDesc.getElementType(), NewPointerPhi, 9570 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9571 InductionLoc); 9572 // Add induction update using an incorrect block temporarily. The phi node 9573 // will be fixed after VPlan execution. Note that at this point the latch 9574 // block cannot be used, as it does not exist yet. 9575 // TODO: Model increment value in VPlan, by turning the recipe into a 9576 // multi-def and a subclass of VPHeaderPHIRecipe. 9577 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9578 9579 // Create UF many actual address geps that use the pointer 9580 // phi as base and a vectorized version of the step value 9581 // (<step*0, ..., step*N>) as offset. 9582 for (unsigned Part = 0; Part < State.UF; ++Part) { 9583 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9584 Value *StartOffsetScalar = 9585 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9586 Value *StartOffset = 9587 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9588 // Create a vector of consecutive numbers from zero to VF. 9589 StartOffset = State.Builder.CreateAdd( 9590 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9591 9592 Value *GEP = State.Builder.CreateGEP( 9593 IndDesc.getElementType(), NewPointerPhi, 9594 State.Builder.CreateMul( 9595 StartOffset, 9596 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9597 "vector.gep")); 9598 State.set(this, GEP, Part); 9599 } 9600 } 9601 9602 void VPScalarIVStepsRecipe::execute(VPTransformState &State) { 9603 assert(!State.Instance && "VPScalarIVStepsRecipe being replicated."); 9604 9605 // Fast-math-flags propagate from the original induction instruction. 9606 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9607 if (IndDesc.getInductionBinOp() && 9608 isa<FPMathOperator>(IndDesc.getInductionBinOp())) 9609 State.Builder.setFastMathFlags( 9610 IndDesc.getInductionBinOp()->getFastMathFlags()); 9611 9612 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9613 auto CreateScalarIV = [&](Value *&Step) -> Value * { 9614 Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9615 auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0); 9616 if (!isCanonical() || CanonicalIV->getType() != Ty) { 9617 ScalarIV = 9618 Ty->isIntegerTy() 9619 ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty) 9620 : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty); 9621 ScalarIV = emitTransformedIndex(State.Builder, ScalarIV, 9622 getStartValue()->getLiveInIRValue(), Step, 9623 IndDesc); 9624 ScalarIV->setName("offset.idx"); 9625 } 9626 if (TruncToTy) { 9627 assert(Step->getType()->isIntegerTy() && 9628 "Truncation requires an integer step"); 9629 ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy); 9630 Step = State.Builder.CreateTrunc(Step, TruncToTy); 9631 } 9632 return ScalarIV; 9633 }; 9634 9635 Value *ScalarIV = CreateScalarIV(Step); 9636 if (State.VF.isVector()) { 9637 buildScalarSteps(ScalarIV, Step, IndDesc, this, State); 9638 return; 9639 } 9640 9641 for (unsigned Part = 0; Part < State.UF; ++Part) { 9642 assert(!State.VF.isScalable() && "scalable vectors not yet supported."); 9643 Value *EntryPart; 9644 if (Step->getType()->isFloatingPointTy()) { 9645 Value *StartIdx = 9646 getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part); 9647 // Floating-point operations inherit FMF via the builder's flags. 9648 Value *MulOp = State.Builder.CreateFMul(StartIdx, Step); 9649 EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(), 9650 ScalarIV, MulOp); 9651 } else { 9652 Value *StartIdx = 9653 getRuntimeVF(State.Builder, Step->getType(), State.VF * Part); 9654 EntryPart = State.Builder.CreateAdd( 9655 ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction"); 9656 } 9657 State.set(this, EntryPart, Part); 9658 } 9659 } 9660 9661 void VPBlendRecipe::execute(VPTransformState &State) { 9662 State.ILV->setDebugLocFromInst(Phi); 9663 // We know that all PHIs in non-header blocks are converted into 9664 // selects, so we don't have to worry about the insertion order and we 9665 // can just use the builder. 9666 // At this point we generate the predication tree. There may be 9667 // duplications since this is a simple recursive scan, but future 9668 // optimizations will clean it up. 9669 9670 unsigned NumIncoming = getNumIncomingValues(); 9671 9672 // Generate a sequence of selects of the form: 9673 // SELECT(Mask3, In3, 9674 // SELECT(Mask2, In2, 9675 // SELECT(Mask1, In1, 9676 // In0))) 9677 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9678 // are essentially undef are taken from In0. 9679 InnerLoopVectorizer::VectorParts Entry(State.UF); 9680 for (unsigned In = 0; In < NumIncoming; ++In) { 9681 for (unsigned Part = 0; Part < State.UF; ++Part) { 9682 // We might have single edge PHIs (blocks) - use an identity 9683 // 'select' for the first PHI operand. 9684 Value *In0 = State.get(getIncomingValue(In), Part); 9685 if (In == 0) 9686 Entry[Part] = In0; // Initialize with the first incoming value. 9687 else { 9688 // Select between the current value and the previous incoming edge 9689 // based on the incoming mask. 9690 Value *Cond = State.get(getMask(In), Part); 9691 Entry[Part] = 9692 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9693 } 9694 } 9695 } 9696 for (unsigned Part = 0; Part < State.UF; ++Part) 9697 State.set(this, Entry[Part], Part); 9698 } 9699 9700 void VPInterleaveRecipe::execute(VPTransformState &State) { 9701 assert(!State.Instance && "Interleave group being replicated."); 9702 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9703 getStoredValues(), getMask()); 9704 } 9705 9706 void VPReductionRecipe::execute(VPTransformState &State) { 9707 assert(!State.Instance && "Reduction being replicated."); 9708 Value *PrevInChain = State.get(getChainOp(), 0); 9709 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9710 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); 9711 // Propagate the fast-math flags carried by the underlying instruction. 9712 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9713 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags()); 9714 for (unsigned Part = 0; Part < State.UF; ++Part) { 9715 Value *NewVecOp = State.get(getVecOp(), Part); 9716 if (VPValue *Cond = getCondOp()) { 9717 Value *NewCond = State.get(Cond, Part); 9718 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9719 Value *Iden = RdxDesc->getRecurrenceIdentity( 9720 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9721 Value *IdenVec = 9722 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9723 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9724 NewVecOp = Select; 9725 } 9726 Value *NewRed; 9727 Value *NextInChain; 9728 if (IsOrdered) { 9729 if (State.VF.isVector()) 9730 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9731 PrevInChain); 9732 else 9733 NewRed = State.Builder.CreateBinOp( 9734 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain, 9735 NewVecOp); 9736 PrevInChain = NewRed; 9737 } else { 9738 PrevInChain = State.get(getChainOp(), Part); 9739 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9740 } 9741 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9742 NextInChain = 9743 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9744 NewRed, PrevInChain); 9745 } else if (IsOrdered) 9746 NextInChain = NewRed; 9747 else 9748 NextInChain = State.Builder.CreateBinOp( 9749 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed, 9750 PrevInChain); 9751 State.set(this, NextInChain, Part); 9752 } 9753 } 9754 9755 void VPReplicateRecipe::execute(VPTransformState &State) { 9756 if (State.Instance) { // Generate a single instance. 9757 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9758 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, 9759 IsPredicated, State); 9760 // Insert scalar instance packing it into a vector. 9761 if (AlsoPack && State.VF.isVector()) { 9762 // If we're constructing lane 0, initialize to start from poison. 9763 if (State.Instance->Lane.isFirstLane()) { 9764 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9765 Value *Poison = PoisonValue::get( 9766 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9767 State.set(this, Poison, State.Instance->Part); 9768 } 9769 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9770 } 9771 return; 9772 } 9773 9774 // Generate scalar instances for all VF lanes of all UF parts, unless the 9775 // instruction is uniform inwhich case generate only the first lane for each 9776 // of the UF parts. 9777 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9778 assert((!State.VF.isScalable() || IsUniform) && 9779 "Can't scalarize a scalable vector"); 9780 for (unsigned Part = 0; Part < State.UF; ++Part) 9781 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9782 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, 9783 VPIteration(Part, Lane), IsPredicated, 9784 State); 9785 } 9786 9787 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9788 assert(State.Instance && "Branch on Mask works only on single instance."); 9789 9790 unsigned Part = State.Instance->Part; 9791 unsigned Lane = State.Instance->Lane.getKnownLane(); 9792 9793 Value *ConditionBit = nullptr; 9794 VPValue *BlockInMask = getMask(); 9795 if (BlockInMask) { 9796 ConditionBit = State.get(BlockInMask, Part); 9797 if (ConditionBit->getType()->isVectorTy()) 9798 ConditionBit = State.Builder.CreateExtractElement( 9799 ConditionBit, State.Builder.getInt32(Lane)); 9800 } else // Block in mask is all-one. 9801 ConditionBit = State.Builder.getTrue(); 9802 9803 // Replace the temporary unreachable terminator with a new conditional branch, 9804 // whose two destinations will be set later when they are created. 9805 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9806 assert(isa<UnreachableInst>(CurrentTerminator) && 9807 "Expected to replace unreachable terminator with conditional branch."); 9808 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9809 CondBr->setSuccessor(0, nullptr); 9810 ReplaceInstWithInst(CurrentTerminator, CondBr); 9811 } 9812 9813 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9814 assert(State.Instance && "Predicated instruction PHI works per instance."); 9815 Instruction *ScalarPredInst = 9816 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9817 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9818 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9819 assert(PredicatingBB && "Predicated block has no single predecessor."); 9820 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9821 "operand must be VPReplicateRecipe"); 9822 9823 // By current pack/unpack logic we need to generate only a single phi node: if 9824 // a vector value for the predicated instruction exists at this point it means 9825 // the instruction has vector users only, and a phi for the vector value is 9826 // needed. In this case the recipe of the predicated instruction is marked to 9827 // also do that packing, thereby "hoisting" the insert-element sequence. 9828 // Otherwise, a phi node for the scalar value is needed. 9829 unsigned Part = State.Instance->Part; 9830 if (State.hasVectorValue(getOperand(0), Part)) { 9831 Value *VectorValue = State.get(getOperand(0), Part); 9832 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9833 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9834 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9835 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9836 if (State.hasVectorValue(this, Part)) 9837 State.reset(this, VPhi, Part); 9838 else 9839 State.set(this, VPhi, Part); 9840 // NOTE: Currently we need to update the value of the operand, so the next 9841 // predicated iteration inserts its generated value in the correct vector. 9842 State.reset(getOperand(0), VPhi, Part); 9843 } else { 9844 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9845 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9846 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9847 PredicatingBB); 9848 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9849 if (State.hasScalarValue(this, *State.Instance)) 9850 State.reset(this, Phi, *State.Instance); 9851 else 9852 State.set(this, Phi, *State.Instance); 9853 // NOTE: Currently we need to update the value of the operand, so the next 9854 // predicated iteration inserts its generated value in the correct vector. 9855 State.reset(getOperand(0), Phi, *State.Instance); 9856 } 9857 } 9858 9859 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9860 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9861 9862 // Attempt to issue a wide load. 9863 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9864 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9865 9866 assert((LI || SI) && "Invalid Load/Store instruction"); 9867 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9868 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9869 9870 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9871 9872 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9873 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9874 bool CreateGatherScatter = !Consecutive; 9875 9876 auto &Builder = State.Builder; 9877 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9878 bool isMaskRequired = getMask(); 9879 if (isMaskRequired) 9880 for (unsigned Part = 0; Part < State.UF; ++Part) 9881 BlockInMaskParts[Part] = State.get(getMask(), Part); 9882 9883 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 9884 // Calculate the pointer for the specific unroll-part. 9885 GetElementPtrInst *PartPtr = nullptr; 9886 9887 bool InBounds = false; 9888 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 9889 InBounds = gep->isInBounds(); 9890 if (Reverse) { 9891 // If the address is consecutive but reversed, then the 9892 // wide store needs to start at the last vector element. 9893 // RunTimeVF = VScale * VF.getKnownMinValue() 9894 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 9895 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); 9896 // NumElt = -Part * RunTimeVF 9897 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 9898 // LastLane = 1 - RunTimeVF 9899 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 9900 PartPtr = 9901 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 9902 PartPtr->setIsInBounds(InBounds); 9903 PartPtr = cast<GetElementPtrInst>( 9904 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 9905 PartPtr->setIsInBounds(InBounds); 9906 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 9907 BlockInMaskParts[Part] = 9908 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); 9909 } else { 9910 Value *Increment = 9911 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part); 9912 PartPtr = cast<GetElementPtrInst>( 9913 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 9914 PartPtr->setIsInBounds(InBounds); 9915 } 9916 9917 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 9918 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 9919 }; 9920 9921 // Handle Stores: 9922 if (SI) { 9923 State.ILV->setDebugLocFromInst(SI); 9924 9925 for (unsigned Part = 0; Part < State.UF; ++Part) { 9926 Instruction *NewSI = nullptr; 9927 Value *StoredVal = State.get(StoredValue, Part); 9928 if (CreateGatherScatter) { 9929 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9930 Value *VectorGep = State.get(getAddr(), Part); 9931 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9932 MaskPart); 9933 } else { 9934 if (Reverse) { 9935 // If we store to reverse consecutive memory locations, then we need 9936 // to reverse the order of elements in the stored value. 9937 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9938 // We don't want to update the value in the map as it might be used in 9939 // another expression. So don't call resetVectorValue(StoredVal). 9940 } 9941 auto *VecPtr = 9942 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9943 if (isMaskRequired) 9944 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9945 BlockInMaskParts[Part]); 9946 else 9947 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9948 } 9949 State.addMetadata(NewSI, SI); 9950 } 9951 return; 9952 } 9953 9954 // Handle loads. 9955 assert(LI && "Must have a load instruction"); 9956 State.ILV->setDebugLocFromInst(LI); 9957 for (unsigned Part = 0; Part < State.UF; ++Part) { 9958 Value *NewLI; 9959 if (CreateGatherScatter) { 9960 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9961 Value *VectorGep = State.get(getAddr(), Part); 9962 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9963 nullptr, "wide.masked.gather"); 9964 State.addMetadata(NewLI, LI); 9965 } else { 9966 auto *VecPtr = 9967 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); 9968 if (isMaskRequired) 9969 NewLI = Builder.CreateMaskedLoad( 9970 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9971 PoisonValue::get(DataTy), "wide.masked.load"); 9972 else 9973 NewLI = 9974 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9975 9976 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9977 State.addMetadata(NewLI, LI); 9978 if (Reverse) 9979 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9980 } 9981 9982 State.set(getVPSingleValue(), NewLI, Part); 9983 } 9984 } 9985 9986 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9987 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9988 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9989 // for predication. 9990 static ScalarEpilogueLowering getScalarEpilogueLowering( 9991 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9992 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9993 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9994 LoopVectorizationLegality &LVL) { 9995 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9996 // don't look at hints or options, and don't request a scalar epilogue. 9997 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9998 // LoopAccessInfo (due to code dependency and not being able to reliably get 9999 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 10000 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 10001 // versioning when the vectorization is forced, unlike hasOptSize. So revert 10002 // back to the old way and vectorize with versioning when forced. See D81345.) 10003 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 10004 PGSOQueryType::IRPass) && 10005 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 10006 return CM_ScalarEpilogueNotAllowedOptSize; 10007 10008 // 2) If set, obey the directives 10009 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 10010 switch (PreferPredicateOverEpilogue) { 10011 case PreferPredicateTy::ScalarEpilogue: 10012 return CM_ScalarEpilogueAllowed; 10013 case PreferPredicateTy::PredicateElseScalarEpilogue: 10014 return CM_ScalarEpilogueNotNeededUsePredicate; 10015 case PreferPredicateTy::PredicateOrDontVectorize: 10016 return CM_ScalarEpilogueNotAllowedUsePredicate; 10017 }; 10018 } 10019 10020 // 3) If set, obey the hints 10021 switch (Hints.getPredicate()) { 10022 case LoopVectorizeHints::FK_Enabled: 10023 return CM_ScalarEpilogueNotNeededUsePredicate; 10024 case LoopVectorizeHints::FK_Disabled: 10025 return CM_ScalarEpilogueAllowed; 10026 }; 10027 10028 // 4) if the TTI hook indicates this is profitable, request predication. 10029 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 10030 LVL.getLAI())) 10031 return CM_ScalarEpilogueNotNeededUsePredicate; 10032 10033 return CM_ScalarEpilogueAllowed; 10034 } 10035 10036 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 10037 // If Values have been set for this Def return the one relevant for \p Part. 10038 if (hasVectorValue(Def, Part)) 10039 return Data.PerPartOutput[Def][Part]; 10040 10041 if (!hasScalarValue(Def, {Part, 0})) { 10042 Value *IRV = Def->getLiveInIRValue(); 10043 Value *B = ILV->getBroadcastInstrs(IRV); 10044 set(Def, B, Part); 10045 return B; 10046 } 10047 10048 Value *ScalarValue = get(Def, {Part, 0}); 10049 // If we aren't vectorizing, we can just copy the scalar map values over 10050 // to the vector map. 10051 if (VF.isScalar()) { 10052 set(Def, ScalarValue, Part); 10053 return ScalarValue; 10054 } 10055 10056 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 10057 bool IsUniform = RepR && RepR->isUniform(); 10058 10059 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 10060 // Check if there is a scalar value for the selected lane. 10061 if (!hasScalarValue(Def, {Part, LastLane})) { 10062 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 10063 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) || 10064 isa<VPScalarIVStepsRecipe>(Def->getDef())) && 10065 "unexpected recipe found to be invariant"); 10066 IsUniform = true; 10067 LastLane = 0; 10068 } 10069 10070 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 10071 // Set the insert point after the last scalarized instruction or after the 10072 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence 10073 // will directly follow the scalar definitions. 10074 auto OldIP = Builder.saveIP(); 10075 auto NewIP = 10076 isa<PHINode>(LastInst) 10077 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI()) 10078 : std::next(BasicBlock::iterator(LastInst)); 10079 Builder.SetInsertPoint(&*NewIP); 10080 10081 // However, if we are vectorizing, we need to construct the vector values. 10082 // If the value is known to be uniform after vectorization, we can just 10083 // broadcast the scalar value corresponding to lane zero for each unroll 10084 // iteration. Otherwise, we construct the vector values using 10085 // insertelement instructions. Since the resulting vectors are stored in 10086 // State, we will only generate the insertelements once. 10087 Value *VectorValue = nullptr; 10088 if (IsUniform) { 10089 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 10090 set(Def, VectorValue, Part); 10091 } else { 10092 // Initialize packing with insertelements to start from undef. 10093 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 10094 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 10095 set(Def, Undef, Part); 10096 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 10097 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 10098 VectorValue = get(Def, Part); 10099 } 10100 Builder.restoreIP(OldIP); 10101 return VectorValue; 10102 } 10103 10104 // Process the loop in the VPlan-native vectorization path. This path builds 10105 // VPlan upfront in the vectorization pipeline, which allows to apply 10106 // VPlan-to-VPlan transformations from the very beginning without modifying the 10107 // input LLVM IR. 10108 static bool processLoopInVPlanNativePath( 10109 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 10110 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 10111 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 10112 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 10113 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 10114 LoopVectorizationRequirements &Requirements) { 10115 10116 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 10117 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 10118 return false; 10119 } 10120 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 10121 Function *F = L->getHeader()->getParent(); 10122 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 10123 10124 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10125 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 10126 10127 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 10128 &Hints, IAI); 10129 // Use the planner for outer loop vectorization. 10130 // TODO: CM is not used at this point inside the planner. Turn CM into an 10131 // optional argument if we don't need it in the future. 10132 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 10133 Requirements, ORE); 10134 10135 // Get user vectorization factor. 10136 ElementCount UserVF = Hints.getWidth(); 10137 10138 CM.collectElementTypesForWidening(); 10139 10140 // Plan how to best vectorize, return the best VF and its cost. 10141 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 10142 10143 // If we are stress testing VPlan builds, do not attempt to generate vector 10144 // code. Masked vector code generation support will follow soon. 10145 // Also, do not attempt to vectorize if no vector code will be produced. 10146 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 10147 return false; 10148 10149 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10150 10151 { 10152 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10153 F->getParent()->getDataLayout()); 10154 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 10155 &CM, BFI, PSI, Checks); 10156 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 10157 << L->getHeader()->getParent()->getName() << "\"\n"); 10158 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT); 10159 } 10160 10161 // Mark the loop as already vectorized to avoid vectorizing again. 10162 Hints.setAlreadyVectorized(); 10163 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10164 return true; 10165 } 10166 10167 // Emit a remark if there are stores to floats that required a floating point 10168 // extension. If the vectorized loop was generated with floating point there 10169 // will be a performance penalty from the conversion overhead and the change in 10170 // the vector width. 10171 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 10172 SmallVector<Instruction *, 4> Worklist; 10173 for (BasicBlock *BB : L->getBlocks()) { 10174 for (Instruction &Inst : *BB) { 10175 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 10176 if (S->getValueOperand()->getType()->isFloatTy()) 10177 Worklist.push_back(S); 10178 } 10179 } 10180 } 10181 10182 // Traverse the floating point stores upwards searching, for floating point 10183 // conversions. 10184 SmallPtrSet<const Instruction *, 4> Visited; 10185 SmallPtrSet<const Instruction *, 4> EmittedRemark; 10186 while (!Worklist.empty()) { 10187 auto *I = Worklist.pop_back_val(); 10188 if (!L->contains(I)) 10189 continue; 10190 if (!Visited.insert(I).second) 10191 continue; 10192 10193 // Emit a remark if the floating point store required a floating 10194 // point conversion. 10195 // TODO: More work could be done to identify the root cause such as a 10196 // constant or a function return type and point the user to it. 10197 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 10198 ORE->emit([&]() { 10199 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 10200 I->getDebugLoc(), L->getHeader()) 10201 << "floating point conversion changes vector width. " 10202 << "Mixed floating point precision requires an up/down " 10203 << "cast that will negatively impact performance."; 10204 }); 10205 10206 for (Use &Op : I->operands()) 10207 if (auto *OpI = dyn_cast<Instruction>(Op)) 10208 Worklist.push_back(OpI); 10209 } 10210 } 10211 10212 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 10213 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 10214 !EnableLoopInterleaving), 10215 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 10216 !EnableLoopVectorization) {} 10217 10218 bool LoopVectorizePass::processLoop(Loop *L) { 10219 assert((EnableVPlanNativePath || L->isInnermost()) && 10220 "VPlan-native path is not enabled. Only process inner loops."); 10221 10222 #ifndef NDEBUG 10223 const std::string DebugLocStr = getDebugLocString(L); 10224 #endif /* NDEBUG */ 10225 10226 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 10227 << L->getHeader()->getParent()->getName() << "' from " 10228 << DebugLocStr << "\n"); 10229 10230 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 10231 10232 LLVM_DEBUG( 10233 dbgs() << "LV: Loop hints:" 10234 << " force=" 10235 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 10236 ? "disabled" 10237 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 10238 ? "enabled" 10239 : "?")) 10240 << " width=" << Hints.getWidth() 10241 << " interleave=" << Hints.getInterleave() << "\n"); 10242 10243 // Function containing loop 10244 Function *F = L->getHeader()->getParent(); 10245 10246 // Looking at the diagnostic output is the only way to determine if a loop 10247 // was vectorized (other than looking at the IR or machine code), so it 10248 // is important to generate an optimization remark for each loop. Most of 10249 // these messages are generated as OptimizationRemarkAnalysis. Remarks 10250 // generated as OptimizationRemark and OptimizationRemarkMissed are 10251 // less verbose reporting vectorized loops and unvectorized loops that may 10252 // benefit from vectorization, respectively. 10253 10254 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 10255 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 10256 return false; 10257 } 10258 10259 PredicatedScalarEvolution PSE(*SE, *L); 10260 10261 // Check if it is legal to vectorize the loop. 10262 LoopVectorizationRequirements Requirements; 10263 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 10264 &Requirements, &Hints, DB, AC, BFI, PSI); 10265 if (!LVL.canVectorize(EnableVPlanNativePath)) { 10266 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 10267 Hints.emitRemarkWithHints(); 10268 return false; 10269 } 10270 10271 // Check the function attributes and profiles to find out if this function 10272 // should be optimized for size. 10273 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 10274 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 10275 10276 // Entrance to the VPlan-native vectorization path. Outer loops are processed 10277 // here. They may require CFG and instruction level transformations before 10278 // even evaluating whether vectorization is profitable. Since we cannot modify 10279 // the incoming IR, we need to build VPlan upfront in the vectorization 10280 // pipeline. 10281 if (!L->isInnermost()) 10282 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 10283 ORE, BFI, PSI, Hints, Requirements); 10284 10285 assert(L->isInnermost() && "Inner loop expected."); 10286 10287 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 10288 // count by optimizing for size, to minimize overheads. 10289 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 10290 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 10291 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 10292 << "This loop is worth vectorizing only if no scalar " 10293 << "iteration overheads are incurred."); 10294 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 10295 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 10296 else { 10297 LLVM_DEBUG(dbgs() << "\n"); 10298 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 10299 } 10300 } 10301 10302 // Check the function attributes to see if implicit floats are allowed. 10303 // FIXME: This check doesn't seem possibly correct -- what if the loop is 10304 // an integer loop and the vector instructions selected are purely integer 10305 // vector instructions? 10306 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 10307 reportVectorizationFailure( 10308 "Can't vectorize when the NoImplicitFloat attribute is used", 10309 "loop not vectorized due to NoImplicitFloat attribute", 10310 "NoImplicitFloat", ORE, L); 10311 Hints.emitRemarkWithHints(); 10312 return false; 10313 } 10314 10315 // Check if the target supports potentially unsafe FP vectorization. 10316 // FIXME: Add a check for the type of safety issue (denormal, signaling) 10317 // for the target we're vectorizing for, to make sure none of the 10318 // additional fp-math flags can help. 10319 if (Hints.isPotentiallyUnsafe() && 10320 TTI->isFPVectorizationPotentiallyUnsafe()) { 10321 reportVectorizationFailure( 10322 "Potentially unsafe FP op prevents vectorization", 10323 "loop not vectorized due to unsafe FP support.", 10324 "UnsafeFP", ORE, L); 10325 Hints.emitRemarkWithHints(); 10326 return false; 10327 } 10328 10329 bool AllowOrderedReductions; 10330 // If the flag is set, use that instead and override the TTI behaviour. 10331 if (ForceOrderedReductions.getNumOccurrences() > 0) 10332 AllowOrderedReductions = ForceOrderedReductions; 10333 else 10334 AllowOrderedReductions = TTI->enableOrderedReductions(); 10335 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 10336 ORE->emit([&]() { 10337 auto *ExactFPMathInst = Requirements.getExactFPInst(); 10338 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 10339 ExactFPMathInst->getDebugLoc(), 10340 ExactFPMathInst->getParent()) 10341 << "loop not vectorized: cannot prove it is safe to reorder " 10342 "floating-point operations"; 10343 }); 10344 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 10345 "reorder floating-point operations\n"); 10346 Hints.emitRemarkWithHints(); 10347 return false; 10348 } 10349 10350 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 10351 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 10352 10353 // If an override option has been passed in for interleaved accesses, use it. 10354 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 10355 UseInterleaved = EnableInterleavedMemAccesses; 10356 10357 // Analyze interleaved memory accesses. 10358 if (UseInterleaved) { 10359 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10360 } 10361 10362 // Use the cost model. 10363 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10364 F, &Hints, IAI); 10365 CM.collectValuesToIgnore(); 10366 CM.collectElementTypesForWidening(); 10367 10368 // Use the planner for vectorization. 10369 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10370 Requirements, ORE); 10371 10372 // Get user vectorization factor and interleave count. 10373 ElementCount UserVF = Hints.getWidth(); 10374 unsigned UserIC = Hints.getInterleave(); 10375 10376 // Plan how to best vectorize, return the best VF and its cost. 10377 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10378 10379 VectorizationFactor VF = VectorizationFactor::Disabled(); 10380 unsigned IC = 1; 10381 10382 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10383 F->getParent()->getDataLayout()); 10384 if (MaybeVF) { 10385 if (LVP.requiresTooManyRuntimeChecks()) { 10386 ORE->emit([&]() { 10387 return OptimizationRemarkAnalysisAliasing( 10388 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10389 L->getHeader()) 10390 << "loop not vectorized: cannot prove it is safe to reorder " 10391 "memory operations"; 10392 }); 10393 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10394 Hints.emitRemarkWithHints(); 10395 return false; 10396 } 10397 VF = *MaybeVF; 10398 // Select the interleave count. 10399 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10400 10401 unsigned SelectedIC = std::max(IC, UserIC); 10402 // Optimistically generate runtime checks if they are needed. Drop them if 10403 // they turn out to not be profitable. 10404 if (VF.Width.isVector() || SelectedIC > 1) 10405 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10406 } 10407 10408 // Identify the diagnostic messages that should be produced. 10409 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10410 bool VectorizeLoop = true, InterleaveLoop = true; 10411 if (VF.Width.isScalar()) { 10412 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10413 VecDiagMsg = std::make_pair( 10414 "VectorizationNotBeneficial", 10415 "the cost-model indicates that vectorization is not beneficial"); 10416 VectorizeLoop = false; 10417 } 10418 10419 if (!MaybeVF && UserIC > 1) { 10420 // Tell the user interleaving was avoided up-front, despite being explicitly 10421 // requested. 10422 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10423 "interleaving should be avoided up front\n"); 10424 IntDiagMsg = std::make_pair( 10425 "InterleavingAvoided", 10426 "Ignoring UserIC, because interleaving was avoided up front"); 10427 InterleaveLoop = false; 10428 } else if (IC == 1 && UserIC <= 1) { 10429 // Tell the user interleaving is not beneficial. 10430 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10431 IntDiagMsg = std::make_pair( 10432 "InterleavingNotBeneficial", 10433 "the cost-model indicates that interleaving is not beneficial"); 10434 InterleaveLoop = false; 10435 if (UserIC == 1) { 10436 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10437 IntDiagMsg.second += 10438 " and is explicitly disabled or interleave count is set to 1"; 10439 } 10440 } else if (IC > 1 && UserIC == 1) { 10441 // Tell the user interleaving is beneficial, but it explicitly disabled. 10442 LLVM_DEBUG( 10443 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10444 IntDiagMsg = std::make_pair( 10445 "InterleavingBeneficialButDisabled", 10446 "the cost-model indicates that interleaving is beneficial " 10447 "but is explicitly disabled or interleave count is set to 1"); 10448 InterleaveLoop = false; 10449 } 10450 10451 // Override IC if user provided an interleave count. 10452 IC = UserIC > 0 ? UserIC : IC; 10453 10454 // Emit diagnostic messages, if any. 10455 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10456 if (!VectorizeLoop && !InterleaveLoop) { 10457 // Do not vectorize or interleaving the loop. 10458 ORE->emit([&]() { 10459 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10460 L->getStartLoc(), L->getHeader()) 10461 << VecDiagMsg.second; 10462 }); 10463 ORE->emit([&]() { 10464 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10465 L->getStartLoc(), L->getHeader()) 10466 << IntDiagMsg.second; 10467 }); 10468 return false; 10469 } else if (!VectorizeLoop && InterleaveLoop) { 10470 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10471 ORE->emit([&]() { 10472 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10473 L->getStartLoc(), L->getHeader()) 10474 << VecDiagMsg.second; 10475 }); 10476 } else if (VectorizeLoop && !InterleaveLoop) { 10477 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10478 << ") in " << DebugLocStr << '\n'); 10479 ORE->emit([&]() { 10480 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10481 L->getStartLoc(), L->getHeader()) 10482 << IntDiagMsg.second; 10483 }); 10484 } else if (VectorizeLoop && InterleaveLoop) { 10485 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10486 << ") in " << DebugLocStr << '\n'); 10487 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10488 } 10489 10490 bool DisableRuntimeUnroll = false; 10491 MDNode *OrigLoopID = L->getLoopID(); 10492 { 10493 using namespace ore; 10494 if (!VectorizeLoop) { 10495 assert(IC > 1 && "interleave count should not be 1 or 0"); 10496 // If we decided that it is not legal to vectorize the loop, then 10497 // interleave it. 10498 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10499 &CM, BFI, PSI, Checks); 10500 10501 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10502 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT); 10503 10504 ORE->emit([&]() { 10505 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10506 L->getHeader()) 10507 << "interleaved loop (interleaved count: " 10508 << NV("InterleaveCount", IC) << ")"; 10509 }); 10510 } else { 10511 // If we decided that it is *legal* to vectorize the loop, then do it. 10512 10513 // Consider vectorizing the epilogue too if it's profitable. 10514 VectorizationFactor EpilogueVF = 10515 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10516 if (EpilogueVF.Width.isVector()) { 10517 10518 // The first pass vectorizes the main loop and creates a scalar epilogue 10519 // to be vectorized by executing the plan (potentially with a different 10520 // factor) again shortly afterwards. 10521 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10522 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10523 EPI, &LVL, &CM, BFI, PSI, Checks); 10524 10525 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10526 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, 10527 DT); 10528 ++LoopsVectorized; 10529 10530 // Second pass vectorizes the epilogue and adjusts the control flow 10531 // edges from the first pass. 10532 EPI.MainLoopVF = EPI.EpilogueVF; 10533 EPI.MainLoopUF = EPI.EpilogueUF; 10534 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10535 ORE, EPI, &LVL, &CM, BFI, PSI, 10536 Checks); 10537 10538 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10539 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10540 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10541 Header->setName("vec.epilog.vector.body"); 10542 10543 // Ensure that the start values for any VPReductionPHIRecipes are 10544 // updated before vectorising the epilogue loop. 10545 for (VPRecipeBase &R : Header->phis()) { 10546 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10547 if (auto *Resume = MainILV.getReductionResumeValue( 10548 ReductionPhi->getRecurrenceDescriptor())) { 10549 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume); 10550 ReductionPhi->setOperand(0, StartVal); 10551 } 10552 } 10553 } 10554 10555 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10556 DT); 10557 ++LoopsEpilogueVectorized; 10558 10559 if (!MainILV.areSafetyChecksAdded()) 10560 DisableRuntimeUnroll = true; 10561 } else { 10562 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10563 &LVL, &CM, BFI, PSI, Checks); 10564 10565 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10566 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT); 10567 ++LoopsVectorized; 10568 10569 // Add metadata to disable runtime unrolling a scalar loop when there 10570 // are no runtime checks about strides and memory. A scalar loop that is 10571 // rarely used is not worth unrolling. 10572 if (!LB.areSafetyChecksAdded()) 10573 DisableRuntimeUnroll = true; 10574 } 10575 // Report the vectorization decision. 10576 ORE->emit([&]() { 10577 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10578 L->getHeader()) 10579 << "vectorized loop (vectorization width: " 10580 << NV("VectorizationFactor", VF.Width) 10581 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10582 }); 10583 } 10584 10585 if (ORE->allowExtraAnalysis(LV_NAME)) 10586 checkMixedPrecision(L, ORE); 10587 } 10588 10589 Optional<MDNode *> RemainderLoopID = 10590 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10591 LLVMLoopVectorizeFollowupEpilogue}); 10592 if (RemainderLoopID) { 10593 L->setLoopID(RemainderLoopID.getValue()); 10594 } else { 10595 if (DisableRuntimeUnroll) 10596 AddRuntimeUnrollDisableMetaData(L); 10597 10598 // Mark the loop as already vectorized to avoid vectorizing again. 10599 Hints.setAlreadyVectorized(); 10600 } 10601 10602 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10603 return true; 10604 } 10605 10606 LoopVectorizeResult LoopVectorizePass::runImpl( 10607 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10608 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10609 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10610 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10611 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10612 SE = &SE_; 10613 LI = &LI_; 10614 TTI = &TTI_; 10615 DT = &DT_; 10616 BFI = &BFI_; 10617 TLI = TLI_; 10618 AA = &AA_; 10619 AC = &AC_; 10620 GetLAA = &GetLAA_; 10621 DB = &DB_; 10622 ORE = &ORE_; 10623 PSI = PSI_; 10624 10625 // Don't attempt if 10626 // 1. the target claims to have no vector registers, and 10627 // 2. interleaving won't help ILP. 10628 // 10629 // The second condition is necessary because, even if the target has no 10630 // vector registers, loop vectorization may still enable scalar 10631 // interleaving. 10632 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10633 TTI->getMaxInterleaveFactor(1) < 2) 10634 return LoopVectorizeResult(false, false); 10635 10636 bool Changed = false, CFGChanged = false; 10637 10638 // The vectorizer requires loops to be in simplified form. 10639 // Since simplification may add new inner loops, it has to run before the 10640 // legality and profitability checks. This means running the loop vectorizer 10641 // will simplify all loops, regardless of whether anything end up being 10642 // vectorized. 10643 for (auto &L : *LI) 10644 Changed |= CFGChanged |= 10645 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10646 10647 // Build up a worklist of inner-loops to vectorize. This is necessary as 10648 // the act of vectorizing or partially unrolling a loop creates new loops 10649 // and can invalidate iterators across the loops. 10650 SmallVector<Loop *, 8> Worklist; 10651 10652 for (Loop *L : *LI) 10653 collectSupportedLoops(*L, LI, ORE, Worklist); 10654 10655 LoopsAnalyzed += Worklist.size(); 10656 10657 // Now walk the identified inner loops. 10658 while (!Worklist.empty()) { 10659 Loop *L = Worklist.pop_back_val(); 10660 10661 // For the inner loops we actually process, form LCSSA to simplify the 10662 // transform. 10663 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10664 10665 Changed |= CFGChanged |= processLoop(L); 10666 } 10667 10668 // Process each loop nest in the function. 10669 return LoopVectorizeResult(Changed, CFGChanged); 10670 } 10671 10672 PreservedAnalyses LoopVectorizePass::run(Function &F, 10673 FunctionAnalysisManager &AM) { 10674 auto &LI = AM.getResult<LoopAnalysis>(F); 10675 // There are no loops in the function. Return before computing other expensive 10676 // analyses. 10677 if (LI.empty()) 10678 return PreservedAnalyses::all(); 10679 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10680 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10681 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10682 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10683 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10684 auto &AA = AM.getResult<AAManager>(F); 10685 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10686 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10687 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10688 10689 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10690 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10691 [&](Loop &L) -> const LoopAccessInfo & { 10692 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10693 TLI, TTI, nullptr, nullptr, nullptr}; 10694 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10695 }; 10696 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10697 ProfileSummaryInfo *PSI = 10698 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10699 LoopVectorizeResult Result = 10700 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10701 if (!Result.MadeAnyChange) 10702 return PreservedAnalyses::all(); 10703 PreservedAnalyses PA; 10704 10705 // We currently do not preserve loopinfo/dominator analyses with outer loop 10706 // vectorization. Until this is addressed, mark these analyses as preserved 10707 // only for non-VPlan-native path. 10708 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10709 if (!EnableVPlanNativePath) { 10710 PA.preserve<LoopAnalysis>(); 10711 PA.preserve<DominatorTreeAnalysis>(); 10712 } 10713 10714 if (Result.MadeCFGChange) { 10715 // Making CFG changes likely means a loop got vectorized. Indicate that 10716 // extra simplification passes should be run. 10717 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10718 // be run if runtime checks have been added. 10719 AM.getResult<ShouldRunExtraVectorPasses>(F); 10720 PA.preserve<ShouldRunExtraVectorPasses>(); 10721 } else { 10722 PA.preserveSet<CFGAnalyses>(); 10723 } 10724 return PA; 10725 } 10726 10727 void LoopVectorizePass::printPipeline( 10728 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10729 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10730 OS, MapClassName2PassName); 10731 10732 OS << "<"; 10733 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10734 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10735 OS << ">"; 10736 } 10737