1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<bool> ForceTargetSupportsScalableVectors( 276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 277 cl::desc( 278 "Pretend that scalable vectors are supported, even if the target does " 279 "not support them. This flag should only be used for testing.")); 280 281 static cl::opt<unsigned> SmallLoopCost( 282 "small-loop-cost", cl::init(20), cl::Hidden, 283 cl::desc( 284 "The cost of a loop that is considered 'small' by the interleaver.")); 285 286 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 288 cl::desc("Enable the use of the block frequency analysis to access PGO " 289 "heuristics minimizing code growth in cold regions and being more " 290 "aggressive in hot regions.")); 291 292 // Runtime interleave loops for load/store throughput. 293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 295 cl::desc( 296 "Enable runtime interleaving until load/store ports are saturated")); 297 298 /// Interleave small loops with scalar reductions. 299 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 301 cl::desc("Enable interleaving for loops with small iteration counts that " 302 "contain scalar reductions to expose ILP.")); 303 304 /// The number of stores in a loop that are allowed to need predication. 305 static cl::opt<unsigned> NumberOfStoresToPredicate( 306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 307 cl::desc("Max number of stores to be predicated behind an if.")); 308 309 static cl::opt<bool> EnableIndVarRegisterHeur( 310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 311 cl::desc("Count the induction variable only once when interleaving")); 312 313 static cl::opt<bool> EnableCondStoresVectorization( 314 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 315 cl::desc("Enable if predication of stores during vectorization.")); 316 317 static cl::opt<unsigned> MaxNestedScalarReductionIC( 318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 319 cl::desc("The maximum interleave count to use when interleaving a scalar " 320 "reduction in a nested loop.")); 321 322 static cl::opt<bool> 323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 324 cl::Hidden, 325 cl::desc("Prefer in-loop vector reductions, " 326 "overriding the targets preference.")); 327 328 static cl::opt<bool> PreferPredicatedReductionSelect( 329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 330 cl::desc( 331 "Prefer predicating a reduction operation over an after loop select.")); 332 333 cl::opt<bool> EnableVPlanNativePath( 334 "enable-vplan-native-path", cl::init(false), cl::Hidden, 335 cl::desc("Enable VPlan-native vectorization path with " 336 "support for outer loop vectorization.")); 337 338 // FIXME: Remove this switch once we have divergence analysis. Currently we 339 // assume divergent non-backedge branches when this switch is true. 340 cl::opt<bool> EnableVPlanPredication( 341 "enable-vplan-predication", cl::init(false), cl::Hidden, 342 cl::desc("Enable VPlan-native vectorization path predicator with " 343 "support for outer loop vectorization.")); 344 345 // This flag enables the stress testing of the VPlan H-CFG construction in the 346 // VPlan-native vectorization path. It must be used in conjuction with 347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 348 // verification of the H-CFGs built. 349 static cl::opt<bool> VPlanBuildStressTest( 350 "vplan-build-stress-test", cl::init(false), cl::Hidden, 351 cl::desc( 352 "Build VPlan for every supported loop nest in the function and bail " 353 "out right after the build (stress test the VPlan H-CFG construction " 354 "in the VPlan-native vectorization path).")); 355 356 cl::opt<bool> llvm::EnableLoopInterleaving( 357 "interleave-loops", cl::init(true), cl::Hidden, 358 cl::desc("Enable loop interleaving in Loop vectorization passes")); 359 cl::opt<bool> llvm::EnableLoopVectorization( 360 "vectorize-loops", cl::init(true), cl::Hidden, 361 cl::desc("Run the Loop vectorization passes")); 362 363 cl::opt<bool> PrintVPlansInDotFormat( 364 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 365 cl::desc("Use dot format instead of plain text when dumping VPlans")); 366 367 /// A helper function that returns the type of loaded or stored value. 368 static Type *getMemInstValueType(Value *I) { 369 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 370 "Expected Load or Store instruction"); 371 if (auto *LI = dyn_cast<LoadInst>(I)) 372 return LI->getType(); 373 return cast<StoreInst>(I)->getValueOperand()->getType(); 374 } 375 376 /// A helper function that returns true if the given type is irregular. The 377 /// type is irregular if its allocated size doesn't equal the store size of an 378 /// element of the corresponding vector type. 379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 380 // Determine if an array of N elements of type Ty is "bitcast compatible" 381 // with a <N x Ty> vector. 382 // This is only true if there is no padding between the array elements. 383 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 384 } 385 386 /// A helper function that returns the reciprocal of the block probability of 387 /// predicated blocks. If we return X, we are assuming the predicated block 388 /// will execute once for every X iterations of the loop header. 389 /// 390 /// TODO: We should use actual block probability here, if available. Currently, 391 /// we always assume predicated blocks have a 50% chance of executing. 392 static unsigned getReciprocalPredBlockProb() { return 2; } 393 394 /// A helper function that returns an integer or floating-point constant with 395 /// value C. 396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 397 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 398 : ConstantFP::get(Ty, C); 399 } 400 401 /// Returns "best known" trip count for the specified loop \p L as defined by 402 /// the following procedure: 403 /// 1) Returns exact trip count if it is known. 404 /// 2) Returns expected trip count according to profile data if any. 405 /// 3) Returns upper bound estimate if it is known. 406 /// 4) Returns None if all of the above failed. 407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 408 // Check if exact trip count is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 410 return ExpectedTC; 411 412 // Check if there is an expected trip count available from profile data. 413 if (LoopVectorizeWithBlockFrequency) 414 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 415 return EstimatedTC; 416 417 // Check if upper bound estimate is known. 418 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 419 return ExpectedTC; 420 421 return None; 422 } 423 424 // Forward declare GeneratedRTChecks. 425 class GeneratedRTChecks; 426 427 namespace llvm { 428 429 /// InnerLoopVectorizer vectorizes loops which contain only one basic 430 /// block to a specified vectorization factor (VF). 431 /// This class performs the widening of scalars into vectors, or multiple 432 /// scalars. This class also implements the following features: 433 /// * It inserts an epilogue loop for handling loops that don't have iteration 434 /// counts that are known to be a multiple of the vectorization factor. 435 /// * It handles the code generation for reduction variables. 436 /// * Scalarization (implementation using scalars) of un-vectorizable 437 /// instructions. 438 /// InnerLoopVectorizer does not perform any vectorization-legality 439 /// checks, and relies on the caller to check for the different legality 440 /// aspects. The InnerLoopVectorizer relies on the 441 /// LoopVectorizationLegality class to provide information about the induction 442 /// and reduction variables that were found to a given vectorization factor. 443 class InnerLoopVectorizer { 444 public: 445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 446 LoopInfo *LI, DominatorTree *DT, 447 const TargetLibraryInfo *TLI, 448 const TargetTransformInfo *TTI, AssumptionCache *AC, 449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 450 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 451 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 452 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 453 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 454 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 455 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 456 PSI(PSI), RTChecks(RTChecks) { 457 // Query this against the original loop and save it here because the profile 458 // of the original loop header may change as the transformation happens. 459 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 460 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop. 470 /// In the case of epilogue vectorization, this function is overriden to 471 /// handle the more complex control flow around the loops. 472 virtual BasicBlock *createVectorizedLoopSkeleton(); 473 474 /// Widen a single instruction within the innermost loop. 475 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 476 VPTransformState &State); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Widen a single select instruction within the innermost loop. 483 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 484 bool InvariantCond, VPTransformState &State); 485 486 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 487 void fixVectorizedLoop(VPTransformState &State); 488 489 // Return true if any runtime check is added. 490 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 491 492 /// A type for vectorized values in the new loop. Each value from the 493 /// original loop, when vectorized, is represented by UF vector values in the 494 /// new unrolled loop, where UF is the unroll factor. 495 using VectorParts = SmallVector<Value *, 2>; 496 497 /// Vectorize a single GetElementPtrInst based on information gathered and 498 /// decisions taken during planning. 499 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 500 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 501 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 502 503 /// Vectorize a single PHINode in a block. This method handles the induction 504 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 505 /// arbitrary length vectors. 506 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 507 VPValue *StartV, VPValue *Def, 508 VPTransformState &State); 509 510 /// A helper function to scalarize a single Instruction in the innermost loop. 511 /// Generates a sequence of scalar instances for each lane between \p MinLane 512 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 513 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 514 /// Instr's operands. 515 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 516 const VPIteration &Instance, bool IfPredicateInstr, 517 VPTransformState &State); 518 519 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 520 /// is provided, the integer induction variable will first be truncated to 521 /// the corresponding type. 522 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 523 VPValue *Def, VPValue *CastDef, 524 VPTransformState &State); 525 526 /// Construct the vector value of a scalarized value \p V one lane at a time. 527 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 528 VPTransformState &State); 529 530 /// Try to vectorize interleaved access group \p Group with the base address 531 /// given in \p Addr, optionally masking the vector operations if \p 532 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 533 /// values in the vectorized loop. 534 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 535 ArrayRef<VPValue *> VPDefs, 536 VPTransformState &State, VPValue *Addr, 537 ArrayRef<VPValue *> StoredValues, 538 VPValue *BlockInMask = nullptr); 539 540 /// Vectorize Load and Store instructions with the base address given in \p 541 /// Addr, optionally masking the vector operations if \p BlockInMask is 542 /// non-null. Use \p State to translate given VPValues to IR values in the 543 /// vectorized loop. 544 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 545 VPValue *Def, VPValue *Addr, 546 VPValue *StoredValue, VPValue *BlockInMask); 547 548 /// Set the debug location in the builder using the debug location in 549 /// the instruction. 550 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 551 552 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 553 void fixNonInductionPHIs(VPTransformState &State); 554 555 /// Create a broadcast instruction. This method generates a broadcast 556 /// instruction (shuffle) for loop invariant values and for the induction 557 /// value. If this is the induction variable then we extend it to N, N+1, ... 558 /// this is needed because each iteration in the loop corresponds to a SIMD 559 /// element. 560 virtual Value *getBroadcastInstrs(Value *V); 561 562 protected: 563 friend class LoopVectorizationPlanner; 564 565 /// A small list of PHINodes. 566 using PhiVector = SmallVector<PHINode *, 4>; 567 568 /// A type for scalarized values in the new loop. Each value from the 569 /// original loop, when scalarized, is represented by UF x VF scalar values 570 /// in the new unrolled loop, where UF is the unroll factor and VF is the 571 /// vectorization factor. 572 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 573 574 /// Set up the values of the IVs correctly when exiting the vector loop. 575 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 576 Value *CountRoundDown, Value *EndValue, 577 BasicBlock *MiddleBlock); 578 579 /// Create a new induction variable inside L. 580 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 581 Value *Step, Instruction *DL); 582 583 /// Handle all cross-iteration phis in the header. 584 void fixCrossIterationPHIs(VPTransformState &State); 585 586 /// Fix a first-order recurrence. This is the second phase of vectorizing 587 /// this phi node. 588 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 589 590 /// Fix a reduction cross-iteration phi. This is the second phase of 591 /// vectorizing this phi node. 592 void fixReduction(PHINode *Phi, VPTransformState &State); 593 594 /// Clear NSW/NUW flags from reduction instructions if necessary. 595 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 596 VPTransformState &State); 597 598 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 599 /// means we need to add the appropriate incoming value from the middle 600 /// block as exiting edges from the scalar epilogue loop (if present) are 601 /// already in place, and we exit the vector loop exclusively to the middle 602 /// block. 603 void fixLCSSAPHIs(VPTransformState &State); 604 605 /// Iteratively sink the scalarized operands of a predicated instruction into 606 /// the block that was created for it. 607 void sinkScalarOperands(Instruction *PredInst); 608 609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 610 /// represented as. 611 void truncateToMinimalBitwidths(VPTransformState &State); 612 613 /// This function adds 614 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 615 /// to each vector element of Val. The sequence starts at StartIndex. 616 /// \p Opcode is relevant for FP induction variable. 617 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 618 Instruction::BinaryOps Opcode = 619 Instruction::BinaryOpsEnd); 620 621 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 622 /// variable on which to base the steps, \p Step is the size of the step, and 623 /// \p EntryVal is the value from the original loop that maps to the steps. 624 /// Note that \p EntryVal doesn't have to be an induction variable - it 625 /// can also be a truncate instruction. 626 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 627 const InductionDescriptor &ID, VPValue *Def, 628 VPValue *CastDef, VPTransformState &State); 629 630 /// Create a vector induction phi node based on an existing scalar one. \p 631 /// EntryVal is the value from the original loop that maps to the vector phi 632 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 633 /// truncate instruction, instead of widening the original IV, we widen a 634 /// version of the IV truncated to \p EntryVal's type. 635 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 636 Value *Step, Value *Start, 637 Instruction *EntryVal, VPValue *Def, 638 VPValue *CastDef, 639 VPTransformState &State); 640 641 /// Returns true if an instruction \p I should be scalarized instead of 642 /// vectorized for the chosen vectorization factor. 643 bool shouldScalarizeInstruction(Instruction *I) const; 644 645 /// Returns true if we should generate a scalar version of \p IV. 646 bool needsScalarInduction(Instruction *IV) const; 647 648 /// If there is a cast involved in the induction variable \p ID, which should 649 /// be ignored in the vectorized loop body, this function records the 650 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 651 /// cast. We had already proved that the casted Phi is equal to the uncasted 652 /// Phi in the vectorized loop (under a runtime guard), and therefore 653 /// there is no need to vectorize the cast - the same value can be used in the 654 /// vector loop for both the Phi and the cast. 655 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 656 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 657 /// 658 /// \p EntryVal is the value from the original loop that maps to the vector 659 /// phi node and is used to distinguish what is the IV currently being 660 /// processed - original one (if \p EntryVal is a phi corresponding to the 661 /// original IV) or the "newly-created" one based on the proof mentioned above 662 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 663 /// latter case \p EntryVal is a TruncInst and we must not record anything for 664 /// that IV, but it's error-prone to expect callers of this routine to care 665 /// about that, hence this explicit parameter. 666 void recordVectorLoopValueForInductionCast( 667 const InductionDescriptor &ID, const Instruction *EntryVal, 668 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 669 unsigned Part, unsigned Lane = UINT_MAX); 670 671 /// Generate a shuffle sequence that will reverse the vector Vec. 672 virtual Value *reverseVector(Value *Vec); 673 674 /// Returns (and creates if needed) the original loop trip count. 675 Value *getOrCreateTripCount(Loop *NewLoop); 676 677 /// Returns (and creates if needed) the trip count of the widened loop. 678 Value *getOrCreateVectorTripCount(Loop *NewLoop); 679 680 /// Returns a bitcasted value to the requested vector type. 681 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 682 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 683 const DataLayout &DL); 684 685 /// Emit a bypass check to see if the vector trip count is zero, including if 686 /// it overflows. 687 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 688 689 /// Emit a bypass check to see if all of the SCEV assumptions we've 690 /// had to make are correct. Returns the block containing the checks or 691 /// nullptr if no checks have been added. 692 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 693 694 /// Emit bypass checks to check any memory assumptions we may have made. 695 /// Returns the block containing the checks or nullptr if no checks have been 696 /// added. 697 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Compute the transformed value of Index at offset StartValue using step 700 /// StepValue. 701 /// For integer induction, returns StartValue + Index * StepValue. 702 /// For pointer induction, returns StartValue[Index * StepValue]. 703 /// FIXME: The newly created binary instructions should contain nsw/nuw 704 /// flags, which can be found from the original scalar operations. 705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 706 const DataLayout &DL, 707 const InductionDescriptor &ID) const; 708 709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 710 /// vector loop preheader, middle block and scalar preheader. Also 711 /// allocate a loop object for the new vector loop and return it. 712 Loop *createVectorLoopSkeleton(StringRef Prefix); 713 714 /// Create new phi nodes for the induction variables to resume iteration count 715 /// in the scalar epilogue, from where the vectorized loop left off (given by 716 /// \p VectorTripCount). 717 /// In cases where the loop skeleton is more complicated (eg. epilogue 718 /// vectorization) and the resume values can come from an additional bypass 719 /// block, the \p AdditionalBypass pair provides information about the bypass 720 /// block and the end value on the edge from bypass to this loop. 721 void createInductionResumeValues( 722 Loop *L, Value *VectorTripCount, 723 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 724 725 /// Complete the loop skeleton by adding debug MDs, creating appropriate 726 /// conditional branches in the middle block, preparing the builder and 727 /// running the verifier. Take in the vector loop \p L as argument, and return 728 /// the preheader of the completed vector loop. 729 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 730 731 /// Add additional metadata to \p To that was not present on \p Orig. 732 /// 733 /// Currently this is used to add the noalias annotations based on the 734 /// inserted memchecks. Use this for instructions that are *cloned* into the 735 /// vector loop. 736 void addNewMetadata(Instruction *To, const Instruction *Orig); 737 738 /// Add metadata from one instruction to another. 739 /// 740 /// This includes both the original MDs from \p From and additional ones (\see 741 /// addNewMetadata). Use this for *newly created* instructions in the vector 742 /// loop. 743 void addMetadata(Instruction *To, Instruction *From); 744 745 /// Similar to the previous function but it adds the metadata to a 746 /// vector of instructions. 747 void addMetadata(ArrayRef<Value *> To, Instruction *From); 748 749 /// Allow subclasses to override and print debug traces before/after vplan 750 /// execution, when trace information is requested. 751 virtual void printDebugTracesAtStart(){}; 752 virtual void printDebugTracesAtEnd(){}; 753 754 /// The original loop. 755 Loop *OrigLoop; 756 757 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 758 /// dynamic knowledge to simplify SCEV expressions and converts them to a 759 /// more usable form. 760 PredicatedScalarEvolution &PSE; 761 762 /// Loop Info. 763 LoopInfo *LI; 764 765 /// Dominator Tree. 766 DominatorTree *DT; 767 768 /// Alias Analysis. 769 AAResults *AA; 770 771 /// Target Library Info. 772 const TargetLibraryInfo *TLI; 773 774 /// Target Transform Info. 775 const TargetTransformInfo *TTI; 776 777 /// Assumption Cache. 778 AssumptionCache *AC; 779 780 /// Interface to emit optimization remarks. 781 OptimizationRemarkEmitter *ORE; 782 783 /// LoopVersioning. It's only set up (non-null) if memchecks were 784 /// used. 785 /// 786 /// This is currently only used to add no-alias metadata based on the 787 /// memchecks. The actually versioning is performed manually. 788 std::unique_ptr<LoopVersioning> LVer; 789 790 /// The vectorization SIMD factor to use. Each vector will have this many 791 /// vector elements. 792 ElementCount VF; 793 794 /// The vectorization unroll factor to use. Each scalar is vectorized to this 795 /// many different vector instructions. 796 unsigned UF; 797 798 /// The builder that we use 799 IRBuilder<> Builder; 800 801 // --- Vectorization state --- 802 803 /// The vector-loop preheader. 804 BasicBlock *LoopVectorPreHeader; 805 806 /// The scalar-loop preheader. 807 BasicBlock *LoopScalarPreHeader; 808 809 /// Middle Block between the vector and the scalar. 810 BasicBlock *LoopMiddleBlock; 811 812 /// The (unique) ExitBlock of the scalar loop. Note that 813 /// there can be multiple exiting edges reaching this block. 814 BasicBlock *LoopExitBlock; 815 816 /// The vector loop body. 817 BasicBlock *LoopVectorBody; 818 819 /// The scalar loop body. 820 BasicBlock *LoopScalarBody; 821 822 /// A list of all bypass blocks. The first block is the entry of the loop. 823 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 824 825 /// The new Induction variable which was added to the new block. 826 PHINode *Induction = nullptr; 827 828 /// The induction variable of the old basic block. 829 PHINode *OldInduction = nullptr; 830 831 /// Store instructions that were predicated. 832 SmallVector<Instruction *, 4> PredicatedInstructions; 833 834 /// Trip count of the original loop. 835 Value *TripCount = nullptr; 836 837 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 838 Value *VectorTripCount = nullptr; 839 840 /// The legality analysis. 841 LoopVectorizationLegality *Legal; 842 843 /// The profitablity analysis. 844 LoopVectorizationCostModel *Cost; 845 846 // Record whether runtime checks are added. 847 bool AddedSafetyChecks = false; 848 849 // Holds the end values for each induction variable. We save the end values 850 // so we can later fix-up the external users of the induction variables. 851 DenseMap<PHINode *, Value *> IVEndValues; 852 853 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 854 // fixed up at the end of vector code generation. 855 SmallVector<PHINode *, 8> OrigPHIsToFix; 856 857 /// BFI and PSI are used to check for profile guided size optimizations. 858 BlockFrequencyInfo *BFI; 859 ProfileSummaryInfo *PSI; 860 861 // Whether this loop should be optimized for size based on profile guided size 862 // optimizatios. 863 bool OptForSizeBasedOnProfile; 864 865 /// Structure to hold information about generated runtime checks, responsible 866 /// for cleaning the checks, if vectorization turns out unprofitable. 867 GeneratedRTChecks &RTChecks; 868 }; 869 870 class InnerLoopUnroller : public InnerLoopVectorizer { 871 public: 872 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 873 LoopInfo *LI, DominatorTree *DT, 874 const TargetLibraryInfo *TLI, 875 const TargetTransformInfo *TTI, AssumptionCache *AC, 876 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 877 LoopVectorizationLegality *LVL, 878 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 879 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 880 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 881 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 882 BFI, PSI, Check) {} 883 884 private: 885 Value *getBroadcastInstrs(Value *V) override; 886 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 887 Instruction::BinaryOps Opcode = 888 Instruction::BinaryOpsEnd) override; 889 Value *reverseVector(Value *Vec) override; 890 }; 891 892 /// Encapsulate information regarding vectorization of a loop and its epilogue. 893 /// This information is meant to be updated and used across two stages of 894 /// epilogue vectorization. 895 struct EpilogueLoopVectorizationInfo { 896 ElementCount MainLoopVF = ElementCount::getFixed(0); 897 unsigned MainLoopUF = 0; 898 ElementCount EpilogueVF = ElementCount::getFixed(0); 899 unsigned EpilogueUF = 0; 900 BasicBlock *MainLoopIterationCountCheck = nullptr; 901 BasicBlock *EpilogueIterationCountCheck = nullptr; 902 BasicBlock *SCEVSafetyCheck = nullptr; 903 BasicBlock *MemSafetyCheck = nullptr; 904 Value *TripCount = nullptr; 905 Value *VectorTripCount = nullptr; 906 907 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 908 unsigned EUF) 909 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 910 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 911 assert(EUF == 1 && 912 "A high UF for the epilogue loop is likely not beneficial."); 913 } 914 }; 915 916 /// An extension of the inner loop vectorizer that creates a skeleton for a 917 /// vectorized loop that has its epilogue (residual) also vectorized. 918 /// The idea is to run the vplan on a given loop twice, firstly to setup the 919 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 920 /// from the first step and vectorize the epilogue. This is achieved by 921 /// deriving two concrete strategy classes from this base class and invoking 922 /// them in succession from the loop vectorizer planner. 923 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 924 public: 925 InnerLoopAndEpilogueVectorizer( 926 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 927 DominatorTree *DT, const TargetLibraryInfo *TLI, 928 const TargetTransformInfo *TTI, AssumptionCache *AC, 929 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 930 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 931 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 932 GeneratedRTChecks &Checks) 933 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 934 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 935 Checks), 936 EPI(EPI) {} 937 938 // Override this function to handle the more complex control flow around the 939 // three loops. 940 BasicBlock *createVectorizedLoopSkeleton() final override { 941 return createEpilogueVectorizedLoopSkeleton(); 942 } 943 944 /// The interface for creating a vectorized skeleton using one of two 945 /// different strategies, each corresponding to one execution of the vplan 946 /// as described above. 947 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 948 949 /// Holds and updates state information required to vectorize the main loop 950 /// and its epilogue in two separate passes. This setup helps us avoid 951 /// regenerating and recomputing runtime safety checks. It also helps us to 952 /// shorten the iteration-count-check path length for the cases where the 953 /// iteration count of the loop is so small that the main vector loop is 954 /// completely skipped. 955 EpilogueLoopVectorizationInfo &EPI; 956 }; 957 958 /// A specialized derived class of inner loop vectorizer that performs 959 /// vectorization of *main* loops in the process of vectorizing loops and their 960 /// epilogues. 961 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 962 public: 963 EpilogueVectorizerMainLoop( 964 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 DominatorTree *DT, const TargetLibraryInfo *TLI, 966 const TargetTransformInfo *TTI, AssumptionCache *AC, 967 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 968 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 969 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 970 GeneratedRTChecks &Check) 971 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 972 EPI, LVL, CM, BFI, PSI, Check) {} 973 /// Implements the interface for creating a vectorized skeleton using the 974 /// *main loop* strategy (ie the first pass of vplan execution). 975 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 976 977 protected: 978 /// Emits an iteration count bypass check once for the main loop (when \p 979 /// ForEpilogue is false) and once for the epilogue loop (when \p 980 /// ForEpilogue is true). 981 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 982 bool ForEpilogue); 983 void printDebugTracesAtStart() override; 984 void printDebugTracesAtEnd() override; 985 }; 986 987 // A specialized derived class of inner loop vectorizer that performs 988 // vectorization of *epilogue* loops in the process of vectorizing loops and 989 // their epilogues. 990 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 991 public: 992 EpilogueVectorizerEpilogueLoop( 993 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 994 DominatorTree *DT, const TargetLibraryInfo *TLI, 995 const TargetTransformInfo *TTI, AssumptionCache *AC, 996 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 997 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 998 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 999 GeneratedRTChecks &Checks) 1000 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1001 EPI, LVL, CM, BFI, PSI, Checks) {} 1002 /// Implements the interface for creating a vectorized skeleton using the 1003 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1004 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1005 1006 protected: 1007 /// Emits an iteration count bypass check after the main vector loop has 1008 /// finished to see if there are any iterations left to execute by either 1009 /// the vector epilogue or the scalar epilogue. 1010 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1011 BasicBlock *Bypass, 1012 BasicBlock *Insert); 1013 void printDebugTracesAtStart() override; 1014 void printDebugTracesAtEnd() override; 1015 }; 1016 } // end namespace llvm 1017 1018 /// Look for a meaningful debug location on the instruction or it's 1019 /// operands. 1020 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1021 if (!I) 1022 return I; 1023 1024 DebugLoc Empty; 1025 if (I->getDebugLoc() != Empty) 1026 return I; 1027 1028 for (Use &Op : I->operands()) { 1029 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1030 if (OpInst->getDebugLoc() != Empty) 1031 return OpInst; 1032 } 1033 1034 return I; 1035 } 1036 1037 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1038 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1039 const DILocation *DIL = Inst->getDebugLoc(); 1040 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1041 !isa<DbgInfoIntrinsic>(Inst)) { 1042 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1043 auto NewDIL = 1044 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1045 if (NewDIL) 1046 B.SetCurrentDebugLocation(NewDIL.getValue()); 1047 else 1048 LLVM_DEBUG(dbgs() 1049 << "Failed to create new discriminator: " 1050 << DIL->getFilename() << " Line: " << DIL->getLine()); 1051 } 1052 else 1053 B.SetCurrentDebugLocation(DIL); 1054 } else 1055 B.SetCurrentDebugLocation(DebugLoc()); 1056 } 1057 1058 /// Write a record \p DebugMsg about vectorization failure to the debug 1059 /// output stream. If \p I is passed, it is an instruction that prevents 1060 /// vectorization. 1061 #ifndef NDEBUG 1062 static void debugVectorizationFailure(const StringRef DebugMsg, 1063 Instruction *I) { 1064 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1065 if (I != nullptr) 1066 dbgs() << " " << *I; 1067 else 1068 dbgs() << '.'; 1069 dbgs() << '\n'; 1070 } 1071 #endif 1072 1073 /// Create an analysis remark that explains why vectorization failed 1074 /// 1075 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1076 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1077 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1078 /// the location of the remark. \return the remark object that can be 1079 /// streamed to. 1080 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1081 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1082 Value *CodeRegion = TheLoop->getHeader(); 1083 DebugLoc DL = TheLoop->getStartLoc(); 1084 1085 if (I) { 1086 CodeRegion = I->getParent(); 1087 // If there is no debug location attached to the instruction, revert back to 1088 // using the loop's. 1089 if (I->getDebugLoc()) 1090 DL = I->getDebugLoc(); 1091 } 1092 1093 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1094 R << "loop not vectorized: "; 1095 return R; 1096 } 1097 1098 /// Return a value for Step multiplied by VF. 1099 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1100 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1101 Constant *StepVal = ConstantInt::get( 1102 Step->getType(), 1103 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1104 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1105 } 1106 1107 namespace llvm { 1108 1109 /// Return the runtime value for VF. 1110 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1111 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1113 } 1114 1115 void reportVectorizationFailure(const StringRef DebugMsg, 1116 const StringRef OREMsg, const StringRef ORETag, 1117 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1118 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1119 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1120 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1121 ORETag, TheLoop, I) << OREMsg); 1122 } 1123 1124 } // end namespace llvm 1125 1126 #ifndef NDEBUG 1127 /// \return string containing a file name and a line # for the given loop. 1128 static std::string getDebugLocString(const Loop *L) { 1129 std::string Result; 1130 if (L) { 1131 raw_string_ostream OS(Result); 1132 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1133 LoopDbgLoc.print(OS); 1134 else 1135 // Just print the module name. 1136 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1137 OS.flush(); 1138 } 1139 return Result; 1140 } 1141 #endif 1142 1143 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1144 const Instruction *Orig) { 1145 // If the loop was versioned with memchecks, add the corresponding no-alias 1146 // metadata. 1147 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1148 LVer->annotateInstWithNoAlias(To, Orig); 1149 } 1150 1151 void InnerLoopVectorizer::addMetadata(Instruction *To, 1152 Instruction *From) { 1153 propagateMetadata(To, From); 1154 addNewMetadata(To, From); 1155 } 1156 1157 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1158 Instruction *From) { 1159 for (Value *V : To) { 1160 if (Instruction *I = dyn_cast<Instruction>(V)) 1161 addMetadata(I, From); 1162 } 1163 } 1164 1165 namespace llvm { 1166 1167 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1168 // lowered. 1169 enum ScalarEpilogueLowering { 1170 1171 // The default: allowing scalar epilogues. 1172 CM_ScalarEpilogueAllowed, 1173 1174 // Vectorization with OptForSize: don't allow epilogues. 1175 CM_ScalarEpilogueNotAllowedOptSize, 1176 1177 // A special case of vectorisation with OptForSize: loops with a very small 1178 // trip count are considered for vectorization under OptForSize, thereby 1179 // making sure the cost of their loop body is dominant, free of runtime 1180 // guards and scalar iteration overheads. 1181 CM_ScalarEpilogueNotAllowedLowTripLoop, 1182 1183 // Loop hint predicate indicating an epilogue is undesired. 1184 CM_ScalarEpilogueNotNeededUsePredicate, 1185 1186 // Directive indicating we must either tail fold or not vectorize 1187 CM_ScalarEpilogueNotAllowedUsePredicate 1188 }; 1189 1190 /// LoopVectorizationCostModel - estimates the expected speedups due to 1191 /// vectorization. 1192 /// In many cases vectorization is not profitable. This can happen because of 1193 /// a number of reasons. In this class we mainly attempt to predict the 1194 /// expected speedup/slowdowns due to the supported instruction set. We use the 1195 /// TargetTransformInfo to query the different backends for the cost of 1196 /// different operations. 1197 class LoopVectorizationCostModel { 1198 public: 1199 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1200 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1201 LoopVectorizationLegality *Legal, 1202 const TargetTransformInfo &TTI, 1203 const TargetLibraryInfo *TLI, DemandedBits *DB, 1204 AssumptionCache *AC, 1205 OptimizationRemarkEmitter *ORE, const Function *F, 1206 const LoopVectorizeHints *Hints, 1207 InterleavedAccessInfo &IAI) 1208 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1209 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1210 Hints(Hints), InterleaveInfo(IAI) {} 1211 1212 /// \return An upper bound for the vectorization factor, or None if 1213 /// vectorization and interleaving should be avoided up front. 1214 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1215 1216 /// \return True if runtime checks are required for vectorization, and false 1217 /// otherwise. 1218 bool runtimeChecksRequired(); 1219 1220 /// \return The most profitable vectorization factor and the cost of that VF. 1221 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1222 /// then this vectorization factor will be selected if vectorization is 1223 /// possible. 1224 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1225 VectorizationFactor 1226 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1227 const LoopVectorizationPlanner &LVP); 1228 1229 /// Setup cost-based decisions for user vectorization factor. 1230 void selectUserVectorizationFactor(ElementCount UserVF) { 1231 collectUniformsAndScalars(UserVF); 1232 collectInstsToScalarize(UserVF); 1233 } 1234 1235 /// \return The size (in bits) of the smallest and widest types in the code 1236 /// that needs to be vectorized. We ignore values that remain scalar such as 1237 /// 64 bit loop indices. 1238 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1239 1240 /// \return The desired interleave count. 1241 /// If interleave count has been specified by metadata it will be returned. 1242 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1243 /// are the selected vectorization factor and the cost of the selected VF. 1244 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1245 1246 /// Memory access instruction may be vectorized in more than one way. 1247 /// Form of instruction after vectorization depends on cost. 1248 /// This function takes cost-based decisions for Load/Store instructions 1249 /// and collects them in a map. This decisions map is used for building 1250 /// the lists of loop-uniform and loop-scalar instructions. 1251 /// The calculated cost is saved with widening decision in order to 1252 /// avoid redundant calculations. 1253 void setCostBasedWideningDecision(ElementCount VF); 1254 1255 /// A struct that represents some properties of the register usage 1256 /// of a loop. 1257 struct RegisterUsage { 1258 /// Holds the number of loop invariant values that are used in the loop. 1259 /// The key is ClassID of target-provided register class. 1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1261 /// Holds the maximum number of concurrent live intervals in the loop. 1262 /// The key is ClassID of target-provided register class. 1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1264 }; 1265 1266 /// \return Returns information about the register usages of the loop for the 1267 /// given vectorization factors. 1268 SmallVector<RegisterUsage, 8> 1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1270 1271 /// Collect values we want to ignore in the cost model. 1272 void collectValuesToIgnore(); 1273 1274 /// Split reductions into those that happen in the loop, and those that happen 1275 /// outside. In loop reductions are collected into InLoopReductionChains. 1276 void collectInLoopReductions(); 1277 1278 /// \returns The smallest bitwidth each instruction can be represented with. 1279 /// The vector equivalents of these instructions should be truncated to this 1280 /// type. 1281 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1282 return MinBWs; 1283 } 1284 1285 /// \returns True if it is more profitable to scalarize instruction \p I for 1286 /// vectorization factor \p VF. 1287 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1288 assert(VF.isVector() && 1289 "Profitable to scalarize relevant only for VF > 1."); 1290 1291 // Cost model is not run in the VPlan-native path - return conservative 1292 // result until this changes. 1293 if (EnableVPlanNativePath) 1294 return false; 1295 1296 auto Scalars = InstsToScalarize.find(VF); 1297 assert(Scalars != InstsToScalarize.end() && 1298 "VF not yet analyzed for scalarization profitability"); 1299 return Scalars->second.find(I) != Scalars->second.end(); 1300 } 1301 1302 /// Returns true if \p I is known to be uniform after vectorization. 1303 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1304 if (VF.isScalar()) 1305 return true; 1306 1307 // Cost model is not run in the VPlan-native path - return conservative 1308 // result until this changes. 1309 if (EnableVPlanNativePath) 1310 return false; 1311 1312 auto UniformsPerVF = Uniforms.find(VF); 1313 assert(UniformsPerVF != Uniforms.end() && 1314 "VF not yet analyzed for uniformity"); 1315 return UniformsPerVF->second.count(I); 1316 } 1317 1318 /// Returns true if \p I is known to be scalar after vectorization. 1319 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1320 if (VF.isScalar()) 1321 return true; 1322 1323 // Cost model is not run in the VPlan-native path - return conservative 1324 // result until this changes. 1325 if (EnableVPlanNativePath) 1326 return false; 1327 1328 auto ScalarsPerVF = Scalars.find(VF); 1329 assert(ScalarsPerVF != Scalars.end() && 1330 "Scalar values are not calculated for VF"); 1331 return ScalarsPerVF->second.count(I); 1332 } 1333 1334 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1335 /// for vectorization factor \p VF. 1336 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1337 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1338 !isProfitableToScalarize(I, VF) && 1339 !isScalarAfterVectorization(I, VF); 1340 } 1341 1342 /// Decision that was taken during cost calculation for memory instruction. 1343 enum InstWidening { 1344 CM_Unknown, 1345 CM_Widen, // For consecutive accesses with stride +1. 1346 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1347 CM_Interleave, 1348 CM_GatherScatter, 1349 CM_Scalarize 1350 }; 1351 1352 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1353 /// instruction \p I and vector width \p VF. 1354 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1355 InstructionCost Cost) { 1356 assert(VF.isVector() && "Expected VF >=2"); 1357 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1358 } 1359 1360 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1361 /// interleaving group \p Grp and vector width \p VF. 1362 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1363 ElementCount VF, InstWidening W, 1364 InstructionCost Cost) { 1365 assert(VF.isVector() && "Expected VF >=2"); 1366 /// Broadcast this decicion to all instructions inside the group. 1367 /// But the cost will be assigned to one instruction only. 1368 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1369 if (auto *I = Grp->getMember(i)) { 1370 if (Grp->getInsertPos() == I) 1371 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1372 else 1373 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1374 } 1375 } 1376 } 1377 1378 /// Return the cost model decision for the given instruction \p I and vector 1379 /// width \p VF. Return CM_Unknown if this instruction did not pass 1380 /// through the cost modeling. 1381 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1382 assert(VF.isVector() && "Expected VF to be a vector VF"); 1383 // Cost model is not run in the VPlan-native path - return conservative 1384 // result until this changes. 1385 if (EnableVPlanNativePath) 1386 return CM_GatherScatter; 1387 1388 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1389 auto Itr = WideningDecisions.find(InstOnVF); 1390 if (Itr == WideningDecisions.end()) 1391 return CM_Unknown; 1392 return Itr->second.first; 1393 } 1394 1395 /// Return the vectorization cost for the given instruction \p I and vector 1396 /// width \p VF. 1397 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1398 assert(VF.isVector() && "Expected VF >=2"); 1399 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1400 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1401 "The cost is not calculated"); 1402 return WideningDecisions[InstOnVF].second; 1403 } 1404 1405 /// Return True if instruction \p I is an optimizable truncate whose operand 1406 /// is an induction variable. Such a truncate will be removed by adding a new 1407 /// induction variable with the destination type. 1408 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1409 // If the instruction is not a truncate, return false. 1410 auto *Trunc = dyn_cast<TruncInst>(I); 1411 if (!Trunc) 1412 return false; 1413 1414 // Get the source and destination types of the truncate. 1415 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1416 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1417 1418 // If the truncate is free for the given types, return false. Replacing a 1419 // free truncate with an induction variable would add an induction variable 1420 // update instruction to each iteration of the loop. We exclude from this 1421 // check the primary induction variable since it will need an update 1422 // instruction regardless. 1423 Value *Op = Trunc->getOperand(0); 1424 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1425 return false; 1426 1427 // If the truncated value is not an induction variable, return false. 1428 return Legal->isInductionPhi(Op); 1429 } 1430 1431 /// Collects the instructions to scalarize for each predicated instruction in 1432 /// the loop. 1433 void collectInstsToScalarize(ElementCount VF); 1434 1435 /// Collect Uniform and Scalar values for the given \p VF. 1436 /// The sets depend on CM decision for Load/Store instructions 1437 /// that may be vectorized as interleave, gather-scatter or scalarized. 1438 void collectUniformsAndScalars(ElementCount VF) { 1439 // Do the analysis once. 1440 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1441 return; 1442 setCostBasedWideningDecision(VF); 1443 collectLoopUniforms(VF); 1444 collectLoopScalars(VF); 1445 } 1446 1447 /// Returns true if the target machine supports masked store operation 1448 /// for the given \p DataType and kind of access to \p Ptr. 1449 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1450 return Legal->isConsecutivePtr(Ptr) && 1451 TTI.isLegalMaskedStore(DataType, Alignment); 1452 } 1453 1454 /// Returns true if the target machine supports masked load operation 1455 /// for the given \p DataType and kind of access to \p Ptr. 1456 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1457 return Legal->isConsecutivePtr(Ptr) && 1458 TTI.isLegalMaskedLoad(DataType, Alignment); 1459 } 1460 1461 /// Returns true if the target machine supports masked scatter operation 1462 /// for the given \p DataType. 1463 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1464 return TTI.isLegalMaskedScatter(DataType, Alignment); 1465 } 1466 1467 /// Returns true if the target machine supports masked gather operation 1468 /// for the given \p DataType. 1469 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1470 return TTI.isLegalMaskedGather(DataType, Alignment); 1471 } 1472 1473 /// Returns true if the target machine can represent \p V as a masked gather 1474 /// or scatter operation. 1475 bool isLegalGatherOrScatter(Value *V) { 1476 bool LI = isa<LoadInst>(V); 1477 bool SI = isa<StoreInst>(V); 1478 if (!LI && !SI) 1479 return false; 1480 auto *Ty = getMemInstValueType(V); 1481 Align Align = getLoadStoreAlignment(V); 1482 return (LI && isLegalMaskedGather(Ty, Align)) || 1483 (SI && isLegalMaskedScatter(Ty, Align)); 1484 } 1485 1486 /// Returns true if the target machine supports all of the reduction 1487 /// variables found for the given VF. 1488 bool canVectorizeReductions(ElementCount VF) { 1489 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1490 RecurrenceDescriptor RdxDesc = Reduction.second; 1491 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1492 })); 1493 } 1494 1495 /// Returns true if \p I is an instruction that will be scalarized with 1496 /// predication. Such instructions include conditional stores and 1497 /// instructions that may divide by zero. 1498 /// If a non-zero VF has been calculated, we check if I will be scalarized 1499 /// predication for that VF. 1500 bool isScalarWithPredication(Instruction *I, 1501 ElementCount VF = ElementCount::getFixed(1)); 1502 1503 // Returns true if \p I is an instruction that will be predicated either 1504 // through scalar predication or masked load/store or masked gather/scatter. 1505 // Superset of instructions that return true for isScalarWithPredication. 1506 bool isPredicatedInst(Instruction *I) { 1507 if (!blockNeedsPredication(I->getParent())) 1508 return false; 1509 // Loads and stores that need some form of masked operation are predicated 1510 // instructions. 1511 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1512 return Legal->isMaskRequired(I); 1513 return isScalarWithPredication(I); 1514 } 1515 1516 /// Returns true if \p I is a memory instruction with consecutive memory 1517 /// access that can be widened. 1518 bool 1519 memoryInstructionCanBeWidened(Instruction *I, 1520 ElementCount VF = ElementCount::getFixed(1)); 1521 1522 /// Returns true if \p I is a memory instruction in an interleaved-group 1523 /// of memory accesses that can be vectorized with wide vector loads/stores 1524 /// and shuffles. 1525 bool 1526 interleavedAccessCanBeWidened(Instruction *I, 1527 ElementCount VF = ElementCount::getFixed(1)); 1528 1529 /// Check if \p Instr belongs to any interleaved access group. 1530 bool isAccessInterleaved(Instruction *Instr) { 1531 return InterleaveInfo.isInterleaved(Instr); 1532 } 1533 1534 /// Get the interleaved access group that \p Instr belongs to. 1535 const InterleaveGroup<Instruction> * 1536 getInterleavedAccessGroup(Instruction *Instr) { 1537 return InterleaveInfo.getInterleaveGroup(Instr); 1538 } 1539 1540 /// Returns true if we're required to use a scalar epilogue for at least 1541 /// the final iteration of the original loop. 1542 bool requiresScalarEpilogue() const { 1543 if (!isScalarEpilogueAllowed()) 1544 return false; 1545 // If we might exit from anywhere but the latch, must run the exiting 1546 // iteration in scalar form. 1547 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1548 return true; 1549 return InterleaveInfo.requiresScalarEpilogue(); 1550 } 1551 1552 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1553 /// loop hint annotation. 1554 bool isScalarEpilogueAllowed() const { 1555 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1556 } 1557 1558 /// Returns true if all loop blocks should be masked to fold tail loop. 1559 bool foldTailByMasking() const { return FoldTailByMasking; } 1560 1561 bool blockNeedsPredication(BasicBlock *BB) { 1562 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1563 } 1564 1565 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1566 /// nodes to the chain of instructions representing the reductions. Uses a 1567 /// MapVector to ensure deterministic iteration order. 1568 using ReductionChainMap = 1569 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1570 1571 /// Return the chain of instructions representing an inloop reduction. 1572 const ReductionChainMap &getInLoopReductionChains() const { 1573 return InLoopReductionChains; 1574 } 1575 1576 /// Returns true if the Phi is part of an inloop reduction. 1577 bool isInLoopReduction(PHINode *Phi) const { 1578 return InLoopReductionChains.count(Phi); 1579 } 1580 1581 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1582 /// with factor VF. Return the cost of the instruction, including 1583 /// scalarization overhead if it's needed. 1584 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1585 1586 /// Estimate cost of a call instruction CI if it were vectorized with factor 1587 /// VF. Return the cost of the instruction, including scalarization overhead 1588 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1589 /// scalarized - 1590 /// i.e. either vector version isn't available, or is too expensive. 1591 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1592 bool &NeedToScalarize); 1593 1594 /// Invalidates decisions already taken by the cost model. 1595 void invalidateCostModelingDecisions() { 1596 WideningDecisions.clear(); 1597 Uniforms.clear(); 1598 Scalars.clear(); 1599 } 1600 1601 private: 1602 unsigned NumPredStores = 0; 1603 1604 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1605 /// than zero. One is returned if vectorization should best be avoided due 1606 /// to cost. 1607 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1608 ElementCount UserVF); 1609 1610 /// The vectorization cost is a combination of the cost itself and a boolean 1611 /// indicating whether any of the contributing operations will actually 1612 /// operate on 1613 /// vector values after type legalization in the backend. If this latter value 1614 /// is 1615 /// false, then all operations will be scalarized (i.e. no vectorization has 1616 /// actually taken place). 1617 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1618 1619 /// Returns the expected execution cost. The unit of the cost does 1620 /// not matter because we use the 'cost' units to compare different 1621 /// vector widths. The cost that is returned is *not* normalized by 1622 /// the factor width. 1623 VectorizationCostTy expectedCost(ElementCount VF); 1624 1625 /// Returns the execution time cost of an instruction for a given vector 1626 /// width. Vector width of one means scalar. 1627 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1628 1629 /// The cost-computation logic from getInstructionCost which provides 1630 /// the vector type as an output parameter. 1631 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1632 Type *&VectorTy); 1633 1634 /// Return the cost of instructions in an inloop reduction pattern, if I is 1635 /// part of that pattern. 1636 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1637 Type *VectorTy, 1638 TTI::TargetCostKind CostKind); 1639 1640 /// Calculate vectorization cost of memory instruction \p I. 1641 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1642 1643 /// The cost computation for scalarized memory instruction. 1644 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1645 1646 /// The cost computation for interleaving group of memory instructions. 1647 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1648 1649 /// The cost computation for Gather/Scatter instruction. 1650 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1651 1652 /// The cost computation for widening instruction \p I with consecutive 1653 /// memory access. 1654 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1655 1656 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1657 /// Load: scalar load + broadcast. 1658 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1659 /// element) 1660 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1661 1662 /// Estimate the overhead of scalarizing an instruction. This is a 1663 /// convenience wrapper for the type-based getScalarizationOverhead API. 1664 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1665 1666 /// Returns whether the instruction is a load or store and will be a emitted 1667 /// as a vector operation. 1668 bool isConsecutiveLoadOrStore(Instruction *I); 1669 1670 /// Returns true if an artificially high cost for emulated masked memrefs 1671 /// should be used. 1672 bool useEmulatedMaskMemRefHack(Instruction *I); 1673 1674 /// Map of scalar integer values to the smallest bitwidth they can be legally 1675 /// represented as. The vector equivalents of these values should be truncated 1676 /// to this type. 1677 MapVector<Instruction *, uint64_t> MinBWs; 1678 1679 /// A type representing the costs for instructions if they were to be 1680 /// scalarized rather than vectorized. The entries are Instruction-Cost 1681 /// pairs. 1682 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1683 1684 /// A set containing all BasicBlocks that are known to present after 1685 /// vectorization as a predicated block. 1686 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1687 1688 /// Records whether it is allowed to have the original scalar loop execute at 1689 /// least once. This may be needed as a fallback loop in case runtime 1690 /// aliasing/dependence checks fail, or to handle the tail/remainder 1691 /// iterations when the trip count is unknown or doesn't divide by the VF, 1692 /// or as a peel-loop to handle gaps in interleave-groups. 1693 /// Under optsize and when the trip count is very small we don't allow any 1694 /// iterations to execute in the scalar loop. 1695 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1696 1697 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1698 bool FoldTailByMasking = false; 1699 1700 /// A map holding scalar costs for different vectorization factors. The 1701 /// presence of a cost for an instruction in the mapping indicates that the 1702 /// instruction will be scalarized when vectorizing with the associated 1703 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1704 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1705 1706 /// Holds the instructions known to be uniform after vectorization. 1707 /// The data is collected per VF. 1708 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1709 1710 /// Holds the instructions known to be scalar after vectorization. 1711 /// The data is collected per VF. 1712 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1713 1714 /// Holds the instructions (address computations) that are forced to be 1715 /// scalarized. 1716 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1717 1718 /// PHINodes of the reductions that should be expanded in-loop along with 1719 /// their associated chains of reduction operations, in program order from top 1720 /// (PHI) to bottom 1721 ReductionChainMap InLoopReductionChains; 1722 1723 /// A Map of inloop reduction operations and their immediate chain operand. 1724 /// FIXME: This can be removed once reductions can be costed correctly in 1725 /// vplan. This was added to allow quick lookup to the inloop operations, 1726 /// without having to loop through InLoopReductionChains. 1727 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1728 1729 /// Returns the expected difference in cost from scalarizing the expression 1730 /// feeding a predicated instruction \p PredInst. The instructions to 1731 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1732 /// non-negative return value implies the expression will be scalarized. 1733 /// Currently, only single-use chains are considered for scalarization. 1734 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1735 ElementCount VF); 1736 1737 /// Collect the instructions that are uniform after vectorization. An 1738 /// instruction is uniform if we represent it with a single scalar value in 1739 /// the vectorized loop corresponding to each vector iteration. Examples of 1740 /// uniform instructions include pointer operands of consecutive or 1741 /// interleaved memory accesses. Note that although uniformity implies an 1742 /// instruction will be scalar, the reverse is not true. In general, a 1743 /// scalarized instruction will be represented by VF scalar values in the 1744 /// vectorized loop, each corresponding to an iteration of the original 1745 /// scalar loop. 1746 void collectLoopUniforms(ElementCount VF); 1747 1748 /// Collect the instructions that are scalar after vectorization. An 1749 /// instruction is scalar if it is known to be uniform or will be scalarized 1750 /// during vectorization. Non-uniform scalarized instructions will be 1751 /// represented by VF values in the vectorized loop, each corresponding to an 1752 /// iteration of the original scalar loop. 1753 void collectLoopScalars(ElementCount VF); 1754 1755 /// Keeps cost model vectorization decision and cost for instructions. 1756 /// Right now it is used for memory instructions only. 1757 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1758 std::pair<InstWidening, InstructionCost>>; 1759 1760 DecisionList WideningDecisions; 1761 1762 /// Returns true if \p V is expected to be vectorized and it needs to be 1763 /// extracted. 1764 bool needsExtract(Value *V, ElementCount VF) const { 1765 Instruction *I = dyn_cast<Instruction>(V); 1766 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1767 TheLoop->isLoopInvariant(I)) 1768 return false; 1769 1770 // Assume we can vectorize V (and hence we need extraction) if the 1771 // scalars are not computed yet. This can happen, because it is called 1772 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1773 // the scalars are collected. That should be a safe assumption in most 1774 // cases, because we check if the operands have vectorizable types 1775 // beforehand in LoopVectorizationLegality. 1776 return Scalars.find(VF) == Scalars.end() || 1777 !isScalarAfterVectorization(I, VF); 1778 }; 1779 1780 /// Returns a range containing only operands needing to be extracted. 1781 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1782 ElementCount VF) { 1783 return SmallVector<Value *, 4>(make_filter_range( 1784 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1785 } 1786 1787 /// Determines if we have the infrastructure to vectorize loop \p L and its 1788 /// epilogue, assuming the main loop is vectorized by \p VF. 1789 bool isCandidateForEpilogueVectorization(const Loop &L, 1790 const ElementCount VF) const; 1791 1792 /// Returns true if epilogue vectorization is considered profitable, and 1793 /// false otherwise. 1794 /// \p VF is the vectorization factor chosen for the original loop. 1795 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1796 1797 public: 1798 /// The loop that we evaluate. 1799 Loop *TheLoop; 1800 1801 /// Predicated scalar evolution analysis. 1802 PredicatedScalarEvolution &PSE; 1803 1804 /// Loop Info analysis. 1805 LoopInfo *LI; 1806 1807 /// Vectorization legality. 1808 LoopVectorizationLegality *Legal; 1809 1810 /// Vector target information. 1811 const TargetTransformInfo &TTI; 1812 1813 /// Target Library Info. 1814 const TargetLibraryInfo *TLI; 1815 1816 /// Demanded bits analysis. 1817 DemandedBits *DB; 1818 1819 /// Assumption cache. 1820 AssumptionCache *AC; 1821 1822 /// Interface to emit optimization remarks. 1823 OptimizationRemarkEmitter *ORE; 1824 1825 const Function *TheFunction; 1826 1827 /// Loop Vectorize Hint. 1828 const LoopVectorizeHints *Hints; 1829 1830 /// The interleave access information contains groups of interleaved accesses 1831 /// with the same stride and close to each other. 1832 InterleavedAccessInfo &InterleaveInfo; 1833 1834 /// Values to ignore in the cost model. 1835 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1836 1837 /// Values to ignore in the cost model when VF > 1. 1838 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1839 1840 /// Profitable vector factors. 1841 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1842 }; 1843 } // end namespace llvm 1844 1845 /// Helper struct to manage generating runtime checks for vectorization. 1846 /// 1847 /// The runtime checks are created up-front in temporary blocks to allow better 1848 /// estimating the cost and un-linked from the existing IR. After deciding to 1849 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1850 /// temporary blocks are completely removed. 1851 class GeneratedRTChecks { 1852 /// Basic block which contains the generated SCEV checks, if any. 1853 BasicBlock *SCEVCheckBlock = nullptr; 1854 1855 /// The value representing the result of the generated SCEV checks. If it is 1856 /// nullptr, either no SCEV checks have been generated or they have been used. 1857 Value *SCEVCheckCond = nullptr; 1858 1859 /// Basic block which contains the generated memory runtime checks, if any. 1860 BasicBlock *MemCheckBlock = nullptr; 1861 1862 /// The value representing the result of the generated memory runtime checks. 1863 /// If it is nullptr, either no memory runtime checks have been generated or 1864 /// they have been used. 1865 Instruction *MemRuntimeCheckCond = nullptr; 1866 1867 DominatorTree *DT; 1868 LoopInfo *LI; 1869 1870 SCEVExpander SCEVExp; 1871 SCEVExpander MemCheckExp; 1872 1873 public: 1874 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1875 const DataLayout &DL) 1876 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1877 MemCheckExp(SE, DL, "scev.check") {} 1878 1879 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1880 /// accurately estimate the cost of the runtime checks. The blocks are 1881 /// un-linked from the IR and is added back during vector code generation. If 1882 /// there is no vector code generation, the check blocks are removed 1883 /// completely. 1884 void Create(Loop *L, const LoopAccessInfo &LAI, 1885 const SCEVUnionPredicate &UnionPred) { 1886 1887 BasicBlock *LoopHeader = L->getHeader(); 1888 BasicBlock *Preheader = L->getLoopPreheader(); 1889 1890 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1891 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1892 // may be used by SCEVExpander. The blocks will be un-linked from their 1893 // predecessors and removed from LI & DT at the end of the function. 1894 if (!UnionPred.isAlwaysTrue()) { 1895 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1896 nullptr, "vector.scevcheck"); 1897 1898 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1899 &UnionPred, SCEVCheckBlock->getTerminator()); 1900 } 1901 1902 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1903 if (RtPtrChecking.Need) { 1904 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1905 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1906 "vector.memcheck"); 1907 1908 std::tie(std::ignore, MemRuntimeCheckCond) = 1909 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1910 RtPtrChecking.getChecks(), MemCheckExp); 1911 assert(MemRuntimeCheckCond && 1912 "no RT checks generated although RtPtrChecking " 1913 "claimed checks are required"); 1914 } 1915 1916 if (!MemCheckBlock && !SCEVCheckBlock) 1917 return; 1918 1919 // Unhook the temporary block with the checks, update various places 1920 // accordingly. 1921 if (SCEVCheckBlock) 1922 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1923 if (MemCheckBlock) 1924 MemCheckBlock->replaceAllUsesWith(Preheader); 1925 1926 if (SCEVCheckBlock) { 1927 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1928 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1929 Preheader->getTerminator()->eraseFromParent(); 1930 } 1931 if (MemCheckBlock) { 1932 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1933 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1934 Preheader->getTerminator()->eraseFromParent(); 1935 } 1936 1937 DT->changeImmediateDominator(LoopHeader, Preheader); 1938 if (MemCheckBlock) { 1939 DT->eraseNode(MemCheckBlock); 1940 LI->removeBlock(MemCheckBlock); 1941 } 1942 if (SCEVCheckBlock) { 1943 DT->eraseNode(SCEVCheckBlock); 1944 LI->removeBlock(SCEVCheckBlock); 1945 } 1946 } 1947 1948 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1949 /// unused. 1950 ~GeneratedRTChecks() { 1951 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1952 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1953 if (!SCEVCheckCond) 1954 SCEVCleaner.markResultUsed(); 1955 1956 if (!MemRuntimeCheckCond) 1957 MemCheckCleaner.markResultUsed(); 1958 1959 if (MemRuntimeCheckCond) { 1960 auto &SE = *MemCheckExp.getSE(); 1961 // Memory runtime check generation creates compares that use expanded 1962 // values. Remove them before running the SCEVExpanderCleaners. 1963 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1964 if (MemCheckExp.isInsertedInstruction(&I)) 1965 continue; 1966 SE.forgetValue(&I); 1967 SE.eraseValueFromMap(&I); 1968 I.eraseFromParent(); 1969 } 1970 } 1971 MemCheckCleaner.cleanup(); 1972 SCEVCleaner.cleanup(); 1973 1974 if (SCEVCheckCond) 1975 SCEVCheckBlock->eraseFromParent(); 1976 if (MemRuntimeCheckCond) 1977 MemCheckBlock->eraseFromParent(); 1978 } 1979 1980 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1981 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1982 /// depending on the generated condition. 1983 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1984 BasicBlock *LoopVectorPreHeader, 1985 BasicBlock *LoopExitBlock) { 1986 if (!SCEVCheckCond) 1987 return nullptr; 1988 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1989 if (C->isZero()) 1990 return nullptr; 1991 1992 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1993 1994 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1995 // Create new preheader for vector loop. 1996 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 1997 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 1998 1999 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2000 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2001 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2002 SCEVCheckBlock); 2003 2004 DT->addNewBlock(SCEVCheckBlock, Pred); 2005 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2006 2007 ReplaceInstWithInst( 2008 SCEVCheckBlock->getTerminator(), 2009 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2010 // Mark the check as used, to prevent it from being removed during cleanup. 2011 SCEVCheckCond = nullptr; 2012 return SCEVCheckBlock; 2013 } 2014 2015 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2016 /// the branches to branch to the vector preheader or \p Bypass, depending on 2017 /// the generated condition. 2018 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2019 BasicBlock *LoopVectorPreHeader) { 2020 // Check if we generated code that checks in runtime if arrays overlap. 2021 if (!MemRuntimeCheckCond) 2022 return nullptr; 2023 2024 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2025 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2026 MemCheckBlock); 2027 2028 DT->addNewBlock(MemCheckBlock, Pred); 2029 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2030 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2031 2032 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2033 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2034 2035 ReplaceInstWithInst( 2036 MemCheckBlock->getTerminator(), 2037 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2038 MemCheckBlock->getTerminator()->setDebugLoc( 2039 Pred->getTerminator()->getDebugLoc()); 2040 2041 // Mark the check as used, to prevent it from being removed during cleanup. 2042 MemRuntimeCheckCond = nullptr; 2043 return MemCheckBlock; 2044 } 2045 }; 2046 2047 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2048 // vectorization. The loop needs to be annotated with #pragma omp simd 2049 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2050 // vector length information is not provided, vectorization is not considered 2051 // explicit. Interleave hints are not allowed either. These limitations will be 2052 // relaxed in the future. 2053 // Please, note that we are currently forced to abuse the pragma 'clang 2054 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2055 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2056 // provides *explicit vectorization hints* (LV can bypass legal checks and 2057 // assume that vectorization is legal). However, both hints are implemented 2058 // using the same metadata (llvm.loop.vectorize, processed by 2059 // LoopVectorizeHints). This will be fixed in the future when the native IR 2060 // representation for pragma 'omp simd' is introduced. 2061 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2062 OptimizationRemarkEmitter *ORE) { 2063 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2064 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2065 2066 // Only outer loops with an explicit vectorization hint are supported. 2067 // Unannotated outer loops are ignored. 2068 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2069 return false; 2070 2071 Function *Fn = OuterLp->getHeader()->getParent(); 2072 if (!Hints.allowVectorization(Fn, OuterLp, 2073 true /*VectorizeOnlyWhenForced*/)) { 2074 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2075 return false; 2076 } 2077 2078 if (Hints.getInterleave() > 1) { 2079 // TODO: Interleave support is future work. 2080 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2081 "outer loops.\n"); 2082 Hints.emitRemarkWithHints(); 2083 return false; 2084 } 2085 2086 return true; 2087 } 2088 2089 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2090 OptimizationRemarkEmitter *ORE, 2091 SmallVectorImpl<Loop *> &V) { 2092 // Collect inner loops and outer loops without irreducible control flow. For 2093 // now, only collect outer loops that have explicit vectorization hints. If we 2094 // are stress testing the VPlan H-CFG construction, we collect the outermost 2095 // loop of every loop nest. 2096 if (L.isInnermost() || VPlanBuildStressTest || 2097 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2098 LoopBlocksRPO RPOT(&L); 2099 RPOT.perform(LI); 2100 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2101 V.push_back(&L); 2102 // TODO: Collect inner loops inside marked outer loops in case 2103 // vectorization fails for the outer loop. Do not invoke 2104 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2105 // already known to be reducible. We can use an inherited attribute for 2106 // that. 2107 return; 2108 } 2109 } 2110 for (Loop *InnerL : L) 2111 collectSupportedLoops(*InnerL, LI, ORE, V); 2112 } 2113 2114 namespace { 2115 2116 /// The LoopVectorize Pass. 2117 struct LoopVectorize : public FunctionPass { 2118 /// Pass identification, replacement for typeid 2119 static char ID; 2120 2121 LoopVectorizePass Impl; 2122 2123 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2124 bool VectorizeOnlyWhenForced = false) 2125 : FunctionPass(ID), 2126 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2127 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2128 } 2129 2130 bool runOnFunction(Function &F) override { 2131 if (skipFunction(F)) 2132 return false; 2133 2134 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2135 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2136 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2137 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2138 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2139 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2140 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2141 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2142 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2143 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2144 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2145 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2146 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2147 2148 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2149 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2150 2151 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2152 GetLAA, *ORE, PSI).MadeAnyChange; 2153 } 2154 2155 void getAnalysisUsage(AnalysisUsage &AU) const override { 2156 AU.addRequired<AssumptionCacheTracker>(); 2157 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2158 AU.addRequired<DominatorTreeWrapperPass>(); 2159 AU.addRequired<LoopInfoWrapperPass>(); 2160 AU.addRequired<ScalarEvolutionWrapperPass>(); 2161 AU.addRequired<TargetTransformInfoWrapperPass>(); 2162 AU.addRequired<AAResultsWrapperPass>(); 2163 AU.addRequired<LoopAccessLegacyAnalysis>(); 2164 AU.addRequired<DemandedBitsWrapperPass>(); 2165 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2166 AU.addRequired<InjectTLIMappingsLegacy>(); 2167 2168 // We currently do not preserve loopinfo/dominator analyses with outer loop 2169 // vectorization. Until this is addressed, mark these analyses as preserved 2170 // only for non-VPlan-native path. 2171 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2172 if (!EnableVPlanNativePath) { 2173 AU.addPreserved<LoopInfoWrapperPass>(); 2174 AU.addPreserved<DominatorTreeWrapperPass>(); 2175 } 2176 2177 AU.addPreserved<BasicAAWrapperPass>(); 2178 AU.addPreserved<GlobalsAAWrapperPass>(); 2179 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2180 } 2181 }; 2182 2183 } // end anonymous namespace 2184 2185 //===----------------------------------------------------------------------===// 2186 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2187 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2188 //===----------------------------------------------------------------------===// 2189 2190 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2191 // We need to place the broadcast of invariant variables outside the loop, 2192 // but only if it's proven safe to do so. Else, broadcast will be inside 2193 // vector loop body. 2194 Instruction *Instr = dyn_cast<Instruction>(V); 2195 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2196 (!Instr || 2197 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2198 // Place the code for broadcasting invariant variables in the new preheader. 2199 IRBuilder<>::InsertPointGuard Guard(Builder); 2200 if (SafeToHoist) 2201 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2202 2203 // Broadcast the scalar into all locations in the vector. 2204 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2205 2206 return Shuf; 2207 } 2208 2209 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2210 const InductionDescriptor &II, Value *Step, Value *Start, 2211 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2212 VPTransformState &State) { 2213 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2214 "Expected either an induction phi-node or a truncate of it!"); 2215 2216 // Construct the initial value of the vector IV in the vector loop preheader 2217 auto CurrIP = Builder.saveIP(); 2218 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2219 if (isa<TruncInst>(EntryVal)) { 2220 assert(Start->getType()->isIntegerTy() && 2221 "Truncation requires an integer type"); 2222 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2223 Step = Builder.CreateTrunc(Step, TruncType); 2224 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2225 } 2226 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2227 Value *SteppedStart = 2228 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2229 2230 // We create vector phi nodes for both integer and floating-point induction 2231 // variables. Here, we determine the kind of arithmetic we will perform. 2232 Instruction::BinaryOps AddOp; 2233 Instruction::BinaryOps MulOp; 2234 if (Step->getType()->isIntegerTy()) { 2235 AddOp = Instruction::Add; 2236 MulOp = Instruction::Mul; 2237 } else { 2238 AddOp = II.getInductionOpcode(); 2239 MulOp = Instruction::FMul; 2240 } 2241 2242 // Multiply the vectorization factor by the step using integer or 2243 // floating-point arithmetic as appropriate. 2244 Value *ConstVF = 2245 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2246 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); 2247 2248 // Create a vector splat to use in the induction update. 2249 // 2250 // FIXME: If the step is non-constant, we create the vector splat with 2251 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2252 // handle a constant vector splat. 2253 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2254 Value *SplatVF = isa<Constant>(Mul) 2255 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2256 : Builder.CreateVectorSplat(VF, Mul); 2257 Builder.restoreIP(CurrIP); 2258 2259 // We may need to add the step a number of times, depending on the unroll 2260 // factor. The last of those goes into the PHI. 2261 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2262 &*LoopVectorBody->getFirstInsertionPt()); 2263 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2264 Instruction *LastInduction = VecInd; 2265 for (unsigned Part = 0; Part < UF; ++Part) { 2266 State.set(Def, LastInduction, Part); 2267 2268 if (isa<TruncInst>(EntryVal)) 2269 addMetadata(LastInduction, EntryVal); 2270 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2271 State, Part); 2272 2273 LastInduction = cast<Instruction>( 2274 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2275 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2276 } 2277 2278 // Move the last step to the end of the latch block. This ensures consistent 2279 // placement of all induction updates. 2280 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2281 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2282 auto *ICmp = cast<Instruction>(Br->getCondition()); 2283 LastInduction->moveBefore(ICmp); 2284 LastInduction->setName("vec.ind.next"); 2285 2286 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2287 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2288 } 2289 2290 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2291 return Cost->isScalarAfterVectorization(I, VF) || 2292 Cost->isProfitableToScalarize(I, VF); 2293 } 2294 2295 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2296 if (shouldScalarizeInstruction(IV)) 2297 return true; 2298 auto isScalarInst = [&](User *U) -> bool { 2299 auto *I = cast<Instruction>(U); 2300 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2301 }; 2302 return llvm::any_of(IV->users(), isScalarInst); 2303 } 2304 2305 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2306 const InductionDescriptor &ID, const Instruction *EntryVal, 2307 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2308 unsigned Part, unsigned Lane) { 2309 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2310 "Expected either an induction phi-node or a truncate of it!"); 2311 2312 // This induction variable is not the phi from the original loop but the 2313 // newly-created IV based on the proof that casted Phi is equal to the 2314 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2315 // re-uses the same InductionDescriptor that original IV uses but we don't 2316 // have to do any recording in this case - that is done when original IV is 2317 // processed. 2318 if (isa<TruncInst>(EntryVal)) 2319 return; 2320 2321 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2322 if (Casts.empty()) 2323 return; 2324 // Only the first Cast instruction in the Casts vector is of interest. 2325 // The rest of the Casts (if exist) have no uses outside the 2326 // induction update chain itself. 2327 if (Lane < UINT_MAX) 2328 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2329 else 2330 State.set(CastDef, VectorLoopVal, Part); 2331 } 2332 2333 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2334 TruncInst *Trunc, VPValue *Def, 2335 VPValue *CastDef, 2336 VPTransformState &State) { 2337 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2338 "Primary induction variable must have an integer type"); 2339 2340 auto II = Legal->getInductionVars().find(IV); 2341 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2342 2343 auto ID = II->second; 2344 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2345 2346 // The value from the original loop to which we are mapping the new induction 2347 // variable. 2348 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2349 2350 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2351 2352 // Generate code for the induction step. Note that induction steps are 2353 // required to be loop-invariant 2354 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2355 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2356 "Induction step should be loop invariant"); 2357 if (PSE.getSE()->isSCEVable(IV->getType())) { 2358 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2359 return Exp.expandCodeFor(Step, Step->getType(), 2360 LoopVectorPreHeader->getTerminator()); 2361 } 2362 return cast<SCEVUnknown>(Step)->getValue(); 2363 }; 2364 2365 // The scalar value to broadcast. This is derived from the canonical 2366 // induction variable. If a truncation type is given, truncate the canonical 2367 // induction variable and step. Otherwise, derive these values from the 2368 // induction descriptor. 2369 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2370 Value *ScalarIV = Induction; 2371 if (IV != OldInduction) { 2372 ScalarIV = IV->getType()->isIntegerTy() 2373 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2374 : Builder.CreateCast(Instruction::SIToFP, Induction, 2375 IV->getType()); 2376 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2377 ScalarIV->setName("offset.idx"); 2378 } 2379 if (Trunc) { 2380 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2381 assert(Step->getType()->isIntegerTy() && 2382 "Truncation requires an integer step"); 2383 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2384 Step = Builder.CreateTrunc(Step, TruncType); 2385 } 2386 return ScalarIV; 2387 }; 2388 2389 // Create the vector values from the scalar IV, in the absence of creating a 2390 // vector IV. 2391 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2392 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2393 for (unsigned Part = 0; Part < UF; ++Part) { 2394 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2395 Value *EntryPart = 2396 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2397 ID.getInductionOpcode()); 2398 State.set(Def, EntryPart, Part); 2399 if (Trunc) 2400 addMetadata(EntryPart, Trunc); 2401 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2402 State, Part); 2403 } 2404 }; 2405 2406 // Fast-math-flags propagate from the original induction instruction. 2407 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2408 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2409 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2410 2411 // Now do the actual transformations, and start with creating the step value. 2412 Value *Step = CreateStepValue(ID.getStep()); 2413 if (VF.isZero() || VF.isScalar()) { 2414 Value *ScalarIV = CreateScalarIV(Step); 2415 CreateSplatIV(ScalarIV, Step); 2416 return; 2417 } 2418 2419 // Determine if we want a scalar version of the induction variable. This is 2420 // true if the induction variable itself is not widened, or if it has at 2421 // least one user in the loop that is not widened. 2422 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2423 if (!NeedsScalarIV) { 2424 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2425 State); 2426 return; 2427 } 2428 2429 // Try to create a new independent vector induction variable. If we can't 2430 // create the phi node, we will splat the scalar induction variable in each 2431 // loop iteration. 2432 if (!shouldScalarizeInstruction(EntryVal)) { 2433 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2434 State); 2435 Value *ScalarIV = CreateScalarIV(Step); 2436 // Create scalar steps that can be used by instructions we will later 2437 // scalarize. Note that the addition of the scalar steps will not increase 2438 // the number of instructions in the loop in the common case prior to 2439 // InstCombine. We will be trading one vector extract for each scalar step. 2440 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2441 return; 2442 } 2443 2444 // All IV users are scalar instructions, so only emit a scalar IV, not a 2445 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2446 // predicate used by the masked loads/stores. 2447 Value *ScalarIV = CreateScalarIV(Step); 2448 if (!Cost->isScalarEpilogueAllowed()) 2449 CreateSplatIV(ScalarIV, Step); 2450 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2451 } 2452 2453 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2454 Instruction::BinaryOps BinOp) { 2455 // Create and check the types. 2456 assert(isa<FixedVectorType>(Val->getType()) && 2457 "Creation of scalable step vector not yet supported"); 2458 auto *ValVTy = cast<VectorType>(Val->getType()); 2459 ElementCount VLen = ValVTy->getElementCount(); 2460 2461 Type *STy = Val->getType()->getScalarType(); 2462 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2463 "Induction Step must be an integer or FP"); 2464 assert(Step->getType() == STy && "Step has wrong type"); 2465 2466 SmallVector<Constant *, 8> Indices; 2467 2468 // Create a vector of consecutive numbers from zero to VF. 2469 VectorType *InitVecValVTy = ValVTy; 2470 Type *InitVecValSTy = STy; 2471 if (STy->isFloatingPointTy()) { 2472 InitVecValSTy = 2473 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2474 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2475 } 2476 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2477 2478 // Add on StartIdx 2479 Value *StartIdxSplat = Builder.CreateVectorSplat( 2480 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2481 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2482 2483 if (STy->isIntegerTy()) { 2484 Step = Builder.CreateVectorSplat(VLen, Step); 2485 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2486 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2487 // which can be found from the original scalar operations. 2488 Step = Builder.CreateMul(InitVec, Step); 2489 return Builder.CreateAdd(Val, Step, "induction"); 2490 } 2491 2492 // Floating point induction. 2493 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2494 "Binary Opcode should be specified for FP induction"); 2495 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2496 Step = Builder.CreateVectorSplat(VLen, Step); 2497 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2498 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2499 } 2500 2501 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2502 Instruction *EntryVal, 2503 const InductionDescriptor &ID, 2504 VPValue *Def, VPValue *CastDef, 2505 VPTransformState &State) { 2506 // We shouldn't have to build scalar steps if we aren't vectorizing. 2507 assert(VF.isVector() && "VF should be greater than one"); 2508 // Get the value type and ensure it and the step have the same integer type. 2509 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2510 assert(ScalarIVTy == Step->getType() && 2511 "Val and Step should have the same type"); 2512 2513 // We build scalar steps for both integer and floating-point induction 2514 // variables. Here, we determine the kind of arithmetic we will perform. 2515 Instruction::BinaryOps AddOp; 2516 Instruction::BinaryOps MulOp; 2517 if (ScalarIVTy->isIntegerTy()) { 2518 AddOp = Instruction::Add; 2519 MulOp = Instruction::Mul; 2520 } else { 2521 AddOp = ID.getInductionOpcode(); 2522 MulOp = Instruction::FMul; 2523 } 2524 2525 // Determine the number of scalars we need to generate for each unroll 2526 // iteration. If EntryVal is uniform, we only need to generate the first 2527 // lane. Otherwise, we generate all VF values. 2528 unsigned Lanes = 2529 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2530 ? 1 2531 : VF.getKnownMinValue(); 2532 assert((!VF.isScalable() || Lanes == 1) && 2533 "Should never scalarize a scalable vector"); 2534 // Compute the scalar steps and save the results in State. 2535 for (unsigned Part = 0; Part < UF; ++Part) { 2536 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2537 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2538 ScalarIVTy->getScalarSizeInBits()); 2539 Value *StartIdx = 2540 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2541 if (ScalarIVTy->isFloatingPointTy()) 2542 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2543 StartIdx = Builder.CreateBinOp( 2544 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2545 // The step returned by `createStepForVF` is a runtime-evaluated value 2546 // when VF is scalable. Otherwise, it should be folded into a Constant. 2547 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2548 "Expected StartIdx to be folded to a constant when VF is not " 2549 "scalable"); 2550 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2551 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2552 State.set(Def, Add, VPIteration(Part, Lane)); 2553 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2554 Part, Lane); 2555 } 2556 } 2557 } 2558 2559 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2560 const VPIteration &Instance, 2561 VPTransformState &State) { 2562 Value *ScalarInst = State.get(Def, Instance); 2563 Value *VectorValue = State.get(Def, Instance.Part); 2564 VectorValue = Builder.CreateInsertElement( 2565 VectorValue, ScalarInst, 2566 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2567 State.set(Def, VectorValue, Instance.Part); 2568 } 2569 2570 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2571 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2572 return Builder.CreateVectorReverse(Vec, "reverse"); 2573 } 2574 2575 // Return whether we allow using masked interleave-groups (for dealing with 2576 // strided loads/stores that reside in predicated blocks, or for dealing 2577 // with gaps). 2578 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2579 // If an override option has been passed in for interleaved accesses, use it. 2580 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2581 return EnableMaskedInterleavedMemAccesses; 2582 2583 return TTI.enableMaskedInterleavedAccessVectorization(); 2584 } 2585 2586 // Try to vectorize the interleave group that \p Instr belongs to. 2587 // 2588 // E.g. Translate following interleaved load group (factor = 3): 2589 // for (i = 0; i < N; i+=3) { 2590 // R = Pic[i]; // Member of index 0 2591 // G = Pic[i+1]; // Member of index 1 2592 // B = Pic[i+2]; // Member of index 2 2593 // ... // do something to R, G, B 2594 // } 2595 // To: 2596 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2597 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2598 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2599 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2600 // 2601 // Or translate following interleaved store group (factor = 3): 2602 // for (i = 0; i < N; i+=3) { 2603 // ... do something to R, G, B 2604 // Pic[i] = R; // Member of index 0 2605 // Pic[i+1] = G; // Member of index 1 2606 // Pic[i+2] = B; // Member of index 2 2607 // } 2608 // To: 2609 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2610 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2611 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2612 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2613 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2614 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2615 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2616 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2617 VPValue *BlockInMask) { 2618 Instruction *Instr = Group->getInsertPos(); 2619 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2620 2621 // Prepare for the vector type of the interleaved load/store. 2622 Type *ScalarTy = getMemInstValueType(Instr); 2623 unsigned InterleaveFactor = Group->getFactor(); 2624 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2625 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2626 2627 // Prepare for the new pointers. 2628 SmallVector<Value *, 2> AddrParts; 2629 unsigned Index = Group->getIndex(Instr); 2630 2631 // TODO: extend the masked interleaved-group support to reversed access. 2632 assert((!BlockInMask || !Group->isReverse()) && 2633 "Reversed masked interleave-group not supported."); 2634 2635 // If the group is reverse, adjust the index to refer to the last vector lane 2636 // instead of the first. We adjust the index from the first vector lane, 2637 // rather than directly getting the pointer for lane VF - 1, because the 2638 // pointer operand of the interleaved access is supposed to be uniform. For 2639 // uniform instructions, we're only required to generate a value for the 2640 // first vector lane in each unroll iteration. 2641 assert(!VF.isScalable() && 2642 "scalable vector reverse operation is not implemented"); 2643 if (Group->isReverse()) 2644 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2645 2646 for (unsigned Part = 0; Part < UF; Part++) { 2647 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2648 setDebugLocFromInst(Builder, AddrPart); 2649 2650 // Notice current instruction could be any index. Need to adjust the address 2651 // to the member of index 0. 2652 // 2653 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2654 // b = A[i]; // Member of index 0 2655 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2656 // 2657 // E.g. A[i+1] = a; // Member of index 1 2658 // A[i] = b; // Member of index 0 2659 // A[i+2] = c; // Member of index 2 (Current instruction) 2660 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2661 2662 bool InBounds = false; 2663 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2664 InBounds = gep->isInBounds(); 2665 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2666 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2667 2668 // Cast to the vector pointer type. 2669 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2670 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2671 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2672 } 2673 2674 setDebugLocFromInst(Builder, Instr); 2675 Value *PoisonVec = PoisonValue::get(VecTy); 2676 2677 Value *MaskForGaps = nullptr; 2678 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2679 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2680 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2681 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2682 } 2683 2684 // Vectorize the interleaved load group. 2685 if (isa<LoadInst>(Instr)) { 2686 // For each unroll part, create a wide load for the group. 2687 SmallVector<Value *, 2> NewLoads; 2688 for (unsigned Part = 0; Part < UF; Part++) { 2689 Instruction *NewLoad; 2690 if (BlockInMask || MaskForGaps) { 2691 assert(useMaskedInterleavedAccesses(*TTI) && 2692 "masked interleaved groups are not allowed."); 2693 Value *GroupMask = MaskForGaps; 2694 if (BlockInMask) { 2695 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2696 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2697 Value *ShuffledMask = Builder.CreateShuffleVector( 2698 BlockInMaskPart, 2699 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2700 "interleaved.mask"); 2701 GroupMask = MaskForGaps 2702 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2703 MaskForGaps) 2704 : ShuffledMask; 2705 } 2706 NewLoad = 2707 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2708 GroupMask, PoisonVec, "wide.masked.vec"); 2709 } 2710 else 2711 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2712 Group->getAlign(), "wide.vec"); 2713 Group->addMetadata(NewLoad); 2714 NewLoads.push_back(NewLoad); 2715 } 2716 2717 // For each member in the group, shuffle out the appropriate data from the 2718 // wide loads. 2719 unsigned J = 0; 2720 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2721 Instruction *Member = Group->getMember(I); 2722 2723 // Skip the gaps in the group. 2724 if (!Member) 2725 continue; 2726 2727 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2728 auto StrideMask = 2729 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2730 for (unsigned Part = 0; Part < UF; Part++) { 2731 Value *StridedVec = Builder.CreateShuffleVector( 2732 NewLoads[Part], StrideMask, "strided.vec"); 2733 2734 // If this member has different type, cast the result type. 2735 if (Member->getType() != ScalarTy) { 2736 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2737 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2738 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2739 } 2740 2741 if (Group->isReverse()) 2742 StridedVec = reverseVector(StridedVec); 2743 2744 State.set(VPDefs[J], StridedVec, Part); 2745 } 2746 ++J; 2747 } 2748 return; 2749 } 2750 2751 // The sub vector type for current instruction. 2752 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2753 auto *SubVT = VectorType::get(ScalarTy, VF); 2754 2755 // Vectorize the interleaved store group. 2756 for (unsigned Part = 0; Part < UF; Part++) { 2757 // Collect the stored vector from each member. 2758 SmallVector<Value *, 4> StoredVecs; 2759 for (unsigned i = 0; i < InterleaveFactor; i++) { 2760 // Interleaved store group doesn't allow a gap, so each index has a member 2761 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2762 2763 Value *StoredVec = State.get(StoredValues[i], Part); 2764 2765 if (Group->isReverse()) 2766 StoredVec = reverseVector(StoredVec); 2767 2768 // If this member has different type, cast it to a unified type. 2769 2770 if (StoredVec->getType() != SubVT) 2771 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2772 2773 StoredVecs.push_back(StoredVec); 2774 } 2775 2776 // Concatenate all vectors into a wide vector. 2777 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2778 2779 // Interleave the elements in the wide vector. 2780 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2781 Value *IVec = Builder.CreateShuffleVector( 2782 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2783 "interleaved.vec"); 2784 2785 Instruction *NewStoreInstr; 2786 if (BlockInMask) { 2787 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2788 Value *ShuffledMask = Builder.CreateShuffleVector( 2789 BlockInMaskPart, 2790 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2791 "interleaved.mask"); 2792 NewStoreInstr = Builder.CreateMaskedStore( 2793 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2794 } 2795 else 2796 NewStoreInstr = 2797 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2798 2799 Group->addMetadata(NewStoreInstr); 2800 } 2801 } 2802 2803 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2804 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2805 VPValue *StoredValue, VPValue *BlockInMask) { 2806 // Attempt to issue a wide load. 2807 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2808 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2809 2810 assert((LI || SI) && "Invalid Load/Store instruction"); 2811 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2812 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2813 2814 LoopVectorizationCostModel::InstWidening Decision = 2815 Cost->getWideningDecision(Instr, VF); 2816 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2817 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2818 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2819 "CM decision is not to widen the memory instruction"); 2820 2821 Type *ScalarDataTy = getMemInstValueType(Instr); 2822 2823 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2824 const Align Alignment = getLoadStoreAlignment(Instr); 2825 2826 // Determine if the pointer operand of the access is either consecutive or 2827 // reverse consecutive. 2828 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2829 bool ConsecutiveStride = 2830 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2831 bool CreateGatherScatter = 2832 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2833 2834 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2835 // gather/scatter. Otherwise Decision should have been to Scalarize. 2836 assert((ConsecutiveStride || CreateGatherScatter) && 2837 "The instruction should be scalarized"); 2838 (void)ConsecutiveStride; 2839 2840 VectorParts BlockInMaskParts(UF); 2841 bool isMaskRequired = BlockInMask; 2842 if (isMaskRequired) 2843 for (unsigned Part = 0; Part < UF; ++Part) 2844 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2845 2846 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2847 // Calculate the pointer for the specific unroll-part. 2848 GetElementPtrInst *PartPtr = nullptr; 2849 2850 bool InBounds = false; 2851 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2852 InBounds = gep->isInBounds(); 2853 if (Reverse) { 2854 // If the address is consecutive but reversed, then the 2855 // wide store needs to start at the last vector element. 2856 // RunTimeVF = VScale * VF.getKnownMinValue() 2857 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2858 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2859 // NumElt = -Part * RunTimeVF 2860 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2861 // LastLane = 1 - RunTimeVF 2862 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2863 PartPtr = 2864 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2865 PartPtr->setIsInBounds(InBounds); 2866 PartPtr = cast<GetElementPtrInst>( 2867 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2868 PartPtr->setIsInBounds(InBounds); 2869 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2870 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2871 } else { 2872 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2873 PartPtr = cast<GetElementPtrInst>( 2874 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2875 PartPtr->setIsInBounds(InBounds); 2876 } 2877 2878 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2879 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2880 }; 2881 2882 // Handle Stores: 2883 if (SI) { 2884 setDebugLocFromInst(Builder, SI); 2885 2886 for (unsigned Part = 0; Part < UF; ++Part) { 2887 Instruction *NewSI = nullptr; 2888 Value *StoredVal = State.get(StoredValue, Part); 2889 if (CreateGatherScatter) { 2890 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2891 Value *VectorGep = State.get(Addr, Part); 2892 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2893 MaskPart); 2894 } else { 2895 if (Reverse) { 2896 // If we store to reverse consecutive memory locations, then we need 2897 // to reverse the order of elements in the stored value. 2898 StoredVal = reverseVector(StoredVal); 2899 // We don't want to update the value in the map as it might be used in 2900 // another expression. So don't call resetVectorValue(StoredVal). 2901 } 2902 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2903 if (isMaskRequired) 2904 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2905 BlockInMaskParts[Part]); 2906 else 2907 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2908 } 2909 addMetadata(NewSI, SI); 2910 } 2911 return; 2912 } 2913 2914 // Handle loads. 2915 assert(LI && "Must have a load instruction"); 2916 setDebugLocFromInst(Builder, LI); 2917 for (unsigned Part = 0; Part < UF; ++Part) { 2918 Value *NewLI; 2919 if (CreateGatherScatter) { 2920 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2921 Value *VectorGep = State.get(Addr, Part); 2922 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2923 nullptr, "wide.masked.gather"); 2924 addMetadata(NewLI, LI); 2925 } else { 2926 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2927 if (isMaskRequired) 2928 NewLI = Builder.CreateMaskedLoad( 2929 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2930 "wide.masked.load"); 2931 else 2932 NewLI = 2933 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2934 2935 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2936 addMetadata(NewLI, LI); 2937 if (Reverse) 2938 NewLI = reverseVector(NewLI); 2939 } 2940 2941 State.set(Def, NewLI, Part); 2942 } 2943 } 2944 2945 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2946 VPUser &User, 2947 const VPIteration &Instance, 2948 bool IfPredicateInstr, 2949 VPTransformState &State) { 2950 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2951 2952 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2953 // the first lane and part. 2954 if (isa<NoAliasScopeDeclInst>(Instr)) 2955 if (!Instance.isFirstIteration()) 2956 return; 2957 2958 setDebugLocFromInst(Builder, Instr); 2959 2960 // Does this instruction return a value ? 2961 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2962 2963 Instruction *Cloned = Instr->clone(); 2964 if (!IsVoidRetTy) 2965 Cloned->setName(Instr->getName() + ".cloned"); 2966 2967 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2968 Builder.GetInsertPoint()); 2969 // Replace the operands of the cloned instructions with their scalar 2970 // equivalents in the new loop. 2971 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2972 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2973 auto InputInstance = Instance; 2974 if (!Operand || !OrigLoop->contains(Operand) || 2975 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2976 InputInstance.Lane = VPLane::getFirstLane(); 2977 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2978 Cloned->setOperand(op, NewOp); 2979 } 2980 addNewMetadata(Cloned, Instr); 2981 2982 // Place the cloned scalar in the new loop. 2983 Builder.Insert(Cloned); 2984 2985 State.set(Def, Cloned, Instance); 2986 2987 // If we just cloned a new assumption, add it the assumption cache. 2988 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2989 if (II->getIntrinsicID() == Intrinsic::assume) 2990 AC->registerAssumption(II); 2991 2992 // End if-block. 2993 if (IfPredicateInstr) 2994 PredicatedInstructions.push_back(Cloned); 2995 } 2996 2997 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2998 Value *End, Value *Step, 2999 Instruction *DL) { 3000 BasicBlock *Header = L->getHeader(); 3001 BasicBlock *Latch = L->getLoopLatch(); 3002 // As we're just creating this loop, it's possible no latch exists 3003 // yet. If so, use the header as this will be a single block loop. 3004 if (!Latch) 3005 Latch = Header; 3006 3007 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3008 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3009 setDebugLocFromInst(Builder, OldInst); 3010 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3011 3012 Builder.SetInsertPoint(Latch->getTerminator()); 3013 setDebugLocFromInst(Builder, OldInst); 3014 3015 // Create i+1 and fill the PHINode. 3016 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3017 Induction->addIncoming(Start, L->getLoopPreheader()); 3018 Induction->addIncoming(Next, Latch); 3019 // Create the compare. 3020 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3021 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3022 3023 // Now we have two terminators. Remove the old one from the block. 3024 Latch->getTerminator()->eraseFromParent(); 3025 3026 return Induction; 3027 } 3028 3029 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3030 if (TripCount) 3031 return TripCount; 3032 3033 assert(L && "Create Trip Count for null loop."); 3034 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3035 // Find the loop boundaries. 3036 ScalarEvolution *SE = PSE.getSE(); 3037 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3038 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3039 "Invalid loop count"); 3040 3041 Type *IdxTy = Legal->getWidestInductionType(); 3042 assert(IdxTy && "No type for induction"); 3043 3044 // The exit count might have the type of i64 while the phi is i32. This can 3045 // happen if we have an induction variable that is sign extended before the 3046 // compare. The only way that we get a backedge taken count is that the 3047 // induction variable was signed and as such will not overflow. In such a case 3048 // truncation is legal. 3049 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3050 IdxTy->getPrimitiveSizeInBits()) 3051 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3052 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3053 3054 // Get the total trip count from the count by adding 1. 3055 const SCEV *ExitCount = SE->getAddExpr( 3056 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3057 3058 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3059 3060 // Expand the trip count and place the new instructions in the preheader. 3061 // Notice that the pre-header does not change, only the loop body. 3062 SCEVExpander Exp(*SE, DL, "induction"); 3063 3064 // Count holds the overall loop count (N). 3065 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3066 L->getLoopPreheader()->getTerminator()); 3067 3068 if (TripCount->getType()->isPointerTy()) 3069 TripCount = 3070 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3071 L->getLoopPreheader()->getTerminator()); 3072 3073 return TripCount; 3074 } 3075 3076 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3077 if (VectorTripCount) 3078 return VectorTripCount; 3079 3080 Value *TC = getOrCreateTripCount(L); 3081 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3082 3083 Type *Ty = TC->getType(); 3084 // This is where we can make the step a runtime constant. 3085 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3086 3087 // If the tail is to be folded by masking, round the number of iterations N 3088 // up to a multiple of Step instead of rounding down. This is done by first 3089 // adding Step-1 and then rounding down. Note that it's ok if this addition 3090 // overflows: the vector induction variable will eventually wrap to zero given 3091 // that it starts at zero and its Step is a power of two; the loop will then 3092 // exit, with the last early-exit vector comparison also producing all-true. 3093 if (Cost->foldTailByMasking()) { 3094 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3095 "VF*UF must be a power of 2 when folding tail by masking"); 3096 assert(!VF.isScalable() && 3097 "Tail folding not yet supported for scalable vectors"); 3098 TC = Builder.CreateAdd( 3099 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3100 } 3101 3102 // Now we need to generate the expression for the part of the loop that the 3103 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3104 // iterations are not required for correctness, or N - Step, otherwise. Step 3105 // is equal to the vectorization factor (number of SIMD elements) times the 3106 // unroll factor (number of SIMD instructions). 3107 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3108 3109 // There are two cases where we need to ensure (at least) the last iteration 3110 // runs in the scalar remainder loop. Thus, if the step evenly divides 3111 // the trip count, we set the remainder to be equal to the step. If the step 3112 // does not evenly divide the trip count, no adjustment is necessary since 3113 // there will already be scalar iterations. Note that the minimum iterations 3114 // check ensures that N >= Step. The cases are: 3115 // 1) If there is a non-reversed interleaved group that may speculatively 3116 // access memory out-of-bounds. 3117 // 2) If any instruction may follow a conditionally taken exit. That is, if 3118 // the loop contains multiple exiting blocks, or a single exiting block 3119 // which is not the latch. 3120 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3121 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3122 R = Builder.CreateSelect(IsZero, Step, R); 3123 } 3124 3125 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3126 3127 return VectorTripCount; 3128 } 3129 3130 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3131 const DataLayout &DL) { 3132 // Verify that V is a vector type with same number of elements as DstVTy. 3133 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3134 unsigned VF = DstFVTy->getNumElements(); 3135 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3136 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3137 Type *SrcElemTy = SrcVecTy->getElementType(); 3138 Type *DstElemTy = DstFVTy->getElementType(); 3139 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3140 "Vector elements must have same size"); 3141 3142 // Do a direct cast if element types are castable. 3143 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3144 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3145 } 3146 // V cannot be directly casted to desired vector type. 3147 // May happen when V is a floating point vector but DstVTy is a vector of 3148 // pointers or vice-versa. Handle this using a two-step bitcast using an 3149 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3150 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3151 "Only one type should be a pointer type"); 3152 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3153 "Only one type should be a floating point type"); 3154 Type *IntTy = 3155 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3156 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3157 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3158 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3159 } 3160 3161 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3162 BasicBlock *Bypass) { 3163 Value *Count = getOrCreateTripCount(L); 3164 // Reuse existing vector loop preheader for TC checks. 3165 // Note that new preheader block is generated for vector loop. 3166 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3167 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3168 3169 // Generate code to check if the loop's trip count is less than VF * UF, or 3170 // equal to it in case a scalar epilogue is required; this implies that the 3171 // vector trip count is zero. This check also covers the case where adding one 3172 // to the backedge-taken count overflowed leading to an incorrect trip count 3173 // of zero. In this case we will also jump to the scalar loop. 3174 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3175 : ICmpInst::ICMP_ULT; 3176 3177 // If tail is to be folded, vector loop takes care of all iterations. 3178 Value *CheckMinIters = Builder.getFalse(); 3179 if (!Cost->foldTailByMasking()) { 3180 Value *Step = 3181 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3182 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3183 } 3184 // Create new preheader for vector loop. 3185 LoopVectorPreHeader = 3186 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3187 "vector.ph"); 3188 3189 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3190 DT->getNode(Bypass)->getIDom()) && 3191 "TC check is expected to dominate Bypass"); 3192 3193 // Update dominator for Bypass & LoopExit. 3194 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3195 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3196 3197 ReplaceInstWithInst( 3198 TCCheckBlock->getTerminator(), 3199 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3200 LoopBypassBlocks.push_back(TCCheckBlock); 3201 } 3202 3203 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3204 3205 BasicBlock *const SCEVCheckBlock = 3206 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3207 if (!SCEVCheckBlock) 3208 return nullptr; 3209 3210 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3211 (OptForSizeBasedOnProfile && 3212 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3213 "Cannot SCEV check stride or overflow when optimizing for size"); 3214 3215 3216 // Update dominator only if this is first RT check. 3217 if (LoopBypassBlocks.empty()) { 3218 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3219 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3220 } 3221 3222 LoopBypassBlocks.push_back(SCEVCheckBlock); 3223 AddedSafetyChecks = true; 3224 return SCEVCheckBlock; 3225 } 3226 3227 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3228 BasicBlock *Bypass) { 3229 // VPlan-native path does not do any analysis for runtime checks currently. 3230 if (EnableVPlanNativePath) 3231 return nullptr; 3232 3233 BasicBlock *const MemCheckBlock = 3234 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3235 3236 // Check if we generated code that checks in runtime if arrays overlap. We put 3237 // the checks into a separate block to make the more common case of few 3238 // elements faster. 3239 if (!MemCheckBlock) 3240 return nullptr; 3241 3242 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3243 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3244 "Cannot emit memory checks when optimizing for size, unless forced " 3245 "to vectorize."); 3246 ORE->emit([&]() { 3247 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3248 L->getStartLoc(), L->getHeader()) 3249 << "Code-size may be reduced by not forcing " 3250 "vectorization, or by source-code modifications " 3251 "eliminating the need for runtime checks " 3252 "(e.g., adding 'restrict')."; 3253 }); 3254 } 3255 3256 LoopBypassBlocks.push_back(MemCheckBlock); 3257 3258 AddedSafetyChecks = true; 3259 3260 // We currently don't use LoopVersioning for the actual loop cloning but we 3261 // still use it to add the noalias metadata. 3262 LVer = std::make_unique<LoopVersioning>( 3263 *Legal->getLAI(), 3264 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3265 DT, PSE.getSE()); 3266 LVer->prepareNoAliasMetadata(); 3267 return MemCheckBlock; 3268 } 3269 3270 Value *InnerLoopVectorizer::emitTransformedIndex( 3271 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3272 const InductionDescriptor &ID) const { 3273 3274 SCEVExpander Exp(*SE, DL, "induction"); 3275 auto Step = ID.getStep(); 3276 auto StartValue = ID.getStartValue(); 3277 assert(Index->getType() == Step->getType() && 3278 "Index type does not match StepValue type"); 3279 3280 // Note: the IR at this point is broken. We cannot use SE to create any new 3281 // SCEV and then expand it, hoping that SCEV's simplification will give us 3282 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3283 // lead to various SCEV crashes. So all we can do is to use builder and rely 3284 // on InstCombine for future simplifications. Here we handle some trivial 3285 // cases only. 3286 auto CreateAdd = [&B](Value *X, Value *Y) { 3287 assert(X->getType() == Y->getType() && "Types don't match!"); 3288 if (auto *CX = dyn_cast<ConstantInt>(X)) 3289 if (CX->isZero()) 3290 return Y; 3291 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3292 if (CY->isZero()) 3293 return X; 3294 return B.CreateAdd(X, Y); 3295 }; 3296 3297 auto CreateMul = [&B](Value *X, Value *Y) { 3298 assert(X->getType() == Y->getType() && "Types don't match!"); 3299 if (auto *CX = dyn_cast<ConstantInt>(X)) 3300 if (CX->isOne()) 3301 return Y; 3302 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3303 if (CY->isOne()) 3304 return X; 3305 return B.CreateMul(X, Y); 3306 }; 3307 3308 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3309 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3310 // the DomTree is not kept up-to-date for additional blocks generated in the 3311 // vector loop. By using the header as insertion point, we guarantee that the 3312 // expanded instructions dominate all their uses. 3313 auto GetInsertPoint = [this, &B]() { 3314 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3315 if (InsertBB != LoopVectorBody && 3316 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3317 return LoopVectorBody->getTerminator(); 3318 return &*B.GetInsertPoint(); 3319 }; 3320 3321 switch (ID.getKind()) { 3322 case InductionDescriptor::IK_IntInduction: { 3323 assert(Index->getType() == StartValue->getType() && 3324 "Index type does not match StartValue type"); 3325 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3326 return B.CreateSub(StartValue, Index); 3327 auto *Offset = CreateMul( 3328 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3329 return CreateAdd(StartValue, Offset); 3330 } 3331 case InductionDescriptor::IK_PtrInduction: { 3332 assert(isa<SCEVConstant>(Step) && 3333 "Expected constant step for pointer induction"); 3334 return B.CreateGEP( 3335 StartValue->getType()->getPointerElementType(), StartValue, 3336 CreateMul(Index, 3337 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3338 } 3339 case InductionDescriptor::IK_FpInduction: { 3340 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3341 auto InductionBinOp = ID.getInductionBinOp(); 3342 assert(InductionBinOp && 3343 (InductionBinOp->getOpcode() == Instruction::FAdd || 3344 InductionBinOp->getOpcode() == Instruction::FSub) && 3345 "Original bin op should be defined for FP induction"); 3346 3347 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3348 Value *MulExp = B.CreateFMul(StepValue, Index); 3349 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3350 "induction"); 3351 } 3352 case InductionDescriptor::IK_NoInduction: 3353 return nullptr; 3354 } 3355 llvm_unreachable("invalid enum"); 3356 } 3357 3358 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3359 LoopScalarBody = OrigLoop->getHeader(); 3360 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3361 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3362 assert(LoopExitBlock && "Must have an exit block"); 3363 assert(LoopVectorPreHeader && "Invalid loop structure"); 3364 3365 LoopMiddleBlock = 3366 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3367 LI, nullptr, Twine(Prefix) + "middle.block"); 3368 LoopScalarPreHeader = 3369 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3370 nullptr, Twine(Prefix) + "scalar.ph"); 3371 3372 // Set up branch from middle block to the exit and scalar preheader blocks. 3373 // completeLoopSkeleton will update the condition to use an iteration check, 3374 // if required to decide whether to execute the remainder. 3375 BranchInst *BrInst = 3376 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3377 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3378 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3379 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3380 3381 // We intentionally don't let SplitBlock to update LoopInfo since 3382 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3383 // LoopVectorBody is explicitly added to the correct place few lines later. 3384 LoopVectorBody = 3385 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3386 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3387 3388 // Update dominator for loop exit. 3389 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3390 3391 // Create and register the new vector loop. 3392 Loop *Lp = LI->AllocateLoop(); 3393 Loop *ParentLoop = OrigLoop->getParentLoop(); 3394 3395 // Insert the new loop into the loop nest and register the new basic blocks 3396 // before calling any utilities such as SCEV that require valid LoopInfo. 3397 if (ParentLoop) { 3398 ParentLoop->addChildLoop(Lp); 3399 } else { 3400 LI->addTopLevelLoop(Lp); 3401 } 3402 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3403 return Lp; 3404 } 3405 3406 void InnerLoopVectorizer::createInductionResumeValues( 3407 Loop *L, Value *VectorTripCount, 3408 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3409 assert(VectorTripCount && L && "Expected valid arguments"); 3410 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3411 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3412 "Inconsistent information about additional bypass."); 3413 // We are going to resume the execution of the scalar loop. 3414 // Go over all of the induction variables that we found and fix the 3415 // PHIs that are left in the scalar version of the loop. 3416 // The starting values of PHI nodes depend on the counter of the last 3417 // iteration in the vectorized loop. 3418 // If we come from a bypass edge then we need to start from the original 3419 // start value. 3420 for (auto &InductionEntry : Legal->getInductionVars()) { 3421 PHINode *OrigPhi = InductionEntry.first; 3422 InductionDescriptor II = InductionEntry.second; 3423 3424 // Create phi nodes to merge from the backedge-taken check block. 3425 PHINode *BCResumeVal = 3426 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3427 LoopScalarPreHeader->getTerminator()); 3428 // Copy original phi DL over to the new one. 3429 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3430 Value *&EndValue = IVEndValues[OrigPhi]; 3431 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3432 if (OrigPhi == OldInduction) { 3433 // We know what the end value is. 3434 EndValue = VectorTripCount; 3435 } else { 3436 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3437 3438 // Fast-math-flags propagate from the original induction instruction. 3439 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3440 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3441 3442 Type *StepType = II.getStep()->getType(); 3443 Instruction::CastOps CastOp = 3444 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3445 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3446 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3447 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3448 EndValue->setName("ind.end"); 3449 3450 // Compute the end value for the additional bypass (if applicable). 3451 if (AdditionalBypass.first) { 3452 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3453 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3454 StepType, true); 3455 CRD = 3456 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3457 EndValueFromAdditionalBypass = 3458 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3459 EndValueFromAdditionalBypass->setName("ind.end"); 3460 } 3461 } 3462 // The new PHI merges the original incoming value, in case of a bypass, 3463 // or the value at the end of the vectorized loop. 3464 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3465 3466 // Fix the scalar body counter (PHI node). 3467 // The old induction's phi node in the scalar body needs the truncated 3468 // value. 3469 for (BasicBlock *BB : LoopBypassBlocks) 3470 BCResumeVal->addIncoming(II.getStartValue(), BB); 3471 3472 if (AdditionalBypass.first) 3473 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3474 EndValueFromAdditionalBypass); 3475 3476 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3477 } 3478 } 3479 3480 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3481 MDNode *OrigLoopID) { 3482 assert(L && "Expected valid loop."); 3483 3484 // The trip counts should be cached by now. 3485 Value *Count = getOrCreateTripCount(L); 3486 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3487 3488 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3489 3490 // Add a check in the middle block to see if we have completed 3491 // all of the iterations in the first vector loop. 3492 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3493 // If tail is to be folded, we know we don't need to run the remainder. 3494 if (!Cost->foldTailByMasking()) { 3495 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3496 Count, VectorTripCount, "cmp.n", 3497 LoopMiddleBlock->getTerminator()); 3498 3499 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3500 // of the corresponding compare because they may have ended up with 3501 // different line numbers and we want to avoid awkward line stepping while 3502 // debugging. Eg. if the compare has got a line number inside the loop. 3503 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3504 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3505 } 3506 3507 // Get ready to start creating new instructions into the vectorized body. 3508 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3509 "Inconsistent vector loop preheader"); 3510 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3511 3512 Optional<MDNode *> VectorizedLoopID = 3513 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3514 LLVMLoopVectorizeFollowupVectorized}); 3515 if (VectorizedLoopID.hasValue()) { 3516 L->setLoopID(VectorizedLoopID.getValue()); 3517 3518 // Do not setAlreadyVectorized if loop attributes have been defined 3519 // explicitly. 3520 return LoopVectorPreHeader; 3521 } 3522 3523 // Keep all loop hints from the original loop on the vector loop (we'll 3524 // replace the vectorizer-specific hints below). 3525 if (MDNode *LID = OrigLoop->getLoopID()) 3526 L->setLoopID(LID); 3527 3528 LoopVectorizeHints Hints(L, true, *ORE); 3529 Hints.setAlreadyVectorized(); 3530 3531 #ifdef EXPENSIVE_CHECKS 3532 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3533 LI->verify(*DT); 3534 #endif 3535 3536 return LoopVectorPreHeader; 3537 } 3538 3539 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3540 /* 3541 In this function we generate a new loop. The new loop will contain 3542 the vectorized instructions while the old loop will continue to run the 3543 scalar remainder. 3544 3545 [ ] <-- loop iteration number check. 3546 / | 3547 / v 3548 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3549 | / | 3550 | / v 3551 || [ ] <-- vector pre header. 3552 |/ | 3553 | v 3554 | [ ] \ 3555 | [ ]_| <-- vector loop. 3556 | | 3557 | v 3558 | -[ ] <--- middle-block. 3559 | / | 3560 | / v 3561 -|- >[ ] <--- new preheader. 3562 | | 3563 | v 3564 | [ ] \ 3565 | [ ]_| <-- old scalar loop to handle remainder. 3566 \ | 3567 \ v 3568 >[ ] <-- exit block. 3569 ... 3570 */ 3571 3572 // Get the metadata of the original loop before it gets modified. 3573 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3574 3575 // Create an empty vector loop, and prepare basic blocks for the runtime 3576 // checks. 3577 Loop *Lp = createVectorLoopSkeleton(""); 3578 3579 // Now, compare the new count to zero. If it is zero skip the vector loop and 3580 // jump to the scalar loop. This check also covers the case where the 3581 // backedge-taken count is uint##_max: adding one to it will overflow leading 3582 // to an incorrect trip count of zero. In this (rare) case we will also jump 3583 // to the scalar loop. 3584 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3585 3586 // Generate the code to check any assumptions that we've made for SCEV 3587 // expressions. 3588 emitSCEVChecks(Lp, LoopScalarPreHeader); 3589 3590 // Generate the code that checks in runtime if arrays overlap. We put the 3591 // checks into a separate block to make the more common case of few elements 3592 // faster. 3593 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3594 3595 // Some loops have a single integer induction variable, while other loops 3596 // don't. One example is c++ iterators that often have multiple pointer 3597 // induction variables. In the code below we also support a case where we 3598 // don't have a single induction variable. 3599 // 3600 // We try to obtain an induction variable from the original loop as hard 3601 // as possible. However if we don't find one that: 3602 // - is an integer 3603 // - counts from zero, stepping by one 3604 // - is the size of the widest induction variable type 3605 // then we create a new one. 3606 OldInduction = Legal->getPrimaryInduction(); 3607 Type *IdxTy = Legal->getWidestInductionType(); 3608 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3609 // The loop step is equal to the vectorization factor (num of SIMD elements) 3610 // times the unroll factor (num of SIMD instructions). 3611 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3612 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3613 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3614 Induction = 3615 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3616 getDebugLocFromInstOrOperands(OldInduction)); 3617 3618 // Emit phis for the new starting index of the scalar loop. 3619 createInductionResumeValues(Lp, CountRoundDown); 3620 3621 return completeLoopSkeleton(Lp, OrigLoopID); 3622 } 3623 3624 // Fix up external users of the induction variable. At this point, we are 3625 // in LCSSA form, with all external PHIs that use the IV having one input value, 3626 // coming from the remainder loop. We need those PHIs to also have a correct 3627 // value for the IV when arriving directly from the middle block. 3628 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3629 const InductionDescriptor &II, 3630 Value *CountRoundDown, Value *EndValue, 3631 BasicBlock *MiddleBlock) { 3632 // There are two kinds of external IV usages - those that use the value 3633 // computed in the last iteration (the PHI) and those that use the penultimate 3634 // value (the value that feeds into the phi from the loop latch). 3635 // We allow both, but they, obviously, have different values. 3636 3637 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3638 3639 DenseMap<Value *, Value *> MissingVals; 3640 3641 // An external user of the last iteration's value should see the value that 3642 // the remainder loop uses to initialize its own IV. 3643 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3644 for (User *U : PostInc->users()) { 3645 Instruction *UI = cast<Instruction>(U); 3646 if (!OrigLoop->contains(UI)) { 3647 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3648 MissingVals[UI] = EndValue; 3649 } 3650 } 3651 3652 // An external user of the penultimate value need to see EndValue - Step. 3653 // The simplest way to get this is to recompute it from the constituent SCEVs, 3654 // that is Start + (Step * (CRD - 1)). 3655 for (User *U : OrigPhi->users()) { 3656 auto *UI = cast<Instruction>(U); 3657 if (!OrigLoop->contains(UI)) { 3658 const DataLayout &DL = 3659 OrigLoop->getHeader()->getModule()->getDataLayout(); 3660 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3661 3662 IRBuilder<> B(MiddleBlock->getTerminator()); 3663 3664 // Fast-math-flags propagate from the original induction instruction. 3665 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3666 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3667 3668 Value *CountMinusOne = B.CreateSub( 3669 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3670 Value *CMO = 3671 !II.getStep()->getType()->isIntegerTy() 3672 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3673 II.getStep()->getType()) 3674 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3675 CMO->setName("cast.cmo"); 3676 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3677 Escape->setName("ind.escape"); 3678 MissingVals[UI] = Escape; 3679 } 3680 } 3681 3682 for (auto &I : MissingVals) { 3683 PHINode *PHI = cast<PHINode>(I.first); 3684 // One corner case we have to handle is two IVs "chasing" each-other, 3685 // that is %IV2 = phi [...], [ %IV1, %latch ] 3686 // In this case, if IV1 has an external use, we need to avoid adding both 3687 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3688 // don't already have an incoming value for the middle block. 3689 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3690 PHI->addIncoming(I.second, MiddleBlock); 3691 } 3692 } 3693 3694 namespace { 3695 3696 struct CSEDenseMapInfo { 3697 static bool canHandle(const Instruction *I) { 3698 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3699 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3700 } 3701 3702 static inline Instruction *getEmptyKey() { 3703 return DenseMapInfo<Instruction *>::getEmptyKey(); 3704 } 3705 3706 static inline Instruction *getTombstoneKey() { 3707 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3708 } 3709 3710 static unsigned getHashValue(const Instruction *I) { 3711 assert(canHandle(I) && "Unknown instruction!"); 3712 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3713 I->value_op_end())); 3714 } 3715 3716 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3717 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3718 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3719 return LHS == RHS; 3720 return LHS->isIdenticalTo(RHS); 3721 } 3722 }; 3723 3724 } // end anonymous namespace 3725 3726 ///Perform cse of induction variable instructions. 3727 static void cse(BasicBlock *BB) { 3728 // Perform simple cse. 3729 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3730 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3731 Instruction *In = &*I++; 3732 3733 if (!CSEDenseMapInfo::canHandle(In)) 3734 continue; 3735 3736 // Check if we can replace this instruction with any of the 3737 // visited instructions. 3738 if (Instruction *V = CSEMap.lookup(In)) { 3739 In->replaceAllUsesWith(V); 3740 In->eraseFromParent(); 3741 continue; 3742 } 3743 3744 CSEMap[In] = In; 3745 } 3746 } 3747 3748 InstructionCost 3749 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3750 bool &NeedToScalarize) { 3751 Function *F = CI->getCalledFunction(); 3752 Type *ScalarRetTy = CI->getType(); 3753 SmallVector<Type *, 4> Tys, ScalarTys; 3754 for (auto &ArgOp : CI->arg_operands()) 3755 ScalarTys.push_back(ArgOp->getType()); 3756 3757 // Estimate cost of scalarized vector call. The source operands are assumed 3758 // to be vectors, so we need to extract individual elements from there, 3759 // execute VF scalar calls, and then gather the result into the vector return 3760 // value. 3761 InstructionCost ScalarCallCost = 3762 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3763 if (VF.isScalar()) 3764 return ScalarCallCost; 3765 3766 // Compute corresponding vector type for return value and arguments. 3767 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3768 for (Type *ScalarTy : ScalarTys) 3769 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3770 3771 // Compute costs of unpacking argument values for the scalar calls and 3772 // packing the return values to a vector. 3773 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3774 3775 InstructionCost Cost = 3776 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3777 3778 // If we can't emit a vector call for this function, then the currently found 3779 // cost is the cost we need to return. 3780 NeedToScalarize = true; 3781 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3782 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3783 3784 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3785 return Cost; 3786 3787 // If the corresponding vector cost is cheaper, return its cost. 3788 InstructionCost VectorCallCost = 3789 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3790 if (VectorCallCost < Cost) { 3791 NeedToScalarize = false; 3792 Cost = VectorCallCost; 3793 } 3794 return Cost; 3795 } 3796 3797 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3798 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3799 return Elt; 3800 return VectorType::get(Elt, VF); 3801 } 3802 3803 InstructionCost 3804 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3805 ElementCount VF) { 3806 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3807 assert(ID && "Expected intrinsic call!"); 3808 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3809 FastMathFlags FMF; 3810 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3811 FMF = FPMO->getFastMathFlags(); 3812 3813 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3814 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3815 SmallVector<Type *> ParamTys; 3816 std::transform(FTy->param_begin(), FTy->param_end(), 3817 std::back_inserter(ParamTys), 3818 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3819 3820 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3821 dyn_cast<IntrinsicInst>(CI)); 3822 return TTI.getIntrinsicInstrCost(CostAttrs, 3823 TargetTransformInfo::TCK_RecipThroughput); 3824 } 3825 3826 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3827 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3828 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3829 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3830 } 3831 3832 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3833 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3834 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3835 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3836 } 3837 3838 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3839 // For every instruction `I` in MinBWs, truncate the operands, create a 3840 // truncated version of `I` and reextend its result. InstCombine runs 3841 // later and will remove any ext/trunc pairs. 3842 SmallPtrSet<Value *, 4> Erased; 3843 for (const auto &KV : Cost->getMinimalBitwidths()) { 3844 // If the value wasn't vectorized, we must maintain the original scalar 3845 // type. The absence of the value from State indicates that it 3846 // wasn't vectorized. 3847 VPValue *Def = State.Plan->getVPValue(KV.first); 3848 if (!State.hasAnyVectorValue(Def)) 3849 continue; 3850 for (unsigned Part = 0; Part < UF; ++Part) { 3851 Value *I = State.get(Def, Part); 3852 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3853 continue; 3854 Type *OriginalTy = I->getType(); 3855 Type *ScalarTruncatedTy = 3856 IntegerType::get(OriginalTy->getContext(), KV.second); 3857 auto *TruncatedTy = FixedVectorType::get( 3858 ScalarTruncatedTy, 3859 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3860 if (TruncatedTy == OriginalTy) 3861 continue; 3862 3863 IRBuilder<> B(cast<Instruction>(I)); 3864 auto ShrinkOperand = [&](Value *V) -> Value * { 3865 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3866 if (ZI->getSrcTy() == TruncatedTy) 3867 return ZI->getOperand(0); 3868 return B.CreateZExtOrTrunc(V, TruncatedTy); 3869 }; 3870 3871 // The actual instruction modification depends on the instruction type, 3872 // unfortunately. 3873 Value *NewI = nullptr; 3874 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3875 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3876 ShrinkOperand(BO->getOperand(1))); 3877 3878 // Any wrapping introduced by shrinking this operation shouldn't be 3879 // considered undefined behavior. So, we can't unconditionally copy 3880 // arithmetic wrapping flags to NewI. 3881 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3882 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3883 NewI = 3884 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3885 ShrinkOperand(CI->getOperand(1))); 3886 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3887 NewI = B.CreateSelect(SI->getCondition(), 3888 ShrinkOperand(SI->getTrueValue()), 3889 ShrinkOperand(SI->getFalseValue())); 3890 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3891 switch (CI->getOpcode()) { 3892 default: 3893 llvm_unreachable("Unhandled cast!"); 3894 case Instruction::Trunc: 3895 NewI = ShrinkOperand(CI->getOperand(0)); 3896 break; 3897 case Instruction::SExt: 3898 NewI = B.CreateSExtOrTrunc( 3899 CI->getOperand(0), 3900 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3901 break; 3902 case Instruction::ZExt: 3903 NewI = B.CreateZExtOrTrunc( 3904 CI->getOperand(0), 3905 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3906 break; 3907 } 3908 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3909 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3910 ->getNumElements(); 3911 auto *O0 = B.CreateZExtOrTrunc( 3912 SI->getOperand(0), 3913 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3914 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3915 ->getNumElements(); 3916 auto *O1 = B.CreateZExtOrTrunc( 3917 SI->getOperand(1), 3918 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3919 3920 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3921 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3922 // Don't do anything with the operands, just extend the result. 3923 continue; 3924 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3925 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3926 ->getNumElements(); 3927 auto *O0 = B.CreateZExtOrTrunc( 3928 IE->getOperand(0), 3929 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3930 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3931 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3932 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3933 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3934 ->getNumElements(); 3935 auto *O0 = B.CreateZExtOrTrunc( 3936 EE->getOperand(0), 3937 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3938 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3939 } else { 3940 // If we don't know what to do, be conservative and don't do anything. 3941 continue; 3942 } 3943 3944 // Lastly, extend the result. 3945 NewI->takeName(cast<Instruction>(I)); 3946 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3947 I->replaceAllUsesWith(Res); 3948 cast<Instruction>(I)->eraseFromParent(); 3949 Erased.insert(I); 3950 State.reset(Def, Res, Part); 3951 } 3952 } 3953 3954 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3955 for (const auto &KV : Cost->getMinimalBitwidths()) { 3956 // If the value wasn't vectorized, we must maintain the original scalar 3957 // type. The absence of the value from State indicates that it 3958 // wasn't vectorized. 3959 VPValue *Def = State.Plan->getVPValue(KV.first); 3960 if (!State.hasAnyVectorValue(Def)) 3961 continue; 3962 for (unsigned Part = 0; Part < UF; ++Part) { 3963 Value *I = State.get(Def, Part); 3964 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3965 if (Inst && Inst->use_empty()) { 3966 Value *NewI = Inst->getOperand(0); 3967 Inst->eraseFromParent(); 3968 State.reset(Def, NewI, Part); 3969 } 3970 } 3971 } 3972 } 3973 3974 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3975 // Insert truncates and extends for any truncated instructions as hints to 3976 // InstCombine. 3977 if (VF.isVector()) 3978 truncateToMinimalBitwidths(State); 3979 3980 // Fix widened non-induction PHIs by setting up the PHI operands. 3981 if (OrigPHIsToFix.size()) { 3982 assert(EnableVPlanNativePath && 3983 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3984 fixNonInductionPHIs(State); 3985 } 3986 3987 // At this point every instruction in the original loop is widened to a 3988 // vector form. Now we need to fix the recurrences in the loop. These PHI 3989 // nodes are currently empty because we did not want to introduce cycles. 3990 // This is the second stage of vectorizing recurrences. 3991 fixCrossIterationPHIs(State); 3992 3993 // Forget the original basic block. 3994 PSE.getSE()->forgetLoop(OrigLoop); 3995 3996 // Fix-up external users of the induction variables. 3997 for (auto &Entry : Legal->getInductionVars()) 3998 fixupIVUsers(Entry.first, Entry.second, 3999 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4000 IVEndValues[Entry.first], LoopMiddleBlock); 4001 4002 fixLCSSAPHIs(State); 4003 for (Instruction *PI : PredicatedInstructions) 4004 sinkScalarOperands(&*PI); 4005 4006 // Remove redundant induction instructions. 4007 cse(LoopVectorBody); 4008 4009 // Set/update profile weights for the vector and remainder loops as original 4010 // loop iterations are now distributed among them. Note that original loop 4011 // represented by LoopScalarBody becomes remainder loop after vectorization. 4012 // 4013 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4014 // end up getting slightly roughened result but that should be OK since 4015 // profile is not inherently precise anyway. Note also possible bypass of 4016 // vector code caused by legality checks is ignored, assigning all the weight 4017 // to the vector loop, optimistically. 4018 // 4019 // For scalable vectorization we can't know at compile time how many iterations 4020 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4021 // vscale of '1'. 4022 setProfileInfoAfterUnrolling( 4023 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4024 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4025 } 4026 4027 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4028 // In order to support recurrences we need to be able to vectorize Phi nodes. 4029 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4030 // stage #2: We now need to fix the recurrences by adding incoming edges to 4031 // the currently empty PHI nodes. At this point every instruction in the 4032 // original loop is widened to a vector form so we can use them to construct 4033 // the incoming edges. 4034 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4035 // Handle first-order recurrences and reductions that need to be fixed. 4036 if (Legal->isFirstOrderRecurrence(&Phi)) 4037 fixFirstOrderRecurrence(&Phi, State); 4038 else if (Legal->isReductionVariable(&Phi)) 4039 fixReduction(&Phi, State); 4040 } 4041 } 4042 4043 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4044 VPTransformState &State) { 4045 // This is the second phase of vectorizing first-order recurrences. An 4046 // overview of the transformation is described below. Suppose we have the 4047 // following loop. 4048 // 4049 // for (int i = 0; i < n; ++i) 4050 // b[i] = a[i] - a[i - 1]; 4051 // 4052 // There is a first-order recurrence on "a". For this loop, the shorthand 4053 // scalar IR looks like: 4054 // 4055 // scalar.ph: 4056 // s_init = a[-1] 4057 // br scalar.body 4058 // 4059 // scalar.body: 4060 // i = phi [0, scalar.ph], [i+1, scalar.body] 4061 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4062 // s2 = a[i] 4063 // b[i] = s2 - s1 4064 // br cond, scalar.body, ... 4065 // 4066 // In this example, s1 is a recurrence because it's value depends on the 4067 // previous iteration. In the first phase of vectorization, we created a 4068 // temporary value for s1. We now complete the vectorization and produce the 4069 // shorthand vector IR shown below (for VF = 4, UF = 1). 4070 // 4071 // vector.ph: 4072 // v_init = vector(..., ..., ..., a[-1]) 4073 // br vector.body 4074 // 4075 // vector.body 4076 // i = phi [0, vector.ph], [i+4, vector.body] 4077 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4078 // v2 = a[i, i+1, i+2, i+3]; 4079 // v3 = vector(v1(3), v2(0, 1, 2)) 4080 // b[i, i+1, i+2, i+3] = v2 - v3 4081 // br cond, vector.body, middle.block 4082 // 4083 // middle.block: 4084 // x = v2(3) 4085 // br scalar.ph 4086 // 4087 // scalar.ph: 4088 // s_init = phi [x, middle.block], [a[-1], otherwise] 4089 // br scalar.body 4090 // 4091 // After execution completes the vector loop, we extract the next value of 4092 // the recurrence (x) to use as the initial value in the scalar loop. 4093 4094 // Get the original loop preheader and single loop latch. 4095 auto *Preheader = OrigLoop->getLoopPreheader(); 4096 auto *Latch = OrigLoop->getLoopLatch(); 4097 4098 // Get the initial and previous values of the scalar recurrence. 4099 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4100 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4101 4102 // Create a vector from the initial value. 4103 auto *VectorInit = ScalarInit; 4104 if (VF.isVector()) { 4105 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4106 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4107 VectorInit = Builder.CreateInsertElement( 4108 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4109 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4110 } 4111 4112 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4113 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4114 // We constructed a temporary phi node in the first phase of vectorization. 4115 // This phi node will eventually be deleted. 4116 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4117 4118 // Create a phi node for the new recurrence. The current value will either be 4119 // the initial value inserted into a vector or loop-varying vector value. 4120 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4121 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4122 4123 // Get the vectorized previous value of the last part UF - 1. It appears last 4124 // among all unrolled iterations, due to the order of their construction. 4125 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4126 4127 // Find and set the insertion point after the previous value if it is an 4128 // instruction. 4129 BasicBlock::iterator InsertPt; 4130 // Note that the previous value may have been constant-folded so it is not 4131 // guaranteed to be an instruction in the vector loop. 4132 // FIXME: Loop invariant values do not form recurrences. We should deal with 4133 // them earlier. 4134 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4135 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4136 else { 4137 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4138 if (isa<PHINode>(PreviousLastPart)) 4139 // If the previous value is a phi node, we should insert after all the phi 4140 // nodes in the block containing the PHI to avoid breaking basic block 4141 // verification. Note that the basic block may be different to 4142 // LoopVectorBody, in case we predicate the loop. 4143 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4144 else 4145 InsertPt = ++PreviousInst->getIterator(); 4146 } 4147 Builder.SetInsertPoint(&*InsertPt); 4148 4149 // We will construct a vector for the recurrence by combining the values for 4150 // the current and previous iterations. This is the required shuffle mask. 4151 assert(!VF.isScalable()); 4152 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4153 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4154 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4155 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4156 4157 // The vector from which to take the initial value for the current iteration 4158 // (actual or unrolled). Initially, this is the vector phi node. 4159 Value *Incoming = VecPhi; 4160 4161 // Shuffle the current and previous vector and update the vector parts. 4162 for (unsigned Part = 0; Part < UF; ++Part) { 4163 Value *PreviousPart = State.get(PreviousDef, Part); 4164 Value *PhiPart = State.get(PhiDef, Part); 4165 auto *Shuffle = 4166 VF.isVector() 4167 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4168 : Incoming; 4169 PhiPart->replaceAllUsesWith(Shuffle); 4170 cast<Instruction>(PhiPart)->eraseFromParent(); 4171 State.reset(PhiDef, Shuffle, Part); 4172 Incoming = PreviousPart; 4173 } 4174 4175 // Fix the latch value of the new recurrence in the vector loop. 4176 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4177 4178 // Extract the last vector element in the middle block. This will be the 4179 // initial value for the recurrence when jumping to the scalar loop. 4180 auto *ExtractForScalar = Incoming; 4181 if (VF.isVector()) { 4182 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4183 ExtractForScalar = Builder.CreateExtractElement( 4184 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4185 "vector.recur.extract"); 4186 } 4187 // Extract the second last element in the middle block if the 4188 // Phi is used outside the loop. We need to extract the phi itself 4189 // and not the last element (the phi update in the current iteration). This 4190 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4191 // when the scalar loop is not run at all. 4192 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4193 if (VF.isVector()) 4194 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4195 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4196 "vector.recur.extract.for.phi"); 4197 // When loop is unrolled without vectorizing, initialize 4198 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4199 // `Incoming`. This is analogous to the vectorized case above: extracting the 4200 // second last element when VF > 1. 4201 else if (UF > 1) 4202 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4203 4204 // Fix the initial value of the original recurrence in the scalar loop. 4205 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4206 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4207 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4208 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4209 Start->addIncoming(Incoming, BB); 4210 } 4211 4212 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4213 Phi->setName("scalar.recur"); 4214 4215 // Finally, fix users of the recurrence outside the loop. The users will need 4216 // either the last value of the scalar recurrence or the last value of the 4217 // vector recurrence we extracted in the middle block. Since the loop is in 4218 // LCSSA form, we just need to find all the phi nodes for the original scalar 4219 // recurrence in the exit block, and then add an edge for the middle block. 4220 // Note that LCSSA does not imply single entry when the original scalar loop 4221 // had multiple exiting edges (as we always run the last iteration in the 4222 // scalar epilogue); in that case, the exiting path through middle will be 4223 // dynamically dead and the value picked for the phi doesn't matter. 4224 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4225 if (any_of(LCSSAPhi.incoming_values(), 4226 [Phi](Value *V) { return V == Phi; })) 4227 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4228 } 4229 4230 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4231 // Get it's reduction variable descriptor. 4232 assert(Legal->isReductionVariable(Phi) && 4233 "Unable to find the reduction variable"); 4234 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4235 4236 RecurKind RK = RdxDesc.getRecurrenceKind(); 4237 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4238 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4239 setDebugLocFromInst(Builder, ReductionStartValue); 4240 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4241 4242 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4243 // This is the vector-clone of the value that leaves the loop. 4244 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4245 4246 // Wrap flags are in general invalid after vectorization, clear them. 4247 clearReductionWrapFlags(RdxDesc, State); 4248 4249 // Fix the vector-loop phi. 4250 4251 // Reductions do not have to start at zero. They can start with 4252 // any loop invariant values. 4253 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4254 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4255 4256 for (unsigned Part = 0; Part < UF; ++Part) { 4257 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4258 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4259 cast<PHINode>(VecRdxPhi) 4260 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4261 } 4262 4263 // Before each round, move the insertion point right between 4264 // the PHIs and the values we are going to write. 4265 // This allows us to write both PHINodes and the extractelement 4266 // instructions. 4267 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4268 4269 setDebugLocFromInst(Builder, LoopExitInst); 4270 4271 // If tail is folded by masking, the vector value to leave the loop should be 4272 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4273 // instead of the former. For an inloop reduction the reduction will already 4274 // be predicated, and does not need to be handled here. 4275 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4276 for (unsigned Part = 0; Part < UF; ++Part) { 4277 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4278 Value *Sel = nullptr; 4279 for (User *U : VecLoopExitInst->users()) { 4280 if (isa<SelectInst>(U)) { 4281 assert(!Sel && "Reduction exit feeding two selects"); 4282 Sel = U; 4283 } else 4284 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4285 } 4286 assert(Sel && "Reduction exit feeds no select"); 4287 State.reset(LoopExitInstDef, Sel, Part); 4288 4289 // If the target can create a predicated operator for the reduction at no 4290 // extra cost in the loop (for example a predicated vadd), it can be 4291 // cheaper for the select to remain in the loop than be sunk out of it, 4292 // and so use the select value for the phi instead of the old 4293 // LoopExitValue. 4294 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4295 if (PreferPredicatedReductionSelect || 4296 TTI->preferPredicatedReductionSelect( 4297 RdxDesc.getOpcode(), Phi->getType(), 4298 TargetTransformInfo::ReductionFlags())) { 4299 auto *VecRdxPhi = 4300 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4301 VecRdxPhi->setIncomingValueForBlock( 4302 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4303 } 4304 } 4305 } 4306 4307 // If the vector reduction can be performed in a smaller type, we truncate 4308 // then extend the loop exit value to enable InstCombine to evaluate the 4309 // entire expression in the smaller type. 4310 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4311 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4312 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4313 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4314 Builder.SetInsertPoint( 4315 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4316 VectorParts RdxParts(UF); 4317 for (unsigned Part = 0; Part < UF; ++Part) { 4318 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4319 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4320 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4321 : Builder.CreateZExt(Trunc, VecTy); 4322 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4323 UI != RdxParts[Part]->user_end();) 4324 if (*UI != Trunc) { 4325 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4326 RdxParts[Part] = Extnd; 4327 } else { 4328 ++UI; 4329 } 4330 } 4331 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4332 for (unsigned Part = 0; Part < UF; ++Part) { 4333 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4334 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4335 } 4336 } 4337 4338 // Reduce all of the unrolled parts into a single vector. 4339 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4340 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4341 4342 // The middle block terminator has already been assigned a DebugLoc here (the 4343 // OrigLoop's single latch terminator). We want the whole middle block to 4344 // appear to execute on this line because: (a) it is all compiler generated, 4345 // (b) these instructions are always executed after evaluating the latch 4346 // conditional branch, and (c) other passes may add new predecessors which 4347 // terminate on this line. This is the easiest way to ensure we don't 4348 // accidentally cause an extra step back into the loop while debugging. 4349 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4350 { 4351 // Floating-point operations should have some FMF to enable the reduction. 4352 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4353 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4354 for (unsigned Part = 1; Part < UF; ++Part) { 4355 Value *RdxPart = State.get(LoopExitInstDef, Part); 4356 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4357 ReducedPartRdx = Builder.CreateBinOp( 4358 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4359 } else { 4360 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4361 } 4362 } 4363 } 4364 4365 // Create the reduction after the loop. Note that inloop reductions create the 4366 // target reduction in the loop using a Reduction recipe. 4367 if (VF.isVector() && !IsInLoopReductionPhi) { 4368 ReducedPartRdx = 4369 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4370 // If the reduction can be performed in a smaller type, we need to extend 4371 // the reduction to the wider type before we branch to the original loop. 4372 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4373 ReducedPartRdx = 4374 RdxDesc.isSigned() 4375 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4376 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4377 } 4378 4379 // Create a phi node that merges control-flow from the backedge-taken check 4380 // block and the middle block. 4381 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4382 LoopScalarPreHeader->getTerminator()); 4383 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4384 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4385 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4386 4387 // Now, we need to fix the users of the reduction variable 4388 // inside and outside of the scalar remainder loop. 4389 4390 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4391 // in the exit blocks. See comment on analogous loop in 4392 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4393 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4394 if (any_of(LCSSAPhi.incoming_values(), 4395 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4396 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4397 4398 // Fix the scalar loop reduction variable with the incoming reduction sum 4399 // from the vector body and from the backedge value. 4400 int IncomingEdgeBlockIdx = 4401 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4402 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4403 // Pick the other block. 4404 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4405 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4406 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4407 } 4408 4409 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4410 VPTransformState &State) { 4411 RecurKind RK = RdxDesc.getRecurrenceKind(); 4412 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4413 return; 4414 4415 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4416 assert(LoopExitInstr && "null loop exit instruction"); 4417 SmallVector<Instruction *, 8> Worklist; 4418 SmallPtrSet<Instruction *, 8> Visited; 4419 Worklist.push_back(LoopExitInstr); 4420 Visited.insert(LoopExitInstr); 4421 4422 while (!Worklist.empty()) { 4423 Instruction *Cur = Worklist.pop_back_val(); 4424 if (isa<OverflowingBinaryOperator>(Cur)) 4425 for (unsigned Part = 0; Part < UF; ++Part) { 4426 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4427 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4428 } 4429 4430 for (User *U : Cur->users()) { 4431 Instruction *UI = cast<Instruction>(U); 4432 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4433 Visited.insert(UI).second) 4434 Worklist.push_back(UI); 4435 } 4436 } 4437 } 4438 4439 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4440 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4441 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4442 // Some phis were already hand updated by the reduction and recurrence 4443 // code above, leave them alone. 4444 continue; 4445 4446 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4447 // Non-instruction incoming values will have only one value. 4448 4449 VPLane Lane = VPLane::getFirstLane(); 4450 if (isa<Instruction>(IncomingValue) && 4451 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4452 VF)) 4453 Lane = VPLane::getLastLaneForVF(VF); 4454 4455 // Can be a loop invariant incoming value or the last scalar value to be 4456 // extracted from the vectorized loop. 4457 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4458 Value *lastIncomingValue = 4459 OrigLoop->isLoopInvariant(IncomingValue) 4460 ? IncomingValue 4461 : State.get(State.Plan->getVPValue(IncomingValue), 4462 VPIteration(UF - 1, Lane)); 4463 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4464 } 4465 } 4466 4467 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4468 // The basic block and loop containing the predicated instruction. 4469 auto *PredBB = PredInst->getParent(); 4470 auto *VectorLoop = LI->getLoopFor(PredBB); 4471 4472 // Initialize a worklist with the operands of the predicated instruction. 4473 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4474 4475 // Holds instructions that we need to analyze again. An instruction may be 4476 // reanalyzed if we don't yet know if we can sink it or not. 4477 SmallVector<Instruction *, 8> InstsToReanalyze; 4478 4479 // Returns true if a given use occurs in the predicated block. Phi nodes use 4480 // their operands in their corresponding predecessor blocks. 4481 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4482 auto *I = cast<Instruction>(U.getUser()); 4483 BasicBlock *BB = I->getParent(); 4484 if (auto *Phi = dyn_cast<PHINode>(I)) 4485 BB = Phi->getIncomingBlock( 4486 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4487 return BB == PredBB; 4488 }; 4489 4490 // Iteratively sink the scalarized operands of the predicated instruction 4491 // into the block we created for it. When an instruction is sunk, it's 4492 // operands are then added to the worklist. The algorithm ends after one pass 4493 // through the worklist doesn't sink a single instruction. 4494 bool Changed; 4495 do { 4496 // Add the instructions that need to be reanalyzed to the worklist, and 4497 // reset the changed indicator. 4498 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4499 InstsToReanalyze.clear(); 4500 Changed = false; 4501 4502 while (!Worklist.empty()) { 4503 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4504 4505 // We can't sink an instruction if it is a phi node, is already in the 4506 // predicated block, is not in the loop, or may have side effects. 4507 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4508 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4509 continue; 4510 4511 // It's legal to sink the instruction if all its uses occur in the 4512 // predicated block. Otherwise, there's nothing to do yet, and we may 4513 // need to reanalyze the instruction. 4514 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4515 InstsToReanalyze.push_back(I); 4516 continue; 4517 } 4518 4519 // Move the instruction to the beginning of the predicated block, and add 4520 // it's operands to the worklist. 4521 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4522 Worklist.insert(I->op_begin(), I->op_end()); 4523 4524 // The sinking may have enabled other instructions to be sunk, so we will 4525 // need to iterate. 4526 Changed = true; 4527 } 4528 } while (Changed); 4529 } 4530 4531 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4532 for (PHINode *OrigPhi : OrigPHIsToFix) { 4533 VPWidenPHIRecipe *VPPhi = 4534 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4535 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4536 // Make sure the builder has a valid insert point. 4537 Builder.SetInsertPoint(NewPhi); 4538 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4539 VPValue *Inc = VPPhi->getIncomingValue(i); 4540 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4541 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4542 } 4543 } 4544 } 4545 4546 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4547 VPUser &Operands, unsigned UF, 4548 ElementCount VF, bool IsPtrLoopInvariant, 4549 SmallBitVector &IsIndexLoopInvariant, 4550 VPTransformState &State) { 4551 // Construct a vector GEP by widening the operands of the scalar GEP as 4552 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4553 // results in a vector of pointers when at least one operand of the GEP 4554 // is vector-typed. Thus, to keep the representation compact, we only use 4555 // vector-typed operands for loop-varying values. 4556 4557 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4558 // If we are vectorizing, but the GEP has only loop-invariant operands, 4559 // the GEP we build (by only using vector-typed operands for 4560 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4561 // produce a vector of pointers, we need to either arbitrarily pick an 4562 // operand to broadcast, or broadcast a clone of the original GEP. 4563 // Here, we broadcast a clone of the original. 4564 // 4565 // TODO: If at some point we decide to scalarize instructions having 4566 // loop-invariant operands, this special case will no longer be 4567 // required. We would add the scalarization decision to 4568 // collectLoopScalars() and teach getVectorValue() to broadcast 4569 // the lane-zero scalar value. 4570 auto *Clone = Builder.Insert(GEP->clone()); 4571 for (unsigned Part = 0; Part < UF; ++Part) { 4572 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4573 State.set(VPDef, EntryPart, Part); 4574 addMetadata(EntryPart, GEP); 4575 } 4576 } else { 4577 // If the GEP has at least one loop-varying operand, we are sure to 4578 // produce a vector of pointers. But if we are only unrolling, we want 4579 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4580 // produce with the code below will be scalar (if VF == 1) or vector 4581 // (otherwise). Note that for the unroll-only case, we still maintain 4582 // values in the vector mapping with initVector, as we do for other 4583 // instructions. 4584 for (unsigned Part = 0; Part < UF; ++Part) { 4585 // The pointer operand of the new GEP. If it's loop-invariant, we 4586 // won't broadcast it. 4587 auto *Ptr = IsPtrLoopInvariant 4588 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4589 : State.get(Operands.getOperand(0), Part); 4590 4591 // Collect all the indices for the new GEP. If any index is 4592 // loop-invariant, we won't broadcast it. 4593 SmallVector<Value *, 4> Indices; 4594 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4595 VPValue *Operand = Operands.getOperand(I); 4596 if (IsIndexLoopInvariant[I - 1]) 4597 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4598 else 4599 Indices.push_back(State.get(Operand, Part)); 4600 } 4601 4602 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4603 // but it should be a vector, otherwise. 4604 auto *NewGEP = 4605 GEP->isInBounds() 4606 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4607 Indices) 4608 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4609 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4610 "NewGEP is not a pointer vector"); 4611 State.set(VPDef, NewGEP, Part); 4612 addMetadata(NewGEP, GEP); 4613 } 4614 } 4615 } 4616 4617 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4618 RecurrenceDescriptor *RdxDesc, 4619 VPValue *StartVPV, VPValue *Def, 4620 VPTransformState &State) { 4621 PHINode *P = cast<PHINode>(PN); 4622 if (EnableVPlanNativePath) { 4623 // Currently we enter here in the VPlan-native path for non-induction 4624 // PHIs where all control flow is uniform. We simply widen these PHIs. 4625 // Create a vector phi with no operands - the vector phi operands will be 4626 // set at the end of vector code generation. 4627 Type *VecTy = (State.VF.isScalar()) 4628 ? PN->getType() 4629 : VectorType::get(PN->getType(), State.VF); 4630 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4631 State.set(Def, VecPhi, 0); 4632 OrigPHIsToFix.push_back(P); 4633 4634 return; 4635 } 4636 4637 assert(PN->getParent() == OrigLoop->getHeader() && 4638 "Non-header phis should have been handled elsewhere"); 4639 4640 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4641 // In order to support recurrences we need to be able to vectorize Phi nodes. 4642 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4643 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4644 // this value when we vectorize all of the instructions that use the PHI. 4645 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4646 Value *Iden = nullptr; 4647 bool ScalarPHI = 4648 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4649 Type *VecTy = 4650 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4651 4652 if (RdxDesc) { 4653 assert(Legal->isReductionVariable(P) && StartV && 4654 "RdxDesc should only be set for reduction variables; in that case " 4655 "a StartV is also required"); 4656 RecurKind RK = RdxDesc->getRecurrenceKind(); 4657 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4658 // MinMax reduction have the start value as their identify. 4659 if (ScalarPHI) { 4660 Iden = StartV; 4661 } else { 4662 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4663 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4664 StartV = Iden = 4665 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4666 } 4667 } else { 4668 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4669 RK, VecTy->getScalarType()); 4670 Iden = IdenC; 4671 4672 if (!ScalarPHI) { 4673 Iden = ConstantVector::getSplat(State.VF, IdenC); 4674 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4675 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4676 Constant *Zero = Builder.getInt32(0); 4677 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4678 } 4679 } 4680 } 4681 4682 for (unsigned Part = 0; Part < State.UF; ++Part) { 4683 // This is phase one of vectorizing PHIs. 4684 Value *EntryPart = PHINode::Create( 4685 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4686 State.set(Def, EntryPart, Part); 4687 if (StartV) { 4688 // Make sure to add the reduction start value only to the 4689 // first unroll part. 4690 Value *StartVal = (Part == 0) ? StartV : Iden; 4691 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4692 } 4693 } 4694 return; 4695 } 4696 4697 assert(!Legal->isReductionVariable(P) && 4698 "reductions should be handled above"); 4699 4700 setDebugLocFromInst(Builder, P); 4701 4702 // This PHINode must be an induction variable. 4703 // Make sure that we know about it. 4704 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4705 4706 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4707 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4708 4709 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4710 // which can be found from the original scalar operations. 4711 switch (II.getKind()) { 4712 case InductionDescriptor::IK_NoInduction: 4713 llvm_unreachable("Unknown induction"); 4714 case InductionDescriptor::IK_IntInduction: 4715 case InductionDescriptor::IK_FpInduction: 4716 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4717 case InductionDescriptor::IK_PtrInduction: { 4718 // Handle the pointer induction variable case. 4719 assert(P->getType()->isPointerTy() && "Unexpected type."); 4720 4721 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4722 // This is the normalized GEP that starts counting at zero. 4723 Value *PtrInd = 4724 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4725 // Determine the number of scalars we need to generate for each unroll 4726 // iteration. If the instruction is uniform, we only need to generate the 4727 // first lane. Otherwise, we generate all VF values. 4728 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4729 ? 1 4730 : State.VF.getKnownMinValue(); 4731 for (unsigned Part = 0; Part < UF; ++Part) { 4732 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4733 Constant *Idx = ConstantInt::get( 4734 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4735 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4736 Value *SclrGep = 4737 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4738 SclrGep->setName("next.gep"); 4739 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4740 } 4741 } 4742 return; 4743 } 4744 assert(isa<SCEVConstant>(II.getStep()) && 4745 "Induction step not a SCEV constant!"); 4746 Type *PhiType = II.getStep()->getType(); 4747 4748 // Build a pointer phi 4749 Value *ScalarStartValue = II.getStartValue(); 4750 Type *ScStValueType = ScalarStartValue->getType(); 4751 PHINode *NewPointerPhi = 4752 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4753 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4754 4755 // A pointer induction, performed by using a gep 4756 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4757 Instruction *InductionLoc = LoopLatch->getTerminator(); 4758 const SCEV *ScalarStep = II.getStep(); 4759 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4760 Value *ScalarStepValue = 4761 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4762 Value *InductionGEP = GetElementPtrInst::Create( 4763 ScStValueType->getPointerElementType(), NewPointerPhi, 4764 Builder.CreateMul( 4765 ScalarStepValue, 4766 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4767 "ptr.ind", InductionLoc); 4768 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4769 4770 // Create UF many actual address geps that use the pointer 4771 // phi as base and a vectorized version of the step value 4772 // (<step*0, ..., step*N>) as offset. 4773 for (unsigned Part = 0; Part < State.UF; ++Part) { 4774 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4775 Value *StartOffset = 4776 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue()); 4777 // Create a vector of consecutive numbers from zero to VF. 4778 StartOffset = 4779 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4780 4781 Value *GEP = Builder.CreateGEP( 4782 ScStValueType->getPointerElementType(), NewPointerPhi, 4783 Builder.CreateMul(StartOffset, 4784 Builder.CreateVectorSplat( 4785 State.VF.getKnownMinValue(), ScalarStepValue), 4786 "vector.gep")); 4787 State.set(Def, GEP, Part); 4788 } 4789 } 4790 } 4791 } 4792 4793 /// A helper function for checking whether an integer division-related 4794 /// instruction may divide by zero (in which case it must be predicated if 4795 /// executed conditionally in the scalar code). 4796 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4797 /// Non-zero divisors that are non compile-time constants will not be 4798 /// converted into multiplication, so we will still end up scalarizing 4799 /// the division, but can do so w/o predication. 4800 static bool mayDivideByZero(Instruction &I) { 4801 assert((I.getOpcode() == Instruction::UDiv || 4802 I.getOpcode() == Instruction::SDiv || 4803 I.getOpcode() == Instruction::URem || 4804 I.getOpcode() == Instruction::SRem) && 4805 "Unexpected instruction"); 4806 Value *Divisor = I.getOperand(1); 4807 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4808 return !CInt || CInt->isZero(); 4809 } 4810 4811 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4812 VPUser &User, 4813 VPTransformState &State) { 4814 switch (I.getOpcode()) { 4815 case Instruction::Call: 4816 case Instruction::Br: 4817 case Instruction::PHI: 4818 case Instruction::GetElementPtr: 4819 case Instruction::Select: 4820 llvm_unreachable("This instruction is handled by a different recipe."); 4821 case Instruction::UDiv: 4822 case Instruction::SDiv: 4823 case Instruction::SRem: 4824 case Instruction::URem: 4825 case Instruction::Add: 4826 case Instruction::FAdd: 4827 case Instruction::Sub: 4828 case Instruction::FSub: 4829 case Instruction::FNeg: 4830 case Instruction::Mul: 4831 case Instruction::FMul: 4832 case Instruction::FDiv: 4833 case Instruction::FRem: 4834 case Instruction::Shl: 4835 case Instruction::LShr: 4836 case Instruction::AShr: 4837 case Instruction::And: 4838 case Instruction::Or: 4839 case Instruction::Xor: { 4840 // Just widen unops and binops. 4841 setDebugLocFromInst(Builder, &I); 4842 4843 for (unsigned Part = 0; Part < UF; ++Part) { 4844 SmallVector<Value *, 2> Ops; 4845 for (VPValue *VPOp : User.operands()) 4846 Ops.push_back(State.get(VPOp, Part)); 4847 4848 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4849 4850 if (auto *VecOp = dyn_cast<Instruction>(V)) 4851 VecOp->copyIRFlags(&I); 4852 4853 // Use this vector value for all users of the original instruction. 4854 State.set(Def, V, Part); 4855 addMetadata(V, &I); 4856 } 4857 4858 break; 4859 } 4860 case Instruction::ICmp: 4861 case Instruction::FCmp: { 4862 // Widen compares. Generate vector compares. 4863 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4864 auto *Cmp = cast<CmpInst>(&I); 4865 setDebugLocFromInst(Builder, Cmp); 4866 for (unsigned Part = 0; Part < UF; ++Part) { 4867 Value *A = State.get(User.getOperand(0), Part); 4868 Value *B = State.get(User.getOperand(1), Part); 4869 Value *C = nullptr; 4870 if (FCmp) { 4871 // Propagate fast math flags. 4872 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4873 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4874 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4875 } else { 4876 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4877 } 4878 State.set(Def, C, Part); 4879 addMetadata(C, &I); 4880 } 4881 4882 break; 4883 } 4884 4885 case Instruction::ZExt: 4886 case Instruction::SExt: 4887 case Instruction::FPToUI: 4888 case Instruction::FPToSI: 4889 case Instruction::FPExt: 4890 case Instruction::PtrToInt: 4891 case Instruction::IntToPtr: 4892 case Instruction::SIToFP: 4893 case Instruction::UIToFP: 4894 case Instruction::Trunc: 4895 case Instruction::FPTrunc: 4896 case Instruction::BitCast: { 4897 auto *CI = cast<CastInst>(&I); 4898 setDebugLocFromInst(Builder, CI); 4899 4900 /// Vectorize casts. 4901 Type *DestTy = 4902 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4903 4904 for (unsigned Part = 0; Part < UF; ++Part) { 4905 Value *A = State.get(User.getOperand(0), Part); 4906 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4907 State.set(Def, Cast, Part); 4908 addMetadata(Cast, &I); 4909 } 4910 break; 4911 } 4912 default: 4913 // This instruction is not vectorized by simple widening. 4914 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4915 llvm_unreachable("Unhandled instruction!"); 4916 } // end of switch. 4917 } 4918 4919 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4920 VPUser &ArgOperands, 4921 VPTransformState &State) { 4922 assert(!isa<DbgInfoIntrinsic>(I) && 4923 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4924 setDebugLocFromInst(Builder, &I); 4925 4926 Module *M = I.getParent()->getParent()->getParent(); 4927 auto *CI = cast<CallInst>(&I); 4928 4929 SmallVector<Type *, 4> Tys; 4930 for (Value *ArgOperand : CI->arg_operands()) 4931 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4932 4933 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4934 4935 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4936 // version of the instruction. 4937 // Is it beneficial to perform intrinsic call compared to lib call? 4938 bool NeedToScalarize = false; 4939 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4940 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4941 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4942 assert((UseVectorIntrinsic || !NeedToScalarize) && 4943 "Instruction should be scalarized elsewhere."); 4944 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4945 "Cannot have invalid costs while widening"); 4946 4947 for (unsigned Part = 0; Part < UF; ++Part) { 4948 SmallVector<Value *, 4> Args; 4949 for (auto &I : enumerate(ArgOperands.operands())) { 4950 // Some intrinsics have a scalar argument - don't replace it with a 4951 // vector. 4952 Value *Arg; 4953 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4954 Arg = State.get(I.value(), Part); 4955 else 4956 Arg = State.get(I.value(), VPIteration(0, 0)); 4957 Args.push_back(Arg); 4958 } 4959 4960 Function *VectorF; 4961 if (UseVectorIntrinsic) { 4962 // Use vector version of the intrinsic. 4963 Type *TysForDecl[] = {CI->getType()}; 4964 if (VF.isVector()) 4965 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4966 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4967 assert(VectorF && "Can't retrieve vector intrinsic."); 4968 } else { 4969 // Use vector version of the function call. 4970 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4971 #ifndef NDEBUG 4972 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4973 "Can't create vector function."); 4974 #endif 4975 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4976 } 4977 SmallVector<OperandBundleDef, 1> OpBundles; 4978 CI->getOperandBundlesAsDefs(OpBundles); 4979 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4980 4981 if (isa<FPMathOperator>(V)) 4982 V->copyFastMathFlags(CI); 4983 4984 State.set(Def, V, Part); 4985 addMetadata(V, &I); 4986 } 4987 } 4988 4989 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4990 VPUser &Operands, 4991 bool InvariantCond, 4992 VPTransformState &State) { 4993 setDebugLocFromInst(Builder, &I); 4994 4995 // The condition can be loop invariant but still defined inside the 4996 // loop. This means that we can't just use the original 'cond' value. 4997 // We have to take the 'vectorized' value and pick the first lane. 4998 // Instcombine will make this a no-op. 4999 auto *InvarCond = InvariantCond 5000 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5001 : nullptr; 5002 5003 for (unsigned Part = 0; Part < UF; ++Part) { 5004 Value *Cond = 5005 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5006 Value *Op0 = State.get(Operands.getOperand(1), Part); 5007 Value *Op1 = State.get(Operands.getOperand(2), Part); 5008 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5009 State.set(VPDef, Sel, Part); 5010 addMetadata(Sel, &I); 5011 } 5012 } 5013 5014 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5015 // We should not collect Scalars more than once per VF. Right now, this 5016 // function is called from collectUniformsAndScalars(), which already does 5017 // this check. Collecting Scalars for VF=1 does not make any sense. 5018 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5019 "This function should not be visited twice for the same VF"); 5020 5021 SmallSetVector<Instruction *, 8> Worklist; 5022 5023 // These sets are used to seed the analysis with pointers used by memory 5024 // accesses that will remain scalar. 5025 SmallSetVector<Instruction *, 8> ScalarPtrs; 5026 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5027 auto *Latch = TheLoop->getLoopLatch(); 5028 5029 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5030 // The pointer operands of loads and stores will be scalar as long as the 5031 // memory access is not a gather or scatter operation. The value operand of a 5032 // store will remain scalar if the store is scalarized. 5033 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5034 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5035 assert(WideningDecision != CM_Unknown && 5036 "Widening decision should be ready at this moment"); 5037 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5038 if (Ptr == Store->getValueOperand()) 5039 return WideningDecision == CM_Scalarize; 5040 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5041 "Ptr is neither a value or pointer operand"); 5042 return WideningDecision != CM_GatherScatter; 5043 }; 5044 5045 // A helper that returns true if the given value is a bitcast or 5046 // getelementptr instruction contained in the loop. 5047 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5048 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5049 isa<GetElementPtrInst>(V)) && 5050 !TheLoop->isLoopInvariant(V); 5051 }; 5052 5053 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5054 if (!isa<PHINode>(Ptr) || 5055 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5056 return false; 5057 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5058 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5059 return false; 5060 return isScalarUse(MemAccess, Ptr); 5061 }; 5062 5063 // A helper that evaluates a memory access's use of a pointer. If the 5064 // pointer is actually the pointer induction of a loop, it is being 5065 // inserted into Worklist. If the use will be a scalar use, and the 5066 // pointer is only used by memory accesses, we place the pointer in 5067 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5068 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5069 if (isScalarPtrInduction(MemAccess, Ptr)) { 5070 Worklist.insert(cast<Instruction>(Ptr)); 5071 Instruction *Update = cast<Instruction>( 5072 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5073 Worklist.insert(Update); 5074 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5075 << "\n"); 5076 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5077 << "\n"); 5078 return; 5079 } 5080 // We only care about bitcast and getelementptr instructions contained in 5081 // the loop. 5082 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5083 return; 5084 5085 // If the pointer has already been identified as scalar (e.g., if it was 5086 // also identified as uniform), there's nothing to do. 5087 auto *I = cast<Instruction>(Ptr); 5088 if (Worklist.count(I)) 5089 return; 5090 5091 // If the use of the pointer will be a scalar use, and all users of the 5092 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5093 // place the pointer in PossibleNonScalarPtrs. 5094 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5095 return isa<LoadInst>(U) || isa<StoreInst>(U); 5096 })) 5097 ScalarPtrs.insert(I); 5098 else 5099 PossibleNonScalarPtrs.insert(I); 5100 }; 5101 5102 // We seed the scalars analysis with three classes of instructions: (1) 5103 // instructions marked uniform-after-vectorization and (2) bitcast, 5104 // getelementptr and (pointer) phi instructions used by memory accesses 5105 // requiring a scalar use. 5106 // 5107 // (1) Add to the worklist all instructions that have been identified as 5108 // uniform-after-vectorization. 5109 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5110 5111 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5112 // memory accesses requiring a scalar use. The pointer operands of loads and 5113 // stores will be scalar as long as the memory accesses is not a gather or 5114 // scatter operation. The value operand of a store will remain scalar if the 5115 // store is scalarized. 5116 for (auto *BB : TheLoop->blocks()) 5117 for (auto &I : *BB) { 5118 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5119 evaluatePtrUse(Load, Load->getPointerOperand()); 5120 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5121 evaluatePtrUse(Store, Store->getPointerOperand()); 5122 evaluatePtrUse(Store, Store->getValueOperand()); 5123 } 5124 } 5125 for (auto *I : ScalarPtrs) 5126 if (!PossibleNonScalarPtrs.count(I)) { 5127 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5128 Worklist.insert(I); 5129 } 5130 5131 // Insert the forced scalars. 5132 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5133 // induction variable when the PHI user is scalarized. 5134 auto ForcedScalar = ForcedScalars.find(VF); 5135 if (ForcedScalar != ForcedScalars.end()) 5136 for (auto *I : ForcedScalar->second) 5137 Worklist.insert(I); 5138 5139 // Expand the worklist by looking through any bitcasts and getelementptr 5140 // instructions we've already identified as scalar. This is similar to the 5141 // expansion step in collectLoopUniforms(); however, here we're only 5142 // expanding to include additional bitcasts and getelementptr instructions. 5143 unsigned Idx = 0; 5144 while (Idx != Worklist.size()) { 5145 Instruction *Dst = Worklist[Idx++]; 5146 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5147 continue; 5148 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5149 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5150 auto *J = cast<Instruction>(U); 5151 return !TheLoop->contains(J) || Worklist.count(J) || 5152 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5153 isScalarUse(J, Src)); 5154 })) { 5155 Worklist.insert(Src); 5156 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5157 } 5158 } 5159 5160 // An induction variable will remain scalar if all users of the induction 5161 // variable and induction variable update remain scalar. 5162 for (auto &Induction : Legal->getInductionVars()) { 5163 auto *Ind = Induction.first; 5164 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5165 5166 // If tail-folding is applied, the primary induction variable will be used 5167 // to feed a vector compare. 5168 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5169 continue; 5170 5171 // Determine if all users of the induction variable are scalar after 5172 // vectorization. 5173 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5174 auto *I = cast<Instruction>(U); 5175 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5176 }); 5177 if (!ScalarInd) 5178 continue; 5179 5180 // Determine if all users of the induction variable update instruction are 5181 // scalar after vectorization. 5182 auto ScalarIndUpdate = 5183 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5184 auto *I = cast<Instruction>(U); 5185 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5186 }); 5187 if (!ScalarIndUpdate) 5188 continue; 5189 5190 // The induction variable and its update instruction will remain scalar. 5191 Worklist.insert(Ind); 5192 Worklist.insert(IndUpdate); 5193 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5194 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5195 << "\n"); 5196 } 5197 5198 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5199 } 5200 5201 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5202 ElementCount VF) { 5203 if (!blockNeedsPredication(I->getParent())) 5204 return false; 5205 switch(I->getOpcode()) { 5206 default: 5207 break; 5208 case Instruction::Load: 5209 case Instruction::Store: { 5210 if (!Legal->isMaskRequired(I)) 5211 return false; 5212 auto *Ptr = getLoadStorePointerOperand(I); 5213 auto *Ty = getMemInstValueType(I); 5214 // We have already decided how to vectorize this instruction, get that 5215 // result. 5216 if (VF.isVector()) { 5217 InstWidening WideningDecision = getWideningDecision(I, VF); 5218 assert(WideningDecision != CM_Unknown && 5219 "Widening decision should be ready at this moment"); 5220 return WideningDecision == CM_Scalarize; 5221 } 5222 const Align Alignment = getLoadStoreAlignment(I); 5223 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5224 isLegalMaskedGather(Ty, Alignment)) 5225 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5226 isLegalMaskedScatter(Ty, Alignment)); 5227 } 5228 case Instruction::UDiv: 5229 case Instruction::SDiv: 5230 case Instruction::SRem: 5231 case Instruction::URem: 5232 return mayDivideByZero(*I); 5233 } 5234 return false; 5235 } 5236 5237 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5238 Instruction *I, ElementCount VF) { 5239 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5240 assert(getWideningDecision(I, VF) == CM_Unknown && 5241 "Decision should not be set yet."); 5242 auto *Group = getInterleavedAccessGroup(I); 5243 assert(Group && "Must have a group."); 5244 5245 // If the instruction's allocated size doesn't equal it's type size, it 5246 // requires padding and will be scalarized. 5247 auto &DL = I->getModule()->getDataLayout(); 5248 auto *ScalarTy = getMemInstValueType(I); 5249 if (hasIrregularType(ScalarTy, DL)) 5250 return false; 5251 5252 // Check if masking is required. 5253 // A Group may need masking for one of two reasons: it resides in a block that 5254 // needs predication, or it was decided to use masking to deal with gaps. 5255 bool PredicatedAccessRequiresMasking = 5256 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5257 bool AccessWithGapsRequiresMasking = 5258 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5259 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5260 return true; 5261 5262 // If masked interleaving is required, we expect that the user/target had 5263 // enabled it, because otherwise it either wouldn't have been created or 5264 // it should have been invalidated by the CostModel. 5265 assert(useMaskedInterleavedAccesses(TTI) && 5266 "Masked interleave-groups for predicated accesses are not enabled."); 5267 5268 auto *Ty = getMemInstValueType(I); 5269 const Align Alignment = getLoadStoreAlignment(I); 5270 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5271 : TTI.isLegalMaskedStore(Ty, Alignment); 5272 } 5273 5274 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5275 Instruction *I, ElementCount VF) { 5276 // Get and ensure we have a valid memory instruction. 5277 LoadInst *LI = dyn_cast<LoadInst>(I); 5278 StoreInst *SI = dyn_cast<StoreInst>(I); 5279 assert((LI || SI) && "Invalid memory instruction"); 5280 5281 auto *Ptr = getLoadStorePointerOperand(I); 5282 5283 // In order to be widened, the pointer should be consecutive, first of all. 5284 if (!Legal->isConsecutivePtr(Ptr)) 5285 return false; 5286 5287 // If the instruction is a store located in a predicated block, it will be 5288 // scalarized. 5289 if (isScalarWithPredication(I)) 5290 return false; 5291 5292 // If the instruction's allocated size doesn't equal it's type size, it 5293 // requires padding and will be scalarized. 5294 auto &DL = I->getModule()->getDataLayout(); 5295 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5296 if (hasIrregularType(ScalarTy, DL)) 5297 return false; 5298 5299 return true; 5300 } 5301 5302 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5303 // We should not collect Uniforms more than once per VF. Right now, 5304 // this function is called from collectUniformsAndScalars(), which 5305 // already does this check. Collecting Uniforms for VF=1 does not make any 5306 // sense. 5307 5308 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5309 "This function should not be visited twice for the same VF"); 5310 5311 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5312 // not analyze again. Uniforms.count(VF) will return 1. 5313 Uniforms[VF].clear(); 5314 5315 // We now know that the loop is vectorizable! 5316 // Collect instructions inside the loop that will remain uniform after 5317 // vectorization. 5318 5319 // Global values, params and instructions outside of current loop are out of 5320 // scope. 5321 auto isOutOfScope = [&](Value *V) -> bool { 5322 Instruction *I = dyn_cast<Instruction>(V); 5323 return (!I || !TheLoop->contains(I)); 5324 }; 5325 5326 SetVector<Instruction *> Worklist; 5327 BasicBlock *Latch = TheLoop->getLoopLatch(); 5328 5329 // Instructions that are scalar with predication must not be considered 5330 // uniform after vectorization, because that would create an erroneous 5331 // replicating region where only a single instance out of VF should be formed. 5332 // TODO: optimize such seldom cases if found important, see PR40816. 5333 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5334 if (isOutOfScope(I)) { 5335 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5336 << *I << "\n"); 5337 return; 5338 } 5339 if (isScalarWithPredication(I, VF)) { 5340 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5341 << *I << "\n"); 5342 return; 5343 } 5344 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5345 Worklist.insert(I); 5346 }; 5347 5348 // Start with the conditional branch. If the branch condition is an 5349 // instruction contained in the loop that is only used by the branch, it is 5350 // uniform. 5351 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5352 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5353 addToWorklistIfAllowed(Cmp); 5354 5355 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5356 InstWidening WideningDecision = getWideningDecision(I, VF); 5357 assert(WideningDecision != CM_Unknown && 5358 "Widening decision should be ready at this moment"); 5359 5360 // A uniform memory op is itself uniform. We exclude uniform stores 5361 // here as they demand the last lane, not the first one. 5362 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5363 assert(WideningDecision == CM_Scalarize); 5364 return true; 5365 } 5366 5367 return (WideningDecision == CM_Widen || 5368 WideningDecision == CM_Widen_Reverse || 5369 WideningDecision == CM_Interleave); 5370 }; 5371 5372 5373 // Returns true if Ptr is the pointer operand of a memory access instruction 5374 // I, and I is known to not require scalarization. 5375 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5376 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5377 }; 5378 5379 // Holds a list of values which are known to have at least one uniform use. 5380 // Note that there may be other uses which aren't uniform. A "uniform use" 5381 // here is something which only demands lane 0 of the unrolled iterations; 5382 // it does not imply that all lanes produce the same value (e.g. this is not 5383 // the usual meaning of uniform) 5384 SmallPtrSet<Value *, 8> HasUniformUse; 5385 5386 // Scan the loop for instructions which are either a) known to have only 5387 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5388 for (auto *BB : TheLoop->blocks()) 5389 for (auto &I : *BB) { 5390 // If there's no pointer operand, there's nothing to do. 5391 auto *Ptr = getLoadStorePointerOperand(&I); 5392 if (!Ptr) 5393 continue; 5394 5395 // A uniform memory op is itself uniform. We exclude uniform stores 5396 // here as they demand the last lane, not the first one. 5397 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5398 addToWorklistIfAllowed(&I); 5399 5400 if (isUniformDecision(&I, VF)) { 5401 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5402 HasUniformUse.insert(Ptr); 5403 } 5404 } 5405 5406 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5407 // demanding) users. Since loops are assumed to be in LCSSA form, this 5408 // disallows uses outside the loop as well. 5409 for (auto *V : HasUniformUse) { 5410 if (isOutOfScope(V)) 5411 continue; 5412 auto *I = cast<Instruction>(V); 5413 auto UsersAreMemAccesses = 5414 llvm::all_of(I->users(), [&](User *U) -> bool { 5415 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5416 }); 5417 if (UsersAreMemAccesses) 5418 addToWorklistIfAllowed(I); 5419 } 5420 5421 // Expand Worklist in topological order: whenever a new instruction 5422 // is added , its users should be already inside Worklist. It ensures 5423 // a uniform instruction will only be used by uniform instructions. 5424 unsigned idx = 0; 5425 while (idx != Worklist.size()) { 5426 Instruction *I = Worklist[idx++]; 5427 5428 for (auto OV : I->operand_values()) { 5429 // isOutOfScope operands cannot be uniform instructions. 5430 if (isOutOfScope(OV)) 5431 continue; 5432 // First order recurrence Phi's should typically be considered 5433 // non-uniform. 5434 auto *OP = dyn_cast<PHINode>(OV); 5435 if (OP && Legal->isFirstOrderRecurrence(OP)) 5436 continue; 5437 // If all the users of the operand are uniform, then add the 5438 // operand into the uniform worklist. 5439 auto *OI = cast<Instruction>(OV); 5440 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5441 auto *J = cast<Instruction>(U); 5442 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5443 })) 5444 addToWorklistIfAllowed(OI); 5445 } 5446 } 5447 5448 // For an instruction to be added into Worklist above, all its users inside 5449 // the loop should also be in Worklist. However, this condition cannot be 5450 // true for phi nodes that form a cyclic dependence. We must process phi 5451 // nodes separately. An induction variable will remain uniform if all users 5452 // of the induction variable and induction variable update remain uniform. 5453 // The code below handles both pointer and non-pointer induction variables. 5454 for (auto &Induction : Legal->getInductionVars()) { 5455 auto *Ind = Induction.first; 5456 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5457 5458 // Determine if all users of the induction variable are uniform after 5459 // vectorization. 5460 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5461 auto *I = cast<Instruction>(U); 5462 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5463 isVectorizedMemAccessUse(I, Ind); 5464 }); 5465 if (!UniformInd) 5466 continue; 5467 5468 // Determine if all users of the induction variable update instruction are 5469 // uniform after vectorization. 5470 auto UniformIndUpdate = 5471 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5472 auto *I = cast<Instruction>(U); 5473 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5474 isVectorizedMemAccessUse(I, IndUpdate); 5475 }); 5476 if (!UniformIndUpdate) 5477 continue; 5478 5479 // The induction variable and its update instruction will remain uniform. 5480 addToWorklistIfAllowed(Ind); 5481 addToWorklistIfAllowed(IndUpdate); 5482 } 5483 5484 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5485 } 5486 5487 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5488 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5489 5490 if (Legal->getRuntimePointerChecking()->Need) { 5491 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5492 "runtime pointer checks needed. Enable vectorization of this " 5493 "loop with '#pragma clang loop vectorize(enable)' when " 5494 "compiling with -Os/-Oz", 5495 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5496 return true; 5497 } 5498 5499 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5500 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5501 "runtime SCEV checks needed. Enable vectorization of this " 5502 "loop with '#pragma clang loop vectorize(enable)' when " 5503 "compiling with -Os/-Oz", 5504 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5505 return true; 5506 } 5507 5508 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5509 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5510 reportVectorizationFailure("Runtime stride check for small trip count", 5511 "runtime stride == 1 checks needed. Enable vectorization of " 5512 "this loop without such check by compiling with -Os/-Oz", 5513 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5514 return true; 5515 } 5516 5517 return false; 5518 } 5519 5520 Optional<ElementCount> 5521 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5522 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5523 // TODO: It may by useful to do since it's still likely to be dynamically 5524 // uniform if the target can skip. 5525 reportVectorizationFailure( 5526 "Not inserting runtime ptr check for divergent target", 5527 "runtime pointer checks needed. Not enabled for divergent target", 5528 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5529 return None; 5530 } 5531 5532 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5533 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5534 if (TC == 1) { 5535 reportVectorizationFailure("Single iteration (non) loop", 5536 "loop trip count is one, irrelevant for vectorization", 5537 "SingleIterationLoop", ORE, TheLoop); 5538 return None; 5539 } 5540 5541 switch (ScalarEpilogueStatus) { 5542 case CM_ScalarEpilogueAllowed: 5543 return computeFeasibleMaxVF(TC, UserVF); 5544 case CM_ScalarEpilogueNotAllowedUsePredicate: 5545 LLVM_FALLTHROUGH; 5546 case CM_ScalarEpilogueNotNeededUsePredicate: 5547 LLVM_DEBUG( 5548 dbgs() << "LV: vector predicate hint/switch found.\n" 5549 << "LV: Not allowing scalar epilogue, creating predicated " 5550 << "vector loop.\n"); 5551 break; 5552 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5553 // fallthrough as a special case of OptForSize 5554 case CM_ScalarEpilogueNotAllowedOptSize: 5555 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5556 LLVM_DEBUG( 5557 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5558 else 5559 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5560 << "count.\n"); 5561 5562 // Bail if runtime checks are required, which are not good when optimising 5563 // for size. 5564 if (runtimeChecksRequired()) 5565 return None; 5566 5567 break; 5568 } 5569 5570 // The only loops we can vectorize without a scalar epilogue, are loops with 5571 // a bottom-test and a single exiting block. We'd have to handle the fact 5572 // that not every instruction executes on the last iteration. This will 5573 // require a lane mask which varies through the vector loop body. (TODO) 5574 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5575 // If there was a tail-folding hint/switch, but we can't fold the tail by 5576 // masking, fallback to a vectorization with a scalar epilogue. 5577 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5578 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5579 "scalar epilogue instead.\n"); 5580 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5581 return computeFeasibleMaxVF(TC, UserVF); 5582 } 5583 return None; 5584 } 5585 5586 // Now try the tail folding 5587 5588 // Invalidate interleave groups that require an epilogue if we can't mask 5589 // the interleave-group. 5590 if (!useMaskedInterleavedAccesses(TTI)) { 5591 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5592 "No decisions should have been taken at this point"); 5593 // Note: There is no need to invalidate any cost modeling decisions here, as 5594 // non where taken so far. 5595 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5596 } 5597 5598 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5599 assert(!MaxVF.isScalable() && 5600 "Scalable vectors do not yet support tail folding"); 5601 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5602 "MaxVF must be a power of 2"); 5603 unsigned MaxVFtimesIC = 5604 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5605 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5606 // chose. 5607 ScalarEvolution *SE = PSE.getSE(); 5608 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5609 const SCEV *ExitCount = SE->getAddExpr( 5610 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5611 const SCEV *Rem = SE->getURemExpr( 5612 SE->applyLoopGuards(ExitCount, TheLoop), 5613 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5614 if (Rem->isZero()) { 5615 // Accept MaxVF if we do not have a tail. 5616 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5617 return MaxVF; 5618 } 5619 5620 // If we don't know the precise trip count, or if the trip count that we 5621 // found modulo the vectorization factor is not zero, try to fold the tail 5622 // by masking. 5623 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5624 if (Legal->prepareToFoldTailByMasking()) { 5625 FoldTailByMasking = true; 5626 return MaxVF; 5627 } 5628 5629 // If there was a tail-folding hint/switch, but we can't fold the tail by 5630 // masking, fallback to a vectorization with a scalar epilogue. 5631 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5632 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5633 "scalar epilogue instead.\n"); 5634 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5635 return MaxVF; 5636 } 5637 5638 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5639 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5640 return None; 5641 } 5642 5643 if (TC == 0) { 5644 reportVectorizationFailure( 5645 "Unable to calculate the loop count due to complex control flow", 5646 "unable to calculate the loop count due to complex control flow", 5647 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5648 return None; 5649 } 5650 5651 reportVectorizationFailure( 5652 "Cannot optimize for size and vectorize at the same time.", 5653 "cannot optimize for size and vectorize at the same time. " 5654 "Enable vectorization of this loop with '#pragma clang loop " 5655 "vectorize(enable)' when compiling with -Os/-Oz", 5656 "NoTailLoopWithOptForSize", ORE, TheLoop); 5657 return None; 5658 } 5659 5660 ElementCount 5661 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5662 ElementCount UserVF) { 5663 bool IgnoreScalableUserVF = UserVF.isScalable() && 5664 !TTI.supportsScalableVectors() && 5665 !ForceTargetSupportsScalableVectors; 5666 if (IgnoreScalableUserVF) { 5667 LLVM_DEBUG( 5668 dbgs() << "LV: Ignoring VF=" << UserVF 5669 << " because target does not support scalable vectors.\n"); 5670 ORE->emit([&]() { 5671 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5672 TheLoop->getStartLoc(), 5673 TheLoop->getHeader()) 5674 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5675 << " because target does not support scalable vectors."; 5676 }); 5677 } 5678 5679 // Beyond this point two scenarios are handled. If UserVF isn't specified 5680 // then a suitable VF is chosen. If UserVF is specified and there are 5681 // dependencies, check if it's legal. However, if a UserVF is specified and 5682 // there are no dependencies, then there's nothing to do. 5683 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5684 if (!canVectorizeReductions(UserVF)) { 5685 reportVectorizationFailure( 5686 "LV: Scalable vectorization not supported for the reduction " 5687 "operations found in this loop. Using fixed-width " 5688 "vectorization instead.", 5689 "Scalable vectorization not supported for the reduction operations " 5690 "found in this loop. Using fixed-width vectorization instead.", 5691 "ScalableVFUnfeasible", ORE, TheLoop); 5692 return computeFeasibleMaxVF( 5693 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5694 } 5695 5696 if (Legal->isSafeForAnyVectorWidth()) 5697 return UserVF; 5698 } 5699 5700 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5701 unsigned SmallestType, WidestType; 5702 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5703 unsigned WidestRegister = 5704 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 5705 .getFixedSize(); 5706 5707 // Get the maximum safe dependence distance in bits computed by LAA. 5708 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5709 // the memory accesses that is most restrictive (involved in the smallest 5710 // dependence distance). 5711 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5712 5713 // If the user vectorization factor is legally unsafe, clamp it to a safe 5714 // value. Otherwise, return as is. 5715 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5716 unsigned MaxSafeElements = 5717 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5718 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5719 5720 if (UserVF.isScalable()) { 5721 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5722 5723 // Scale VF by vscale before checking if it's safe. 5724 MaxSafeVF = ElementCount::getScalable( 5725 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5726 5727 if (MaxSafeVF.isZero()) { 5728 // The dependence distance is too small to use scalable vectors, 5729 // fallback on fixed. 5730 LLVM_DEBUG( 5731 dbgs() 5732 << "LV: Max legal vector width too small, scalable vectorization " 5733 "unfeasible. Using fixed-width vectorization instead.\n"); 5734 ORE->emit([&]() { 5735 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5736 TheLoop->getStartLoc(), 5737 TheLoop->getHeader()) 5738 << "Max legal vector width too small, scalable vectorization " 5739 << "unfeasible. Using fixed-width vectorization instead."; 5740 }); 5741 return computeFeasibleMaxVF( 5742 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5743 } 5744 } 5745 5746 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5747 5748 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5749 return UserVF; 5750 5751 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5752 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5753 << ".\n"); 5754 ORE->emit([&]() { 5755 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5756 TheLoop->getStartLoc(), 5757 TheLoop->getHeader()) 5758 << "User-specified vectorization factor " 5759 << ore::NV("UserVectorizationFactor", UserVF) 5760 << " is unsafe, clamping to maximum safe vectorization factor " 5761 << ore::NV("VectorizationFactor", MaxSafeVF); 5762 }); 5763 return MaxSafeVF; 5764 } 5765 5766 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5767 5768 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5769 // Note that both WidestRegister and WidestType may not be a powers of 2. 5770 auto MaxVectorSize = 5771 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5772 5773 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5774 << " / " << WidestType << " bits.\n"); 5775 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5776 << WidestRegister << " bits.\n"); 5777 5778 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5779 "Did not expect to pack so many elements" 5780 " into one vector!"); 5781 if (MaxVectorSize.getFixedValue() == 0) { 5782 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5783 return ElementCount::getFixed(1); 5784 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5785 isPowerOf2_32(ConstTripCount)) { 5786 // We need to clamp the VF to be the ConstTripCount. There is no point in 5787 // choosing a higher viable VF as done in the loop below. 5788 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5789 << ConstTripCount << "\n"); 5790 return ElementCount::getFixed(ConstTripCount); 5791 } 5792 5793 ElementCount MaxVF = MaxVectorSize; 5794 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5795 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5796 // Collect all viable vectorization factors larger than the default MaxVF 5797 // (i.e. MaxVectorSize). 5798 SmallVector<ElementCount, 8> VFs; 5799 auto MaxVectorSizeMaxBW = 5800 ElementCount::getFixed(WidestRegister / SmallestType); 5801 for (ElementCount VS = MaxVectorSize * 2; 5802 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5803 VFs.push_back(VS); 5804 5805 // For each VF calculate its register usage. 5806 auto RUs = calculateRegisterUsage(VFs); 5807 5808 // Select the largest VF which doesn't require more registers than existing 5809 // ones. 5810 for (int i = RUs.size() - 1; i >= 0; --i) { 5811 bool Selected = true; 5812 for (auto &pair : RUs[i].MaxLocalUsers) { 5813 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5814 if (pair.second > TargetNumRegisters) 5815 Selected = false; 5816 } 5817 if (Selected) { 5818 MaxVF = VFs[i]; 5819 break; 5820 } 5821 } 5822 if (ElementCount MinVF = 5823 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5824 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5825 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5826 << ") with target's minimum: " << MinVF << '\n'); 5827 MaxVF = MinVF; 5828 } 5829 } 5830 } 5831 return MaxVF; 5832 } 5833 5834 VectorizationFactor 5835 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5836 // FIXME: This can be fixed for scalable vectors later, because at this stage 5837 // the LoopVectorizer will only consider vectorizing a loop with scalable 5838 // vectors when the loop has a hint to enable vectorization for a given VF. 5839 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5840 5841 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5842 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5843 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5844 5845 auto Width = ElementCount::getFixed(1); 5846 const float ScalarCost = *ExpectedCost.getValue(); 5847 float Cost = ScalarCost; 5848 5849 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5850 if (ForceVectorization && MaxVF.isVector()) { 5851 // Ignore scalar width, because the user explicitly wants vectorization. 5852 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5853 // evaluation. 5854 Cost = std::numeric_limits<float>::max(); 5855 } 5856 5857 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5858 i *= 2) { 5859 // Notice that the vector loop needs to be executed less times, so 5860 // we need to divide the cost of the vector loops by the width of 5861 // the vector elements. 5862 VectorizationCostTy C = expectedCost(i); 5863 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5864 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5865 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5866 << " costs: " << (int)VectorCost << ".\n"); 5867 if (!C.second && !ForceVectorization) { 5868 LLVM_DEBUG( 5869 dbgs() << "LV: Not considering vector loop of width " << i 5870 << " because it will not generate any vector instructions.\n"); 5871 continue; 5872 } 5873 5874 // If profitable add it to ProfitableVF list. 5875 if (VectorCost < ScalarCost) { 5876 ProfitableVFs.push_back(VectorizationFactor( 5877 {i, (unsigned)VectorCost})); 5878 } 5879 5880 if (VectorCost < Cost) { 5881 Cost = VectorCost; 5882 Width = i; 5883 } 5884 } 5885 5886 if (!EnableCondStoresVectorization && NumPredStores) { 5887 reportVectorizationFailure("There are conditional stores.", 5888 "store that is conditionally executed prevents vectorization", 5889 "ConditionalStore", ORE, TheLoop); 5890 Width = ElementCount::getFixed(1); 5891 Cost = ScalarCost; 5892 } 5893 5894 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5895 << "LV: Vectorization seems to be not beneficial, " 5896 << "but was forced by a user.\n"); 5897 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5898 VectorizationFactor Factor = {Width, 5899 (unsigned)(Width.getKnownMinValue() * Cost)}; 5900 return Factor; 5901 } 5902 5903 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5904 const Loop &L, ElementCount VF) const { 5905 // Cross iteration phis such as reductions need special handling and are 5906 // currently unsupported. 5907 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5908 return Legal->isFirstOrderRecurrence(&Phi) || 5909 Legal->isReductionVariable(&Phi); 5910 })) 5911 return false; 5912 5913 // Phis with uses outside of the loop require special handling and are 5914 // currently unsupported. 5915 for (auto &Entry : Legal->getInductionVars()) { 5916 // Look for uses of the value of the induction at the last iteration. 5917 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5918 for (User *U : PostInc->users()) 5919 if (!L.contains(cast<Instruction>(U))) 5920 return false; 5921 // Look for uses of penultimate value of the induction. 5922 for (User *U : Entry.first->users()) 5923 if (!L.contains(cast<Instruction>(U))) 5924 return false; 5925 } 5926 5927 // Induction variables that are widened require special handling that is 5928 // currently not supported. 5929 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5930 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5931 this->isProfitableToScalarize(Entry.first, VF)); 5932 })) 5933 return false; 5934 5935 return true; 5936 } 5937 5938 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5939 const ElementCount VF) const { 5940 // FIXME: We need a much better cost-model to take different parameters such 5941 // as register pressure, code size increase and cost of extra branches into 5942 // account. For now we apply a very crude heuristic and only consider loops 5943 // with vectorization factors larger than a certain value. 5944 // We also consider epilogue vectorization unprofitable for targets that don't 5945 // consider interleaving beneficial (eg. MVE). 5946 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5947 return false; 5948 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5949 return true; 5950 return false; 5951 } 5952 5953 VectorizationFactor 5954 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5955 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5956 VectorizationFactor Result = VectorizationFactor::Disabled(); 5957 if (!EnableEpilogueVectorization) { 5958 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5959 return Result; 5960 } 5961 5962 if (!isScalarEpilogueAllowed()) { 5963 LLVM_DEBUG( 5964 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5965 "allowed.\n";); 5966 return Result; 5967 } 5968 5969 // FIXME: This can be fixed for scalable vectors later, because at this stage 5970 // the LoopVectorizer will only consider vectorizing a loop with scalable 5971 // vectors when the loop has a hint to enable vectorization for a given VF. 5972 if (MainLoopVF.isScalable()) { 5973 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5974 "yet supported.\n"); 5975 return Result; 5976 } 5977 5978 // Not really a cost consideration, but check for unsupported cases here to 5979 // simplify the logic. 5980 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5981 LLVM_DEBUG( 5982 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5983 "not a supported candidate.\n";); 5984 return Result; 5985 } 5986 5987 if (EpilogueVectorizationForceVF > 1) { 5988 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5989 if (LVP.hasPlanWithVFs( 5990 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5991 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5992 else { 5993 LLVM_DEBUG( 5994 dbgs() 5995 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5996 return Result; 5997 } 5998 } 5999 6000 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6001 TheLoop->getHeader()->getParent()->hasMinSize()) { 6002 LLVM_DEBUG( 6003 dbgs() 6004 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6005 return Result; 6006 } 6007 6008 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6009 return Result; 6010 6011 for (auto &NextVF : ProfitableVFs) 6012 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6013 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6014 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6015 Result = NextVF; 6016 6017 if (Result != VectorizationFactor::Disabled()) 6018 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6019 << Result.Width.getFixedValue() << "\n";); 6020 return Result; 6021 } 6022 6023 std::pair<unsigned, unsigned> 6024 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6025 unsigned MinWidth = -1U; 6026 unsigned MaxWidth = 8; 6027 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6028 6029 // For each block. 6030 for (BasicBlock *BB : TheLoop->blocks()) { 6031 // For each instruction in the loop. 6032 for (Instruction &I : BB->instructionsWithoutDebug()) { 6033 Type *T = I.getType(); 6034 6035 // Skip ignored values. 6036 if (ValuesToIgnore.count(&I)) 6037 continue; 6038 6039 // Only examine Loads, Stores and PHINodes. 6040 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6041 continue; 6042 6043 // Examine PHI nodes that are reduction variables. Update the type to 6044 // account for the recurrence type. 6045 if (auto *PN = dyn_cast<PHINode>(&I)) { 6046 if (!Legal->isReductionVariable(PN)) 6047 continue; 6048 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6049 if (PreferInLoopReductions || 6050 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6051 RdxDesc.getRecurrenceType(), 6052 TargetTransformInfo::ReductionFlags())) 6053 continue; 6054 T = RdxDesc.getRecurrenceType(); 6055 } 6056 6057 // Examine the stored values. 6058 if (auto *ST = dyn_cast<StoreInst>(&I)) 6059 T = ST->getValueOperand()->getType(); 6060 6061 // Ignore loaded pointer types and stored pointer types that are not 6062 // vectorizable. 6063 // 6064 // FIXME: The check here attempts to predict whether a load or store will 6065 // be vectorized. We only know this for certain after a VF has 6066 // been selected. Here, we assume that if an access can be 6067 // vectorized, it will be. We should also look at extending this 6068 // optimization to non-pointer types. 6069 // 6070 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6071 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6072 continue; 6073 6074 MinWidth = std::min(MinWidth, 6075 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6076 MaxWidth = std::max(MaxWidth, 6077 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6078 } 6079 } 6080 6081 return {MinWidth, MaxWidth}; 6082 } 6083 6084 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6085 unsigned LoopCost) { 6086 // -- The interleave heuristics -- 6087 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6088 // There are many micro-architectural considerations that we can't predict 6089 // at this level. For example, frontend pressure (on decode or fetch) due to 6090 // code size, or the number and capabilities of the execution ports. 6091 // 6092 // We use the following heuristics to select the interleave count: 6093 // 1. If the code has reductions, then we interleave to break the cross 6094 // iteration dependency. 6095 // 2. If the loop is really small, then we interleave to reduce the loop 6096 // overhead. 6097 // 3. We don't interleave if we think that we will spill registers to memory 6098 // due to the increased register pressure. 6099 6100 if (!isScalarEpilogueAllowed()) 6101 return 1; 6102 6103 // We used the distance for the interleave count. 6104 if (Legal->getMaxSafeDepDistBytes() != -1U) 6105 return 1; 6106 6107 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6108 const bool HasReductions = !Legal->getReductionVars().empty(); 6109 // Do not interleave loops with a relatively small known or estimated trip 6110 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6111 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6112 // because with the above conditions interleaving can expose ILP and break 6113 // cross iteration dependences for reductions. 6114 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6115 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6116 return 1; 6117 6118 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6119 // We divide by these constants so assume that we have at least one 6120 // instruction that uses at least one register. 6121 for (auto& pair : R.MaxLocalUsers) { 6122 pair.second = std::max(pair.second, 1U); 6123 } 6124 6125 // We calculate the interleave count using the following formula. 6126 // Subtract the number of loop invariants from the number of available 6127 // registers. These registers are used by all of the interleaved instances. 6128 // Next, divide the remaining registers by the number of registers that is 6129 // required by the loop, in order to estimate how many parallel instances 6130 // fit without causing spills. All of this is rounded down if necessary to be 6131 // a power of two. We want power of two interleave count to simplify any 6132 // addressing operations or alignment considerations. 6133 // We also want power of two interleave counts to ensure that the induction 6134 // variable of the vector loop wraps to zero, when tail is folded by masking; 6135 // this currently happens when OptForSize, in which case IC is set to 1 above. 6136 unsigned IC = UINT_MAX; 6137 6138 for (auto& pair : R.MaxLocalUsers) { 6139 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6140 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6141 << " registers of " 6142 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6143 if (VF.isScalar()) { 6144 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6145 TargetNumRegisters = ForceTargetNumScalarRegs; 6146 } else { 6147 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6148 TargetNumRegisters = ForceTargetNumVectorRegs; 6149 } 6150 unsigned MaxLocalUsers = pair.second; 6151 unsigned LoopInvariantRegs = 0; 6152 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6153 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6154 6155 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6156 // Don't count the induction variable as interleaved. 6157 if (EnableIndVarRegisterHeur) { 6158 TmpIC = 6159 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6160 std::max(1U, (MaxLocalUsers - 1))); 6161 } 6162 6163 IC = std::min(IC, TmpIC); 6164 } 6165 6166 // Clamp the interleave ranges to reasonable counts. 6167 unsigned MaxInterleaveCount = 6168 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6169 6170 // Check if the user has overridden the max. 6171 if (VF.isScalar()) { 6172 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6173 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6174 } else { 6175 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6176 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6177 } 6178 6179 // If trip count is known or estimated compile time constant, limit the 6180 // interleave count to be less than the trip count divided by VF, provided it 6181 // is at least 1. 6182 // 6183 // For scalable vectors we can't know if interleaving is beneficial. It may 6184 // not be beneficial for small loops if none of the lanes in the second vector 6185 // iterations is enabled. However, for larger loops, there is likely to be a 6186 // similar benefit as for fixed-width vectors. For now, we choose to leave 6187 // the InterleaveCount as if vscale is '1', although if some information about 6188 // the vector is known (e.g. min vector size), we can make a better decision. 6189 if (BestKnownTC) { 6190 MaxInterleaveCount = 6191 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6192 // Make sure MaxInterleaveCount is greater than 0. 6193 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6194 } 6195 6196 assert(MaxInterleaveCount > 0 && 6197 "Maximum interleave count must be greater than 0"); 6198 6199 // Clamp the calculated IC to be between the 1 and the max interleave count 6200 // that the target and trip count allows. 6201 if (IC > MaxInterleaveCount) 6202 IC = MaxInterleaveCount; 6203 else 6204 // Make sure IC is greater than 0. 6205 IC = std::max(1u, IC); 6206 6207 assert(IC > 0 && "Interleave count must be greater than 0."); 6208 6209 // If we did not calculate the cost for VF (because the user selected the VF) 6210 // then we calculate the cost of VF here. 6211 if (LoopCost == 0) { 6212 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6213 LoopCost = *expectedCost(VF).first.getValue(); 6214 } 6215 6216 assert(LoopCost && "Non-zero loop cost expected"); 6217 6218 // Interleave if we vectorized this loop and there is a reduction that could 6219 // benefit from interleaving. 6220 if (VF.isVector() && HasReductions) { 6221 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6222 return IC; 6223 } 6224 6225 // Note that if we've already vectorized the loop we will have done the 6226 // runtime check and so interleaving won't require further checks. 6227 bool InterleavingRequiresRuntimePointerCheck = 6228 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6229 6230 // We want to interleave small loops in order to reduce the loop overhead and 6231 // potentially expose ILP opportunities. 6232 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6233 << "LV: IC is " << IC << '\n' 6234 << "LV: VF is " << VF << '\n'); 6235 const bool AggressivelyInterleaveReductions = 6236 TTI.enableAggressiveInterleaving(HasReductions); 6237 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6238 // We assume that the cost overhead is 1 and we use the cost model 6239 // to estimate the cost of the loop and interleave until the cost of the 6240 // loop overhead is about 5% of the cost of the loop. 6241 unsigned SmallIC = 6242 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6243 6244 // Interleave until store/load ports (estimated by max interleave count) are 6245 // saturated. 6246 unsigned NumStores = Legal->getNumStores(); 6247 unsigned NumLoads = Legal->getNumLoads(); 6248 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6249 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6250 6251 // If we have a scalar reduction (vector reductions are already dealt with 6252 // by this point), we can increase the critical path length if the loop 6253 // we're interleaving is inside another loop. Limit, by default to 2, so the 6254 // critical path only gets increased by one reduction operation. 6255 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6256 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6257 SmallIC = std::min(SmallIC, F); 6258 StoresIC = std::min(StoresIC, F); 6259 LoadsIC = std::min(LoadsIC, F); 6260 } 6261 6262 if (EnableLoadStoreRuntimeInterleave && 6263 std::max(StoresIC, LoadsIC) > SmallIC) { 6264 LLVM_DEBUG( 6265 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6266 return std::max(StoresIC, LoadsIC); 6267 } 6268 6269 // If there are scalar reductions and TTI has enabled aggressive 6270 // interleaving for reductions, we will interleave to expose ILP. 6271 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6272 AggressivelyInterleaveReductions) { 6273 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6274 // Interleave no less than SmallIC but not as aggressive as the normal IC 6275 // to satisfy the rare situation when resources are too limited. 6276 return std::max(IC / 2, SmallIC); 6277 } else { 6278 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6279 return SmallIC; 6280 } 6281 } 6282 6283 // Interleave if this is a large loop (small loops are already dealt with by 6284 // this point) that could benefit from interleaving. 6285 if (AggressivelyInterleaveReductions) { 6286 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6287 return IC; 6288 } 6289 6290 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6291 return 1; 6292 } 6293 6294 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6295 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6296 // This function calculates the register usage by measuring the highest number 6297 // of values that are alive at a single location. Obviously, this is a very 6298 // rough estimation. We scan the loop in a topological order in order and 6299 // assign a number to each instruction. We use RPO to ensure that defs are 6300 // met before their users. We assume that each instruction that has in-loop 6301 // users starts an interval. We record every time that an in-loop value is 6302 // used, so we have a list of the first and last occurrences of each 6303 // instruction. Next, we transpose this data structure into a multi map that 6304 // holds the list of intervals that *end* at a specific location. This multi 6305 // map allows us to perform a linear search. We scan the instructions linearly 6306 // and record each time that a new interval starts, by placing it in a set. 6307 // If we find this value in the multi-map then we remove it from the set. 6308 // The max register usage is the maximum size of the set. 6309 // We also search for instructions that are defined outside the loop, but are 6310 // used inside the loop. We need this number separately from the max-interval 6311 // usage number because when we unroll, loop-invariant values do not take 6312 // more register. 6313 LoopBlocksDFS DFS(TheLoop); 6314 DFS.perform(LI); 6315 6316 RegisterUsage RU; 6317 6318 // Each 'key' in the map opens a new interval. The values 6319 // of the map are the index of the 'last seen' usage of the 6320 // instruction that is the key. 6321 using IntervalMap = DenseMap<Instruction *, unsigned>; 6322 6323 // Maps instruction to its index. 6324 SmallVector<Instruction *, 64> IdxToInstr; 6325 // Marks the end of each interval. 6326 IntervalMap EndPoint; 6327 // Saves the list of instruction indices that are used in the loop. 6328 SmallPtrSet<Instruction *, 8> Ends; 6329 // Saves the list of values that are used in the loop but are 6330 // defined outside the loop, such as arguments and constants. 6331 SmallPtrSet<Value *, 8> LoopInvariants; 6332 6333 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6334 for (Instruction &I : BB->instructionsWithoutDebug()) { 6335 IdxToInstr.push_back(&I); 6336 6337 // Save the end location of each USE. 6338 for (Value *U : I.operands()) { 6339 auto *Instr = dyn_cast<Instruction>(U); 6340 6341 // Ignore non-instruction values such as arguments, constants, etc. 6342 if (!Instr) 6343 continue; 6344 6345 // If this instruction is outside the loop then record it and continue. 6346 if (!TheLoop->contains(Instr)) { 6347 LoopInvariants.insert(Instr); 6348 continue; 6349 } 6350 6351 // Overwrite previous end points. 6352 EndPoint[Instr] = IdxToInstr.size(); 6353 Ends.insert(Instr); 6354 } 6355 } 6356 } 6357 6358 // Saves the list of intervals that end with the index in 'key'. 6359 using InstrList = SmallVector<Instruction *, 2>; 6360 DenseMap<unsigned, InstrList> TransposeEnds; 6361 6362 // Transpose the EndPoints to a list of values that end at each index. 6363 for (auto &Interval : EndPoint) 6364 TransposeEnds[Interval.second].push_back(Interval.first); 6365 6366 SmallPtrSet<Instruction *, 8> OpenIntervals; 6367 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6368 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6369 6370 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6371 6372 // A lambda that gets the register usage for the given type and VF. 6373 const auto &TTICapture = TTI; 6374 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6375 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6376 return 0U; 6377 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6378 }; 6379 6380 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6381 Instruction *I = IdxToInstr[i]; 6382 6383 // Remove all of the instructions that end at this location. 6384 InstrList &List = TransposeEnds[i]; 6385 for (Instruction *ToRemove : List) 6386 OpenIntervals.erase(ToRemove); 6387 6388 // Ignore instructions that are never used within the loop. 6389 if (!Ends.count(I)) 6390 continue; 6391 6392 // Skip ignored values. 6393 if (ValuesToIgnore.count(I)) 6394 continue; 6395 6396 // For each VF find the maximum usage of registers. 6397 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6398 // Count the number of live intervals. 6399 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6400 6401 if (VFs[j].isScalar()) { 6402 for (auto Inst : OpenIntervals) { 6403 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6404 if (RegUsage.find(ClassID) == RegUsage.end()) 6405 RegUsage[ClassID] = 1; 6406 else 6407 RegUsage[ClassID] += 1; 6408 } 6409 } else { 6410 collectUniformsAndScalars(VFs[j]); 6411 for (auto Inst : OpenIntervals) { 6412 // Skip ignored values for VF > 1. 6413 if (VecValuesToIgnore.count(Inst)) 6414 continue; 6415 if (isScalarAfterVectorization(Inst, VFs[j])) { 6416 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6417 if (RegUsage.find(ClassID) == RegUsage.end()) 6418 RegUsage[ClassID] = 1; 6419 else 6420 RegUsage[ClassID] += 1; 6421 } else { 6422 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6423 if (RegUsage.find(ClassID) == RegUsage.end()) 6424 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6425 else 6426 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6427 } 6428 } 6429 } 6430 6431 for (auto& pair : RegUsage) { 6432 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6433 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6434 else 6435 MaxUsages[j][pair.first] = pair.second; 6436 } 6437 } 6438 6439 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6440 << OpenIntervals.size() << '\n'); 6441 6442 // Add the current instruction to the list of open intervals. 6443 OpenIntervals.insert(I); 6444 } 6445 6446 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6447 SmallMapVector<unsigned, unsigned, 4> Invariant; 6448 6449 for (auto Inst : LoopInvariants) { 6450 unsigned Usage = 6451 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6452 unsigned ClassID = 6453 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6454 if (Invariant.find(ClassID) == Invariant.end()) 6455 Invariant[ClassID] = Usage; 6456 else 6457 Invariant[ClassID] += Usage; 6458 } 6459 6460 LLVM_DEBUG({ 6461 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6462 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6463 << " item\n"; 6464 for (const auto &pair : MaxUsages[i]) { 6465 dbgs() << "LV(REG): RegisterClass: " 6466 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6467 << " registers\n"; 6468 } 6469 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6470 << " item\n"; 6471 for (const auto &pair : Invariant) { 6472 dbgs() << "LV(REG): RegisterClass: " 6473 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6474 << " registers\n"; 6475 } 6476 }); 6477 6478 RU.LoopInvariantRegs = Invariant; 6479 RU.MaxLocalUsers = MaxUsages[i]; 6480 RUs[i] = RU; 6481 } 6482 6483 return RUs; 6484 } 6485 6486 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6487 // TODO: Cost model for emulated masked load/store is completely 6488 // broken. This hack guides the cost model to use an artificially 6489 // high enough value to practically disable vectorization with such 6490 // operations, except where previously deployed legality hack allowed 6491 // using very low cost values. This is to avoid regressions coming simply 6492 // from moving "masked load/store" check from legality to cost model. 6493 // Masked Load/Gather emulation was previously never allowed. 6494 // Limited number of Masked Store/Scatter emulation was allowed. 6495 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6496 return isa<LoadInst>(I) || 6497 (isa<StoreInst>(I) && 6498 NumPredStores > NumberOfStoresToPredicate); 6499 } 6500 6501 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6502 // If we aren't vectorizing the loop, or if we've already collected the 6503 // instructions to scalarize, there's nothing to do. Collection may already 6504 // have occurred if we have a user-selected VF and are now computing the 6505 // expected cost for interleaving. 6506 if (VF.isScalar() || VF.isZero() || 6507 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6508 return; 6509 6510 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6511 // not profitable to scalarize any instructions, the presence of VF in the 6512 // map will indicate that we've analyzed it already. 6513 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6514 6515 // Find all the instructions that are scalar with predication in the loop and 6516 // determine if it would be better to not if-convert the blocks they are in. 6517 // If so, we also record the instructions to scalarize. 6518 for (BasicBlock *BB : TheLoop->blocks()) { 6519 if (!blockNeedsPredication(BB)) 6520 continue; 6521 for (Instruction &I : *BB) 6522 if (isScalarWithPredication(&I)) { 6523 ScalarCostsTy ScalarCosts; 6524 // Do not apply discount logic if hacked cost is needed 6525 // for emulated masked memrefs. 6526 if (!useEmulatedMaskMemRefHack(&I) && 6527 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6528 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6529 // Remember that BB will remain after vectorization. 6530 PredicatedBBsAfterVectorization.insert(BB); 6531 } 6532 } 6533 } 6534 6535 int LoopVectorizationCostModel::computePredInstDiscount( 6536 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6537 assert(!isUniformAfterVectorization(PredInst, VF) && 6538 "Instruction marked uniform-after-vectorization will be predicated"); 6539 6540 // Initialize the discount to zero, meaning that the scalar version and the 6541 // vector version cost the same. 6542 InstructionCost Discount = 0; 6543 6544 // Holds instructions to analyze. The instructions we visit are mapped in 6545 // ScalarCosts. Those instructions are the ones that would be scalarized if 6546 // we find that the scalar version costs less. 6547 SmallVector<Instruction *, 8> Worklist; 6548 6549 // Returns true if the given instruction can be scalarized. 6550 auto canBeScalarized = [&](Instruction *I) -> bool { 6551 // We only attempt to scalarize instructions forming a single-use chain 6552 // from the original predicated block that would otherwise be vectorized. 6553 // Although not strictly necessary, we give up on instructions we know will 6554 // already be scalar to avoid traversing chains that are unlikely to be 6555 // beneficial. 6556 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6557 isScalarAfterVectorization(I, VF)) 6558 return false; 6559 6560 // If the instruction is scalar with predication, it will be analyzed 6561 // separately. We ignore it within the context of PredInst. 6562 if (isScalarWithPredication(I)) 6563 return false; 6564 6565 // If any of the instruction's operands are uniform after vectorization, 6566 // the instruction cannot be scalarized. This prevents, for example, a 6567 // masked load from being scalarized. 6568 // 6569 // We assume we will only emit a value for lane zero of an instruction 6570 // marked uniform after vectorization, rather than VF identical values. 6571 // Thus, if we scalarize an instruction that uses a uniform, we would 6572 // create uses of values corresponding to the lanes we aren't emitting code 6573 // for. This behavior can be changed by allowing getScalarValue to clone 6574 // the lane zero values for uniforms rather than asserting. 6575 for (Use &U : I->operands()) 6576 if (auto *J = dyn_cast<Instruction>(U.get())) 6577 if (isUniformAfterVectorization(J, VF)) 6578 return false; 6579 6580 // Otherwise, we can scalarize the instruction. 6581 return true; 6582 }; 6583 6584 // Compute the expected cost discount from scalarizing the entire expression 6585 // feeding the predicated instruction. We currently only consider expressions 6586 // that are single-use instruction chains. 6587 Worklist.push_back(PredInst); 6588 while (!Worklist.empty()) { 6589 Instruction *I = Worklist.pop_back_val(); 6590 6591 // If we've already analyzed the instruction, there's nothing to do. 6592 if (ScalarCosts.find(I) != ScalarCosts.end()) 6593 continue; 6594 6595 // Compute the cost of the vector instruction. Note that this cost already 6596 // includes the scalarization overhead of the predicated instruction. 6597 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6598 6599 // Compute the cost of the scalarized instruction. This cost is the cost of 6600 // the instruction as if it wasn't if-converted and instead remained in the 6601 // predicated block. We will scale this cost by block probability after 6602 // computing the scalarization overhead. 6603 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6604 InstructionCost ScalarCost = 6605 VF.getKnownMinValue() * 6606 getInstructionCost(I, ElementCount::getFixed(1)).first; 6607 6608 // Compute the scalarization overhead of needed insertelement instructions 6609 // and phi nodes. 6610 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6611 ScalarCost += TTI.getScalarizationOverhead( 6612 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6613 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6614 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6615 ScalarCost += 6616 VF.getKnownMinValue() * 6617 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6618 } 6619 6620 // Compute the scalarization overhead of needed extractelement 6621 // instructions. For each of the instruction's operands, if the operand can 6622 // be scalarized, add it to the worklist; otherwise, account for the 6623 // overhead. 6624 for (Use &U : I->operands()) 6625 if (auto *J = dyn_cast<Instruction>(U.get())) { 6626 assert(VectorType::isValidElementType(J->getType()) && 6627 "Instruction has non-scalar type"); 6628 if (canBeScalarized(J)) 6629 Worklist.push_back(J); 6630 else if (needsExtract(J, VF)) { 6631 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6632 ScalarCost += TTI.getScalarizationOverhead( 6633 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6634 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6635 } 6636 } 6637 6638 // Scale the total scalar cost by block probability. 6639 ScalarCost /= getReciprocalPredBlockProb(); 6640 6641 // Compute the discount. A non-negative discount means the vector version 6642 // of the instruction costs more, and scalarizing would be beneficial. 6643 Discount += VectorCost - ScalarCost; 6644 ScalarCosts[I] = ScalarCost; 6645 } 6646 6647 return *Discount.getValue(); 6648 } 6649 6650 LoopVectorizationCostModel::VectorizationCostTy 6651 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6652 VectorizationCostTy Cost; 6653 6654 // For each block. 6655 for (BasicBlock *BB : TheLoop->blocks()) { 6656 VectorizationCostTy BlockCost; 6657 6658 // For each instruction in the old loop. 6659 for (Instruction &I : BB->instructionsWithoutDebug()) { 6660 // Skip ignored values. 6661 if (ValuesToIgnore.count(&I) || 6662 (VF.isVector() && VecValuesToIgnore.count(&I))) 6663 continue; 6664 6665 VectorizationCostTy C = getInstructionCost(&I, VF); 6666 6667 // Check if we should override the cost. 6668 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6669 C.first = InstructionCost(ForceTargetInstructionCost); 6670 6671 BlockCost.first += C.first; 6672 BlockCost.second |= C.second; 6673 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6674 << " for VF " << VF << " For instruction: " << I 6675 << '\n'); 6676 } 6677 6678 // If we are vectorizing a predicated block, it will have been 6679 // if-converted. This means that the block's instructions (aside from 6680 // stores and instructions that may divide by zero) will now be 6681 // unconditionally executed. For the scalar case, we may not always execute 6682 // the predicated block, if it is an if-else block. Thus, scale the block's 6683 // cost by the probability of executing it. blockNeedsPredication from 6684 // Legal is used so as to not include all blocks in tail folded loops. 6685 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6686 BlockCost.first /= getReciprocalPredBlockProb(); 6687 6688 Cost.first += BlockCost.first; 6689 Cost.second |= BlockCost.second; 6690 } 6691 6692 return Cost; 6693 } 6694 6695 /// Gets Address Access SCEV after verifying that the access pattern 6696 /// is loop invariant except the induction variable dependence. 6697 /// 6698 /// This SCEV can be sent to the Target in order to estimate the address 6699 /// calculation cost. 6700 static const SCEV *getAddressAccessSCEV( 6701 Value *Ptr, 6702 LoopVectorizationLegality *Legal, 6703 PredicatedScalarEvolution &PSE, 6704 const Loop *TheLoop) { 6705 6706 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6707 if (!Gep) 6708 return nullptr; 6709 6710 // We are looking for a gep with all loop invariant indices except for one 6711 // which should be an induction variable. 6712 auto SE = PSE.getSE(); 6713 unsigned NumOperands = Gep->getNumOperands(); 6714 for (unsigned i = 1; i < NumOperands; ++i) { 6715 Value *Opd = Gep->getOperand(i); 6716 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6717 !Legal->isInductionVariable(Opd)) 6718 return nullptr; 6719 } 6720 6721 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6722 return PSE.getSCEV(Ptr); 6723 } 6724 6725 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6726 return Legal->hasStride(I->getOperand(0)) || 6727 Legal->hasStride(I->getOperand(1)); 6728 } 6729 6730 InstructionCost 6731 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6732 ElementCount VF) { 6733 assert(VF.isVector() && 6734 "Scalarization cost of instruction implies vectorization."); 6735 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6736 Type *ValTy = getMemInstValueType(I); 6737 auto SE = PSE.getSE(); 6738 6739 unsigned AS = getLoadStoreAddressSpace(I); 6740 Value *Ptr = getLoadStorePointerOperand(I); 6741 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6742 6743 // Figure out whether the access is strided and get the stride value 6744 // if it's known in compile time 6745 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6746 6747 // Get the cost of the scalar memory instruction and address computation. 6748 InstructionCost Cost = 6749 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6750 6751 // Don't pass *I here, since it is scalar but will actually be part of a 6752 // vectorized loop where the user of it is a vectorized instruction. 6753 const Align Alignment = getLoadStoreAlignment(I); 6754 Cost += VF.getKnownMinValue() * 6755 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6756 AS, TTI::TCK_RecipThroughput); 6757 6758 // Get the overhead of the extractelement and insertelement instructions 6759 // we might create due to scalarization. 6760 Cost += getScalarizationOverhead(I, VF); 6761 6762 // If we have a predicated load/store, it will need extra i1 extracts and 6763 // conditional branches, but may not be executed for each vector lane. Scale 6764 // the cost by the probability of executing the predicated block. 6765 if (isPredicatedInst(I)) { 6766 Cost /= getReciprocalPredBlockProb(); 6767 6768 // Add the cost of an i1 extract and a branch 6769 auto *Vec_i1Ty = 6770 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6771 Cost += TTI.getScalarizationOverhead( 6772 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6773 /*Insert=*/false, /*Extract=*/true); 6774 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6775 6776 if (useEmulatedMaskMemRefHack(I)) 6777 // Artificially setting to a high enough value to practically disable 6778 // vectorization with such operations. 6779 Cost = 3000000; 6780 } 6781 6782 return Cost; 6783 } 6784 6785 InstructionCost 6786 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6787 ElementCount VF) { 6788 Type *ValTy = getMemInstValueType(I); 6789 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6790 Value *Ptr = getLoadStorePointerOperand(I); 6791 unsigned AS = getLoadStoreAddressSpace(I); 6792 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6793 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6794 6795 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6796 "Stride should be 1 or -1 for consecutive memory access"); 6797 const Align Alignment = getLoadStoreAlignment(I); 6798 InstructionCost Cost = 0; 6799 if (Legal->isMaskRequired(I)) 6800 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6801 CostKind); 6802 else 6803 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6804 CostKind, I); 6805 6806 bool Reverse = ConsecutiveStride < 0; 6807 if (Reverse) 6808 Cost += 6809 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6810 return Cost; 6811 } 6812 6813 InstructionCost 6814 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6815 ElementCount VF) { 6816 assert(Legal->isUniformMemOp(*I)); 6817 6818 Type *ValTy = getMemInstValueType(I); 6819 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6820 const Align Alignment = getLoadStoreAlignment(I); 6821 unsigned AS = getLoadStoreAddressSpace(I); 6822 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6823 if (isa<LoadInst>(I)) { 6824 return TTI.getAddressComputationCost(ValTy) + 6825 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6826 CostKind) + 6827 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6828 } 6829 StoreInst *SI = cast<StoreInst>(I); 6830 6831 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6832 return TTI.getAddressComputationCost(ValTy) + 6833 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6834 CostKind) + 6835 (isLoopInvariantStoreValue 6836 ? 0 6837 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6838 VF.getKnownMinValue() - 1)); 6839 } 6840 6841 InstructionCost 6842 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6843 ElementCount VF) { 6844 Type *ValTy = getMemInstValueType(I); 6845 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6846 const Align Alignment = getLoadStoreAlignment(I); 6847 const Value *Ptr = getLoadStorePointerOperand(I); 6848 6849 return TTI.getAddressComputationCost(VectorTy) + 6850 TTI.getGatherScatterOpCost( 6851 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6852 TargetTransformInfo::TCK_RecipThroughput, I); 6853 } 6854 6855 InstructionCost 6856 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6857 ElementCount VF) { 6858 // TODO: Once we have support for interleaving with scalable vectors 6859 // we can calculate the cost properly here. 6860 if (VF.isScalable()) 6861 return InstructionCost::getInvalid(); 6862 6863 Type *ValTy = getMemInstValueType(I); 6864 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6865 unsigned AS = getLoadStoreAddressSpace(I); 6866 6867 auto Group = getInterleavedAccessGroup(I); 6868 assert(Group && "Fail to get an interleaved access group."); 6869 6870 unsigned InterleaveFactor = Group->getFactor(); 6871 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6872 6873 // Holds the indices of existing members in an interleaved load group. 6874 // An interleaved store group doesn't need this as it doesn't allow gaps. 6875 SmallVector<unsigned, 4> Indices; 6876 if (isa<LoadInst>(I)) { 6877 for (unsigned i = 0; i < InterleaveFactor; i++) 6878 if (Group->getMember(i)) 6879 Indices.push_back(i); 6880 } 6881 6882 // Calculate the cost of the whole interleaved group. 6883 bool UseMaskForGaps = 6884 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6885 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6886 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6887 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6888 6889 if (Group->isReverse()) { 6890 // TODO: Add support for reversed masked interleaved access. 6891 assert(!Legal->isMaskRequired(I) && 6892 "Reverse masked interleaved access not supported."); 6893 Cost += 6894 Group->getNumMembers() * 6895 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6896 } 6897 return Cost; 6898 } 6899 6900 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6901 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6902 // Early exit for no inloop reductions 6903 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6904 return InstructionCost::getInvalid(); 6905 auto *VectorTy = cast<VectorType>(Ty); 6906 6907 // We are looking for a pattern of, and finding the minimal acceptable cost: 6908 // reduce(mul(ext(A), ext(B))) or 6909 // reduce(mul(A, B)) or 6910 // reduce(ext(A)) or 6911 // reduce(A). 6912 // The basic idea is that we walk down the tree to do that, finding the root 6913 // reduction instruction in InLoopReductionImmediateChains. From there we find 6914 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6915 // of the components. If the reduction cost is lower then we return it for the 6916 // reduction instruction and 0 for the other instructions in the pattern. If 6917 // it is not we return an invalid cost specifying the orignal cost method 6918 // should be used. 6919 Instruction *RetI = I; 6920 if ((RetI->getOpcode() == Instruction::SExt || 6921 RetI->getOpcode() == Instruction::ZExt)) { 6922 if (!RetI->hasOneUser()) 6923 return InstructionCost::getInvalid(); 6924 RetI = RetI->user_back(); 6925 } 6926 if (RetI->getOpcode() == Instruction::Mul && 6927 RetI->user_back()->getOpcode() == Instruction::Add) { 6928 if (!RetI->hasOneUser()) 6929 return InstructionCost::getInvalid(); 6930 RetI = RetI->user_back(); 6931 } 6932 6933 // Test if the found instruction is a reduction, and if not return an invalid 6934 // cost specifying the parent to use the original cost modelling. 6935 if (!InLoopReductionImmediateChains.count(RetI)) 6936 return InstructionCost::getInvalid(); 6937 6938 // Find the reduction this chain is a part of and calculate the basic cost of 6939 // the reduction on its own. 6940 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6941 Instruction *ReductionPhi = LastChain; 6942 while (!isa<PHINode>(ReductionPhi)) 6943 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6944 6945 RecurrenceDescriptor RdxDesc = 6946 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6947 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6948 VectorTy, false, CostKind); 6949 6950 // Get the operand that was not the reduction chain and match it to one of the 6951 // patterns, returning the better cost if it is found. 6952 Instruction *RedOp = RetI->getOperand(1) == LastChain 6953 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6954 : dyn_cast<Instruction>(RetI->getOperand(1)); 6955 6956 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6957 6958 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6959 !TheLoop->isLoopInvariant(RedOp)) { 6960 bool IsUnsigned = isa<ZExtInst>(RedOp); 6961 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6962 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6963 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6964 CostKind); 6965 6966 unsigned ExtCost = 6967 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6968 TTI::CastContextHint::None, CostKind, RedOp); 6969 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6970 return I == RetI ? *RedCost.getValue() : 0; 6971 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6972 Instruction *Mul = RedOp; 6973 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6974 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6975 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6976 Op0->getOpcode() == Op1->getOpcode() && 6977 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6978 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6979 bool IsUnsigned = isa<ZExtInst>(Op0); 6980 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6981 // reduce(mul(ext, ext)) 6982 unsigned ExtCost = 6983 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6984 TTI::CastContextHint::None, CostKind, Op0); 6985 InstructionCost MulCost = 6986 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6987 6988 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6989 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6990 CostKind); 6991 6992 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6993 return I == RetI ? *RedCost.getValue() : 0; 6994 } else { 6995 InstructionCost MulCost = 6996 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6997 6998 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6999 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7000 CostKind); 7001 7002 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7003 return I == RetI ? *RedCost.getValue() : 0; 7004 } 7005 } 7006 7007 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7008 } 7009 7010 InstructionCost 7011 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7012 ElementCount VF) { 7013 // Calculate scalar cost only. Vectorization cost should be ready at this 7014 // moment. 7015 if (VF.isScalar()) { 7016 Type *ValTy = getMemInstValueType(I); 7017 const Align Alignment = getLoadStoreAlignment(I); 7018 unsigned AS = getLoadStoreAddressSpace(I); 7019 7020 return TTI.getAddressComputationCost(ValTy) + 7021 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7022 TTI::TCK_RecipThroughput, I); 7023 } 7024 return getWideningCost(I, VF); 7025 } 7026 7027 LoopVectorizationCostModel::VectorizationCostTy 7028 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7029 ElementCount VF) { 7030 // If we know that this instruction will remain uniform, check the cost of 7031 // the scalar version. 7032 if (isUniformAfterVectorization(I, VF)) 7033 VF = ElementCount::getFixed(1); 7034 7035 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7036 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7037 7038 // Forced scalars do not have any scalarization overhead. 7039 auto ForcedScalar = ForcedScalars.find(VF); 7040 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7041 auto InstSet = ForcedScalar->second; 7042 if (InstSet.count(I)) 7043 return VectorizationCostTy( 7044 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7045 VF.getKnownMinValue()), 7046 false); 7047 } 7048 7049 Type *VectorTy; 7050 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7051 7052 bool TypeNotScalarized = 7053 VF.isVector() && VectorTy->isVectorTy() && 7054 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7055 return VectorizationCostTy(C, TypeNotScalarized); 7056 } 7057 7058 InstructionCost 7059 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7060 ElementCount VF) { 7061 7062 if (VF.isScalable()) 7063 return InstructionCost::getInvalid(); 7064 7065 if (VF.isScalar()) 7066 return 0; 7067 7068 InstructionCost Cost = 0; 7069 Type *RetTy = ToVectorTy(I->getType(), VF); 7070 if (!RetTy->isVoidTy() && 7071 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7072 Cost += TTI.getScalarizationOverhead( 7073 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7074 true, false); 7075 7076 // Some targets keep addresses scalar. 7077 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7078 return Cost; 7079 7080 // Some targets support efficient element stores. 7081 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7082 return Cost; 7083 7084 // Collect operands to consider. 7085 CallInst *CI = dyn_cast<CallInst>(I); 7086 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7087 7088 // Skip operands that do not require extraction/scalarization and do not incur 7089 // any overhead. 7090 SmallVector<Type *> Tys; 7091 for (auto *V : filterExtractingOperands(Ops, VF)) 7092 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7093 return Cost + TTI.getOperandsScalarizationOverhead( 7094 filterExtractingOperands(Ops, VF), Tys); 7095 } 7096 7097 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7098 if (VF.isScalar()) 7099 return; 7100 NumPredStores = 0; 7101 for (BasicBlock *BB : TheLoop->blocks()) { 7102 // For each instruction in the old loop. 7103 for (Instruction &I : *BB) { 7104 Value *Ptr = getLoadStorePointerOperand(&I); 7105 if (!Ptr) 7106 continue; 7107 7108 // TODO: We should generate better code and update the cost model for 7109 // predicated uniform stores. Today they are treated as any other 7110 // predicated store (see added test cases in 7111 // invariant-store-vectorization.ll). 7112 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7113 NumPredStores++; 7114 7115 if (Legal->isUniformMemOp(I)) { 7116 // TODO: Avoid replicating loads and stores instead of 7117 // relying on instcombine to remove them. 7118 // Load: Scalar load + broadcast 7119 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7120 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7121 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7122 continue; 7123 } 7124 7125 // We assume that widening is the best solution when possible. 7126 if (memoryInstructionCanBeWidened(&I, VF)) { 7127 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7128 int ConsecutiveStride = 7129 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7130 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7131 "Expected consecutive stride."); 7132 InstWidening Decision = 7133 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7134 setWideningDecision(&I, VF, Decision, Cost); 7135 continue; 7136 } 7137 7138 // Choose between Interleaving, Gather/Scatter or Scalarization. 7139 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7140 unsigned NumAccesses = 1; 7141 if (isAccessInterleaved(&I)) { 7142 auto Group = getInterleavedAccessGroup(&I); 7143 assert(Group && "Fail to get an interleaved access group."); 7144 7145 // Make one decision for the whole group. 7146 if (getWideningDecision(&I, VF) != CM_Unknown) 7147 continue; 7148 7149 NumAccesses = Group->getNumMembers(); 7150 if (interleavedAccessCanBeWidened(&I, VF)) 7151 InterleaveCost = getInterleaveGroupCost(&I, VF); 7152 } 7153 7154 InstructionCost GatherScatterCost = 7155 isLegalGatherOrScatter(&I) 7156 ? getGatherScatterCost(&I, VF) * NumAccesses 7157 : InstructionCost::getInvalid(); 7158 7159 InstructionCost ScalarizationCost = 7160 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7161 : InstructionCost::getInvalid(); 7162 7163 // Choose better solution for the current VF, 7164 // write down this decision and use it during vectorization. 7165 InstructionCost Cost; 7166 InstWidening Decision; 7167 if (InterleaveCost <= GatherScatterCost && 7168 InterleaveCost < ScalarizationCost) { 7169 Decision = CM_Interleave; 7170 Cost = InterleaveCost; 7171 } else if (GatherScatterCost < ScalarizationCost) { 7172 Decision = CM_GatherScatter; 7173 Cost = GatherScatterCost; 7174 } else { 7175 assert(!VF.isScalable() && 7176 "We cannot yet scalarise for scalable vectors"); 7177 Decision = CM_Scalarize; 7178 Cost = ScalarizationCost; 7179 } 7180 // If the instructions belongs to an interleave group, the whole group 7181 // receives the same decision. The whole group receives the cost, but 7182 // the cost will actually be assigned to one instruction. 7183 if (auto Group = getInterleavedAccessGroup(&I)) 7184 setWideningDecision(Group, VF, Decision, Cost); 7185 else 7186 setWideningDecision(&I, VF, Decision, Cost); 7187 } 7188 } 7189 7190 // Make sure that any load of address and any other address computation 7191 // remains scalar unless there is gather/scatter support. This avoids 7192 // inevitable extracts into address registers, and also has the benefit of 7193 // activating LSR more, since that pass can't optimize vectorized 7194 // addresses. 7195 if (TTI.prefersVectorizedAddressing()) 7196 return; 7197 7198 // Start with all scalar pointer uses. 7199 SmallPtrSet<Instruction *, 8> AddrDefs; 7200 for (BasicBlock *BB : TheLoop->blocks()) 7201 for (Instruction &I : *BB) { 7202 Instruction *PtrDef = 7203 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7204 if (PtrDef && TheLoop->contains(PtrDef) && 7205 getWideningDecision(&I, VF) != CM_GatherScatter) 7206 AddrDefs.insert(PtrDef); 7207 } 7208 7209 // Add all instructions used to generate the addresses. 7210 SmallVector<Instruction *, 4> Worklist; 7211 append_range(Worklist, AddrDefs); 7212 while (!Worklist.empty()) { 7213 Instruction *I = Worklist.pop_back_val(); 7214 for (auto &Op : I->operands()) 7215 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7216 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7217 AddrDefs.insert(InstOp).second) 7218 Worklist.push_back(InstOp); 7219 } 7220 7221 for (auto *I : AddrDefs) { 7222 if (isa<LoadInst>(I)) { 7223 // Setting the desired widening decision should ideally be handled in 7224 // by cost functions, but since this involves the task of finding out 7225 // if the loaded register is involved in an address computation, it is 7226 // instead changed here when we know this is the case. 7227 InstWidening Decision = getWideningDecision(I, VF); 7228 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7229 // Scalarize a widened load of address. 7230 setWideningDecision( 7231 I, VF, CM_Scalarize, 7232 (VF.getKnownMinValue() * 7233 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7234 else if (auto Group = getInterleavedAccessGroup(I)) { 7235 // Scalarize an interleave group of address loads. 7236 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7237 if (Instruction *Member = Group->getMember(I)) 7238 setWideningDecision( 7239 Member, VF, CM_Scalarize, 7240 (VF.getKnownMinValue() * 7241 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7242 } 7243 } 7244 } else 7245 // Make sure I gets scalarized and a cost estimate without 7246 // scalarization overhead. 7247 ForcedScalars[VF].insert(I); 7248 } 7249 } 7250 7251 InstructionCost 7252 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7253 Type *&VectorTy) { 7254 Type *RetTy = I->getType(); 7255 if (canTruncateToMinimalBitwidth(I, VF)) 7256 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7257 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7258 auto SE = PSE.getSE(); 7259 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7260 7261 // TODO: We need to estimate the cost of intrinsic calls. 7262 switch (I->getOpcode()) { 7263 case Instruction::GetElementPtr: 7264 // We mark this instruction as zero-cost because the cost of GEPs in 7265 // vectorized code depends on whether the corresponding memory instruction 7266 // is scalarized or not. Therefore, we handle GEPs with the memory 7267 // instruction cost. 7268 return 0; 7269 case Instruction::Br: { 7270 // In cases of scalarized and predicated instructions, there will be VF 7271 // predicated blocks in the vectorized loop. Each branch around these 7272 // blocks requires also an extract of its vector compare i1 element. 7273 bool ScalarPredicatedBB = false; 7274 BranchInst *BI = cast<BranchInst>(I); 7275 if (VF.isVector() && BI->isConditional() && 7276 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7277 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7278 ScalarPredicatedBB = true; 7279 7280 if (ScalarPredicatedBB) { 7281 // Return cost for branches around scalarized and predicated blocks. 7282 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7283 auto *Vec_i1Ty = 7284 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7285 return (TTI.getScalarizationOverhead( 7286 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7287 false, true) + 7288 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7289 VF.getKnownMinValue())); 7290 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7291 // The back-edge branch will remain, as will all scalar branches. 7292 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7293 else 7294 // This branch will be eliminated by if-conversion. 7295 return 0; 7296 // Note: We currently assume zero cost for an unconditional branch inside 7297 // a predicated block since it will become a fall-through, although we 7298 // may decide in the future to call TTI for all branches. 7299 } 7300 case Instruction::PHI: { 7301 auto *Phi = cast<PHINode>(I); 7302 7303 // First-order recurrences are replaced by vector shuffles inside the loop. 7304 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7305 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7306 return TTI.getShuffleCost( 7307 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7308 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7309 7310 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7311 // converted into select instructions. We require N - 1 selects per phi 7312 // node, where N is the number of incoming values. 7313 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7314 return (Phi->getNumIncomingValues() - 1) * 7315 TTI.getCmpSelInstrCost( 7316 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7317 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7318 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7319 7320 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7321 } 7322 case Instruction::UDiv: 7323 case Instruction::SDiv: 7324 case Instruction::URem: 7325 case Instruction::SRem: 7326 // If we have a predicated instruction, it may not be executed for each 7327 // vector lane. Get the scalarization cost and scale this amount by the 7328 // probability of executing the predicated block. If the instruction is not 7329 // predicated, we fall through to the next case. 7330 if (VF.isVector() && isScalarWithPredication(I)) { 7331 InstructionCost Cost = 0; 7332 7333 // These instructions have a non-void type, so account for the phi nodes 7334 // that we will create. This cost is likely to be zero. The phi node 7335 // cost, if any, should be scaled by the block probability because it 7336 // models a copy at the end of each predicated block. 7337 Cost += VF.getKnownMinValue() * 7338 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7339 7340 // The cost of the non-predicated instruction. 7341 Cost += VF.getKnownMinValue() * 7342 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7343 7344 // The cost of insertelement and extractelement instructions needed for 7345 // scalarization. 7346 Cost += getScalarizationOverhead(I, VF); 7347 7348 // Scale the cost by the probability of executing the predicated blocks. 7349 // This assumes the predicated block for each vector lane is equally 7350 // likely. 7351 return Cost / getReciprocalPredBlockProb(); 7352 } 7353 LLVM_FALLTHROUGH; 7354 case Instruction::Add: 7355 case Instruction::FAdd: 7356 case Instruction::Sub: 7357 case Instruction::FSub: 7358 case Instruction::Mul: 7359 case Instruction::FMul: 7360 case Instruction::FDiv: 7361 case Instruction::FRem: 7362 case Instruction::Shl: 7363 case Instruction::LShr: 7364 case Instruction::AShr: 7365 case Instruction::And: 7366 case Instruction::Or: 7367 case Instruction::Xor: { 7368 // Since we will replace the stride by 1 the multiplication should go away. 7369 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7370 return 0; 7371 7372 // Detect reduction patterns 7373 InstructionCost RedCost; 7374 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7375 .isValid()) 7376 return RedCost; 7377 7378 // Certain instructions can be cheaper to vectorize if they have a constant 7379 // second vector operand. One example of this are shifts on x86. 7380 Value *Op2 = I->getOperand(1); 7381 TargetTransformInfo::OperandValueProperties Op2VP; 7382 TargetTransformInfo::OperandValueKind Op2VK = 7383 TTI.getOperandInfo(Op2, Op2VP); 7384 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7385 Op2VK = TargetTransformInfo::OK_UniformValue; 7386 7387 SmallVector<const Value *, 4> Operands(I->operand_values()); 7388 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7389 return N * TTI.getArithmeticInstrCost( 7390 I->getOpcode(), VectorTy, CostKind, 7391 TargetTransformInfo::OK_AnyValue, 7392 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7393 } 7394 case Instruction::FNeg: { 7395 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7396 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7397 return N * TTI.getArithmeticInstrCost( 7398 I->getOpcode(), VectorTy, CostKind, 7399 TargetTransformInfo::OK_AnyValue, 7400 TargetTransformInfo::OK_AnyValue, 7401 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7402 I->getOperand(0), I); 7403 } 7404 case Instruction::Select: { 7405 SelectInst *SI = cast<SelectInst>(I); 7406 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7407 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7408 Type *CondTy = SI->getCondition()->getType(); 7409 if (!ScalarCond) 7410 CondTy = VectorType::get(CondTy, VF); 7411 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7412 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7413 } 7414 case Instruction::ICmp: 7415 case Instruction::FCmp: { 7416 Type *ValTy = I->getOperand(0)->getType(); 7417 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7418 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7419 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7420 VectorTy = ToVectorTy(ValTy, VF); 7421 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7422 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7423 } 7424 case Instruction::Store: 7425 case Instruction::Load: { 7426 ElementCount Width = VF; 7427 if (Width.isVector()) { 7428 InstWidening Decision = getWideningDecision(I, Width); 7429 assert(Decision != CM_Unknown && 7430 "CM decision should be taken at this point"); 7431 if (Decision == CM_Scalarize) 7432 Width = ElementCount::getFixed(1); 7433 } 7434 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7435 return getMemoryInstructionCost(I, VF); 7436 } 7437 case Instruction::ZExt: 7438 case Instruction::SExt: 7439 case Instruction::FPToUI: 7440 case Instruction::FPToSI: 7441 case Instruction::FPExt: 7442 case Instruction::PtrToInt: 7443 case Instruction::IntToPtr: 7444 case Instruction::SIToFP: 7445 case Instruction::UIToFP: 7446 case Instruction::Trunc: 7447 case Instruction::FPTrunc: 7448 case Instruction::BitCast: { 7449 // Computes the CastContextHint from a Load/Store instruction. 7450 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7451 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7452 "Expected a load or a store!"); 7453 7454 if (VF.isScalar() || !TheLoop->contains(I)) 7455 return TTI::CastContextHint::Normal; 7456 7457 switch (getWideningDecision(I, VF)) { 7458 case LoopVectorizationCostModel::CM_GatherScatter: 7459 return TTI::CastContextHint::GatherScatter; 7460 case LoopVectorizationCostModel::CM_Interleave: 7461 return TTI::CastContextHint::Interleave; 7462 case LoopVectorizationCostModel::CM_Scalarize: 7463 case LoopVectorizationCostModel::CM_Widen: 7464 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7465 : TTI::CastContextHint::Normal; 7466 case LoopVectorizationCostModel::CM_Widen_Reverse: 7467 return TTI::CastContextHint::Reversed; 7468 case LoopVectorizationCostModel::CM_Unknown: 7469 llvm_unreachable("Instr did not go through cost modelling?"); 7470 } 7471 7472 llvm_unreachable("Unhandled case!"); 7473 }; 7474 7475 unsigned Opcode = I->getOpcode(); 7476 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7477 // For Trunc, the context is the only user, which must be a StoreInst. 7478 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7479 if (I->hasOneUse()) 7480 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7481 CCH = ComputeCCH(Store); 7482 } 7483 // For Z/Sext, the context is the operand, which must be a LoadInst. 7484 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7485 Opcode == Instruction::FPExt) { 7486 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7487 CCH = ComputeCCH(Load); 7488 } 7489 7490 // We optimize the truncation of induction variables having constant 7491 // integer steps. The cost of these truncations is the same as the scalar 7492 // operation. 7493 if (isOptimizableIVTruncate(I, VF)) { 7494 auto *Trunc = cast<TruncInst>(I); 7495 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7496 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7497 } 7498 7499 // Detect reduction patterns 7500 InstructionCost RedCost; 7501 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7502 .isValid()) 7503 return RedCost; 7504 7505 Type *SrcScalarTy = I->getOperand(0)->getType(); 7506 Type *SrcVecTy = 7507 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7508 if (canTruncateToMinimalBitwidth(I, VF)) { 7509 // This cast is going to be shrunk. This may remove the cast or it might 7510 // turn it into slightly different cast. For example, if MinBW == 16, 7511 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7512 // 7513 // Calculate the modified src and dest types. 7514 Type *MinVecTy = VectorTy; 7515 if (Opcode == Instruction::Trunc) { 7516 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7517 VectorTy = 7518 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7519 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7520 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7521 VectorTy = 7522 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7523 } 7524 } 7525 7526 unsigned N; 7527 if (isScalarAfterVectorization(I, VF)) { 7528 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7529 N = VF.getKnownMinValue(); 7530 } else 7531 N = 1; 7532 return N * 7533 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7534 } 7535 case Instruction::Call: { 7536 bool NeedToScalarize; 7537 CallInst *CI = cast<CallInst>(I); 7538 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7539 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7540 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7541 return std::min(CallCost, IntrinsicCost); 7542 } 7543 return CallCost; 7544 } 7545 case Instruction::ExtractValue: 7546 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7547 default: 7548 // The cost of executing VF copies of the scalar instruction. This opcode 7549 // is unknown. Assume that it is the same as 'mul'. 7550 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7551 Instruction::Mul, VectorTy, CostKind) + 7552 getScalarizationOverhead(I, VF); 7553 } // end of switch. 7554 } 7555 7556 char LoopVectorize::ID = 0; 7557 7558 static const char lv_name[] = "Loop Vectorization"; 7559 7560 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7561 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7562 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7563 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7564 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7565 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7566 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7567 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7568 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7569 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7570 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7571 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7572 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7573 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7574 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7575 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7576 7577 namespace llvm { 7578 7579 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7580 7581 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7582 bool VectorizeOnlyWhenForced) { 7583 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7584 } 7585 7586 } // end namespace llvm 7587 7588 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7589 // Check if the pointer operand of a load or store instruction is 7590 // consecutive. 7591 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7592 return Legal->isConsecutivePtr(Ptr); 7593 return false; 7594 } 7595 7596 void LoopVectorizationCostModel::collectValuesToIgnore() { 7597 // Ignore ephemeral values. 7598 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7599 7600 // Ignore type-promoting instructions we identified during reduction 7601 // detection. 7602 for (auto &Reduction : Legal->getReductionVars()) { 7603 RecurrenceDescriptor &RedDes = Reduction.second; 7604 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7605 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7606 } 7607 // Ignore type-casting instructions we identified during induction 7608 // detection. 7609 for (auto &Induction : Legal->getInductionVars()) { 7610 InductionDescriptor &IndDes = Induction.second; 7611 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7612 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7613 } 7614 } 7615 7616 void LoopVectorizationCostModel::collectInLoopReductions() { 7617 for (auto &Reduction : Legal->getReductionVars()) { 7618 PHINode *Phi = Reduction.first; 7619 RecurrenceDescriptor &RdxDesc = Reduction.second; 7620 7621 // We don't collect reductions that are type promoted (yet). 7622 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7623 continue; 7624 7625 // If the target would prefer this reduction to happen "in-loop", then we 7626 // want to record it as such. 7627 unsigned Opcode = RdxDesc.getOpcode(); 7628 if (!PreferInLoopReductions && 7629 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7630 TargetTransformInfo::ReductionFlags())) 7631 continue; 7632 7633 // Check that we can correctly put the reductions into the loop, by 7634 // finding the chain of operations that leads from the phi to the loop 7635 // exit value. 7636 SmallVector<Instruction *, 4> ReductionOperations = 7637 RdxDesc.getReductionOpChain(Phi, TheLoop); 7638 bool InLoop = !ReductionOperations.empty(); 7639 if (InLoop) { 7640 InLoopReductionChains[Phi] = ReductionOperations; 7641 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7642 Instruction *LastChain = Phi; 7643 for (auto *I : ReductionOperations) { 7644 InLoopReductionImmediateChains[I] = LastChain; 7645 LastChain = I; 7646 } 7647 } 7648 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7649 << " reduction for phi: " << *Phi << "\n"); 7650 } 7651 } 7652 7653 // TODO: we could return a pair of values that specify the max VF and 7654 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7655 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7656 // doesn't have a cost model that can choose which plan to execute if 7657 // more than one is generated. 7658 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7659 LoopVectorizationCostModel &CM) { 7660 unsigned WidestType; 7661 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7662 return WidestVectorRegBits / WidestType; 7663 } 7664 7665 VectorizationFactor 7666 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7667 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7668 ElementCount VF = UserVF; 7669 // Outer loop handling: They may require CFG and instruction level 7670 // transformations before even evaluating whether vectorization is profitable. 7671 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7672 // the vectorization pipeline. 7673 if (!OrigLoop->isInnermost()) { 7674 // If the user doesn't provide a vectorization factor, determine a 7675 // reasonable one. 7676 if (UserVF.isZero()) { 7677 VF = ElementCount::getFixed(determineVPlanVF( 7678 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7679 .getFixedSize(), 7680 CM)); 7681 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7682 7683 // Make sure we have a VF > 1 for stress testing. 7684 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7685 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7686 << "overriding computed VF.\n"); 7687 VF = ElementCount::getFixed(4); 7688 } 7689 } 7690 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7691 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7692 "VF needs to be a power of two"); 7693 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7694 << "VF " << VF << " to build VPlans.\n"); 7695 buildVPlans(VF, VF); 7696 7697 // For VPlan build stress testing, we bail out after VPlan construction. 7698 if (VPlanBuildStressTest) 7699 return VectorizationFactor::Disabled(); 7700 7701 return {VF, 0 /*Cost*/}; 7702 } 7703 7704 LLVM_DEBUG( 7705 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7706 "VPlan-native path.\n"); 7707 return VectorizationFactor::Disabled(); 7708 } 7709 7710 Optional<VectorizationFactor> 7711 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7712 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7713 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7714 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7715 return None; 7716 7717 // Invalidate interleave groups if all blocks of loop will be predicated. 7718 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7719 !useMaskedInterleavedAccesses(*TTI)) { 7720 LLVM_DEBUG( 7721 dbgs() 7722 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7723 "which requires masked-interleaved support.\n"); 7724 if (CM.InterleaveInfo.invalidateGroups()) 7725 // Invalidating interleave groups also requires invalidating all decisions 7726 // based on them, which includes widening decisions and uniform and scalar 7727 // values. 7728 CM.invalidateCostModelingDecisions(); 7729 } 7730 7731 ElementCount MaxVF = MaybeMaxVF.getValue(); 7732 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7733 7734 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7735 if (!UserVF.isZero() && 7736 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7737 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7738 // VFs here, this should be reverted to only use legal UserVFs once the 7739 // loop below supports scalable VFs. 7740 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7741 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7742 << " VF " << VF << ".\n"); 7743 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7744 "VF needs to be a power of two"); 7745 // Collect the instructions (and their associated costs) that will be more 7746 // profitable to scalarize. 7747 CM.selectUserVectorizationFactor(VF); 7748 CM.collectInLoopReductions(); 7749 buildVPlansWithVPRecipes(VF, VF); 7750 LLVM_DEBUG(printPlans(dbgs())); 7751 return {{VF, 0}}; 7752 } 7753 7754 assert(!MaxVF.isScalable() && 7755 "Scalable vectors not yet supported beyond this point"); 7756 7757 for (ElementCount VF = ElementCount::getFixed(1); 7758 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7759 // Collect Uniform and Scalar instructions after vectorization with VF. 7760 CM.collectUniformsAndScalars(VF); 7761 7762 // Collect the instructions (and their associated costs) that will be more 7763 // profitable to scalarize. 7764 if (VF.isVector()) 7765 CM.collectInstsToScalarize(VF); 7766 } 7767 7768 CM.collectInLoopReductions(); 7769 7770 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7771 LLVM_DEBUG(printPlans(dbgs())); 7772 if (MaxVF.isScalar()) 7773 return VectorizationFactor::Disabled(); 7774 7775 // Select the optimal vectorization factor. 7776 return CM.selectVectorizationFactor(MaxVF); 7777 } 7778 7779 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7780 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7781 << '\n'); 7782 BestVF = VF; 7783 BestUF = UF; 7784 7785 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7786 return !Plan->hasVF(VF); 7787 }); 7788 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7789 } 7790 7791 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7792 DominatorTree *DT) { 7793 // Perform the actual loop transformation. 7794 7795 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7796 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7797 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7798 7799 VPTransformState State{ 7800 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7801 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7802 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7803 State.CanonicalIV = ILV.Induction; 7804 7805 ILV.printDebugTracesAtStart(); 7806 7807 //===------------------------------------------------===// 7808 // 7809 // Notice: any optimization or new instruction that go 7810 // into the code below should also be implemented in 7811 // the cost-model. 7812 // 7813 //===------------------------------------------------===// 7814 7815 // 2. Copy and widen instructions from the old loop into the new loop. 7816 VPlans.front()->execute(&State); 7817 7818 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7819 // predication, updating analyses. 7820 ILV.fixVectorizedLoop(State); 7821 7822 ILV.printDebugTracesAtEnd(); 7823 } 7824 7825 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7826 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7827 for (const auto &Plan : VPlans) 7828 if (PrintVPlansInDotFormat) 7829 Plan->printDOT(O); 7830 else 7831 Plan->print(O); 7832 } 7833 #endif 7834 7835 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7836 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7837 7838 // We create new control-flow for the vectorized loop, so the original exit 7839 // conditions will be dead after vectorization if it's only used by the 7840 // terminator 7841 SmallVector<BasicBlock*> ExitingBlocks; 7842 OrigLoop->getExitingBlocks(ExitingBlocks); 7843 for (auto *BB : ExitingBlocks) { 7844 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7845 if (!Cmp || !Cmp->hasOneUse()) 7846 continue; 7847 7848 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7849 if (!DeadInstructions.insert(Cmp).second) 7850 continue; 7851 7852 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7853 // TODO: can recurse through operands in general 7854 for (Value *Op : Cmp->operands()) { 7855 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7856 DeadInstructions.insert(cast<Instruction>(Op)); 7857 } 7858 } 7859 7860 // We create new "steps" for induction variable updates to which the original 7861 // induction variables map. An original update instruction will be dead if 7862 // all its users except the induction variable are dead. 7863 auto *Latch = OrigLoop->getLoopLatch(); 7864 for (auto &Induction : Legal->getInductionVars()) { 7865 PHINode *Ind = Induction.first; 7866 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7867 7868 // If the tail is to be folded by masking, the primary induction variable, 7869 // if exists, isn't dead: it will be used for masking. Don't kill it. 7870 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7871 continue; 7872 7873 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7874 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7875 })) 7876 DeadInstructions.insert(IndUpdate); 7877 7878 // We record as "Dead" also the type-casting instructions we had identified 7879 // during induction analysis. We don't need any handling for them in the 7880 // vectorized loop because we have proven that, under a proper runtime 7881 // test guarding the vectorized loop, the value of the phi, and the casted 7882 // value of the phi, are the same. The last instruction in this casting chain 7883 // will get its scalar/vector/widened def from the scalar/vector/widened def 7884 // of the respective phi node. Any other casts in the induction def-use chain 7885 // have no other uses outside the phi update chain, and will be ignored. 7886 InductionDescriptor &IndDes = Induction.second; 7887 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7888 DeadInstructions.insert(Casts.begin(), Casts.end()); 7889 } 7890 } 7891 7892 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7893 7894 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7895 7896 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7897 Instruction::BinaryOps BinOp) { 7898 // When unrolling and the VF is 1, we only need to add a simple scalar. 7899 Type *Ty = Val->getType(); 7900 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7901 7902 if (Ty->isFloatingPointTy()) { 7903 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7904 7905 // Floating-point operations inherit FMF via the builder's flags. 7906 Value *MulOp = Builder.CreateFMul(C, Step); 7907 return Builder.CreateBinOp(BinOp, Val, MulOp); 7908 } 7909 Constant *C = ConstantInt::get(Ty, StartIdx); 7910 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7911 } 7912 7913 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7914 SmallVector<Metadata *, 4> MDs; 7915 // Reserve first location for self reference to the LoopID metadata node. 7916 MDs.push_back(nullptr); 7917 bool IsUnrollMetadata = false; 7918 MDNode *LoopID = L->getLoopID(); 7919 if (LoopID) { 7920 // First find existing loop unrolling disable metadata. 7921 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7922 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7923 if (MD) { 7924 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7925 IsUnrollMetadata = 7926 S && S->getString().startswith("llvm.loop.unroll.disable"); 7927 } 7928 MDs.push_back(LoopID->getOperand(i)); 7929 } 7930 } 7931 7932 if (!IsUnrollMetadata) { 7933 // Add runtime unroll disable metadata. 7934 LLVMContext &Context = L->getHeader()->getContext(); 7935 SmallVector<Metadata *, 1> DisableOperands; 7936 DisableOperands.push_back( 7937 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7938 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7939 MDs.push_back(DisableNode); 7940 MDNode *NewLoopID = MDNode::get(Context, MDs); 7941 // Set operand 0 to refer to the loop id itself. 7942 NewLoopID->replaceOperandWith(0, NewLoopID); 7943 L->setLoopID(NewLoopID); 7944 } 7945 } 7946 7947 //===--------------------------------------------------------------------===// 7948 // EpilogueVectorizerMainLoop 7949 //===--------------------------------------------------------------------===// 7950 7951 /// This function is partially responsible for generating the control flow 7952 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7953 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7954 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7955 Loop *Lp = createVectorLoopSkeleton(""); 7956 7957 // Generate the code to check the minimum iteration count of the vector 7958 // epilogue (see below). 7959 EPI.EpilogueIterationCountCheck = 7960 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7961 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7962 7963 // Generate the code to check any assumptions that we've made for SCEV 7964 // expressions. 7965 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7966 7967 // Generate the code that checks at runtime if arrays overlap. We put the 7968 // checks into a separate block to make the more common case of few elements 7969 // faster. 7970 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7971 7972 // Generate the iteration count check for the main loop, *after* the check 7973 // for the epilogue loop, so that the path-length is shorter for the case 7974 // that goes directly through the vector epilogue. The longer-path length for 7975 // the main loop is compensated for, by the gain from vectorizing the larger 7976 // trip count. Note: the branch will get updated later on when we vectorize 7977 // the epilogue. 7978 EPI.MainLoopIterationCountCheck = 7979 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7980 7981 // Generate the induction variable. 7982 OldInduction = Legal->getPrimaryInduction(); 7983 Type *IdxTy = Legal->getWidestInductionType(); 7984 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7985 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7986 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7987 EPI.VectorTripCount = CountRoundDown; 7988 Induction = 7989 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7990 getDebugLocFromInstOrOperands(OldInduction)); 7991 7992 // Skip induction resume value creation here because they will be created in 7993 // the second pass. If we created them here, they wouldn't be used anyway, 7994 // because the vplan in the second pass still contains the inductions from the 7995 // original loop. 7996 7997 return completeLoopSkeleton(Lp, OrigLoopID); 7998 } 7999 8000 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8001 LLVM_DEBUG({ 8002 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8003 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8004 << ", Main Loop UF:" << EPI.MainLoopUF 8005 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8006 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8007 }); 8008 } 8009 8010 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8011 DEBUG_WITH_TYPE(VerboseDebug, { 8012 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8013 }); 8014 } 8015 8016 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8017 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8018 assert(L && "Expected valid Loop."); 8019 assert(Bypass && "Expected valid bypass basic block."); 8020 unsigned VFactor = 8021 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8022 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8023 Value *Count = getOrCreateTripCount(L); 8024 // Reuse existing vector loop preheader for TC checks. 8025 // Note that new preheader block is generated for vector loop. 8026 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8027 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8028 8029 // Generate code to check if the loop's trip count is less than VF * UF of the 8030 // main vector loop. 8031 auto P = 8032 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8033 8034 Value *CheckMinIters = Builder.CreateICmp( 8035 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8036 "min.iters.check"); 8037 8038 if (!ForEpilogue) 8039 TCCheckBlock->setName("vector.main.loop.iter.check"); 8040 8041 // Create new preheader for vector loop. 8042 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8043 DT, LI, nullptr, "vector.ph"); 8044 8045 if (ForEpilogue) { 8046 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8047 DT->getNode(Bypass)->getIDom()) && 8048 "TC check is expected to dominate Bypass"); 8049 8050 // Update dominator for Bypass & LoopExit. 8051 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8052 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8053 8054 LoopBypassBlocks.push_back(TCCheckBlock); 8055 8056 // Save the trip count so we don't have to regenerate it in the 8057 // vec.epilog.iter.check. This is safe to do because the trip count 8058 // generated here dominates the vector epilog iter check. 8059 EPI.TripCount = Count; 8060 } 8061 8062 ReplaceInstWithInst( 8063 TCCheckBlock->getTerminator(), 8064 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8065 8066 return TCCheckBlock; 8067 } 8068 8069 //===--------------------------------------------------------------------===// 8070 // EpilogueVectorizerEpilogueLoop 8071 //===--------------------------------------------------------------------===// 8072 8073 /// This function is partially responsible for generating the control flow 8074 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8075 BasicBlock * 8076 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8077 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8078 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8079 8080 // Now, compare the remaining count and if there aren't enough iterations to 8081 // execute the vectorized epilogue skip to the scalar part. 8082 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8083 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8084 LoopVectorPreHeader = 8085 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8086 LI, nullptr, "vec.epilog.ph"); 8087 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8088 VecEpilogueIterationCountCheck); 8089 8090 // Adjust the control flow taking the state info from the main loop 8091 // vectorization into account. 8092 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8093 "expected this to be saved from the previous pass."); 8094 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8095 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8096 8097 DT->changeImmediateDominator(LoopVectorPreHeader, 8098 EPI.MainLoopIterationCountCheck); 8099 8100 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8101 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8102 8103 if (EPI.SCEVSafetyCheck) 8104 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8105 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8106 if (EPI.MemSafetyCheck) 8107 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8108 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8109 8110 DT->changeImmediateDominator( 8111 VecEpilogueIterationCountCheck, 8112 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8113 8114 DT->changeImmediateDominator(LoopScalarPreHeader, 8115 EPI.EpilogueIterationCountCheck); 8116 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8117 8118 // Keep track of bypass blocks, as they feed start values to the induction 8119 // phis in the scalar loop preheader. 8120 if (EPI.SCEVSafetyCheck) 8121 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8122 if (EPI.MemSafetyCheck) 8123 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8124 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8125 8126 // Generate a resume induction for the vector epilogue and put it in the 8127 // vector epilogue preheader 8128 Type *IdxTy = Legal->getWidestInductionType(); 8129 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8130 LoopVectorPreHeader->getFirstNonPHI()); 8131 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8132 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8133 EPI.MainLoopIterationCountCheck); 8134 8135 // Generate the induction variable. 8136 OldInduction = Legal->getPrimaryInduction(); 8137 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8138 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8139 Value *StartIdx = EPResumeVal; 8140 Induction = 8141 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8142 getDebugLocFromInstOrOperands(OldInduction)); 8143 8144 // Generate induction resume values. These variables save the new starting 8145 // indexes for the scalar loop. They are used to test if there are any tail 8146 // iterations left once the vector loop has completed. 8147 // Note that when the vectorized epilogue is skipped due to iteration count 8148 // check, then the resume value for the induction variable comes from 8149 // the trip count of the main vector loop, hence passing the AdditionalBypass 8150 // argument. 8151 createInductionResumeValues(Lp, CountRoundDown, 8152 {VecEpilogueIterationCountCheck, 8153 EPI.VectorTripCount} /* AdditionalBypass */); 8154 8155 AddRuntimeUnrollDisableMetaData(Lp); 8156 return completeLoopSkeleton(Lp, OrigLoopID); 8157 } 8158 8159 BasicBlock * 8160 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8161 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8162 8163 assert(EPI.TripCount && 8164 "Expected trip count to have been safed in the first pass."); 8165 assert( 8166 (!isa<Instruction>(EPI.TripCount) || 8167 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8168 "saved trip count does not dominate insertion point."); 8169 Value *TC = EPI.TripCount; 8170 IRBuilder<> Builder(Insert->getTerminator()); 8171 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8172 8173 // Generate code to check if the loop's trip count is less than VF * UF of the 8174 // vector epilogue loop. 8175 auto P = 8176 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8177 8178 Value *CheckMinIters = Builder.CreateICmp( 8179 P, Count, 8180 ConstantInt::get(Count->getType(), 8181 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8182 "min.epilog.iters.check"); 8183 8184 ReplaceInstWithInst( 8185 Insert->getTerminator(), 8186 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8187 8188 LoopBypassBlocks.push_back(Insert); 8189 return Insert; 8190 } 8191 8192 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8193 LLVM_DEBUG({ 8194 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8195 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8196 << ", Main Loop UF:" << EPI.MainLoopUF 8197 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8198 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8199 }); 8200 } 8201 8202 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8203 DEBUG_WITH_TYPE(VerboseDebug, { 8204 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8205 }); 8206 } 8207 8208 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8209 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8210 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8211 bool PredicateAtRangeStart = Predicate(Range.Start); 8212 8213 for (ElementCount TmpVF = Range.Start * 2; 8214 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8215 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8216 Range.End = TmpVF; 8217 break; 8218 } 8219 8220 return PredicateAtRangeStart; 8221 } 8222 8223 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8224 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8225 /// of VF's starting at a given VF and extending it as much as possible. Each 8226 /// vectorization decision can potentially shorten this sub-range during 8227 /// buildVPlan(). 8228 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8229 ElementCount MaxVF) { 8230 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8231 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8232 VFRange SubRange = {VF, MaxVFPlusOne}; 8233 VPlans.push_back(buildVPlan(SubRange)); 8234 VF = SubRange.End; 8235 } 8236 } 8237 8238 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8239 VPlanPtr &Plan) { 8240 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8241 8242 // Look for cached value. 8243 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8244 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8245 if (ECEntryIt != EdgeMaskCache.end()) 8246 return ECEntryIt->second; 8247 8248 VPValue *SrcMask = createBlockInMask(Src, Plan); 8249 8250 // The terminator has to be a branch inst! 8251 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8252 assert(BI && "Unexpected terminator found"); 8253 8254 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8255 return EdgeMaskCache[Edge] = SrcMask; 8256 8257 // If source is an exiting block, we know the exit edge is dynamically dead 8258 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8259 // adding uses of an otherwise potentially dead instruction. 8260 if (OrigLoop->isLoopExiting(Src)) 8261 return EdgeMaskCache[Edge] = SrcMask; 8262 8263 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8264 assert(EdgeMask && "No Edge Mask found for condition"); 8265 8266 if (BI->getSuccessor(0) != Dst) 8267 EdgeMask = Builder.createNot(EdgeMask); 8268 8269 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8270 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8271 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8272 // The select version does not introduce new UB if SrcMask is false and 8273 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8274 VPValue *False = Plan->getOrAddVPValue( 8275 ConstantInt::getFalse(BI->getCondition()->getType())); 8276 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8277 } 8278 8279 return EdgeMaskCache[Edge] = EdgeMask; 8280 } 8281 8282 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8283 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8284 8285 // Look for cached value. 8286 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8287 if (BCEntryIt != BlockMaskCache.end()) 8288 return BCEntryIt->second; 8289 8290 // All-one mask is modelled as no-mask following the convention for masked 8291 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8292 VPValue *BlockMask = nullptr; 8293 8294 if (OrigLoop->getHeader() == BB) { 8295 if (!CM.blockNeedsPredication(BB)) 8296 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8297 8298 // Create the block in mask as the first non-phi instruction in the block. 8299 VPBuilder::InsertPointGuard Guard(Builder); 8300 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8301 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8302 8303 // Introduce the early-exit compare IV <= BTC to form header block mask. 8304 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8305 // Start by constructing the desired canonical IV. 8306 VPValue *IV = nullptr; 8307 if (Legal->getPrimaryInduction()) 8308 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8309 else { 8310 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8311 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8312 IV = IVRecipe->getVPValue(); 8313 } 8314 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8315 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8316 8317 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8318 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8319 // as a second argument, we only pass the IV here and extract the 8320 // tripcount from the transform state where codegen of the VP instructions 8321 // happen. 8322 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8323 } else { 8324 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8325 } 8326 return BlockMaskCache[BB] = BlockMask; 8327 } 8328 8329 // This is the block mask. We OR all incoming edges. 8330 for (auto *Predecessor : predecessors(BB)) { 8331 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8332 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8333 return BlockMaskCache[BB] = EdgeMask; 8334 8335 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8336 BlockMask = EdgeMask; 8337 continue; 8338 } 8339 8340 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8341 } 8342 8343 return BlockMaskCache[BB] = BlockMask; 8344 } 8345 8346 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8347 VPlanPtr &Plan) { 8348 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8349 "Must be called with either a load or store"); 8350 8351 auto willWiden = [&](ElementCount VF) -> bool { 8352 if (VF.isScalar()) 8353 return false; 8354 LoopVectorizationCostModel::InstWidening Decision = 8355 CM.getWideningDecision(I, VF); 8356 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8357 "CM decision should be taken at this point."); 8358 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8359 return true; 8360 if (CM.isScalarAfterVectorization(I, VF) || 8361 CM.isProfitableToScalarize(I, VF)) 8362 return false; 8363 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8364 }; 8365 8366 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8367 return nullptr; 8368 8369 VPValue *Mask = nullptr; 8370 if (Legal->isMaskRequired(I)) 8371 Mask = createBlockInMask(I->getParent(), Plan); 8372 8373 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8374 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8375 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8376 8377 StoreInst *Store = cast<StoreInst>(I); 8378 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8379 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8380 } 8381 8382 VPWidenIntOrFpInductionRecipe * 8383 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8384 // Check if this is an integer or fp induction. If so, build the recipe that 8385 // produces its scalar and vector values. 8386 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8387 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8388 II.getKind() == InductionDescriptor::IK_FpInduction) { 8389 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8390 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8391 return new VPWidenIntOrFpInductionRecipe( 8392 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8393 } 8394 8395 return nullptr; 8396 } 8397 8398 VPWidenIntOrFpInductionRecipe * 8399 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8400 VPlan &Plan) const { 8401 // Optimize the special case where the source is a constant integer 8402 // induction variable. Notice that we can only optimize the 'trunc' case 8403 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8404 // (c) other casts depend on pointer size. 8405 8406 // Determine whether \p K is a truncation based on an induction variable that 8407 // can be optimized. 8408 auto isOptimizableIVTruncate = 8409 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8410 return [=](ElementCount VF) -> bool { 8411 return CM.isOptimizableIVTruncate(K, VF); 8412 }; 8413 }; 8414 8415 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8416 isOptimizableIVTruncate(I), Range)) { 8417 8418 InductionDescriptor II = 8419 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8420 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8421 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8422 Start, nullptr, I); 8423 } 8424 return nullptr; 8425 } 8426 8427 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8428 // If all incoming values are equal, the incoming VPValue can be used directly 8429 // instead of creating a new VPBlendRecipe. 8430 Value *FirstIncoming = Phi->getIncomingValue(0); 8431 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8432 return FirstIncoming == Inc; 8433 })) { 8434 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8435 } 8436 8437 // We know that all PHIs in non-header blocks are converted into selects, so 8438 // we don't have to worry about the insertion order and we can just use the 8439 // builder. At this point we generate the predication tree. There may be 8440 // duplications since this is a simple recursive scan, but future 8441 // optimizations will clean it up. 8442 SmallVector<VPValue *, 2> Operands; 8443 unsigned NumIncoming = Phi->getNumIncomingValues(); 8444 8445 for (unsigned In = 0; In < NumIncoming; In++) { 8446 VPValue *EdgeMask = 8447 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8448 assert((EdgeMask || NumIncoming == 1) && 8449 "Multiple predecessors with one having a full mask"); 8450 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8451 if (EdgeMask) 8452 Operands.push_back(EdgeMask); 8453 } 8454 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8455 } 8456 8457 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8458 VPlan &Plan) const { 8459 8460 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8461 [this, CI](ElementCount VF) { 8462 return CM.isScalarWithPredication(CI, VF); 8463 }, 8464 Range); 8465 8466 if (IsPredicated) 8467 return nullptr; 8468 8469 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8470 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8471 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8472 ID == Intrinsic::pseudoprobe || 8473 ID == Intrinsic::experimental_noalias_scope_decl)) 8474 return nullptr; 8475 8476 auto willWiden = [&](ElementCount VF) -> bool { 8477 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8478 // The following case may be scalarized depending on the VF. 8479 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8480 // version of the instruction. 8481 // Is it beneficial to perform intrinsic call compared to lib call? 8482 bool NeedToScalarize = false; 8483 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8484 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8485 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8486 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8487 "Cannot have invalid costs while widening"); 8488 return UseVectorIntrinsic || !NeedToScalarize; 8489 }; 8490 8491 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8492 return nullptr; 8493 8494 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8495 } 8496 8497 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8498 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8499 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8500 // Instruction should be widened, unless it is scalar after vectorization, 8501 // scalarization is profitable or it is predicated. 8502 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8503 return CM.isScalarAfterVectorization(I, VF) || 8504 CM.isProfitableToScalarize(I, VF) || 8505 CM.isScalarWithPredication(I, VF); 8506 }; 8507 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8508 Range); 8509 } 8510 8511 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8512 auto IsVectorizableOpcode = [](unsigned Opcode) { 8513 switch (Opcode) { 8514 case Instruction::Add: 8515 case Instruction::And: 8516 case Instruction::AShr: 8517 case Instruction::BitCast: 8518 case Instruction::FAdd: 8519 case Instruction::FCmp: 8520 case Instruction::FDiv: 8521 case Instruction::FMul: 8522 case Instruction::FNeg: 8523 case Instruction::FPExt: 8524 case Instruction::FPToSI: 8525 case Instruction::FPToUI: 8526 case Instruction::FPTrunc: 8527 case Instruction::FRem: 8528 case Instruction::FSub: 8529 case Instruction::ICmp: 8530 case Instruction::IntToPtr: 8531 case Instruction::LShr: 8532 case Instruction::Mul: 8533 case Instruction::Or: 8534 case Instruction::PtrToInt: 8535 case Instruction::SDiv: 8536 case Instruction::Select: 8537 case Instruction::SExt: 8538 case Instruction::Shl: 8539 case Instruction::SIToFP: 8540 case Instruction::SRem: 8541 case Instruction::Sub: 8542 case Instruction::Trunc: 8543 case Instruction::UDiv: 8544 case Instruction::UIToFP: 8545 case Instruction::URem: 8546 case Instruction::Xor: 8547 case Instruction::ZExt: 8548 return true; 8549 } 8550 return false; 8551 }; 8552 8553 if (!IsVectorizableOpcode(I->getOpcode())) 8554 return nullptr; 8555 8556 // Success: widen this instruction. 8557 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8558 } 8559 8560 VPBasicBlock *VPRecipeBuilder::handleReplication( 8561 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8562 VPlanPtr &Plan) { 8563 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8564 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8565 Range); 8566 8567 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8568 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8569 Range); 8570 8571 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8572 IsUniform, IsPredicated); 8573 setRecipe(I, Recipe); 8574 Plan->addVPValue(I, Recipe); 8575 8576 // Find if I uses a predicated instruction. If so, it will use its scalar 8577 // value. Avoid hoisting the insert-element which packs the scalar value into 8578 // a vector value, as that happens iff all users use the vector value. 8579 for (VPValue *Op : Recipe->operands()) { 8580 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8581 if (!PredR) 8582 continue; 8583 auto *RepR = 8584 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8585 assert(RepR->isPredicated() && 8586 "expected Replicate recipe to be predicated"); 8587 RepR->setAlsoPack(false); 8588 } 8589 8590 // Finalize the recipe for Instr, first if it is not predicated. 8591 if (!IsPredicated) { 8592 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8593 VPBB->appendRecipe(Recipe); 8594 return VPBB; 8595 } 8596 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8597 assert(VPBB->getSuccessors().empty() && 8598 "VPBB has successors when handling predicated replication."); 8599 // Record predicated instructions for above packing optimizations. 8600 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8601 VPBlockUtils::insertBlockAfter(Region, VPBB); 8602 auto *RegSucc = new VPBasicBlock(); 8603 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8604 return RegSucc; 8605 } 8606 8607 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8608 VPRecipeBase *PredRecipe, 8609 VPlanPtr &Plan) { 8610 // Instructions marked for predication are replicated and placed under an 8611 // if-then construct to prevent side-effects. 8612 8613 // Generate recipes to compute the block mask for this region. 8614 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8615 8616 // Build the triangular if-then region. 8617 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8618 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8619 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8620 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8621 auto *PHIRecipe = Instr->getType()->isVoidTy() 8622 ? nullptr 8623 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8624 if (PHIRecipe) { 8625 Plan->removeVPValueFor(Instr); 8626 Plan->addVPValue(Instr, PHIRecipe); 8627 } 8628 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8629 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8630 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8631 8632 // Note: first set Entry as region entry and then connect successors starting 8633 // from it in order, to propagate the "parent" of each VPBasicBlock. 8634 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8635 VPBlockUtils::connectBlocks(Pred, Exit); 8636 8637 return Region; 8638 } 8639 8640 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8641 VFRange &Range, 8642 VPlanPtr &Plan) { 8643 // First, check for specific widening recipes that deal with calls, memory 8644 // operations, inductions and Phi nodes. 8645 if (auto *CI = dyn_cast<CallInst>(Instr)) 8646 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8647 8648 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8649 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8650 8651 VPRecipeBase *Recipe; 8652 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8653 if (Phi->getParent() != OrigLoop->getHeader()) 8654 return tryToBlend(Phi, Plan); 8655 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8656 return toVPRecipeResult(Recipe); 8657 8658 if (Legal->isReductionVariable(Phi)) { 8659 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8660 VPValue *StartV = 8661 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8662 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8663 } 8664 8665 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8666 } 8667 8668 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8669 cast<TruncInst>(Instr), Range, *Plan))) 8670 return toVPRecipeResult(Recipe); 8671 8672 if (!shouldWiden(Instr, Range)) 8673 return nullptr; 8674 8675 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8676 return toVPRecipeResult(new VPWidenGEPRecipe( 8677 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8678 8679 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8680 bool InvariantCond = 8681 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8682 return toVPRecipeResult(new VPWidenSelectRecipe( 8683 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8684 } 8685 8686 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8687 } 8688 8689 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8690 ElementCount MaxVF) { 8691 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8692 8693 // Collect instructions from the original loop that will become trivially dead 8694 // in the vectorized loop. We don't need to vectorize these instructions. For 8695 // example, original induction update instructions can become dead because we 8696 // separately emit induction "steps" when generating code for the new loop. 8697 // Similarly, we create a new latch condition when setting up the structure 8698 // of the new loop, so the old one can become dead. 8699 SmallPtrSet<Instruction *, 4> DeadInstructions; 8700 collectTriviallyDeadInstructions(DeadInstructions); 8701 8702 // Add assume instructions we need to drop to DeadInstructions, to prevent 8703 // them from being added to the VPlan. 8704 // TODO: We only need to drop assumes in blocks that get flattend. If the 8705 // control flow is preserved, we should keep them. 8706 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8707 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8708 8709 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8710 // Dead instructions do not need sinking. Remove them from SinkAfter. 8711 for (Instruction *I : DeadInstructions) 8712 SinkAfter.erase(I); 8713 8714 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8715 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8716 VFRange SubRange = {VF, MaxVFPlusOne}; 8717 VPlans.push_back( 8718 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8719 VF = SubRange.End; 8720 } 8721 } 8722 8723 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8724 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8725 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8726 8727 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8728 8729 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8730 8731 // --------------------------------------------------------------------------- 8732 // Pre-construction: record ingredients whose recipes we'll need to further 8733 // process after constructing the initial VPlan. 8734 // --------------------------------------------------------------------------- 8735 8736 // Mark instructions we'll need to sink later and their targets as 8737 // ingredients whose recipe we'll need to record. 8738 for (auto &Entry : SinkAfter) { 8739 RecipeBuilder.recordRecipeOf(Entry.first); 8740 RecipeBuilder.recordRecipeOf(Entry.second); 8741 } 8742 for (auto &Reduction : CM.getInLoopReductionChains()) { 8743 PHINode *Phi = Reduction.first; 8744 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8745 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8746 8747 RecipeBuilder.recordRecipeOf(Phi); 8748 for (auto &R : ReductionOperations) { 8749 RecipeBuilder.recordRecipeOf(R); 8750 // For min/max reducitons, where we have a pair of icmp/select, we also 8751 // need to record the ICmp recipe, so it can be removed later. 8752 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8753 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8754 } 8755 } 8756 8757 // For each interleave group which is relevant for this (possibly trimmed) 8758 // Range, add it to the set of groups to be later applied to the VPlan and add 8759 // placeholders for its members' Recipes which we'll be replacing with a 8760 // single VPInterleaveRecipe. 8761 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8762 auto applyIG = [IG, this](ElementCount VF) -> bool { 8763 return (VF.isVector() && // Query is illegal for VF == 1 8764 CM.getWideningDecision(IG->getInsertPos(), VF) == 8765 LoopVectorizationCostModel::CM_Interleave); 8766 }; 8767 if (!getDecisionAndClampRange(applyIG, Range)) 8768 continue; 8769 InterleaveGroups.insert(IG); 8770 for (unsigned i = 0; i < IG->getFactor(); i++) 8771 if (Instruction *Member = IG->getMember(i)) 8772 RecipeBuilder.recordRecipeOf(Member); 8773 }; 8774 8775 // --------------------------------------------------------------------------- 8776 // Build initial VPlan: Scan the body of the loop in a topological order to 8777 // visit each basic block after having visited its predecessor basic blocks. 8778 // --------------------------------------------------------------------------- 8779 8780 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8781 auto Plan = std::make_unique<VPlan>(); 8782 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8783 Plan->setEntry(VPBB); 8784 8785 // Scan the body of the loop in a topological order to visit each basic block 8786 // after having visited its predecessor basic blocks. 8787 LoopBlocksDFS DFS(OrigLoop); 8788 DFS.perform(LI); 8789 8790 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8791 // Relevant instructions from basic block BB will be grouped into VPRecipe 8792 // ingredients and fill a new VPBasicBlock. 8793 unsigned VPBBsForBB = 0; 8794 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8795 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8796 VPBB = FirstVPBBForBB; 8797 Builder.setInsertPoint(VPBB); 8798 8799 // Introduce each ingredient into VPlan. 8800 // TODO: Model and preserve debug instrinsics in VPlan. 8801 for (Instruction &I : BB->instructionsWithoutDebug()) { 8802 Instruction *Instr = &I; 8803 8804 // First filter out irrelevant instructions, to ensure no recipes are 8805 // built for them. 8806 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8807 continue; 8808 8809 if (auto RecipeOrValue = 8810 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8811 // If Instr can be simplified to an existing VPValue, use it. 8812 if (RecipeOrValue.is<VPValue *>()) { 8813 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8814 continue; 8815 } 8816 // Otherwise, add the new recipe. 8817 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8818 for (auto *Def : Recipe->definedValues()) { 8819 auto *UV = Def->getUnderlyingValue(); 8820 Plan->addVPValue(UV, Def); 8821 } 8822 8823 RecipeBuilder.setRecipe(Instr, Recipe); 8824 VPBB->appendRecipe(Recipe); 8825 continue; 8826 } 8827 8828 // Otherwise, if all widening options failed, Instruction is to be 8829 // replicated. This may create a successor for VPBB. 8830 VPBasicBlock *NextVPBB = 8831 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8832 if (NextVPBB != VPBB) { 8833 VPBB = NextVPBB; 8834 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8835 : ""); 8836 } 8837 } 8838 } 8839 8840 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8841 // may also be empty, such as the last one VPBB, reflecting original 8842 // basic-blocks with no recipes. 8843 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8844 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8845 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8846 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8847 delete PreEntry; 8848 8849 // --------------------------------------------------------------------------- 8850 // Transform initial VPlan: Apply previously taken decisions, in order, to 8851 // bring the VPlan to its final state. 8852 // --------------------------------------------------------------------------- 8853 8854 // Apply Sink-After legal constraints. 8855 for (auto &Entry : SinkAfter) { 8856 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8857 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8858 // If the target is in a replication region, make sure to move Sink to the 8859 // block after it, not into the replication region itself. 8860 if (auto *Region = 8861 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8862 if (Region->isReplicator()) { 8863 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8864 VPBasicBlock *NextBlock = 8865 cast<VPBasicBlock>(Region->getSuccessors().front()); 8866 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8867 continue; 8868 } 8869 } 8870 Sink->moveAfter(Target); 8871 } 8872 8873 // Interleave memory: for each Interleave Group we marked earlier as relevant 8874 // for this VPlan, replace the Recipes widening its memory instructions with a 8875 // single VPInterleaveRecipe at its insertion point. 8876 for (auto IG : InterleaveGroups) { 8877 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8878 RecipeBuilder.getRecipe(IG->getInsertPos())); 8879 SmallVector<VPValue *, 4> StoredValues; 8880 for (unsigned i = 0; i < IG->getFactor(); ++i) 8881 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8882 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8883 8884 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8885 Recipe->getMask()); 8886 VPIG->insertBefore(Recipe); 8887 unsigned J = 0; 8888 for (unsigned i = 0; i < IG->getFactor(); ++i) 8889 if (Instruction *Member = IG->getMember(i)) { 8890 if (!Member->getType()->isVoidTy()) { 8891 VPValue *OriginalV = Plan->getVPValue(Member); 8892 Plan->removeVPValueFor(Member); 8893 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8894 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8895 J++; 8896 } 8897 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8898 } 8899 } 8900 8901 // Adjust the recipes for any inloop reductions. 8902 if (Range.Start.isVector()) 8903 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8904 8905 // Finally, if tail is folded by masking, introduce selects between the phi 8906 // and the live-out instruction of each reduction, at the end of the latch. 8907 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8908 Builder.setInsertPoint(VPBB); 8909 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8910 for (auto &Reduction : Legal->getReductionVars()) { 8911 if (CM.isInLoopReduction(Reduction.first)) 8912 continue; 8913 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8914 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8915 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8916 } 8917 } 8918 8919 std::string PlanName; 8920 raw_string_ostream RSO(PlanName); 8921 ElementCount VF = Range.Start; 8922 Plan->addVF(VF); 8923 RSO << "Initial VPlan for VF={" << VF; 8924 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8925 Plan->addVF(VF); 8926 RSO << "," << VF; 8927 } 8928 RSO << "},UF>=1"; 8929 RSO.flush(); 8930 Plan->setName(PlanName); 8931 8932 return Plan; 8933 } 8934 8935 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8936 // Outer loop handling: They may require CFG and instruction level 8937 // transformations before even evaluating whether vectorization is profitable. 8938 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8939 // the vectorization pipeline. 8940 assert(!OrigLoop->isInnermost()); 8941 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8942 8943 // Create new empty VPlan 8944 auto Plan = std::make_unique<VPlan>(); 8945 8946 // Build hierarchical CFG 8947 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8948 HCFGBuilder.buildHierarchicalCFG(); 8949 8950 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8951 VF *= 2) 8952 Plan->addVF(VF); 8953 8954 if (EnableVPlanPredication) { 8955 VPlanPredicator VPP(*Plan); 8956 VPP.predicate(); 8957 8958 // Avoid running transformation to recipes until masked code generation in 8959 // VPlan-native path is in place. 8960 return Plan; 8961 } 8962 8963 SmallPtrSet<Instruction *, 1> DeadInstructions; 8964 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 8965 Legal->getInductionVars(), 8966 DeadInstructions, *PSE.getSE()); 8967 return Plan; 8968 } 8969 8970 // Adjust the recipes for any inloop reductions. The chain of instructions 8971 // leading from the loop exit instr to the phi need to be converted to 8972 // reductions, with one operand being vector and the other being the scalar 8973 // reduction chain. 8974 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8975 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8976 for (auto &Reduction : CM.getInLoopReductionChains()) { 8977 PHINode *Phi = Reduction.first; 8978 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8979 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8980 8981 // ReductionOperations are orders top-down from the phi's use to the 8982 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8983 // which of the two operands will remain scalar and which will be reduced. 8984 // For minmax the chain will be the select instructions. 8985 Instruction *Chain = Phi; 8986 for (Instruction *R : ReductionOperations) { 8987 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8988 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8989 8990 VPValue *ChainOp = Plan->getVPValue(Chain); 8991 unsigned FirstOpId; 8992 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8993 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8994 "Expected to replace a VPWidenSelectSC"); 8995 FirstOpId = 1; 8996 } else { 8997 assert(isa<VPWidenRecipe>(WidenRecipe) && 8998 "Expected to replace a VPWidenSC"); 8999 FirstOpId = 0; 9000 } 9001 unsigned VecOpId = 9002 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9003 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9004 9005 auto *CondOp = CM.foldTailByMasking() 9006 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9007 : nullptr; 9008 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9009 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9010 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9011 Plan->removeVPValueFor(R); 9012 Plan->addVPValue(R, RedRecipe); 9013 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9014 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9015 WidenRecipe->eraseFromParent(); 9016 9017 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9018 VPRecipeBase *CompareRecipe = 9019 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9020 assert(isa<VPWidenRecipe>(CompareRecipe) && 9021 "Expected to replace a VPWidenSC"); 9022 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9023 "Expected no remaining users"); 9024 CompareRecipe->eraseFromParent(); 9025 } 9026 Chain = R; 9027 } 9028 } 9029 } 9030 9031 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9032 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9033 VPSlotTracker &SlotTracker) const { 9034 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9035 IG->getInsertPos()->printAsOperand(O, false); 9036 O << ", "; 9037 getAddr()->printAsOperand(O, SlotTracker); 9038 VPValue *Mask = getMask(); 9039 if (Mask) { 9040 O << ", "; 9041 Mask->printAsOperand(O, SlotTracker); 9042 } 9043 for (unsigned i = 0; i < IG->getFactor(); ++i) 9044 if (Instruction *I = IG->getMember(i)) 9045 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9046 } 9047 #endif 9048 9049 void VPWidenCallRecipe::execute(VPTransformState &State) { 9050 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9051 *this, State); 9052 } 9053 9054 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9055 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9056 this, *this, InvariantCond, State); 9057 } 9058 9059 void VPWidenRecipe::execute(VPTransformState &State) { 9060 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9061 } 9062 9063 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9064 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9065 *this, State.UF, State.VF, IsPtrLoopInvariant, 9066 IsIndexLoopInvariant, State); 9067 } 9068 9069 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9070 assert(!State.Instance && "Int or FP induction being replicated."); 9071 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9072 getTruncInst(), getVPValue(0), 9073 getCastValue(), State); 9074 } 9075 9076 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9077 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9078 getStartValue(), this, State); 9079 } 9080 9081 void VPBlendRecipe::execute(VPTransformState &State) { 9082 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9083 // We know that all PHIs in non-header blocks are converted into 9084 // selects, so we don't have to worry about the insertion order and we 9085 // can just use the builder. 9086 // At this point we generate the predication tree. There may be 9087 // duplications since this is a simple recursive scan, but future 9088 // optimizations will clean it up. 9089 9090 unsigned NumIncoming = getNumIncomingValues(); 9091 9092 // Generate a sequence of selects of the form: 9093 // SELECT(Mask3, In3, 9094 // SELECT(Mask2, In2, 9095 // SELECT(Mask1, In1, 9096 // In0))) 9097 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9098 // are essentially undef are taken from In0. 9099 InnerLoopVectorizer::VectorParts Entry(State.UF); 9100 for (unsigned In = 0; In < NumIncoming; ++In) { 9101 for (unsigned Part = 0; Part < State.UF; ++Part) { 9102 // We might have single edge PHIs (blocks) - use an identity 9103 // 'select' for the first PHI operand. 9104 Value *In0 = State.get(getIncomingValue(In), Part); 9105 if (In == 0) 9106 Entry[Part] = In0; // Initialize with the first incoming value. 9107 else { 9108 // Select between the current value and the previous incoming edge 9109 // based on the incoming mask. 9110 Value *Cond = State.get(getMask(In), Part); 9111 Entry[Part] = 9112 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9113 } 9114 } 9115 } 9116 for (unsigned Part = 0; Part < State.UF; ++Part) 9117 State.set(this, Entry[Part], Part); 9118 } 9119 9120 void VPInterleaveRecipe::execute(VPTransformState &State) { 9121 assert(!State.Instance && "Interleave group being replicated."); 9122 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9123 getStoredValues(), getMask()); 9124 } 9125 9126 void VPReductionRecipe::execute(VPTransformState &State) { 9127 assert(!State.Instance && "Reduction being replicated."); 9128 for (unsigned Part = 0; Part < State.UF; ++Part) { 9129 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9130 Value *NewVecOp = State.get(getVecOp(), Part); 9131 if (VPValue *Cond = getCondOp()) { 9132 Value *NewCond = State.get(Cond, Part); 9133 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9134 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9135 Kind, VecTy->getElementType()); 9136 Constant *IdenVec = 9137 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9138 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9139 NewVecOp = Select; 9140 } 9141 Value *NewRed = 9142 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9143 Value *PrevInChain = State.get(getChainOp(), Part); 9144 Value *NextInChain; 9145 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9146 NextInChain = 9147 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9148 NewRed, PrevInChain); 9149 } else { 9150 NextInChain = State.Builder.CreateBinOp( 9151 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9152 PrevInChain); 9153 } 9154 State.set(this, NextInChain, Part); 9155 } 9156 } 9157 9158 void VPReplicateRecipe::execute(VPTransformState &State) { 9159 if (State.Instance) { // Generate a single instance. 9160 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9161 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9162 *State.Instance, IsPredicated, State); 9163 // Insert scalar instance packing it into a vector. 9164 if (AlsoPack && State.VF.isVector()) { 9165 // If we're constructing lane 0, initialize to start from poison. 9166 if (State.Instance->Lane.isFirstLane()) { 9167 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9168 Value *Poison = PoisonValue::get( 9169 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9170 State.set(this, Poison, State.Instance->Part); 9171 } 9172 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9173 } 9174 return; 9175 } 9176 9177 // Generate scalar instances for all VF lanes of all UF parts, unless the 9178 // instruction is uniform inwhich case generate only the first lane for each 9179 // of the UF parts. 9180 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9181 assert((!State.VF.isScalable() || IsUniform) && 9182 "Can't scalarize a scalable vector"); 9183 for (unsigned Part = 0; Part < State.UF; ++Part) 9184 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9185 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9186 VPIteration(Part, Lane), IsPredicated, 9187 State); 9188 } 9189 9190 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9191 assert(State.Instance && "Branch on Mask works only on single instance."); 9192 9193 unsigned Part = State.Instance->Part; 9194 unsigned Lane = State.Instance->Lane.getKnownLane(); 9195 9196 Value *ConditionBit = nullptr; 9197 VPValue *BlockInMask = getMask(); 9198 if (BlockInMask) { 9199 ConditionBit = State.get(BlockInMask, Part); 9200 if (ConditionBit->getType()->isVectorTy()) 9201 ConditionBit = State.Builder.CreateExtractElement( 9202 ConditionBit, State.Builder.getInt32(Lane)); 9203 } else // Block in mask is all-one. 9204 ConditionBit = State.Builder.getTrue(); 9205 9206 // Replace the temporary unreachable terminator with a new conditional branch, 9207 // whose two destinations will be set later when they are created. 9208 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9209 assert(isa<UnreachableInst>(CurrentTerminator) && 9210 "Expected to replace unreachable terminator with conditional branch."); 9211 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9212 CondBr->setSuccessor(0, nullptr); 9213 ReplaceInstWithInst(CurrentTerminator, CondBr); 9214 } 9215 9216 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9217 assert(State.Instance && "Predicated instruction PHI works per instance."); 9218 Instruction *ScalarPredInst = 9219 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9220 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9221 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9222 assert(PredicatingBB && "Predicated block has no single predecessor."); 9223 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9224 "operand must be VPReplicateRecipe"); 9225 9226 // By current pack/unpack logic we need to generate only a single phi node: if 9227 // a vector value for the predicated instruction exists at this point it means 9228 // the instruction has vector users only, and a phi for the vector value is 9229 // needed. In this case the recipe of the predicated instruction is marked to 9230 // also do that packing, thereby "hoisting" the insert-element sequence. 9231 // Otherwise, a phi node for the scalar value is needed. 9232 unsigned Part = State.Instance->Part; 9233 if (State.hasVectorValue(getOperand(0), Part)) { 9234 Value *VectorValue = State.get(getOperand(0), Part); 9235 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9236 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9237 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9238 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9239 if (State.hasVectorValue(this, Part)) 9240 State.reset(this, VPhi, Part); 9241 else 9242 State.set(this, VPhi, Part); 9243 // NOTE: Currently we need to update the value of the operand, so the next 9244 // predicated iteration inserts its generated value in the correct vector. 9245 State.reset(getOperand(0), VPhi, Part); 9246 } else { 9247 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9248 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9249 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9250 PredicatingBB); 9251 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9252 if (State.hasScalarValue(this, *State.Instance)) 9253 State.reset(this, Phi, *State.Instance); 9254 else 9255 State.set(this, Phi, *State.Instance); 9256 // NOTE: Currently we need to update the value of the operand, so the next 9257 // predicated iteration inserts its generated value in the correct vector. 9258 State.reset(getOperand(0), Phi, *State.Instance); 9259 } 9260 } 9261 9262 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9263 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9264 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9265 StoredValue ? nullptr : getVPValue(), 9266 getAddr(), StoredValue, getMask()); 9267 } 9268 9269 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9270 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9271 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9272 // for predication. 9273 static ScalarEpilogueLowering getScalarEpilogueLowering( 9274 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9275 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9276 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9277 LoopVectorizationLegality &LVL) { 9278 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9279 // don't look at hints or options, and don't request a scalar epilogue. 9280 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9281 // LoopAccessInfo (due to code dependency and not being able to reliably get 9282 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9283 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9284 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9285 // back to the old way and vectorize with versioning when forced. See D81345.) 9286 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9287 PGSOQueryType::IRPass) && 9288 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9289 return CM_ScalarEpilogueNotAllowedOptSize; 9290 9291 // 2) If set, obey the directives 9292 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9293 switch (PreferPredicateOverEpilogue) { 9294 case PreferPredicateTy::ScalarEpilogue: 9295 return CM_ScalarEpilogueAllowed; 9296 case PreferPredicateTy::PredicateElseScalarEpilogue: 9297 return CM_ScalarEpilogueNotNeededUsePredicate; 9298 case PreferPredicateTy::PredicateOrDontVectorize: 9299 return CM_ScalarEpilogueNotAllowedUsePredicate; 9300 }; 9301 } 9302 9303 // 3) If set, obey the hints 9304 switch (Hints.getPredicate()) { 9305 case LoopVectorizeHints::FK_Enabled: 9306 return CM_ScalarEpilogueNotNeededUsePredicate; 9307 case LoopVectorizeHints::FK_Disabled: 9308 return CM_ScalarEpilogueAllowed; 9309 }; 9310 9311 // 4) if the TTI hook indicates this is profitable, request predication. 9312 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9313 LVL.getLAI())) 9314 return CM_ScalarEpilogueNotNeededUsePredicate; 9315 9316 return CM_ScalarEpilogueAllowed; 9317 } 9318 9319 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9320 // If Values have been set for this Def return the one relevant for \p Part. 9321 if (hasVectorValue(Def, Part)) 9322 return Data.PerPartOutput[Def][Part]; 9323 9324 if (!hasScalarValue(Def, {Part, 0})) { 9325 Value *IRV = Def->getLiveInIRValue(); 9326 Value *B = ILV->getBroadcastInstrs(IRV); 9327 set(Def, B, Part); 9328 return B; 9329 } 9330 9331 Value *ScalarValue = get(Def, {Part, 0}); 9332 // If we aren't vectorizing, we can just copy the scalar map values over 9333 // to the vector map. 9334 if (VF.isScalar()) { 9335 set(Def, ScalarValue, Part); 9336 return ScalarValue; 9337 } 9338 9339 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9340 bool IsUniform = RepR && RepR->isUniform(); 9341 9342 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9343 // Check if there is a scalar value for the selected lane. 9344 if (!hasScalarValue(Def, {Part, LastLane})) { 9345 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9346 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9347 "unexpected recipe found to be invariant"); 9348 IsUniform = true; 9349 LastLane = 0; 9350 } 9351 9352 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9353 9354 // Set the insert point after the last scalarized instruction. This 9355 // ensures the insertelement sequence will directly follow the scalar 9356 // definitions. 9357 auto OldIP = Builder.saveIP(); 9358 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9359 Builder.SetInsertPoint(&*NewIP); 9360 9361 // However, if we are vectorizing, we need to construct the vector values. 9362 // If the value is known to be uniform after vectorization, we can just 9363 // broadcast the scalar value corresponding to lane zero for each unroll 9364 // iteration. Otherwise, we construct the vector values using 9365 // insertelement instructions. Since the resulting vectors are stored in 9366 // State, we will only generate the insertelements once. 9367 Value *VectorValue = nullptr; 9368 if (IsUniform) { 9369 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9370 set(Def, VectorValue, Part); 9371 } else { 9372 // Initialize packing with insertelements to start from undef. 9373 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9374 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9375 set(Def, Undef, Part); 9376 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9377 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9378 VectorValue = get(Def, Part); 9379 } 9380 Builder.restoreIP(OldIP); 9381 return VectorValue; 9382 } 9383 9384 // Process the loop in the VPlan-native vectorization path. This path builds 9385 // VPlan upfront in the vectorization pipeline, which allows to apply 9386 // VPlan-to-VPlan transformations from the very beginning without modifying the 9387 // input LLVM IR. 9388 static bool processLoopInVPlanNativePath( 9389 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9390 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9391 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9392 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9393 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9394 9395 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9396 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9397 return false; 9398 } 9399 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9400 Function *F = L->getHeader()->getParent(); 9401 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9402 9403 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9404 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9405 9406 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9407 &Hints, IAI); 9408 // Use the planner for outer loop vectorization. 9409 // TODO: CM is not used at this point inside the planner. Turn CM into an 9410 // optional argument if we don't need it in the future. 9411 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9412 9413 // Get user vectorization factor. 9414 ElementCount UserVF = Hints.getWidth(); 9415 9416 // Plan how to best vectorize, return the best VF and its cost. 9417 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9418 9419 // If we are stress testing VPlan builds, do not attempt to generate vector 9420 // code. Masked vector code generation support will follow soon. 9421 // Also, do not attempt to vectorize if no vector code will be produced. 9422 if (VPlanBuildStressTest || EnableVPlanPredication || 9423 VectorizationFactor::Disabled() == VF) 9424 return false; 9425 9426 LVP.setBestPlan(VF.Width, 1); 9427 9428 { 9429 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9430 F->getParent()->getDataLayout()); 9431 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9432 &CM, BFI, PSI, Checks); 9433 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9434 << L->getHeader()->getParent()->getName() << "\"\n"); 9435 LVP.executePlan(LB, DT); 9436 } 9437 9438 // Mark the loop as already vectorized to avoid vectorizing again. 9439 Hints.setAlreadyVectorized(); 9440 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9441 return true; 9442 } 9443 9444 // Emit a remark if there are stores to floats that required a floating point 9445 // extension. If the vectorized loop was generated with floating point there 9446 // will be a performance penalty from the conversion overhead and the change in 9447 // the vector width. 9448 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9449 SmallVector<Instruction *, 4> Worklist; 9450 for (BasicBlock *BB : L->getBlocks()) { 9451 for (Instruction &Inst : *BB) { 9452 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9453 if (S->getValueOperand()->getType()->isFloatTy()) 9454 Worklist.push_back(S); 9455 } 9456 } 9457 } 9458 9459 // Traverse the floating point stores upwards searching, for floating point 9460 // conversions. 9461 SmallPtrSet<const Instruction *, 4> Visited; 9462 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9463 while (!Worklist.empty()) { 9464 auto *I = Worklist.pop_back_val(); 9465 if (!L->contains(I)) 9466 continue; 9467 if (!Visited.insert(I).second) 9468 continue; 9469 9470 // Emit a remark if the floating point store required a floating 9471 // point conversion. 9472 // TODO: More work could be done to identify the root cause such as a 9473 // constant or a function return type and point the user to it. 9474 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9475 ORE->emit([&]() { 9476 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9477 I->getDebugLoc(), L->getHeader()) 9478 << "floating point conversion changes vector width. " 9479 << "Mixed floating point precision requires an up/down " 9480 << "cast that will negatively impact performance."; 9481 }); 9482 9483 for (Use &Op : I->operands()) 9484 if (auto *OpI = dyn_cast<Instruction>(Op)) 9485 Worklist.push_back(OpI); 9486 } 9487 } 9488 9489 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9490 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9491 !EnableLoopInterleaving), 9492 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9493 !EnableLoopVectorization) {} 9494 9495 bool LoopVectorizePass::processLoop(Loop *L) { 9496 assert((EnableVPlanNativePath || L->isInnermost()) && 9497 "VPlan-native path is not enabled. Only process inner loops."); 9498 9499 #ifndef NDEBUG 9500 const std::string DebugLocStr = getDebugLocString(L); 9501 #endif /* NDEBUG */ 9502 9503 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9504 << L->getHeader()->getParent()->getName() << "\" from " 9505 << DebugLocStr << "\n"); 9506 9507 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9508 9509 LLVM_DEBUG( 9510 dbgs() << "LV: Loop hints:" 9511 << " force=" 9512 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9513 ? "disabled" 9514 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9515 ? "enabled" 9516 : "?")) 9517 << " width=" << Hints.getWidth() 9518 << " unroll=" << Hints.getInterleave() << "\n"); 9519 9520 // Function containing loop 9521 Function *F = L->getHeader()->getParent(); 9522 9523 // Looking at the diagnostic output is the only way to determine if a loop 9524 // was vectorized (other than looking at the IR or machine code), so it 9525 // is important to generate an optimization remark for each loop. Most of 9526 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9527 // generated as OptimizationRemark and OptimizationRemarkMissed are 9528 // less verbose reporting vectorized loops and unvectorized loops that may 9529 // benefit from vectorization, respectively. 9530 9531 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9532 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9533 return false; 9534 } 9535 9536 PredicatedScalarEvolution PSE(*SE, *L); 9537 9538 // Check if it is legal to vectorize the loop. 9539 LoopVectorizationRequirements Requirements(*ORE); 9540 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9541 &Requirements, &Hints, DB, AC, BFI, PSI); 9542 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9543 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9544 Hints.emitRemarkWithHints(); 9545 return false; 9546 } 9547 9548 // Check the function attributes and profiles to find out if this function 9549 // should be optimized for size. 9550 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9551 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9552 9553 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9554 // here. They may require CFG and instruction level transformations before 9555 // even evaluating whether vectorization is profitable. Since we cannot modify 9556 // the incoming IR, we need to build VPlan upfront in the vectorization 9557 // pipeline. 9558 if (!L->isInnermost()) 9559 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9560 ORE, BFI, PSI, Hints); 9561 9562 assert(L->isInnermost() && "Inner loop expected."); 9563 9564 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9565 // count by optimizing for size, to minimize overheads. 9566 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9567 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9568 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9569 << "This loop is worth vectorizing only if no scalar " 9570 << "iteration overheads are incurred."); 9571 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9572 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9573 else { 9574 LLVM_DEBUG(dbgs() << "\n"); 9575 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9576 } 9577 } 9578 9579 // Check the function attributes to see if implicit floats are allowed. 9580 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9581 // an integer loop and the vector instructions selected are purely integer 9582 // vector instructions? 9583 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9584 reportVectorizationFailure( 9585 "Can't vectorize when the NoImplicitFloat attribute is used", 9586 "loop not vectorized due to NoImplicitFloat attribute", 9587 "NoImplicitFloat", ORE, L); 9588 Hints.emitRemarkWithHints(); 9589 return false; 9590 } 9591 9592 // Check if the target supports potentially unsafe FP vectorization. 9593 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9594 // for the target we're vectorizing for, to make sure none of the 9595 // additional fp-math flags can help. 9596 if (Hints.isPotentiallyUnsafe() && 9597 TTI->isFPVectorizationPotentiallyUnsafe()) { 9598 reportVectorizationFailure( 9599 "Potentially unsafe FP op prevents vectorization", 9600 "loop not vectorized due to unsafe FP support.", 9601 "UnsafeFP", ORE, L); 9602 Hints.emitRemarkWithHints(); 9603 return false; 9604 } 9605 9606 if (!Requirements.canVectorizeFPMath(Hints)) { 9607 ORE->emit([&]() { 9608 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9609 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9610 ExactFPMathInst->getDebugLoc(), 9611 ExactFPMathInst->getParent()) 9612 << "loop not vectorized: cannot prove it is safe to reorder " 9613 "floating-point operations"; 9614 }); 9615 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9616 "reorder floating-point operations\n"); 9617 Hints.emitRemarkWithHints(); 9618 return false; 9619 } 9620 9621 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9622 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9623 9624 // If an override option has been passed in for interleaved accesses, use it. 9625 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9626 UseInterleaved = EnableInterleavedMemAccesses; 9627 9628 // Analyze interleaved memory accesses. 9629 if (UseInterleaved) { 9630 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9631 } 9632 9633 // Use the cost model. 9634 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9635 F, &Hints, IAI); 9636 CM.collectValuesToIgnore(); 9637 9638 // Use the planner for vectorization. 9639 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9640 9641 // Get user vectorization factor and interleave count. 9642 ElementCount UserVF = Hints.getWidth(); 9643 unsigned UserIC = Hints.getInterleave(); 9644 9645 // Plan how to best vectorize, return the best VF and its cost. 9646 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9647 9648 VectorizationFactor VF = VectorizationFactor::Disabled(); 9649 unsigned IC = 1; 9650 9651 if (MaybeVF) { 9652 VF = *MaybeVF; 9653 // Select the interleave count. 9654 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9655 } 9656 9657 // Identify the diagnostic messages that should be produced. 9658 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9659 bool VectorizeLoop = true, InterleaveLoop = true; 9660 if (Requirements.doesNotMeet(F, L, Hints)) { 9661 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9662 "requirements.\n"); 9663 Hints.emitRemarkWithHints(); 9664 return false; 9665 } 9666 9667 if (VF.Width.isScalar()) { 9668 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9669 VecDiagMsg = std::make_pair( 9670 "VectorizationNotBeneficial", 9671 "the cost-model indicates that vectorization is not beneficial"); 9672 VectorizeLoop = false; 9673 } 9674 9675 if (!MaybeVF && UserIC > 1) { 9676 // Tell the user interleaving was avoided up-front, despite being explicitly 9677 // requested. 9678 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9679 "interleaving should be avoided up front\n"); 9680 IntDiagMsg = std::make_pair( 9681 "InterleavingAvoided", 9682 "Ignoring UserIC, because interleaving was avoided up front"); 9683 InterleaveLoop = false; 9684 } else if (IC == 1 && UserIC <= 1) { 9685 // Tell the user interleaving is not beneficial. 9686 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9687 IntDiagMsg = std::make_pair( 9688 "InterleavingNotBeneficial", 9689 "the cost-model indicates that interleaving is not beneficial"); 9690 InterleaveLoop = false; 9691 if (UserIC == 1) { 9692 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9693 IntDiagMsg.second += 9694 " and is explicitly disabled or interleave count is set to 1"; 9695 } 9696 } else if (IC > 1 && UserIC == 1) { 9697 // Tell the user interleaving is beneficial, but it explicitly disabled. 9698 LLVM_DEBUG( 9699 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9700 IntDiagMsg = std::make_pair( 9701 "InterleavingBeneficialButDisabled", 9702 "the cost-model indicates that interleaving is beneficial " 9703 "but is explicitly disabled or interleave count is set to 1"); 9704 InterleaveLoop = false; 9705 } 9706 9707 // Override IC if user provided an interleave count. 9708 IC = UserIC > 0 ? UserIC : IC; 9709 9710 // Emit diagnostic messages, if any. 9711 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9712 if (!VectorizeLoop && !InterleaveLoop) { 9713 // Do not vectorize or interleaving the loop. 9714 ORE->emit([&]() { 9715 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9716 L->getStartLoc(), L->getHeader()) 9717 << VecDiagMsg.second; 9718 }); 9719 ORE->emit([&]() { 9720 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9721 L->getStartLoc(), L->getHeader()) 9722 << IntDiagMsg.second; 9723 }); 9724 return false; 9725 } else if (!VectorizeLoop && InterleaveLoop) { 9726 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9727 ORE->emit([&]() { 9728 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9729 L->getStartLoc(), L->getHeader()) 9730 << VecDiagMsg.second; 9731 }); 9732 } else if (VectorizeLoop && !InterleaveLoop) { 9733 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9734 << ") in " << DebugLocStr << '\n'); 9735 ORE->emit([&]() { 9736 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9737 L->getStartLoc(), L->getHeader()) 9738 << IntDiagMsg.second; 9739 }); 9740 } else if (VectorizeLoop && InterleaveLoop) { 9741 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9742 << ") in " << DebugLocStr << '\n'); 9743 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9744 } 9745 9746 bool DisableRuntimeUnroll = false; 9747 MDNode *OrigLoopID = L->getLoopID(); 9748 { 9749 // Optimistically generate runtime checks. Drop them if they turn out to not 9750 // be profitable. Limit the scope of Checks, so the cleanup happens 9751 // immediately after vector codegeneration is done. 9752 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9753 F->getParent()->getDataLayout()); 9754 if (!VF.Width.isScalar() || IC > 1) 9755 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9756 LVP.setBestPlan(VF.Width, IC); 9757 9758 using namespace ore; 9759 if (!VectorizeLoop) { 9760 assert(IC > 1 && "interleave count should not be 1 or 0"); 9761 // If we decided that it is not legal to vectorize the loop, then 9762 // interleave it. 9763 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9764 &CM, BFI, PSI, Checks); 9765 LVP.executePlan(Unroller, DT); 9766 9767 ORE->emit([&]() { 9768 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9769 L->getHeader()) 9770 << "interleaved loop (interleaved count: " 9771 << NV("InterleaveCount", IC) << ")"; 9772 }); 9773 } else { 9774 // If we decided that it is *legal* to vectorize the loop, then do it. 9775 9776 // Consider vectorizing the epilogue too if it's profitable. 9777 VectorizationFactor EpilogueVF = 9778 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9779 if (EpilogueVF.Width.isVector()) { 9780 9781 // The first pass vectorizes the main loop and creates a scalar epilogue 9782 // to be vectorized by executing the plan (potentially with a different 9783 // factor) again shortly afterwards. 9784 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9785 EpilogueVF.Width.getKnownMinValue(), 9786 1); 9787 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9788 EPI, &LVL, &CM, BFI, PSI, Checks); 9789 9790 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9791 LVP.executePlan(MainILV, DT); 9792 ++LoopsVectorized; 9793 9794 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9795 formLCSSARecursively(*L, *DT, LI, SE); 9796 9797 // Second pass vectorizes the epilogue and adjusts the control flow 9798 // edges from the first pass. 9799 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9800 EPI.MainLoopVF = EPI.EpilogueVF; 9801 EPI.MainLoopUF = EPI.EpilogueUF; 9802 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9803 ORE, EPI, &LVL, &CM, BFI, PSI, 9804 Checks); 9805 LVP.executePlan(EpilogILV, DT); 9806 ++LoopsEpilogueVectorized; 9807 9808 if (!MainILV.areSafetyChecksAdded()) 9809 DisableRuntimeUnroll = true; 9810 } else { 9811 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9812 &LVL, &CM, BFI, PSI, Checks); 9813 LVP.executePlan(LB, DT); 9814 ++LoopsVectorized; 9815 9816 // Add metadata to disable runtime unrolling a scalar loop when there 9817 // are no runtime checks about strides and memory. A scalar loop that is 9818 // rarely used is not worth unrolling. 9819 if (!LB.areSafetyChecksAdded()) 9820 DisableRuntimeUnroll = true; 9821 } 9822 // Report the vectorization decision. 9823 ORE->emit([&]() { 9824 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9825 L->getHeader()) 9826 << "vectorized loop (vectorization width: " 9827 << NV("VectorizationFactor", VF.Width) 9828 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9829 }); 9830 } 9831 9832 if (ORE->allowExtraAnalysis(LV_NAME)) 9833 checkMixedPrecision(L, ORE); 9834 } 9835 9836 Optional<MDNode *> RemainderLoopID = 9837 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9838 LLVMLoopVectorizeFollowupEpilogue}); 9839 if (RemainderLoopID.hasValue()) { 9840 L->setLoopID(RemainderLoopID.getValue()); 9841 } else { 9842 if (DisableRuntimeUnroll) 9843 AddRuntimeUnrollDisableMetaData(L); 9844 9845 // Mark the loop as already vectorized to avoid vectorizing again. 9846 Hints.setAlreadyVectorized(); 9847 } 9848 9849 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9850 return true; 9851 } 9852 9853 LoopVectorizeResult LoopVectorizePass::runImpl( 9854 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9855 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9856 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9857 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9858 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9859 SE = &SE_; 9860 LI = &LI_; 9861 TTI = &TTI_; 9862 DT = &DT_; 9863 BFI = &BFI_; 9864 TLI = TLI_; 9865 AA = &AA_; 9866 AC = &AC_; 9867 GetLAA = &GetLAA_; 9868 DB = &DB_; 9869 ORE = &ORE_; 9870 PSI = PSI_; 9871 9872 // Don't attempt if 9873 // 1. the target claims to have no vector registers, and 9874 // 2. interleaving won't help ILP. 9875 // 9876 // The second condition is necessary because, even if the target has no 9877 // vector registers, loop vectorization may still enable scalar 9878 // interleaving. 9879 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9880 TTI->getMaxInterleaveFactor(1) < 2) 9881 return LoopVectorizeResult(false, false); 9882 9883 bool Changed = false, CFGChanged = false; 9884 9885 // The vectorizer requires loops to be in simplified form. 9886 // Since simplification may add new inner loops, it has to run before the 9887 // legality and profitability checks. This means running the loop vectorizer 9888 // will simplify all loops, regardless of whether anything end up being 9889 // vectorized. 9890 for (auto &L : *LI) 9891 Changed |= CFGChanged |= 9892 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9893 9894 // Build up a worklist of inner-loops to vectorize. This is necessary as 9895 // the act of vectorizing or partially unrolling a loop creates new loops 9896 // and can invalidate iterators across the loops. 9897 SmallVector<Loop *, 8> Worklist; 9898 9899 for (Loop *L : *LI) 9900 collectSupportedLoops(*L, LI, ORE, Worklist); 9901 9902 LoopsAnalyzed += Worklist.size(); 9903 9904 // Now walk the identified inner loops. 9905 while (!Worklist.empty()) { 9906 Loop *L = Worklist.pop_back_val(); 9907 9908 // For the inner loops we actually process, form LCSSA to simplify the 9909 // transform. 9910 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9911 9912 Changed |= CFGChanged |= processLoop(L); 9913 } 9914 9915 // Process each loop nest in the function. 9916 return LoopVectorizeResult(Changed, CFGChanged); 9917 } 9918 9919 PreservedAnalyses LoopVectorizePass::run(Function &F, 9920 FunctionAnalysisManager &AM) { 9921 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9922 auto &LI = AM.getResult<LoopAnalysis>(F); 9923 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9924 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9925 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9926 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9927 auto &AA = AM.getResult<AAManager>(F); 9928 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9929 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9930 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9931 MemorySSA *MSSA = EnableMSSALoopDependency 9932 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9933 : nullptr; 9934 9935 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9936 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9937 [&](Loop &L) -> const LoopAccessInfo & { 9938 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9939 TLI, TTI, nullptr, MSSA}; 9940 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9941 }; 9942 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9943 ProfileSummaryInfo *PSI = 9944 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9945 LoopVectorizeResult Result = 9946 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9947 if (!Result.MadeAnyChange) 9948 return PreservedAnalyses::all(); 9949 PreservedAnalyses PA; 9950 9951 // We currently do not preserve loopinfo/dominator analyses with outer loop 9952 // vectorization. Until this is addressed, mark these analyses as preserved 9953 // only for non-VPlan-native path. 9954 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9955 if (!EnableVPlanNativePath) { 9956 PA.preserve<LoopAnalysis>(); 9957 PA.preserve<DominatorTreeAnalysis>(); 9958 } 9959 PA.preserve<BasicAA>(); 9960 PA.preserve<GlobalsAA>(); 9961 if (!Result.MadeCFGChange) 9962 PA.preserveSet<CFGAnalyses>(); 9963 return PA; 9964 } 9965