1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<bool> ForceTargetSupportsScalableVectors( 276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 277 cl::desc( 278 "Pretend that scalable vectors are supported, even if the target does " 279 "not support them. This flag should only be used for testing.")); 280 281 static cl::opt<unsigned> SmallLoopCost( 282 "small-loop-cost", cl::init(20), cl::Hidden, 283 cl::desc( 284 "The cost of a loop that is considered 'small' by the interleaver.")); 285 286 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 288 cl::desc("Enable the use of the block frequency analysis to access PGO " 289 "heuristics minimizing code growth in cold regions and being more " 290 "aggressive in hot regions.")); 291 292 // Runtime interleave loops for load/store throughput. 293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 295 cl::desc( 296 "Enable runtime interleaving until load/store ports are saturated")); 297 298 /// Interleave small loops with scalar reductions. 299 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 301 cl::desc("Enable interleaving for loops with small iteration counts that " 302 "contain scalar reductions to expose ILP.")); 303 304 /// The number of stores in a loop that are allowed to need predication. 305 static cl::opt<unsigned> NumberOfStoresToPredicate( 306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 307 cl::desc("Max number of stores to be predicated behind an if.")); 308 309 static cl::opt<bool> EnableIndVarRegisterHeur( 310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 311 cl::desc("Count the induction variable only once when interleaving")); 312 313 static cl::opt<bool> EnableCondStoresVectorization( 314 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 315 cl::desc("Enable if predication of stores during vectorization.")); 316 317 static cl::opt<unsigned> MaxNestedScalarReductionIC( 318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 319 cl::desc("The maximum interleave count to use when interleaving a scalar " 320 "reduction in a nested loop.")); 321 322 static cl::opt<bool> 323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 324 cl::Hidden, 325 cl::desc("Prefer in-loop vector reductions, " 326 "overriding the targets preference.")); 327 328 static cl::opt<bool> PreferPredicatedReductionSelect( 329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 330 cl::desc( 331 "Prefer predicating a reduction operation over an after loop select.")); 332 333 cl::opt<bool> EnableVPlanNativePath( 334 "enable-vplan-native-path", cl::init(false), cl::Hidden, 335 cl::desc("Enable VPlan-native vectorization path with " 336 "support for outer loop vectorization.")); 337 338 // FIXME: Remove this switch once we have divergence analysis. Currently we 339 // assume divergent non-backedge branches when this switch is true. 340 cl::opt<bool> EnableVPlanPredication( 341 "enable-vplan-predication", cl::init(false), cl::Hidden, 342 cl::desc("Enable VPlan-native vectorization path predicator with " 343 "support for outer loop vectorization.")); 344 345 // This flag enables the stress testing of the VPlan H-CFG construction in the 346 // VPlan-native vectorization path. It must be used in conjuction with 347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 348 // verification of the H-CFGs built. 349 static cl::opt<bool> VPlanBuildStressTest( 350 "vplan-build-stress-test", cl::init(false), cl::Hidden, 351 cl::desc( 352 "Build VPlan for every supported loop nest in the function and bail " 353 "out right after the build (stress test the VPlan H-CFG construction " 354 "in the VPlan-native vectorization path).")); 355 356 cl::opt<bool> llvm::EnableLoopInterleaving( 357 "interleave-loops", cl::init(true), cl::Hidden, 358 cl::desc("Enable loop interleaving in Loop vectorization passes")); 359 cl::opt<bool> llvm::EnableLoopVectorization( 360 "vectorize-loops", cl::init(true), cl::Hidden, 361 cl::desc("Run the Loop vectorization passes")); 362 363 cl::opt<bool> PrintVPlansInDotFormat( 364 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 365 cl::desc("Use dot format instead of plain text when dumping VPlans")); 366 367 /// A helper function that returns the type of loaded or stored value. 368 static Type *getMemInstValueType(Value *I) { 369 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 370 "Expected Load or Store instruction"); 371 if (auto *LI = dyn_cast<LoadInst>(I)) 372 return LI->getType(); 373 return cast<StoreInst>(I)->getValueOperand()->getType(); 374 } 375 376 /// A helper function that returns true if the given type is irregular. The 377 /// type is irregular if its allocated size doesn't equal the store size of an 378 /// element of the corresponding vector type. 379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 380 // Determine if an array of N elements of type Ty is "bitcast compatible" 381 // with a <N x Ty> vector. 382 // This is only true if there is no padding between the array elements. 383 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 384 } 385 386 /// A helper function that returns the reciprocal of the block probability of 387 /// predicated blocks. If we return X, we are assuming the predicated block 388 /// will execute once for every X iterations of the loop header. 389 /// 390 /// TODO: We should use actual block probability here, if available. Currently, 391 /// we always assume predicated blocks have a 50% chance of executing. 392 static unsigned getReciprocalPredBlockProb() { return 2; } 393 394 /// A helper function that returns an integer or floating-point constant with 395 /// value C. 396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 397 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 398 : ConstantFP::get(Ty, C); 399 } 400 401 /// Returns "best known" trip count for the specified loop \p L as defined by 402 /// the following procedure: 403 /// 1) Returns exact trip count if it is known. 404 /// 2) Returns expected trip count according to profile data if any. 405 /// 3) Returns upper bound estimate if it is known. 406 /// 4) Returns None if all of the above failed. 407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 408 // Check if exact trip count is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 410 return ExpectedTC; 411 412 // Check if there is an expected trip count available from profile data. 413 if (LoopVectorizeWithBlockFrequency) 414 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 415 return EstimatedTC; 416 417 // Check if upper bound estimate is known. 418 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 419 return ExpectedTC; 420 421 return None; 422 } 423 424 // Forward declare GeneratedRTChecks. 425 class GeneratedRTChecks; 426 427 namespace llvm { 428 429 /// InnerLoopVectorizer vectorizes loops which contain only one basic 430 /// block to a specified vectorization factor (VF). 431 /// This class performs the widening of scalars into vectors, or multiple 432 /// scalars. This class also implements the following features: 433 /// * It inserts an epilogue loop for handling loops that don't have iteration 434 /// counts that are known to be a multiple of the vectorization factor. 435 /// * It handles the code generation for reduction variables. 436 /// * Scalarization (implementation using scalars) of un-vectorizable 437 /// instructions. 438 /// InnerLoopVectorizer does not perform any vectorization-legality 439 /// checks, and relies on the caller to check for the different legality 440 /// aspects. The InnerLoopVectorizer relies on the 441 /// LoopVectorizationLegality class to provide information about the induction 442 /// and reduction variables that were found to a given vectorization factor. 443 class InnerLoopVectorizer { 444 public: 445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 446 LoopInfo *LI, DominatorTree *DT, 447 const TargetLibraryInfo *TLI, 448 const TargetTransformInfo *TTI, AssumptionCache *AC, 449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 450 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 451 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 452 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 453 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 454 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 455 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 456 PSI(PSI), RTChecks(RTChecks) { 457 // Query this against the original loop and save it here because the profile 458 // of the original loop header may change as the transformation happens. 459 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 460 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop. 470 /// In the case of epilogue vectorization, this function is overriden to 471 /// handle the more complex control flow around the loops. 472 virtual BasicBlock *createVectorizedLoopSkeleton(); 473 474 /// Widen a single instruction within the innermost loop. 475 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 476 VPTransformState &State); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Widen a single select instruction within the innermost loop. 483 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 484 bool InvariantCond, VPTransformState &State); 485 486 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 487 void fixVectorizedLoop(VPTransformState &State); 488 489 // Return true if any runtime check is added. 490 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 491 492 /// A type for vectorized values in the new loop. Each value from the 493 /// original loop, when vectorized, is represented by UF vector values in the 494 /// new unrolled loop, where UF is the unroll factor. 495 using VectorParts = SmallVector<Value *, 2>; 496 497 /// Vectorize a single GetElementPtrInst based on information gathered and 498 /// decisions taken during planning. 499 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 500 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 501 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 502 503 /// Vectorize a single PHINode in a block. This method handles the induction 504 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 505 /// arbitrary length vectors. 506 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 507 VPValue *StartV, VPValue *Def, 508 VPTransformState &State); 509 510 /// A helper function to scalarize a single Instruction in the innermost loop. 511 /// Generates a sequence of scalar instances for each lane between \p MinLane 512 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 513 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 514 /// Instr's operands. 515 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 516 const VPIteration &Instance, bool IfPredicateInstr, 517 VPTransformState &State); 518 519 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 520 /// is provided, the integer induction variable will first be truncated to 521 /// the corresponding type. 522 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 523 VPValue *Def, VPValue *CastDef, 524 VPTransformState &State); 525 526 /// Construct the vector value of a scalarized value \p V one lane at a time. 527 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 528 VPTransformState &State); 529 530 /// Try to vectorize interleaved access group \p Group with the base address 531 /// given in \p Addr, optionally masking the vector operations if \p 532 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 533 /// values in the vectorized loop. 534 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 535 ArrayRef<VPValue *> VPDefs, 536 VPTransformState &State, VPValue *Addr, 537 ArrayRef<VPValue *> StoredValues, 538 VPValue *BlockInMask = nullptr); 539 540 /// Vectorize Load and Store instructions with the base address given in \p 541 /// Addr, optionally masking the vector operations if \p BlockInMask is 542 /// non-null. Use \p State to translate given VPValues to IR values in the 543 /// vectorized loop. 544 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 545 VPValue *Def, VPValue *Addr, 546 VPValue *StoredValue, VPValue *BlockInMask); 547 548 /// Set the debug location in the builder using the debug location in 549 /// the instruction. 550 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 551 552 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 553 void fixNonInductionPHIs(VPTransformState &State); 554 555 /// Create a broadcast instruction. This method generates a broadcast 556 /// instruction (shuffle) for loop invariant values and for the induction 557 /// value. If this is the induction variable then we extend it to N, N+1, ... 558 /// this is needed because each iteration in the loop corresponds to a SIMD 559 /// element. 560 virtual Value *getBroadcastInstrs(Value *V); 561 562 protected: 563 friend class LoopVectorizationPlanner; 564 565 /// A small list of PHINodes. 566 using PhiVector = SmallVector<PHINode *, 4>; 567 568 /// A type for scalarized values in the new loop. Each value from the 569 /// original loop, when scalarized, is represented by UF x VF scalar values 570 /// in the new unrolled loop, where UF is the unroll factor and VF is the 571 /// vectorization factor. 572 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 573 574 /// Set up the values of the IVs correctly when exiting the vector loop. 575 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 576 Value *CountRoundDown, Value *EndValue, 577 BasicBlock *MiddleBlock); 578 579 /// Create a new induction variable inside L. 580 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 581 Value *Step, Instruction *DL); 582 583 /// Handle all cross-iteration phis in the header. 584 void fixCrossIterationPHIs(VPTransformState &State); 585 586 /// Fix a first-order recurrence. This is the second phase of vectorizing 587 /// this phi node. 588 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 589 590 /// Fix a reduction cross-iteration phi. This is the second phase of 591 /// vectorizing this phi node. 592 void fixReduction(PHINode *Phi, VPTransformState &State); 593 594 /// Clear NSW/NUW flags from reduction instructions if necessary. 595 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 596 VPTransformState &State); 597 598 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 599 /// means we need to add the appropriate incoming value from the middle 600 /// block as exiting edges from the scalar epilogue loop (if present) are 601 /// already in place, and we exit the vector loop exclusively to the middle 602 /// block. 603 void fixLCSSAPHIs(VPTransformState &State); 604 605 /// Iteratively sink the scalarized operands of a predicated instruction into 606 /// the block that was created for it. 607 void sinkScalarOperands(Instruction *PredInst); 608 609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 610 /// represented as. 611 void truncateToMinimalBitwidths(VPTransformState &State); 612 613 /// This function adds 614 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 615 /// to each vector element of Val. The sequence starts at StartIndex. 616 /// \p Opcode is relevant for FP induction variable. 617 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 618 Instruction::BinaryOps Opcode = 619 Instruction::BinaryOpsEnd); 620 621 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 622 /// variable on which to base the steps, \p Step is the size of the step, and 623 /// \p EntryVal is the value from the original loop that maps to the steps. 624 /// Note that \p EntryVal doesn't have to be an induction variable - it 625 /// can also be a truncate instruction. 626 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 627 const InductionDescriptor &ID, VPValue *Def, 628 VPValue *CastDef, VPTransformState &State); 629 630 /// Create a vector induction phi node based on an existing scalar one. \p 631 /// EntryVal is the value from the original loop that maps to the vector phi 632 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 633 /// truncate instruction, instead of widening the original IV, we widen a 634 /// version of the IV truncated to \p EntryVal's type. 635 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 636 Value *Step, Value *Start, 637 Instruction *EntryVal, VPValue *Def, 638 VPValue *CastDef, 639 VPTransformState &State); 640 641 /// Returns true if an instruction \p I should be scalarized instead of 642 /// vectorized for the chosen vectorization factor. 643 bool shouldScalarizeInstruction(Instruction *I) const; 644 645 /// Returns true if we should generate a scalar version of \p IV. 646 bool needsScalarInduction(Instruction *IV) const; 647 648 /// If there is a cast involved in the induction variable \p ID, which should 649 /// be ignored in the vectorized loop body, this function records the 650 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 651 /// cast. We had already proved that the casted Phi is equal to the uncasted 652 /// Phi in the vectorized loop (under a runtime guard), and therefore 653 /// there is no need to vectorize the cast - the same value can be used in the 654 /// vector loop for both the Phi and the cast. 655 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 656 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 657 /// 658 /// \p EntryVal is the value from the original loop that maps to the vector 659 /// phi node and is used to distinguish what is the IV currently being 660 /// processed - original one (if \p EntryVal is a phi corresponding to the 661 /// original IV) or the "newly-created" one based on the proof mentioned above 662 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 663 /// latter case \p EntryVal is a TruncInst and we must not record anything for 664 /// that IV, but it's error-prone to expect callers of this routine to care 665 /// about that, hence this explicit parameter. 666 void recordVectorLoopValueForInductionCast( 667 const InductionDescriptor &ID, const Instruction *EntryVal, 668 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 669 unsigned Part, unsigned Lane = UINT_MAX); 670 671 /// Generate a shuffle sequence that will reverse the vector Vec. 672 virtual Value *reverseVector(Value *Vec); 673 674 /// Returns (and creates if needed) the original loop trip count. 675 Value *getOrCreateTripCount(Loop *NewLoop); 676 677 /// Returns (and creates if needed) the trip count of the widened loop. 678 Value *getOrCreateVectorTripCount(Loop *NewLoop); 679 680 /// Returns a bitcasted value to the requested vector type. 681 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 682 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 683 const DataLayout &DL); 684 685 /// Emit a bypass check to see if the vector trip count is zero, including if 686 /// it overflows. 687 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 688 689 /// Emit a bypass check to see if all of the SCEV assumptions we've 690 /// had to make are correct. Returns the block containing the checks or 691 /// nullptr if no checks have been added. 692 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 693 694 /// Emit bypass checks to check any memory assumptions we may have made. 695 /// Returns the block containing the checks or nullptr if no checks have been 696 /// added. 697 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Compute the transformed value of Index at offset StartValue using step 700 /// StepValue. 701 /// For integer induction, returns StartValue + Index * StepValue. 702 /// For pointer induction, returns StartValue[Index * StepValue]. 703 /// FIXME: The newly created binary instructions should contain nsw/nuw 704 /// flags, which can be found from the original scalar operations. 705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 706 const DataLayout &DL, 707 const InductionDescriptor &ID) const; 708 709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 710 /// vector loop preheader, middle block and scalar preheader. Also 711 /// allocate a loop object for the new vector loop and return it. 712 Loop *createVectorLoopSkeleton(StringRef Prefix); 713 714 /// Create new phi nodes for the induction variables to resume iteration count 715 /// in the scalar epilogue, from where the vectorized loop left off (given by 716 /// \p VectorTripCount). 717 /// In cases where the loop skeleton is more complicated (eg. epilogue 718 /// vectorization) and the resume values can come from an additional bypass 719 /// block, the \p AdditionalBypass pair provides information about the bypass 720 /// block and the end value on the edge from bypass to this loop. 721 void createInductionResumeValues( 722 Loop *L, Value *VectorTripCount, 723 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 724 725 /// Complete the loop skeleton by adding debug MDs, creating appropriate 726 /// conditional branches in the middle block, preparing the builder and 727 /// running the verifier. Take in the vector loop \p L as argument, and return 728 /// the preheader of the completed vector loop. 729 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 730 731 /// Add additional metadata to \p To that was not present on \p Orig. 732 /// 733 /// Currently this is used to add the noalias annotations based on the 734 /// inserted memchecks. Use this for instructions that are *cloned* into the 735 /// vector loop. 736 void addNewMetadata(Instruction *To, const Instruction *Orig); 737 738 /// Add metadata from one instruction to another. 739 /// 740 /// This includes both the original MDs from \p From and additional ones (\see 741 /// addNewMetadata). Use this for *newly created* instructions in the vector 742 /// loop. 743 void addMetadata(Instruction *To, Instruction *From); 744 745 /// Similar to the previous function but it adds the metadata to a 746 /// vector of instructions. 747 void addMetadata(ArrayRef<Value *> To, Instruction *From); 748 749 /// Allow subclasses to override and print debug traces before/after vplan 750 /// execution, when trace information is requested. 751 virtual void printDebugTracesAtStart(){}; 752 virtual void printDebugTracesAtEnd(){}; 753 754 /// The original loop. 755 Loop *OrigLoop; 756 757 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 758 /// dynamic knowledge to simplify SCEV expressions and converts them to a 759 /// more usable form. 760 PredicatedScalarEvolution &PSE; 761 762 /// Loop Info. 763 LoopInfo *LI; 764 765 /// Dominator Tree. 766 DominatorTree *DT; 767 768 /// Alias Analysis. 769 AAResults *AA; 770 771 /// Target Library Info. 772 const TargetLibraryInfo *TLI; 773 774 /// Target Transform Info. 775 const TargetTransformInfo *TTI; 776 777 /// Assumption Cache. 778 AssumptionCache *AC; 779 780 /// Interface to emit optimization remarks. 781 OptimizationRemarkEmitter *ORE; 782 783 /// LoopVersioning. It's only set up (non-null) if memchecks were 784 /// used. 785 /// 786 /// This is currently only used to add no-alias metadata based on the 787 /// memchecks. The actually versioning is performed manually. 788 std::unique_ptr<LoopVersioning> LVer; 789 790 /// The vectorization SIMD factor to use. Each vector will have this many 791 /// vector elements. 792 ElementCount VF; 793 794 /// The vectorization unroll factor to use. Each scalar is vectorized to this 795 /// many different vector instructions. 796 unsigned UF; 797 798 /// The builder that we use 799 IRBuilder<> Builder; 800 801 // --- Vectorization state --- 802 803 /// The vector-loop preheader. 804 BasicBlock *LoopVectorPreHeader; 805 806 /// The scalar-loop preheader. 807 BasicBlock *LoopScalarPreHeader; 808 809 /// Middle Block between the vector and the scalar. 810 BasicBlock *LoopMiddleBlock; 811 812 /// The (unique) ExitBlock of the scalar loop. Note that 813 /// there can be multiple exiting edges reaching this block. 814 BasicBlock *LoopExitBlock; 815 816 /// The vector loop body. 817 BasicBlock *LoopVectorBody; 818 819 /// The scalar loop body. 820 BasicBlock *LoopScalarBody; 821 822 /// A list of all bypass blocks. The first block is the entry of the loop. 823 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 824 825 /// The new Induction variable which was added to the new block. 826 PHINode *Induction = nullptr; 827 828 /// The induction variable of the old basic block. 829 PHINode *OldInduction = nullptr; 830 831 /// Store instructions that were predicated. 832 SmallVector<Instruction *, 4> PredicatedInstructions; 833 834 /// Trip count of the original loop. 835 Value *TripCount = nullptr; 836 837 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 838 Value *VectorTripCount = nullptr; 839 840 /// The legality analysis. 841 LoopVectorizationLegality *Legal; 842 843 /// The profitablity analysis. 844 LoopVectorizationCostModel *Cost; 845 846 // Record whether runtime checks are added. 847 bool AddedSafetyChecks = false; 848 849 // Holds the end values for each induction variable. We save the end values 850 // so we can later fix-up the external users of the induction variables. 851 DenseMap<PHINode *, Value *> IVEndValues; 852 853 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 854 // fixed up at the end of vector code generation. 855 SmallVector<PHINode *, 8> OrigPHIsToFix; 856 857 /// BFI and PSI are used to check for profile guided size optimizations. 858 BlockFrequencyInfo *BFI; 859 ProfileSummaryInfo *PSI; 860 861 // Whether this loop should be optimized for size based on profile guided size 862 // optimizatios. 863 bool OptForSizeBasedOnProfile; 864 865 /// Structure to hold information about generated runtime checks, responsible 866 /// for cleaning the checks, if vectorization turns out unprofitable. 867 GeneratedRTChecks &RTChecks; 868 }; 869 870 class InnerLoopUnroller : public InnerLoopVectorizer { 871 public: 872 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 873 LoopInfo *LI, DominatorTree *DT, 874 const TargetLibraryInfo *TLI, 875 const TargetTransformInfo *TTI, AssumptionCache *AC, 876 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 877 LoopVectorizationLegality *LVL, 878 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 879 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 880 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 881 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 882 BFI, PSI, Check) {} 883 884 private: 885 Value *getBroadcastInstrs(Value *V) override; 886 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 887 Instruction::BinaryOps Opcode = 888 Instruction::BinaryOpsEnd) override; 889 Value *reverseVector(Value *Vec) override; 890 }; 891 892 /// Encapsulate information regarding vectorization of a loop and its epilogue. 893 /// This information is meant to be updated and used across two stages of 894 /// epilogue vectorization. 895 struct EpilogueLoopVectorizationInfo { 896 ElementCount MainLoopVF = ElementCount::getFixed(0); 897 unsigned MainLoopUF = 0; 898 ElementCount EpilogueVF = ElementCount::getFixed(0); 899 unsigned EpilogueUF = 0; 900 BasicBlock *MainLoopIterationCountCheck = nullptr; 901 BasicBlock *EpilogueIterationCountCheck = nullptr; 902 BasicBlock *SCEVSafetyCheck = nullptr; 903 BasicBlock *MemSafetyCheck = nullptr; 904 Value *TripCount = nullptr; 905 Value *VectorTripCount = nullptr; 906 907 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 908 unsigned EUF) 909 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 910 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 911 assert(EUF == 1 && 912 "A high UF for the epilogue loop is likely not beneficial."); 913 } 914 }; 915 916 /// An extension of the inner loop vectorizer that creates a skeleton for a 917 /// vectorized loop that has its epilogue (residual) also vectorized. 918 /// The idea is to run the vplan on a given loop twice, firstly to setup the 919 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 920 /// from the first step and vectorize the epilogue. This is achieved by 921 /// deriving two concrete strategy classes from this base class and invoking 922 /// them in succession from the loop vectorizer planner. 923 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 924 public: 925 InnerLoopAndEpilogueVectorizer( 926 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 927 DominatorTree *DT, const TargetLibraryInfo *TLI, 928 const TargetTransformInfo *TTI, AssumptionCache *AC, 929 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 930 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 931 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 932 GeneratedRTChecks &Checks) 933 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 934 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 935 Checks), 936 EPI(EPI) {} 937 938 // Override this function to handle the more complex control flow around the 939 // three loops. 940 BasicBlock *createVectorizedLoopSkeleton() final override { 941 return createEpilogueVectorizedLoopSkeleton(); 942 } 943 944 /// The interface for creating a vectorized skeleton using one of two 945 /// different strategies, each corresponding to one execution of the vplan 946 /// as described above. 947 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 948 949 /// Holds and updates state information required to vectorize the main loop 950 /// and its epilogue in two separate passes. This setup helps us avoid 951 /// regenerating and recomputing runtime safety checks. It also helps us to 952 /// shorten the iteration-count-check path length for the cases where the 953 /// iteration count of the loop is so small that the main vector loop is 954 /// completely skipped. 955 EpilogueLoopVectorizationInfo &EPI; 956 }; 957 958 /// A specialized derived class of inner loop vectorizer that performs 959 /// vectorization of *main* loops in the process of vectorizing loops and their 960 /// epilogues. 961 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 962 public: 963 EpilogueVectorizerMainLoop( 964 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 DominatorTree *DT, const TargetLibraryInfo *TLI, 966 const TargetTransformInfo *TTI, AssumptionCache *AC, 967 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 968 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 969 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 970 GeneratedRTChecks &Check) 971 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 972 EPI, LVL, CM, BFI, PSI, Check) {} 973 /// Implements the interface for creating a vectorized skeleton using the 974 /// *main loop* strategy (ie the first pass of vplan execution). 975 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 976 977 protected: 978 /// Emits an iteration count bypass check once for the main loop (when \p 979 /// ForEpilogue is false) and once for the epilogue loop (when \p 980 /// ForEpilogue is true). 981 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 982 bool ForEpilogue); 983 void printDebugTracesAtStart() override; 984 void printDebugTracesAtEnd() override; 985 }; 986 987 // A specialized derived class of inner loop vectorizer that performs 988 // vectorization of *epilogue* loops in the process of vectorizing loops and 989 // their epilogues. 990 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 991 public: 992 EpilogueVectorizerEpilogueLoop( 993 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 994 DominatorTree *DT, const TargetLibraryInfo *TLI, 995 const TargetTransformInfo *TTI, AssumptionCache *AC, 996 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 997 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 998 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 999 GeneratedRTChecks &Checks) 1000 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1001 EPI, LVL, CM, BFI, PSI, Checks) {} 1002 /// Implements the interface for creating a vectorized skeleton using the 1003 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1004 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1005 1006 protected: 1007 /// Emits an iteration count bypass check after the main vector loop has 1008 /// finished to see if there are any iterations left to execute by either 1009 /// the vector epilogue or the scalar epilogue. 1010 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1011 BasicBlock *Bypass, 1012 BasicBlock *Insert); 1013 void printDebugTracesAtStart() override; 1014 void printDebugTracesAtEnd() override; 1015 }; 1016 } // end namespace llvm 1017 1018 /// Look for a meaningful debug location on the instruction or it's 1019 /// operands. 1020 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1021 if (!I) 1022 return I; 1023 1024 DebugLoc Empty; 1025 if (I->getDebugLoc() != Empty) 1026 return I; 1027 1028 for (Use &Op : I->operands()) { 1029 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1030 if (OpInst->getDebugLoc() != Empty) 1031 return OpInst; 1032 } 1033 1034 return I; 1035 } 1036 1037 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1038 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1039 const DILocation *DIL = Inst->getDebugLoc(); 1040 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1041 !isa<DbgInfoIntrinsic>(Inst)) { 1042 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1043 auto NewDIL = 1044 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1045 if (NewDIL) 1046 B.SetCurrentDebugLocation(NewDIL.getValue()); 1047 else 1048 LLVM_DEBUG(dbgs() 1049 << "Failed to create new discriminator: " 1050 << DIL->getFilename() << " Line: " << DIL->getLine()); 1051 } 1052 else 1053 B.SetCurrentDebugLocation(DIL); 1054 } else 1055 B.SetCurrentDebugLocation(DebugLoc()); 1056 } 1057 1058 /// Write a record \p DebugMsg about vectorization failure to the debug 1059 /// output stream. If \p I is passed, it is an instruction that prevents 1060 /// vectorization. 1061 #ifndef NDEBUG 1062 static void debugVectorizationFailure(const StringRef DebugMsg, 1063 Instruction *I) { 1064 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1065 if (I != nullptr) 1066 dbgs() << " " << *I; 1067 else 1068 dbgs() << '.'; 1069 dbgs() << '\n'; 1070 } 1071 #endif 1072 1073 /// Create an analysis remark that explains why vectorization failed 1074 /// 1075 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1076 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1077 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1078 /// the location of the remark. \return the remark object that can be 1079 /// streamed to. 1080 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1081 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1082 Value *CodeRegion = TheLoop->getHeader(); 1083 DebugLoc DL = TheLoop->getStartLoc(); 1084 1085 if (I) { 1086 CodeRegion = I->getParent(); 1087 // If there is no debug location attached to the instruction, revert back to 1088 // using the loop's. 1089 if (I->getDebugLoc()) 1090 DL = I->getDebugLoc(); 1091 } 1092 1093 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1094 R << "loop not vectorized: "; 1095 return R; 1096 } 1097 1098 /// Return a value for Step multiplied by VF. 1099 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1100 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1101 Constant *StepVal = ConstantInt::get( 1102 Step->getType(), 1103 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1104 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1105 } 1106 1107 namespace llvm { 1108 1109 /// Return the runtime value for VF. 1110 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1111 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1113 } 1114 1115 void reportVectorizationFailure(const StringRef DebugMsg, 1116 const StringRef OREMsg, const StringRef ORETag, 1117 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1118 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1119 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1120 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1121 ORETag, TheLoop, I) << OREMsg); 1122 } 1123 1124 } // end namespace llvm 1125 1126 #ifndef NDEBUG 1127 /// \return string containing a file name and a line # for the given loop. 1128 static std::string getDebugLocString(const Loop *L) { 1129 std::string Result; 1130 if (L) { 1131 raw_string_ostream OS(Result); 1132 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1133 LoopDbgLoc.print(OS); 1134 else 1135 // Just print the module name. 1136 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1137 OS.flush(); 1138 } 1139 return Result; 1140 } 1141 #endif 1142 1143 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1144 const Instruction *Orig) { 1145 // If the loop was versioned with memchecks, add the corresponding no-alias 1146 // metadata. 1147 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1148 LVer->annotateInstWithNoAlias(To, Orig); 1149 } 1150 1151 void InnerLoopVectorizer::addMetadata(Instruction *To, 1152 Instruction *From) { 1153 propagateMetadata(To, From); 1154 addNewMetadata(To, From); 1155 } 1156 1157 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1158 Instruction *From) { 1159 for (Value *V : To) { 1160 if (Instruction *I = dyn_cast<Instruction>(V)) 1161 addMetadata(I, From); 1162 } 1163 } 1164 1165 namespace llvm { 1166 1167 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1168 // lowered. 1169 enum ScalarEpilogueLowering { 1170 1171 // The default: allowing scalar epilogues. 1172 CM_ScalarEpilogueAllowed, 1173 1174 // Vectorization with OptForSize: don't allow epilogues. 1175 CM_ScalarEpilogueNotAllowedOptSize, 1176 1177 // A special case of vectorisation with OptForSize: loops with a very small 1178 // trip count are considered for vectorization under OptForSize, thereby 1179 // making sure the cost of their loop body is dominant, free of runtime 1180 // guards and scalar iteration overheads. 1181 CM_ScalarEpilogueNotAllowedLowTripLoop, 1182 1183 // Loop hint predicate indicating an epilogue is undesired. 1184 CM_ScalarEpilogueNotNeededUsePredicate, 1185 1186 // Directive indicating we must either tail fold or not vectorize 1187 CM_ScalarEpilogueNotAllowedUsePredicate 1188 }; 1189 1190 /// LoopVectorizationCostModel - estimates the expected speedups due to 1191 /// vectorization. 1192 /// In many cases vectorization is not profitable. This can happen because of 1193 /// a number of reasons. In this class we mainly attempt to predict the 1194 /// expected speedup/slowdowns due to the supported instruction set. We use the 1195 /// TargetTransformInfo to query the different backends for the cost of 1196 /// different operations. 1197 class LoopVectorizationCostModel { 1198 public: 1199 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1200 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1201 LoopVectorizationLegality *Legal, 1202 const TargetTransformInfo &TTI, 1203 const TargetLibraryInfo *TLI, DemandedBits *DB, 1204 AssumptionCache *AC, 1205 OptimizationRemarkEmitter *ORE, const Function *F, 1206 const LoopVectorizeHints *Hints, 1207 InterleavedAccessInfo &IAI) 1208 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1209 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1210 Hints(Hints), InterleaveInfo(IAI) {} 1211 1212 /// \return An upper bound for the vectorization factor, or None if 1213 /// vectorization and interleaving should be avoided up front. 1214 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1215 1216 /// \return True if runtime checks are required for vectorization, and false 1217 /// otherwise. 1218 bool runtimeChecksRequired(); 1219 1220 /// \return The most profitable vectorization factor and the cost of that VF. 1221 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1222 /// then this vectorization factor will be selected if vectorization is 1223 /// possible. 1224 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1225 VectorizationFactor 1226 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1227 const LoopVectorizationPlanner &LVP); 1228 1229 /// Setup cost-based decisions for user vectorization factor. 1230 void selectUserVectorizationFactor(ElementCount UserVF) { 1231 collectUniformsAndScalars(UserVF); 1232 collectInstsToScalarize(UserVF); 1233 } 1234 1235 /// \return The size (in bits) of the smallest and widest types in the code 1236 /// that needs to be vectorized. We ignore values that remain scalar such as 1237 /// 64 bit loop indices. 1238 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1239 1240 /// \return The desired interleave count. 1241 /// If interleave count has been specified by metadata it will be returned. 1242 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1243 /// are the selected vectorization factor and the cost of the selected VF. 1244 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1245 1246 /// Memory access instruction may be vectorized in more than one way. 1247 /// Form of instruction after vectorization depends on cost. 1248 /// This function takes cost-based decisions for Load/Store instructions 1249 /// and collects them in a map. This decisions map is used for building 1250 /// the lists of loop-uniform and loop-scalar instructions. 1251 /// The calculated cost is saved with widening decision in order to 1252 /// avoid redundant calculations. 1253 void setCostBasedWideningDecision(ElementCount VF); 1254 1255 /// A struct that represents some properties of the register usage 1256 /// of a loop. 1257 struct RegisterUsage { 1258 /// Holds the number of loop invariant values that are used in the loop. 1259 /// The key is ClassID of target-provided register class. 1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1261 /// Holds the maximum number of concurrent live intervals in the loop. 1262 /// The key is ClassID of target-provided register class. 1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1264 }; 1265 1266 /// \return Returns information about the register usages of the loop for the 1267 /// given vectorization factors. 1268 SmallVector<RegisterUsage, 8> 1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1270 1271 /// Collect values we want to ignore in the cost model. 1272 void collectValuesToIgnore(); 1273 1274 /// Split reductions into those that happen in the loop, and those that happen 1275 /// outside. In loop reductions are collected into InLoopReductionChains. 1276 void collectInLoopReductions(); 1277 1278 /// \returns The smallest bitwidth each instruction can be represented with. 1279 /// The vector equivalents of these instructions should be truncated to this 1280 /// type. 1281 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1282 return MinBWs; 1283 } 1284 1285 /// \returns True if it is more profitable to scalarize instruction \p I for 1286 /// vectorization factor \p VF. 1287 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1288 assert(VF.isVector() && 1289 "Profitable to scalarize relevant only for VF > 1."); 1290 1291 // Cost model is not run in the VPlan-native path - return conservative 1292 // result until this changes. 1293 if (EnableVPlanNativePath) 1294 return false; 1295 1296 auto Scalars = InstsToScalarize.find(VF); 1297 assert(Scalars != InstsToScalarize.end() && 1298 "VF not yet analyzed for scalarization profitability"); 1299 return Scalars->second.find(I) != Scalars->second.end(); 1300 } 1301 1302 /// Returns true if \p I is known to be uniform after vectorization. 1303 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1304 if (VF.isScalar()) 1305 return true; 1306 1307 // Cost model is not run in the VPlan-native path - return conservative 1308 // result until this changes. 1309 if (EnableVPlanNativePath) 1310 return false; 1311 1312 auto UniformsPerVF = Uniforms.find(VF); 1313 assert(UniformsPerVF != Uniforms.end() && 1314 "VF not yet analyzed for uniformity"); 1315 return UniformsPerVF->second.count(I); 1316 } 1317 1318 /// Returns true if \p I is known to be scalar after vectorization. 1319 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1320 if (VF.isScalar()) 1321 return true; 1322 1323 // Cost model is not run in the VPlan-native path - return conservative 1324 // result until this changes. 1325 if (EnableVPlanNativePath) 1326 return false; 1327 1328 auto ScalarsPerVF = Scalars.find(VF); 1329 assert(ScalarsPerVF != Scalars.end() && 1330 "Scalar values are not calculated for VF"); 1331 return ScalarsPerVF->second.count(I); 1332 } 1333 1334 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1335 /// for vectorization factor \p VF. 1336 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1337 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1338 !isProfitableToScalarize(I, VF) && 1339 !isScalarAfterVectorization(I, VF); 1340 } 1341 1342 /// Decision that was taken during cost calculation for memory instruction. 1343 enum InstWidening { 1344 CM_Unknown, 1345 CM_Widen, // For consecutive accesses with stride +1. 1346 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1347 CM_Interleave, 1348 CM_GatherScatter, 1349 CM_Scalarize 1350 }; 1351 1352 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1353 /// instruction \p I and vector width \p VF. 1354 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1355 InstructionCost Cost) { 1356 assert(VF.isVector() && "Expected VF >=2"); 1357 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1358 } 1359 1360 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1361 /// interleaving group \p Grp and vector width \p VF. 1362 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1363 ElementCount VF, InstWidening W, 1364 InstructionCost Cost) { 1365 assert(VF.isVector() && "Expected VF >=2"); 1366 /// Broadcast this decicion to all instructions inside the group. 1367 /// But the cost will be assigned to one instruction only. 1368 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1369 if (auto *I = Grp->getMember(i)) { 1370 if (Grp->getInsertPos() == I) 1371 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1372 else 1373 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1374 } 1375 } 1376 } 1377 1378 /// Return the cost model decision for the given instruction \p I and vector 1379 /// width \p VF. Return CM_Unknown if this instruction did not pass 1380 /// through the cost modeling. 1381 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1382 assert(VF.isVector() && "Expected VF to be a vector VF"); 1383 // Cost model is not run in the VPlan-native path - return conservative 1384 // result until this changes. 1385 if (EnableVPlanNativePath) 1386 return CM_GatherScatter; 1387 1388 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1389 auto Itr = WideningDecisions.find(InstOnVF); 1390 if (Itr == WideningDecisions.end()) 1391 return CM_Unknown; 1392 return Itr->second.first; 1393 } 1394 1395 /// Return the vectorization cost for the given instruction \p I and vector 1396 /// width \p VF. 1397 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1398 assert(VF.isVector() && "Expected VF >=2"); 1399 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1400 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1401 "The cost is not calculated"); 1402 return WideningDecisions[InstOnVF].second; 1403 } 1404 1405 /// Return True if instruction \p I is an optimizable truncate whose operand 1406 /// is an induction variable. Such a truncate will be removed by adding a new 1407 /// induction variable with the destination type. 1408 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1409 // If the instruction is not a truncate, return false. 1410 auto *Trunc = dyn_cast<TruncInst>(I); 1411 if (!Trunc) 1412 return false; 1413 1414 // Get the source and destination types of the truncate. 1415 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1416 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1417 1418 // If the truncate is free for the given types, return false. Replacing a 1419 // free truncate with an induction variable would add an induction variable 1420 // update instruction to each iteration of the loop. We exclude from this 1421 // check the primary induction variable since it will need an update 1422 // instruction regardless. 1423 Value *Op = Trunc->getOperand(0); 1424 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1425 return false; 1426 1427 // If the truncated value is not an induction variable, return false. 1428 return Legal->isInductionPhi(Op); 1429 } 1430 1431 /// Collects the instructions to scalarize for each predicated instruction in 1432 /// the loop. 1433 void collectInstsToScalarize(ElementCount VF); 1434 1435 /// Collect Uniform and Scalar values for the given \p VF. 1436 /// The sets depend on CM decision for Load/Store instructions 1437 /// that may be vectorized as interleave, gather-scatter or scalarized. 1438 void collectUniformsAndScalars(ElementCount VF) { 1439 // Do the analysis once. 1440 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1441 return; 1442 setCostBasedWideningDecision(VF); 1443 collectLoopUniforms(VF); 1444 collectLoopScalars(VF); 1445 } 1446 1447 /// Returns true if the target machine supports masked store operation 1448 /// for the given \p DataType and kind of access to \p Ptr. 1449 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1450 return Legal->isConsecutivePtr(Ptr) && 1451 TTI.isLegalMaskedStore(DataType, Alignment); 1452 } 1453 1454 /// Returns true if the target machine supports masked load operation 1455 /// for the given \p DataType and kind of access to \p Ptr. 1456 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1457 return Legal->isConsecutivePtr(Ptr) && 1458 TTI.isLegalMaskedLoad(DataType, Alignment); 1459 } 1460 1461 /// Returns true if the target machine supports masked scatter operation 1462 /// for the given \p DataType. 1463 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1464 return TTI.isLegalMaskedScatter(DataType, Alignment); 1465 } 1466 1467 /// Returns true if the target machine supports masked gather operation 1468 /// for the given \p DataType. 1469 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1470 return TTI.isLegalMaskedGather(DataType, Alignment); 1471 } 1472 1473 /// Returns true if the target machine can represent \p V as a masked gather 1474 /// or scatter operation. 1475 bool isLegalGatherOrScatter(Value *V) { 1476 bool LI = isa<LoadInst>(V); 1477 bool SI = isa<StoreInst>(V); 1478 if (!LI && !SI) 1479 return false; 1480 auto *Ty = getMemInstValueType(V); 1481 Align Align = getLoadStoreAlignment(V); 1482 return (LI && isLegalMaskedGather(Ty, Align)) || 1483 (SI && isLegalMaskedScatter(Ty, Align)); 1484 } 1485 1486 /// Returns true if the target machine supports all of the reduction 1487 /// variables found for the given VF. 1488 bool canVectorizeReductions(ElementCount VF) { 1489 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1490 RecurrenceDescriptor RdxDesc = Reduction.second; 1491 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1492 })); 1493 } 1494 1495 /// Returns true if \p I is an instruction that will be scalarized with 1496 /// predication. Such instructions include conditional stores and 1497 /// instructions that may divide by zero. 1498 /// If a non-zero VF has been calculated, we check if I will be scalarized 1499 /// predication for that VF. 1500 bool 1501 isScalarWithPredication(Instruction *I, 1502 ElementCount VF = ElementCount::getFixed(1)) const; 1503 1504 // Returns true if \p I is an instruction that will be predicated either 1505 // through scalar predication or masked load/store or masked gather/scatter. 1506 // Superset of instructions that return true for isScalarWithPredication. 1507 bool isPredicatedInst(Instruction *I) { 1508 if (!blockNeedsPredication(I->getParent())) 1509 return false; 1510 // Loads and stores that need some form of masked operation are predicated 1511 // instructions. 1512 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1513 return Legal->isMaskRequired(I); 1514 return isScalarWithPredication(I); 1515 } 1516 1517 /// Returns true if \p I is a memory instruction with consecutive memory 1518 /// access that can be widened. 1519 bool 1520 memoryInstructionCanBeWidened(Instruction *I, 1521 ElementCount VF = ElementCount::getFixed(1)); 1522 1523 /// Returns true if \p I is a memory instruction in an interleaved-group 1524 /// of memory accesses that can be vectorized with wide vector loads/stores 1525 /// and shuffles. 1526 bool 1527 interleavedAccessCanBeWidened(Instruction *I, 1528 ElementCount VF = ElementCount::getFixed(1)); 1529 1530 /// Check if \p Instr belongs to any interleaved access group. 1531 bool isAccessInterleaved(Instruction *Instr) { 1532 return InterleaveInfo.isInterleaved(Instr); 1533 } 1534 1535 /// Get the interleaved access group that \p Instr belongs to. 1536 const InterleaveGroup<Instruction> * 1537 getInterleavedAccessGroup(Instruction *Instr) { 1538 return InterleaveInfo.getInterleaveGroup(Instr); 1539 } 1540 1541 /// Returns true if we're required to use a scalar epilogue for at least 1542 /// the final iteration of the original loop. 1543 bool requiresScalarEpilogue() const { 1544 if (!isScalarEpilogueAllowed()) 1545 return false; 1546 // If we might exit from anywhere but the latch, must run the exiting 1547 // iteration in scalar form. 1548 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1549 return true; 1550 return InterleaveInfo.requiresScalarEpilogue(); 1551 } 1552 1553 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1554 /// loop hint annotation. 1555 bool isScalarEpilogueAllowed() const { 1556 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1557 } 1558 1559 /// Returns true if all loop blocks should be masked to fold tail loop. 1560 bool foldTailByMasking() const { return FoldTailByMasking; } 1561 1562 bool blockNeedsPredication(BasicBlock *BB) const { 1563 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1564 } 1565 1566 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1567 /// nodes to the chain of instructions representing the reductions. Uses a 1568 /// MapVector to ensure deterministic iteration order. 1569 using ReductionChainMap = 1570 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1571 1572 /// Return the chain of instructions representing an inloop reduction. 1573 const ReductionChainMap &getInLoopReductionChains() const { 1574 return InLoopReductionChains; 1575 } 1576 1577 /// Returns true if the Phi is part of an inloop reduction. 1578 bool isInLoopReduction(PHINode *Phi) const { 1579 return InLoopReductionChains.count(Phi); 1580 } 1581 1582 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1583 /// with factor VF. Return the cost of the instruction, including 1584 /// scalarization overhead if it's needed. 1585 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1586 1587 /// Estimate cost of a call instruction CI if it were vectorized with factor 1588 /// VF. Return the cost of the instruction, including scalarization overhead 1589 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1590 /// scalarized - 1591 /// i.e. either vector version isn't available, or is too expensive. 1592 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1593 bool &NeedToScalarize) const; 1594 1595 /// Invalidates decisions already taken by the cost model. 1596 void invalidateCostModelingDecisions() { 1597 WideningDecisions.clear(); 1598 Uniforms.clear(); 1599 Scalars.clear(); 1600 } 1601 1602 private: 1603 unsigned NumPredStores = 0; 1604 1605 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1606 /// than zero. One is returned if vectorization should best be avoided due 1607 /// to cost. 1608 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1609 ElementCount UserVF); 1610 1611 /// The vectorization cost is a combination of the cost itself and a boolean 1612 /// indicating whether any of the contributing operations will actually 1613 /// operate on 1614 /// vector values after type legalization in the backend. If this latter value 1615 /// is 1616 /// false, then all operations will be scalarized (i.e. no vectorization has 1617 /// actually taken place). 1618 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1619 1620 /// Returns the expected execution cost. The unit of the cost does 1621 /// not matter because we use the 'cost' units to compare different 1622 /// vector widths. The cost that is returned is *not* normalized by 1623 /// the factor width. 1624 VectorizationCostTy expectedCost(ElementCount VF); 1625 1626 /// Returns the execution time cost of an instruction for a given vector 1627 /// width. Vector width of one means scalar. 1628 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1629 1630 /// The cost-computation logic from getInstructionCost which provides 1631 /// the vector type as an output parameter. 1632 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1633 Type *&VectorTy); 1634 1635 /// Return the cost of instructions in an inloop reduction pattern, if I is 1636 /// part of that pattern. 1637 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1638 Type *VectorTy, 1639 TTI::TargetCostKind CostKind); 1640 1641 /// Calculate vectorization cost of memory instruction \p I. 1642 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1643 1644 /// The cost computation for scalarized memory instruction. 1645 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1646 1647 /// The cost computation for interleaving group of memory instructions. 1648 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1649 1650 /// The cost computation for Gather/Scatter instruction. 1651 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1652 1653 /// The cost computation for widening instruction \p I with consecutive 1654 /// memory access. 1655 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1656 1657 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1658 /// Load: scalar load + broadcast. 1659 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1660 /// element) 1661 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1662 1663 /// Estimate the overhead of scalarizing an instruction. This is a 1664 /// convenience wrapper for the type-based getScalarizationOverhead API. 1665 InstructionCost getScalarizationOverhead(Instruction *I, 1666 ElementCount VF) const; 1667 1668 /// Returns whether the instruction is a load or store and will be a emitted 1669 /// as a vector operation. 1670 bool isConsecutiveLoadOrStore(Instruction *I); 1671 1672 /// Returns true if an artificially high cost for emulated masked memrefs 1673 /// should be used. 1674 bool useEmulatedMaskMemRefHack(Instruction *I); 1675 1676 /// Map of scalar integer values to the smallest bitwidth they can be legally 1677 /// represented as. The vector equivalents of these values should be truncated 1678 /// to this type. 1679 MapVector<Instruction *, uint64_t> MinBWs; 1680 1681 /// A type representing the costs for instructions if they were to be 1682 /// scalarized rather than vectorized. The entries are Instruction-Cost 1683 /// pairs. 1684 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1685 1686 /// A set containing all BasicBlocks that are known to present after 1687 /// vectorization as a predicated block. 1688 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1689 1690 /// Records whether it is allowed to have the original scalar loop execute at 1691 /// least once. This may be needed as a fallback loop in case runtime 1692 /// aliasing/dependence checks fail, or to handle the tail/remainder 1693 /// iterations when the trip count is unknown or doesn't divide by the VF, 1694 /// or as a peel-loop to handle gaps in interleave-groups. 1695 /// Under optsize and when the trip count is very small we don't allow any 1696 /// iterations to execute in the scalar loop. 1697 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1698 1699 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1700 bool FoldTailByMasking = false; 1701 1702 /// A map holding scalar costs for different vectorization factors. The 1703 /// presence of a cost for an instruction in the mapping indicates that the 1704 /// instruction will be scalarized when vectorizing with the associated 1705 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1706 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1707 1708 /// Holds the instructions known to be uniform after vectorization. 1709 /// The data is collected per VF. 1710 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1711 1712 /// Holds the instructions known to be scalar after vectorization. 1713 /// The data is collected per VF. 1714 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1715 1716 /// Holds the instructions (address computations) that are forced to be 1717 /// scalarized. 1718 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1719 1720 /// PHINodes of the reductions that should be expanded in-loop along with 1721 /// their associated chains of reduction operations, in program order from top 1722 /// (PHI) to bottom 1723 ReductionChainMap InLoopReductionChains; 1724 1725 /// A Map of inloop reduction operations and their immediate chain operand. 1726 /// FIXME: This can be removed once reductions can be costed correctly in 1727 /// vplan. This was added to allow quick lookup to the inloop operations, 1728 /// without having to loop through InLoopReductionChains. 1729 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1730 1731 /// Returns the expected difference in cost from scalarizing the expression 1732 /// feeding a predicated instruction \p PredInst. The instructions to 1733 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1734 /// non-negative return value implies the expression will be scalarized. 1735 /// Currently, only single-use chains are considered for scalarization. 1736 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1737 ElementCount VF); 1738 1739 /// Collect the instructions that are uniform after vectorization. An 1740 /// instruction is uniform if we represent it with a single scalar value in 1741 /// the vectorized loop corresponding to each vector iteration. Examples of 1742 /// uniform instructions include pointer operands of consecutive or 1743 /// interleaved memory accesses. Note that although uniformity implies an 1744 /// instruction will be scalar, the reverse is not true. In general, a 1745 /// scalarized instruction will be represented by VF scalar values in the 1746 /// vectorized loop, each corresponding to an iteration of the original 1747 /// scalar loop. 1748 void collectLoopUniforms(ElementCount VF); 1749 1750 /// Collect the instructions that are scalar after vectorization. An 1751 /// instruction is scalar if it is known to be uniform or will be scalarized 1752 /// during vectorization. Non-uniform scalarized instructions will be 1753 /// represented by VF values in the vectorized loop, each corresponding to an 1754 /// iteration of the original scalar loop. 1755 void collectLoopScalars(ElementCount VF); 1756 1757 /// Keeps cost model vectorization decision and cost for instructions. 1758 /// Right now it is used for memory instructions only. 1759 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1760 std::pair<InstWidening, InstructionCost>>; 1761 1762 DecisionList WideningDecisions; 1763 1764 /// Returns true if \p V is expected to be vectorized and it needs to be 1765 /// extracted. 1766 bool needsExtract(Value *V, ElementCount VF) const { 1767 Instruction *I = dyn_cast<Instruction>(V); 1768 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1769 TheLoop->isLoopInvariant(I)) 1770 return false; 1771 1772 // Assume we can vectorize V (and hence we need extraction) if the 1773 // scalars are not computed yet. This can happen, because it is called 1774 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1775 // the scalars are collected. That should be a safe assumption in most 1776 // cases, because we check if the operands have vectorizable types 1777 // beforehand in LoopVectorizationLegality. 1778 return Scalars.find(VF) == Scalars.end() || 1779 !isScalarAfterVectorization(I, VF); 1780 }; 1781 1782 /// Returns a range containing only operands needing to be extracted. 1783 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1784 ElementCount VF) const { 1785 return SmallVector<Value *, 4>(make_filter_range( 1786 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1787 } 1788 1789 /// Determines if we have the infrastructure to vectorize loop \p L and its 1790 /// epilogue, assuming the main loop is vectorized by \p VF. 1791 bool isCandidateForEpilogueVectorization(const Loop &L, 1792 const ElementCount VF) const; 1793 1794 /// Returns true if epilogue vectorization is considered profitable, and 1795 /// false otherwise. 1796 /// \p VF is the vectorization factor chosen for the original loop. 1797 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1798 1799 public: 1800 /// The loop that we evaluate. 1801 Loop *TheLoop; 1802 1803 /// Predicated scalar evolution analysis. 1804 PredicatedScalarEvolution &PSE; 1805 1806 /// Loop Info analysis. 1807 LoopInfo *LI; 1808 1809 /// Vectorization legality. 1810 LoopVectorizationLegality *Legal; 1811 1812 /// Vector target information. 1813 const TargetTransformInfo &TTI; 1814 1815 /// Target Library Info. 1816 const TargetLibraryInfo *TLI; 1817 1818 /// Demanded bits analysis. 1819 DemandedBits *DB; 1820 1821 /// Assumption cache. 1822 AssumptionCache *AC; 1823 1824 /// Interface to emit optimization remarks. 1825 OptimizationRemarkEmitter *ORE; 1826 1827 const Function *TheFunction; 1828 1829 /// Loop Vectorize Hint. 1830 const LoopVectorizeHints *Hints; 1831 1832 /// The interleave access information contains groups of interleaved accesses 1833 /// with the same stride and close to each other. 1834 InterleavedAccessInfo &InterleaveInfo; 1835 1836 /// Values to ignore in the cost model. 1837 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1838 1839 /// Values to ignore in the cost model when VF > 1. 1840 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1841 1842 /// Profitable vector factors. 1843 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1844 }; 1845 } // end namespace llvm 1846 1847 /// Helper struct to manage generating runtime checks for vectorization. 1848 /// 1849 /// The runtime checks are created up-front in temporary blocks to allow better 1850 /// estimating the cost and un-linked from the existing IR. After deciding to 1851 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1852 /// temporary blocks are completely removed. 1853 class GeneratedRTChecks { 1854 /// Basic block which contains the generated SCEV checks, if any. 1855 BasicBlock *SCEVCheckBlock = nullptr; 1856 1857 /// The value representing the result of the generated SCEV checks. If it is 1858 /// nullptr, either no SCEV checks have been generated or they have been used. 1859 Value *SCEVCheckCond = nullptr; 1860 1861 /// Basic block which contains the generated memory runtime checks, if any. 1862 BasicBlock *MemCheckBlock = nullptr; 1863 1864 /// The value representing the result of the generated memory runtime checks. 1865 /// If it is nullptr, either no memory runtime checks have been generated or 1866 /// they have been used. 1867 Instruction *MemRuntimeCheckCond = nullptr; 1868 1869 DominatorTree *DT; 1870 LoopInfo *LI; 1871 1872 SCEVExpander SCEVExp; 1873 SCEVExpander MemCheckExp; 1874 1875 public: 1876 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1877 const DataLayout &DL) 1878 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1879 MemCheckExp(SE, DL, "scev.check") {} 1880 1881 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1882 /// accurately estimate the cost of the runtime checks. The blocks are 1883 /// un-linked from the IR and is added back during vector code generation. If 1884 /// there is no vector code generation, the check blocks are removed 1885 /// completely. 1886 void Create(Loop *L, const LoopAccessInfo &LAI, 1887 const SCEVUnionPredicate &UnionPred) { 1888 1889 BasicBlock *LoopHeader = L->getHeader(); 1890 BasicBlock *Preheader = L->getLoopPreheader(); 1891 1892 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1893 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1894 // may be used by SCEVExpander. The blocks will be un-linked from their 1895 // predecessors and removed from LI & DT at the end of the function. 1896 if (!UnionPred.isAlwaysTrue()) { 1897 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1898 nullptr, "vector.scevcheck"); 1899 1900 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1901 &UnionPred, SCEVCheckBlock->getTerminator()); 1902 } 1903 1904 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1905 if (RtPtrChecking.Need) { 1906 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1907 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1908 "vector.memcheck"); 1909 1910 std::tie(std::ignore, MemRuntimeCheckCond) = 1911 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1912 RtPtrChecking.getChecks(), MemCheckExp); 1913 assert(MemRuntimeCheckCond && 1914 "no RT checks generated although RtPtrChecking " 1915 "claimed checks are required"); 1916 } 1917 1918 if (!MemCheckBlock && !SCEVCheckBlock) 1919 return; 1920 1921 // Unhook the temporary block with the checks, update various places 1922 // accordingly. 1923 if (SCEVCheckBlock) 1924 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1925 if (MemCheckBlock) 1926 MemCheckBlock->replaceAllUsesWith(Preheader); 1927 1928 if (SCEVCheckBlock) { 1929 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1930 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1931 Preheader->getTerminator()->eraseFromParent(); 1932 } 1933 if (MemCheckBlock) { 1934 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1935 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1936 Preheader->getTerminator()->eraseFromParent(); 1937 } 1938 1939 DT->changeImmediateDominator(LoopHeader, Preheader); 1940 if (MemCheckBlock) { 1941 DT->eraseNode(MemCheckBlock); 1942 LI->removeBlock(MemCheckBlock); 1943 } 1944 if (SCEVCheckBlock) { 1945 DT->eraseNode(SCEVCheckBlock); 1946 LI->removeBlock(SCEVCheckBlock); 1947 } 1948 } 1949 1950 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1951 /// unused. 1952 ~GeneratedRTChecks() { 1953 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1954 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1955 if (!SCEVCheckCond) 1956 SCEVCleaner.markResultUsed(); 1957 1958 if (!MemRuntimeCheckCond) 1959 MemCheckCleaner.markResultUsed(); 1960 1961 if (MemRuntimeCheckCond) { 1962 auto &SE = *MemCheckExp.getSE(); 1963 // Memory runtime check generation creates compares that use expanded 1964 // values. Remove them before running the SCEVExpanderCleaners. 1965 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1966 if (MemCheckExp.isInsertedInstruction(&I)) 1967 continue; 1968 SE.forgetValue(&I); 1969 SE.eraseValueFromMap(&I); 1970 I.eraseFromParent(); 1971 } 1972 } 1973 MemCheckCleaner.cleanup(); 1974 SCEVCleaner.cleanup(); 1975 1976 if (SCEVCheckCond) 1977 SCEVCheckBlock->eraseFromParent(); 1978 if (MemRuntimeCheckCond) 1979 MemCheckBlock->eraseFromParent(); 1980 } 1981 1982 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1983 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1984 /// depending on the generated condition. 1985 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1986 BasicBlock *LoopVectorPreHeader, 1987 BasicBlock *LoopExitBlock) { 1988 if (!SCEVCheckCond) 1989 return nullptr; 1990 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1991 if (C->isZero()) 1992 return nullptr; 1993 1994 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1995 1996 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1997 // Create new preheader for vector loop. 1998 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 1999 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2000 2001 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2002 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2003 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2004 SCEVCheckBlock); 2005 2006 DT->addNewBlock(SCEVCheckBlock, Pred); 2007 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2008 2009 ReplaceInstWithInst( 2010 SCEVCheckBlock->getTerminator(), 2011 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2012 // Mark the check as used, to prevent it from being removed during cleanup. 2013 SCEVCheckCond = nullptr; 2014 return SCEVCheckBlock; 2015 } 2016 2017 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2018 /// the branches to branch to the vector preheader or \p Bypass, depending on 2019 /// the generated condition. 2020 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2021 BasicBlock *LoopVectorPreHeader) { 2022 // Check if we generated code that checks in runtime if arrays overlap. 2023 if (!MemRuntimeCheckCond) 2024 return nullptr; 2025 2026 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2027 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2028 MemCheckBlock); 2029 2030 DT->addNewBlock(MemCheckBlock, Pred); 2031 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2032 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2033 2034 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2035 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2036 2037 ReplaceInstWithInst( 2038 MemCheckBlock->getTerminator(), 2039 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2040 MemCheckBlock->getTerminator()->setDebugLoc( 2041 Pred->getTerminator()->getDebugLoc()); 2042 2043 // Mark the check as used, to prevent it from being removed during cleanup. 2044 MemRuntimeCheckCond = nullptr; 2045 return MemCheckBlock; 2046 } 2047 }; 2048 2049 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2050 // vectorization. The loop needs to be annotated with #pragma omp simd 2051 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2052 // vector length information is not provided, vectorization is not considered 2053 // explicit. Interleave hints are not allowed either. These limitations will be 2054 // relaxed in the future. 2055 // Please, note that we are currently forced to abuse the pragma 'clang 2056 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2057 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2058 // provides *explicit vectorization hints* (LV can bypass legal checks and 2059 // assume that vectorization is legal). However, both hints are implemented 2060 // using the same metadata (llvm.loop.vectorize, processed by 2061 // LoopVectorizeHints). This will be fixed in the future when the native IR 2062 // representation for pragma 'omp simd' is introduced. 2063 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2064 OptimizationRemarkEmitter *ORE) { 2065 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2066 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2067 2068 // Only outer loops with an explicit vectorization hint are supported. 2069 // Unannotated outer loops are ignored. 2070 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2071 return false; 2072 2073 Function *Fn = OuterLp->getHeader()->getParent(); 2074 if (!Hints.allowVectorization(Fn, OuterLp, 2075 true /*VectorizeOnlyWhenForced*/)) { 2076 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2077 return false; 2078 } 2079 2080 if (Hints.getInterleave() > 1) { 2081 // TODO: Interleave support is future work. 2082 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2083 "outer loops.\n"); 2084 Hints.emitRemarkWithHints(); 2085 return false; 2086 } 2087 2088 return true; 2089 } 2090 2091 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2092 OptimizationRemarkEmitter *ORE, 2093 SmallVectorImpl<Loop *> &V) { 2094 // Collect inner loops and outer loops without irreducible control flow. For 2095 // now, only collect outer loops that have explicit vectorization hints. If we 2096 // are stress testing the VPlan H-CFG construction, we collect the outermost 2097 // loop of every loop nest. 2098 if (L.isInnermost() || VPlanBuildStressTest || 2099 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2100 LoopBlocksRPO RPOT(&L); 2101 RPOT.perform(LI); 2102 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2103 V.push_back(&L); 2104 // TODO: Collect inner loops inside marked outer loops in case 2105 // vectorization fails for the outer loop. Do not invoke 2106 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2107 // already known to be reducible. We can use an inherited attribute for 2108 // that. 2109 return; 2110 } 2111 } 2112 for (Loop *InnerL : L) 2113 collectSupportedLoops(*InnerL, LI, ORE, V); 2114 } 2115 2116 namespace { 2117 2118 /// The LoopVectorize Pass. 2119 struct LoopVectorize : public FunctionPass { 2120 /// Pass identification, replacement for typeid 2121 static char ID; 2122 2123 LoopVectorizePass Impl; 2124 2125 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2126 bool VectorizeOnlyWhenForced = false) 2127 : FunctionPass(ID), 2128 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2129 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2130 } 2131 2132 bool runOnFunction(Function &F) override { 2133 if (skipFunction(F)) 2134 return false; 2135 2136 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2137 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2138 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2139 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2140 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2141 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2142 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2143 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2144 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2145 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2146 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2147 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2148 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2149 2150 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2151 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2152 2153 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2154 GetLAA, *ORE, PSI).MadeAnyChange; 2155 } 2156 2157 void getAnalysisUsage(AnalysisUsage &AU) const override { 2158 AU.addRequired<AssumptionCacheTracker>(); 2159 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2160 AU.addRequired<DominatorTreeWrapperPass>(); 2161 AU.addRequired<LoopInfoWrapperPass>(); 2162 AU.addRequired<ScalarEvolutionWrapperPass>(); 2163 AU.addRequired<TargetTransformInfoWrapperPass>(); 2164 AU.addRequired<AAResultsWrapperPass>(); 2165 AU.addRequired<LoopAccessLegacyAnalysis>(); 2166 AU.addRequired<DemandedBitsWrapperPass>(); 2167 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2168 AU.addRequired<InjectTLIMappingsLegacy>(); 2169 2170 // We currently do not preserve loopinfo/dominator analyses with outer loop 2171 // vectorization. Until this is addressed, mark these analyses as preserved 2172 // only for non-VPlan-native path. 2173 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2174 if (!EnableVPlanNativePath) { 2175 AU.addPreserved<LoopInfoWrapperPass>(); 2176 AU.addPreserved<DominatorTreeWrapperPass>(); 2177 } 2178 2179 AU.addPreserved<BasicAAWrapperPass>(); 2180 AU.addPreserved<GlobalsAAWrapperPass>(); 2181 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2182 } 2183 }; 2184 2185 } // end anonymous namespace 2186 2187 //===----------------------------------------------------------------------===// 2188 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2189 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2190 //===----------------------------------------------------------------------===// 2191 2192 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2193 // We need to place the broadcast of invariant variables outside the loop, 2194 // but only if it's proven safe to do so. Else, broadcast will be inside 2195 // vector loop body. 2196 Instruction *Instr = dyn_cast<Instruction>(V); 2197 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2198 (!Instr || 2199 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2200 // Place the code for broadcasting invariant variables in the new preheader. 2201 IRBuilder<>::InsertPointGuard Guard(Builder); 2202 if (SafeToHoist) 2203 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2204 2205 // Broadcast the scalar into all locations in the vector. 2206 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2207 2208 return Shuf; 2209 } 2210 2211 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2212 const InductionDescriptor &II, Value *Step, Value *Start, 2213 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2214 VPTransformState &State) { 2215 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2216 "Expected either an induction phi-node or a truncate of it!"); 2217 2218 // Construct the initial value of the vector IV in the vector loop preheader 2219 auto CurrIP = Builder.saveIP(); 2220 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2221 if (isa<TruncInst>(EntryVal)) { 2222 assert(Start->getType()->isIntegerTy() && 2223 "Truncation requires an integer type"); 2224 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2225 Step = Builder.CreateTrunc(Step, TruncType); 2226 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2227 } 2228 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2229 Value *SteppedStart = 2230 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2231 2232 // We create vector phi nodes for both integer and floating-point induction 2233 // variables. Here, we determine the kind of arithmetic we will perform. 2234 Instruction::BinaryOps AddOp; 2235 Instruction::BinaryOps MulOp; 2236 if (Step->getType()->isIntegerTy()) { 2237 AddOp = Instruction::Add; 2238 MulOp = Instruction::Mul; 2239 } else { 2240 AddOp = II.getInductionOpcode(); 2241 MulOp = Instruction::FMul; 2242 } 2243 2244 // Multiply the vectorization factor by the step using integer or 2245 // floating-point arithmetic as appropriate. 2246 Value *ConstVF = 2247 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2248 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); 2249 2250 // Create a vector splat to use in the induction update. 2251 // 2252 // FIXME: If the step is non-constant, we create the vector splat with 2253 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2254 // handle a constant vector splat. 2255 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2256 Value *SplatVF = isa<Constant>(Mul) 2257 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2258 : Builder.CreateVectorSplat(VF, Mul); 2259 Builder.restoreIP(CurrIP); 2260 2261 // We may need to add the step a number of times, depending on the unroll 2262 // factor. The last of those goes into the PHI. 2263 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2264 &*LoopVectorBody->getFirstInsertionPt()); 2265 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2266 Instruction *LastInduction = VecInd; 2267 for (unsigned Part = 0; Part < UF; ++Part) { 2268 State.set(Def, LastInduction, Part); 2269 2270 if (isa<TruncInst>(EntryVal)) 2271 addMetadata(LastInduction, EntryVal); 2272 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2273 State, Part); 2274 2275 LastInduction = cast<Instruction>( 2276 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2277 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2278 } 2279 2280 // Move the last step to the end of the latch block. This ensures consistent 2281 // placement of all induction updates. 2282 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2283 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2284 auto *ICmp = cast<Instruction>(Br->getCondition()); 2285 LastInduction->moveBefore(ICmp); 2286 LastInduction->setName("vec.ind.next"); 2287 2288 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2289 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2290 } 2291 2292 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2293 return Cost->isScalarAfterVectorization(I, VF) || 2294 Cost->isProfitableToScalarize(I, VF); 2295 } 2296 2297 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2298 if (shouldScalarizeInstruction(IV)) 2299 return true; 2300 auto isScalarInst = [&](User *U) -> bool { 2301 auto *I = cast<Instruction>(U); 2302 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2303 }; 2304 return llvm::any_of(IV->users(), isScalarInst); 2305 } 2306 2307 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2308 const InductionDescriptor &ID, const Instruction *EntryVal, 2309 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2310 unsigned Part, unsigned Lane) { 2311 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2312 "Expected either an induction phi-node or a truncate of it!"); 2313 2314 // This induction variable is not the phi from the original loop but the 2315 // newly-created IV based on the proof that casted Phi is equal to the 2316 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2317 // re-uses the same InductionDescriptor that original IV uses but we don't 2318 // have to do any recording in this case - that is done when original IV is 2319 // processed. 2320 if (isa<TruncInst>(EntryVal)) 2321 return; 2322 2323 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2324 if (Casts.empty()) 2325 return; 2326 // Only the first Cast instruction in the Casts vector is of interest. 2327 // The rest of the Casts (if exist) have no uses outside the 2328 // induction update chain itself. 2329 if (Lane < UINT_MAX) 2330 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2331 else 2332 State.set(CastDef, VectorLoopVal, Part); 2333 } 2334 2335 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2336 TruncInst *Trunc, VPValue *Def, 2337 VPValue *CastDef, 2338 VPTransformState &State) { 2339 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2340 "Primary induction variable must have an integer type"); 2341 2342 auto II = Legal->getInductionVars().find(IV); 2343 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2344 2345 auto ID = II->second; 2346 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2347 2348 // The value from the original loop to which we are mapping the new induction 2349 // variable. 2350 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2351 2352 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2353 2354 // Generate code for the induction step. Note that induction steps are 2355 // required to be loop-invariant 2356 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2357 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2358 "Induction step should be loop invariant"); 2359 if (PSE.getSE()->isSCEVable(IV->getType())) { 2360 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2361 return Exp.expandCodeFor(Step, Step->getType(), 2362 LoopVectorPreHeader->getTerminator()); 2363 } 2364 return cast<SCEVUnknown>(Step)->getValue(); 2365 }; 2366 2367 // The scalar value to broadcast. This is derived from the canonical 2368 // induction variable. If a truncation type is given, truncate the canonical 2369 // induction variable and step. Otherwise, derive these values from the 2370 // induction descriptor. 2371 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2372 Value *ScalarIV = Induction; 2373 if (IV != OldInduction) { 2374 ScalarIV = IV->getType()->isIntegerTy() 2375 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2376 : Builder.CreateCast(Instruction::SIToFP, Induction, 2377 IV->getType()); 2378 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2379 ScalarIV->setName("offset.idx"); 2380 } 2381 if (Trunc) { 2382 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2383 assert(Step->getType()->isIntegerTy() && 2384 "Truncation requires an integer step"); 2385 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2386 Step = Builder.CreateTrunc(Step, TruncType); 2387 } 2388 return ScalarIV; 2389 }; 2390 2391 // Create the vector values from the scalar IV, in the absence of creating a 2392 // vector IV. 2393 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2394 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2395 for (unsigned Part = 0; Part < UF; ++Part) { 2396 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2397 Value *EntryPart = 2398 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2399 ID.getInductionOpcode()); 2400 State.set(Def, EntryPart, Part); 2401 if (Trunc) 2402 addMetadata(EntryPart, Trunc); 2403 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2404 State, Part); 2405 } 2406 }; 2407 2408 // Fast-math-flags propagate from the original induction instruction. 2409 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2410 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2411 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2412 2413 // Now do the actual transformations, and start with creating the step value. 2414 Value *Step = CreateStepValue(ID.getStep()); 2415 if (VF.isZero() || VF.isScalar()) { 2416 Value *ScalarIV = CreateScalarIV(Step); 2417 CreateSplatIV(ScalarIV, Step); 2418 return; 2419 } 2420 2421 // Determine if we want a scalar version of the induction variable. This is 2422 // true if the induction variable itself is not widened, or if it has at 2423 // least one user in the loop that is not widened. 2424 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2425 if (!NeedsScalarIV) { 2426 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2427 State); 2428 return; 2429 } 2430 2431 // Try to create a new independent vector induction variable. If we can't 2432 // create the phi node, we will splat the scalar induction variable in each 2433 // loop iteration. 2434 if (!shouldScalarizeInstruction(EntryVal)) { 2435 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2436 State); 2437 Value *ScalarIV = CreateScalarIV(Step); 2438 // Create scalar steps that can be used by instructions we will later 2439 // scalarize. Note that the addition of the scalar steps will not increase 2440 // the number of instructions in the loop in the common case prior to 2441 // InstCombine. We will be trading one vector extract for each scalar step. 2442 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2443 return; 2444 } 2445 2446 // All IV users are scalar instructions, so only emit a scalar IV, not a 2447 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2448 // predicate used by the masked loads/stores. 2449 Value *ScalarIV = CreateScalarIV(Step); 2450 if (!Cost->isScalarEpilogueAllowed()) 2451 CreateSplatIV(ScalarIV, Step); 2452 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2453 } 2454 2455 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2456 Instruction::BinaryOps BinOp) { 2457 // Create and check the types. 2458 assert(isa<FixedVectorType>(Val->getType()) && 2459 "Creation of scalable step vector not yet supported"); 2460 auto *ValVTy = cast<VectorType>(Val->getType()); 2461 ElementCount VLen = ValVTy->getElementCount(); 2462 2463 Type *STy = Val->getType()->getScalarType(); 2464 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2465 "Induction Step must be an integer or FP"); 2466 assert(Step->getType() == STy && "Step has wrong type"); 2467 2468 SmallVector<Constant *, 8> Indices; 2469 2470 // Create a vector of consecutive numbers from zero to VF. 2471 VectorType *InitVecValVTy = ValVTy; 2472 Type *InitVecValSTy = STy; 2473 if (STy->isFloatingPointTy()) { 2474 InitVecValSTy = 2475 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2476 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2477 } 2478 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2479 2480 // Add on StartIdx 2481 Value *StartIdxSplat = Builder.CreateVectorSplat( 2482 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2483 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2484 2485 if (STy->isIntegerTy()) { 2486 Step = Builder.CreateVectorSplat(VLen, Step); 2487 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2488 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2489 // which can be found from the original scalar operations. 2490 Step = Builder.CreateMul(InitVec, Step); 2491 return Builder.CreateAdd(Val, Step, "induction"); 2492 } 2493 2494 // Floating point induction. 2495 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2496 "Binary Opcode should be specified for FP induction"); 2497 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2498 Step = Builder.CreateVectorSplat(VLen, Step); 2499 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2500 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2501 } 2502 2503 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2504 Instruction *EntryVal, 2505 const InductionDescriptor &ID, 2506 VPValue *Def, VPValue *CastDef, 2507 VPTransformState &State) { 2508 // We shouldn't have to build scalar steps if we aren't vectorizing. 2509 assert(VF.isVector() && "VF should be greater than one"); 2510 // Get the value type and ensure it and the step have the same integer type. 2511 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2512 assert(ScalarIVTy == Step->getType() && 2513 "Val and Step should have the same type"); 2514 2515 // We build scalar steps for both integer and floating-point induction 2516 // variables. Here, we determine the kind of arithmetic we will perform. 2517 Instruction::BinaryOps AddOp; 2518 Instruction::BinaryOps MulOp; 2519 if (ScalarIVTy->isIntegerTy()) { 2520 AddOp = Instruction::Add; 2521 MulOp = Instruction::Mul; 2522 } else { 2523 AddOp = ID.getInductionOpcode(); 2524 MulOp = Instruction::FMul; 2525 } 2526 2527 // Determine the number of scalars we need to generate for each unroll 2528 // iteration. If EntryVal is uniform, we only need to generate the first 2529 // lane. Otherwise, we generate all VF values. 2530 unsigned Lanes = 2531 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2532 ? 1 2533 : VF.getKnownMinValue(); 2534 assert((!VF.isScalable() || Lanes == 1) && 2535 "Should never scalarize a scalable vector"); 2536 // Compute the scalar steps and save the results in State. 2537 for (unsigned Part = 0; Part < UF; ++Part) { 2538 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2539 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2540 ScalarIVTy->getScalarSizeInBits()); 2541 Value *StartIdx = 2542 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2543 if (ScalarIVTy->isFloatingPointTy()) 2544 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2545 StartIdx = Builder.CreateBinOp( 2546 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2547 // The step returned by `createStepForVF` is a runtime-evaluated value 2548 // when VF is scalable. Otherwise, it should be folded into a Constant. 2549 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2550 "Expected StartIdx to be folded to a constant when VF is not " 2551 "scalable"); 2552 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2553 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2554 State.set(Def, Add, VPIteration(Part, Lane)); 2555 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2556 Part, Lane); 2557 } 2558 } 2559 } 2560 2561 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2562 const VPIteration &Instance, 2563 VPTransformState &State) { 2564 Value *ScalarInst = State.get(Def, Instance); 2565 Value *VectorValue = State.get(Def, Instance.Part); 2566 VectorValue = Builder.CreateInsertElement( 2567 VectorValue, ScalarInst, 2568 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2569 State.set(Def, VectorValue, Instance.Part); 2570 } 2571 2572 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2573 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2574 return Builder.CreateVectorReverse(Vec, "reverse"); 2575 } 2576 2577 // Return whether we allow using masked interleave-groups (for dealing with 2578 // strided loads/stores that reside in predicated blocks, or for dealing 2579 // with gaps). 2580 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2581 // If an override option has been passed in for interleaved accesses, use it. 2582 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2583 return EnableMaskedInterleavedMemAccesses; 2584 2585 return TTI.enableMaskedInterleavedAccessVectorization(); 2586 } 2587 2588 // Try to vectorize the interleave group that \p Instr belongs to. 2589 // 2590 // E.g. Translate following interleaved load group (factor = 3): 2591 // for (i = 0; i < N; i+=3) { 2592 // R = Pic[i]; // Member of index 0 2593 // G = Pic[i+1]; // Member of index 1 2594 // B = Pic[i+2]; // Member of index 2 2595 // ... // do something to R, G, B 2596 // } 2597 // To: 2598 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2599 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2600 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2601 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2602 // 2603 // Or translate following interleaved store group (factor = 3): 2604 // for (i = 0; i < N; i+=3) { 2605 // ... do something to R, G, B 2606 // Pic[i] = R; // Member of index 0 2607 // Pic[i+1] = G; // Member of index 1 2608 // Pic[i+2] = B; // Member of index 2 2609 // } 2610 // To: 2611 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2612 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2613 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2614 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2615 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2616 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2617 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2618 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2619 VPValue *BlockInMask) { 2620 Instruction *Instr = Group->getInsertPos(); 2621 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2622 2623 // Prepare for the vector type of the interleaved load/store. 2624 Type *ScalarTy = getMemInstValueType(Instr); 2625 unsigned InterleaveFactor = Group->getFactor(); 2626 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2627 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2628 2629 // Prepare for the new pointers. 2630 SmallVector<Value *, 2> AddrParts; 2631 unsigned Index = Group->getIndex(Instr); 2632 2633 // TODO: extend the masked interleaved-group support to reversed access. 2634 assert((!BlockInMask || !Group->isReverse()) && 2635 "Reversed masked interleave-group not supported."); 2636 2637 // If the group is reverse, adjust the index to refer to the last vector lane 2638 // instead of the first. We adjust the index from the first vector lane, 2639 // rather than directly getting the pointer for lane VF - 1, because the 2640 // pointer operand of the interleaved access is supposed to be uniform. For 2641 // uniform instructions, we're only required to generate a value for the 2642 // first vector lane in each unroll iteration. 2643 assert(!VF.isScalable() && 2644 "scalable vector reverse operation is not implemented"); 2645 if (Group->isReverse()) 2646 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2647 2648 for (unsigned Part = 0; Part < UF; Part++) { 2649 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2650 setDebugLocFromInst(Builder, AddrPart); 2651 2652 // Notice current instruction could be any index. Need to adjust the address 2653 // to the member of index 0. 2654 // 2655 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2656 // b = A[i]; // Member of index 0 2657 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2658 // 2659 // E.g. A[i+1] = a; // Member of index 1 2660 // A[i] = b; // Member of index 0 2661 // A[i+2] = c; // Member of index 2 (Current instruction) 2662 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2663 2664 bool InBounds = false; 2665 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2666 InBounds = gep->isInBounds(); 2667 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2668 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2669 2670 // Cast to the vector pointer type. 2671 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2672 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2673 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2674 } 2675 2676 setDebugLocFromInst(Builder, Instr); 2677 Value *PoisonVec = PoisonValue::get(VecTy); 2678 2679 Value *MaskForGaps = nullptr; 2680 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2681 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2682 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2683 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2684 } 2685 2686 // Vectorize the interleaved load group. 2687 if (isa<LoadInst>(Instr)) { 2688 // For each unroll part, create a wide load for the group. 2689 SmallVector<Value *, 2> NewLoads; 2690 for (unsigned Part = 0; Part < UF; Part++) { 2691 Instruction *NewLoad; 2692 if (BlockInMask || MaskForGaps) { 2693 assert(useMaskedInterleavedAccesses(*TTI) && 2694 "masked interleaved groups are not allowed."); 2695 Value *GroupMask = MaskForGaps; 2696 if (BlockInMask) { 2697 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2698 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2699 Value *ShuffledMask = Builder.CreateShuffleVector( 2700 BlockInMaskPart, 2701 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2702 "interleaved.mask"); 2703 GroupMask = MaskForGaps 2704 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2705 MaskForGaps) 2706 : ShuffledMask; 2707 } 2708 NewLoad = 2709 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2710 GroupMask, PoisonVec, "wide.masked.vec"); 2711 } 2712 else 2713 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2714 Group->getAlign(), "wide.vec"); 2715 Group->addMetadata(NewLoad); 2716 NewLoads.push_back(NewLoad); 2717 } 2718 2719 // For each member in the group, shuffle out the appropriate data from the 2720 // wide loads. 2721 unsigned J = 0; 2722 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2723 Instruction *Member = Group->getMember(I); 2724 2725 // Skip the gaps in the group. 2726 if (!Member) 2727 continue; 2728 2729 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2730 auto StrideMask = 2731 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2732 for (unsigned Part = 0; Part < UF; Part++) { 2733 Value *StridedVec = Builder.CreateShuffleVector( 2734 NewLoads[Part], StrideMask, "strided.vec"); 2735 2736 // If this member has different type, cast the result type. 2737 if (Member->getType() != ScalarTy) { 2738 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2739 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2740 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2741 } 2742 2743 if (Group->isReverse()) 2744 StridedVec = reverseVector(StridedVec); 2745 2746 State.set(VPDefs[J], StridedVec, Part); 2747 } 2748 ++J; 2749 } 2750 return; 2751 } 2752 2753 // The sub vector type for current instruction. 2754 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2755 auto *SubVT = VectorType::get(ScalarTy, VF); 2756 2757 // Vectorize the interleaved store group. 2758 for (unsigned Part = 0; Part < UF; Part++) { 2759 // Collect the stored vector from each member. 2760 SmallVector<Value *, 4> StoredVecs; 2761 for (unsigned i = 0; i < InterleaveFactor; i++) { 2762 // Interleaved store group doesn't allow a gap, so each index has a member 2763 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2764 2765 Value *StoredVec = State.get(StoredValues[i], Part); 2766 2767 if (Group->isReverse()) 2768 StoredVec = reverseVector(StoredVec); 2769 2770 // If this member has different type, cast it to a unified type. 2771 2772 if (StoredVec->getType() != SubVT) 2773 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2774 2775 StoredVecs.push_back(StoredVec); 2776 } 2777 2778 // Concatenate all vectors into a wide vector. 2779 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2780 2781 // Interleave the elements in the wide vector. 2782 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2783 Value *IVec = Builder.CreateShuffleVector( 2784 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2785 "interleaved.vec"); 2786 2787 Instruction *NewStoreInstr; 2788 if (BlockInMask) { 2789 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2790 Value *ShuffledMask = Builder.CreateShuffleVector( 2791 BlockInMaskPart, 2792 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2793 "interleaved.mask"); 2794 NewStoreInstr = Builder.CreateMaskedStore( 2795 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2796 } 2797 else 2798 NewStoreInstr = 2799 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2800 2801 Group->addMetadata(NewStoreInstr); 2802 } 2803 } 2804 2805 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2806 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2807 VPValue *StoredValue, VPValue *BlockInMask) { 2808 // Attempt to issue a wide load. 2809 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2810 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2811 2812 assert((LI || SI) && "Invalid Load/Store instruction"); 2813 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2814 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2815 2816 LoopVectorizationCostModel::InstWidening Decision = 2817 Cost->getWideningDecision(Instr, VF); 2818 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2819 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2820 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2821 "CM decision is not to widen the memory instruction"); 2822 2823 Type *ScalarDataTy = getMemInstValueType(Instr); 2824 2825 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2826 const Align Alignment = getLoadStoreAlignment(Instr); 2827 2828 // Determine if the pointer operand of the access is either consecutive or 2829 // reverse consecutive. 2830 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2831 bool ConsecutiveStride = 2832 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2833 bool CreateGatherScatter = 2834 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2835 2836 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2837 // gather/scatter. Otherwise Decision should have been to Scalarize. 2838 assert((ConsecutiveStride || CreateGatherScatter) && 2839 "The instruction should be scalarized"); 2840 (void)ConsecutiveStride; 2841 2842 VectorParts BlockInMaskParts(UF); 2843 bool isMaskRequired = BlockInMask; 2844 if (isMaskRequired) 2845 for (unsigned Part = 0; Part < UF; ++Part) 2846 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2847 2848 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2849 // Calculate the pointer for the specific unroll-part. 2850 GetElementPtrInst *PartPtr = nullptr; 2851 2852 bool InBounds = false; 2853 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2854 InBounds = gep->isInBounds(); 2855 if (Reverse) { 2856 // If the address is consecutive but reversed, then the 2857 // wide store needs to start at the last vector element. 2858 // RunTimeVF = VScale * VF.getKnownMinValue() 2859 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2860 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2861 // NumElt = -Part * RunTimeVF 2862 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2863 // LastLane = 1 - RunTimeVF 2864 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2865 PartPtr = 2866 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2867 PartPtr->setIsInBounds(InBounds); 2868 PartPtr = cast<GetElementPtrInst>( 2869 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2870 PartPtr->setIsInBounds(InBounds); 2871 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2872 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2873 } else { 2874 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2875 PartPtr = cast<GetElementPtrInst>( 2876 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2877 PartPtr->setIsInBounds(InBounds); 2878 } 2879 2880 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2881 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2882 }; 2883 2884 // Handle Stores: 2885 if (SI) { 2886 setDebugLocFromInst(Builder, SI); 2887 2888 for (unsigned Part = 0; Part < UF; ++Part) { 2889 Instruction *NewSI = nullptr; 2890 Value *StoredVal = State.get(StoredValue, Part); 2891 if (CreateGatherScatter) { 2892 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2893 Value *VectorGep = State.get(Addr, Part); 2894 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2895 MaskPart); 2896 } else { 2897 if (Reverse) { 2898 // If we store to reverse consecutive memory locations, then we need 2899 // to reverse the order of elements in the stored value. 2900 StoredVal = reverseVector(StoredVal); 2901 // We don't want to update the value in the map as it might be used in 2902 // another expression. So don't call resetVectorValue(StoredVal). 2903 } 2904 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2905 if (isMaskRequired) 2906 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2907 BlockInMaskParts[Part]); 2908 else 2909 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2910 } 2911 addMetadata(NewSI, SI); 2912 } 2913 return; 2914 } 2915 2916 // Handle loads. 2917 assert(LI && "Must have a load instruction"); 2918 setDebugLocFromInst(Builder, LI); 2919 for (unsigned Part = 0; Part < UF; ++Part) { 2920 Value *NewLI; 2921 if (CreateGatherScatter) { 2922 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2923 Value *VectorGep = State.get(Addr, Part); 2924 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2925 nullptr, "wide.masked.gather"); 2926 addMetadata(NewLI, LI); 2927 } else { 2928 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2929 if (isMaskRequired) 2930 NewLI = Builder.CreateMaskedLoad( 2931 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2932 "wide.masked.load"); 2933 else 2934 NewLI = 2935 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2936 2937 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2938 addMetadata(NewLI, LI); 2939 if (Reverse) 2940 NewLI = reverseVector(NewLI); 2941 } 2942 2943 State.set(Def, NewLI, Part); 2944 } 2945 } 2946 2947 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2948 VPUser &User, 2949 const VPIteration &Instance, 2950 bool IfPredicateInstr, 2951 VPTransformState &State) { 2952 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2953 2954 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2955 // the first lane and part. 2956 if (isa<NoAliasScopeDeclInst>(Instr)) 2957 if (!Instance.isFirstIteration()) 2958 return; 2959 2960 setDebugLocFromInst(Builder, Instr); 2961 2962 // Does this instruction return a value ? 2963 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2964 2965 Instruction *Cloned = Instr->clone(); 2966 if (!IsVoidRetTy) 2967 Cloned->setName(Instr->getName() + ".cloned"); 2968 2969 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2970 Builder.GetInsertPoint()); 2971 // Replace the operands of the cloned instructions with their scalar 2972 // equivalents in the new loop. 2973 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2974 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2975 auto InputInstance = Instance; 2976 if (!Operand || !OrigLoop->contains(Operand) || 2977 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2978 InputInstance.Lane = VPLane::getFirstLane(); 2979 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2980 Cloned->setOperand(op, NewOp); 2981 } 2982 addNewMetadata(Cloned, Instr); 2983 2984 // Place the cloned scalar in the new loop. 2985 Builder.Insert(Cloned); 2986 2987 State.set(Def, Cloned, Instance); 2988 2989 // If we just cloned a new assumption, add it the assumption cache. 2990 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2991 if (II->getIntrinsicID() == Intrinsic::assume) 2992 AC->registerAssumption(II); 2993 2994 // End if-block. 2995 if (IfPredicateInstr) 2996 PredicatedInstructions.push_back(Cloned); 2997 } 2998 2999 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3000 Value *End, Value *Step, 3001 Instruction *DL) { 3002 BasicBlock *Header = L->getHeader(); 3003 BasicBlock *Latch = L->getLoopLatch(); 3004 // As we're just creating this loop, it's possible no latch exists 3005 // yet. If so, use the header as this will be a single block loop. 3006 if (!Latch) 3007 Latch = Header; 3008 3009 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3010 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3011 setDebugLocFromInst(Builder, OldInst); 3012 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3013 3014 Builder.SetInsertPoint(Latch->getTerminator()); 3015 setDebugLocFromInst(Builder, OldInst); 3016 3017 // Create i+1 and fill the PHINode. 3018 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3019 Induction->addIncoming(Start, L->getLoopPreheader()); 3020 Induction->addIncoming(Next, Latch); 3021 // Create the compare. 3022 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3023 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3024 3025 // Now we have two terminators. Remove the old one from the block. 3026 Latch->getTerminator()->eraseFromParent(); 3027 3028 return Induction; 3029 } 3030 3031 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3032 if (TripCount) 3033 return TripCount; 3034 3035 assert(L && "Create Trip Count for null loop."); 3036 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3037 // Find the loop boundaries. 3038 ScalarEvolution *SE = PSE.getSE(); 3039 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3040 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3041 "Invalid loop count"); 3042 3043 Type *IdxTy = Legal->getWidestInductionType(); 3044 assert(IdxTy && "No type for induction"); 3045 3046 // The exit count might have the type of i64 while the phi is i32. This can 3047 // happen if we have an induction variable that is sign extended before the 3048 // compare. The only way that we get a backedge taken count is that the 3049 // induction variable was signed and as such will not overflow. In such a case 3050 // truncation is legal. 3051 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3052 IdxTy->getPrimitiveSizeInBits()) 3053 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3054 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3055 3056 // Get the total trip count from the count by adding 1. 3057 const SCEV *ExitCount = SE->getAddExpr( 3058 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3059 3060 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3061 3062 // Expand the trip count and place the new instructions in the preheader. 3063 // Notice that the pre-header does not change, only the loop body. 3064 SCEVExpander Exp(*SE, DL, "induction"); 3065 3066 // Count holds the overall loop count (N). 3067 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3068 L->getLoopPreheader()->getTerminator()); 3069 3070 if (TripCount->getType()->isPointerTy()) 3071 TripCount = 3072 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3073 L->getLoopPreheader()->getTerminator()); 3074 3075 return TripCount; 3076 } 3077 3078 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3079 if (VectorTripCount) 3080 return VectorTripCount; 3081 3082 Value *TC = getOrCreateTripCount(L); 3083 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3084 3085 Type *Ty = TC->getType(); 3086 // This is where we can make the step a runtime constant. 3087 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3088 3089 // If the tail is to be folded by masking, round the number of iterations N 3090 // up to a multiple of Step instead of rounding down. This is done by first 3091 // adding Step-1 and then rounding down. Note that it's ok if this addition 3092 // overflows: the vector induction variable will eventually wrap to zero given 3093 // that it starts at zero and its Step is a power of two; the loop will then 3094 // exit, with the last early-exit vector comparison also producing all-true. 3095 if (Cost->foldTailByMasking()) { 3096 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3097 "VF*UF must be a power of 2 when folding tail by masking"); 3098 assert(!VF.isScalable() && 3099 "Tail folding not yet supported for scalable vectors"); 3100 TC = Builder.CreateAdd( 3101 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3102 } 3103 3104 // Now we need to generate the expression for the part of the loop that the 3105 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3106 // iterations are not required for correctness, or N - Step, otherwise. Step 3107 // is equal to the vectorization factor (number of SIMD elements) times the 3108 // unroll factor (number of SIMD instructions). 3109 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3110 3111 // There are two cases where we need to ensure (at least) the last iteration 3112 // runs in the scalar remainder loop. Thus, if the step evenly divides 3113 // the trip count, we set the remainder to be equal to the step. If the step 3114 // does not evenly divide the trip count, no adjustment is necessary since 3115 // there will already be scalar iterations. Note that the minimum iterations 3116 // check ensures that N >= Step. The cases are: 3117 // 1) If there is a non-reversed interleaved group that may speculatively 3118 // access memory out-of-bounds. 3119 // 2) If any instruction may follow a conditionally taken exit. That is, if 3120 // the loop contains multiple exiting blocks, or a single exiting block 3121 // which is not the latch. 3122 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3123 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3124 R = Builder.CreateSelect(IsZero, Step, R); 3125 } 3126 3127 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3128 3129 return VectorTripCount; 3130 } 3131 3132 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3133 const DataLayout &DL) { 3134 // Verify that V is a vector type with same number of elements as DstVTy. 3135 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3136 unsigned VF = DstFVTy->getNumElements(); 3137 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3138 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3139 Type *SrcElemTy = SrcVecTy->getElementType(); 3140 Type *DstElemTy = DstFVTy->getElementType(); 3141 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3142 "Vector elements must have same size"); 3143 3144 // Do a direct cast if element types are castable. 3145 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3146 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3147 } 3148 // V cannot be directly casted to desired vector type. 3149 // May happen when V is a floating point vector but DstVTy is a vector of 3150 // pointers or vice-versa. Handle this using a two-step bitcast using an 3151 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3152 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3153 "Only one type should be a pointer type"); 3154 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3155 "Only one type should be a floating point type"); 3156 Type *IntTy = 3157 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3158 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3159 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3160 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3161 } 3162 3163 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3164 BasicBlock *Bypass) { 3165 Value *Count = getOrCreateTripCount(L); 3166 // Reuse existing vector loop preheader for TC checks. 3167 // Note that new preheader block is generated for vector loop. 3168 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3169 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3170 3171 // Generate code to check if the loop's trip count is less than VF * UF, or 3172 // equal to it in case a scalar epilogue is required; this implies that the 3173 // vector trip count is zero. This check also covers the case where adding one 3174 // to the backedge-taken count overflowed leading to an incorrect trip count 3175 // of zero. In this case we will also jump to the scalar loop. 3176 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3177 : ICmpInst::ICMP_ULT; 3178 3179 // If tail is to be folded, vector loop takes care of all iterations. 3180 Value *CheckMinIters = Builder.getFalse(); 3181 if (!Cost->foldTailByMasking()) { 3182 Value *Step = 3183 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3184 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3185 } 3186 // Create new preheader for vector loop. 3187 LoopVectorPreHeader = 3188 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3189 "vector.ph"); 3190 3191 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3192 DT->getNode(Bypass)->getIDom()) && 3193 "TC check is expected to dominate Bypass"); 3194 3195 // Update dominator for Bypass & LoopExit. 3196 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3197 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3198 3199 ReplaceInstWithInst( 3200 TCCheckBlock->getTerminator(), 3201 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3202 LoopBypassBlocks.push_back(TCCheckBlock); 3203 } 3204 3205 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3206 3207 BasicBlock *const SCEVCheckBlock = 3208 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3209 if (!SCEVCheckBlock) 3210 return nullptr; 3211 3212 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3213 (OptForSizeBasedOnProfile && 3214 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3215 "Cannot SCEV check stride or overflow when optimizing for size"); 3216 3217 3218 // Update dominator only if this is first RT check. 3219 if (LoopBypassBlocks.empty()) { 3220 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3221 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3222 } 3223 3224 LoopBypassBlocks.push_back(SCEVCheckBlock); 3225 AddedSafetyChecks = true; 3226 return SCEVCheckBlock; 3227 } 3228 3229 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3230 BasicBlock *Bypass) { 3231 // VPlan-native path does not do any analysis for runtime checks currently. 3232 if (EnableVPlanNativePath) 3233 return nullptr; 3234 3235 BasicBlock *const MemCheckBlock = 3236 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3237 3238 // Check if we generated code that checks in runtime if arrays overlap. We put 3239 // the checks into a separate block to make the more common case of few 3240 // elements faster. 3241 if (!MemCheckBlock) 3242 return nullptr; 3243 3244 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3245 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3246 "Cannot emit memory checks when optimizing for size, unless forced " 3247 "to vectorize."); 3248 ORE->emit([&]() { 3249 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3250 L->getStartLoc(), L->getHeader()) 3251 << "Code-size may be reduced by not forcing " 3252 "vectorization, or by source-code modifications " 3253 "eliminating the need for runtime checks " 3254 "(e.g., adding 'restrict')."; 3255 }); 3256 } 3257 3258 LoopBypassBlocks.push_back(MemCheckBlock); 3259 3260 AddedSafetyChecks = true; 3261 3262 // We currently don't use LoopVersioning for the actual loop cloning but we 3263 // still use it to add the noalias metadata. 3264 LVer = std::make_unique<LoopVersioning>( 3265 *Legal->getLAI(), 3266 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3267 DT, PSE.getSE()); 3268 LVer->prepareNoAliasMetadata(); 3269 return MemCheckBlock; 3270 } 3271 3272 Value *InnerLoopVectorizer::emitTransformedIndex( 3273 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3274 const InductionDescriptor &ID) const { 3275 3276 SCEVExpander Exp(*SE, DL, "induction"); 3277 auto Step = ID.getStep(); 3278 auto StartValue = ID.getStartValue(); 3279 assert(Index->getType() == Step->getType() && 3280 "Index type does not match StepValue type"); 3281 3282 // Note: the IR at this point is broken. We cannot use SE to create any new 3283 // SCEV and then expand it, hoping that SCEV's simplification will give us 3284 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3285 // lead to various SCEV crashes. So all we can do is to use builder and rely 3286 // on InstCombine for future simplifications. Here we handle some trivial 3287 // cases only. 3288 auto CreateAdd = [&B](Value *X, Value *Y) { 3289 assert(X->getType() == Y->getType() && "Types don't match!"); 3290 if (auto *CX = dyn_cast<ConstantInt>(X)) 3291 if (CX->isZero()) 3292 return Y; 3293 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3294 if (CY->isZero()) 3295 return X; 3296 return B.CreateAdd(X, Y); 3297 }; 3298 3299 auto CreateMul = [&B](Value *X, Value *Y) { 3300 assert(X->getType() == Y->getType() && "Types don't match!"); 3301 if (auto *CX = dyn_cast<ConstantInt>(X)) 3302 if (CX->isOne()) 3303 return Y; 3304 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3305 if (CY->isOne()) 3306 return X; 3307 return B.CreateMul(X, Y); 3308 }; 3309 3310 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3311 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3312 // the DomTree is not kept up-to-date for additional blocks generated in the 3313 // vector loop. By using the header as insertion point, we guarantee that the 3314 // expanded instructions dominate all their uses. 3315 auto GetInsertPoint = [this, &B]() { 3316 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3317 if (InsertBB != LoopVectorBody && 3318 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3319 return LoopVectorBody->getTerminator(); 3320 return &*B.GetInsertPoint(); 3321 }; 3322 3323 switch (ID.getKind()) { 3324 case InductionDescriptor::IK_IntInduction: { 3325 assert(Index->getType() == StartValue->getType() && 3326 "Index type does not match StartValue type"); 3327 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3328 return B.CreateSub(StartValue, Index); 3329 auto *Offset = CreateMul( 3330 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3331 return CreateAdd(StartValue, Offset); 3332 } 3333 case InductionDescriptor::IK_PtrInduction: { 3334 assert(isa<SCEVConstant>(Step) && 3335 "Expected constant step for pointer induction"); 3336 return B.CreateGEP( 3337 StartValue->getType()->getPointerElementType(), StartValue, 3338 CreateMul(Index, 3339 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3340 } 3341 case InductionDescriptor::IK_FpInduction: { 3342 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3343 auto InductionBinOp = ID.getInductionBinOp(); 3344 assert(InductionBinOp && 3345 (InductionBinOp->getOpcode() == Instruction::FAdd || 3346 InductionBinOp->getOpcode() == Instruction::FSub) && 3347 "Original bin op should be defined for FP induction"); 3348 3349 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3350 Value *MulExp = B.CreateFMul(StepValue, Index); 3351 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3352 "induction"); 3353 } 3354 case InductionDescriptor::IK_NoInduction: 3355 return nullptr; 3356 } 3357 llvm_unreachable("invalid enum"); 3358 } 3359 3360 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3361 LoopScalarBody = OrigLoop->getHeader(); 3362 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3363 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3364 assert(LoopExitBlock && "Must have an exit block"); 3365 assert(LoopVectorPreHeader && "Invalid loop structure"); 3366 3367 LoopMiddleBlock = 3368 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3369 LI, nullptr, Twine(Prefix) + "middle.block"); 3370 LoopScalarPreHeader = 3371 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3372 nullptr, Twine(Prefix) + "scalar.ph"); 3373 3374 // Set up branch from middle block to the exit and scalar preheader blocks. 3375 // completeLoopSkeleton will update the condition to use an iteration check, 3376 // if required to decide whether to execute the remainder. 3377 BranchInst *BrInst = 3378 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3379 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3380 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3381 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3382 3383 // We intentionally don't let SplitBlock to update LoopInfo since 3384 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3385 // LoopVectorBody is explicitly added to the correct place few lines later. 3386 LoopVectorBody = 3387 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3388 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3389 3390 // Update dominator for loop exit. 3391 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3392 3393 // Create and register the new vector loop. 3394 Loop *Lp = LI->AllocateLoop(); 3395 Loop *ParentLoop = OrigLoop->getParentLoop(); 3396 3397 // Insert the new loop into the loop nest and register the new basic blocks 3398 // before calling any utilities such as SCEV that require valid LoopInfo. 3399 if (ParentLoop) { 3400 ParentLoop->addChildLoop(Lp); 3401 } else { 3402 LI->addTopLevelLoop(Lp); 3403 } 3404 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3405 return Lp; 3406 } 3407 3408 void InnerLoopVectorizer::createInductionResumeValues( 3409 Loop *L, Value *VectorTripCount, 3410 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3411 assert(VectorTripCount && L && "Expected valid arguments"); 3412 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3413 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3414 "Inconsistent information about additional bypass."); 3415 // We are going to resume the execution of the scalar loop. 3416 // Go over all of the induction variables that we found and fix the 3417 // PHIs that are left in the scalar version of the loop. 3418 // The starting values of PHI nodes depend on the counter of the last 3419 // iteration in the vectorized loop. 3420 // If we come from a bypass edge then we need to start from the original 3421 // start value. 3422 for (auto &InductionEntry : Legal->getInductionVars()) { 3423 PHINode *OrigPhi = InductionEntry.first; 3424 InductionDescriptor II = InductionEntry.second; 3425 3426 // Create phi nodes to merge from the backedge-taken check block. 3427 PHINode *BCResumeVal = 3428 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3429 LoopScalarPreHeader->getTerminator()); 3430 // Copy original phi DL over to the new one. 3431 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3432 Value *&EndValue = IVEndValues[OrigPhi]; 3433 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3434 if (OrigPhi == OldInduction) { 3435 // We know what the end value is. 3436 EndValue = VectorTripCount; 3437 } else { 3438 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3439 3440 // Fast-math-flags propagate from the original induction instruction. 3441 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3442 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3443 3444 Type *StepType = II.getStep()->getType(); 3445 Instruction::CastOps CastOp = 3446 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3447 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3448 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3449 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3450 EndValue->setName("ind.end"); 3451 3452 // Compute the end value for the additional bypass (if applicable). 3453 if (AdditionalBypass.first) { 3454 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3455 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3456 StepType, true); 3457 CRD = 3458 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3459 EndValueFromAdditionalBypass = 3460 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3461 EndValueFromAdditionalBypass->setName("ind.end"); 3462 } 3463 } 3464 // The new PHI merges the original incoming value, in case of a bypass, 3465 // or the value at the end of the vectorized loop. 3466 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3467 3468 // Fix the scalar body counter (PHI node). 3469 // The old induction's phi node in the scalar body needs the truncated 3470 // value. 3471 for (BasicBlock *BB : LoopBypassBlocks) 3472 BCResumeVal->addIncoming(II.getStartValue(), BB); 3473 3474 if (AdditionalBypass.first) 3475 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3476 EndValueFromAdditionalBypass); 3477 3478 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3479 } 3480 } 3481 3482 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3483 MDNode *OrigLoopID) { 3484 assert(L && "Expected valid loop."); 3485 3486 // The trip counts should be cached by now. 3487 Value *Count = getOrCreateTripCount(L); 3488 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3489 3490 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3491 3492 // Add a check in the middle block to see if we have completed 3493 // all of the iterations in the first vector loop. 3494 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3495 // If tail is to be folded, we know we don't need to run the remainder. 3496 if (!Cost->foldTailByMasking()) { 3497 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3498 Count, VectorTripCount, "cmp.n", 3499 LoopMiddleBlock->getTerminator()); 3500 3501 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3502 // of the corresponding compare because they may have ended up with 3503 // different line numbers and we want to avoid awkward line stepping while 3504 // debugging. Eg. if the compare has got a line number inside the loop. 3505 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3506 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3507 } 3508 3509 // Get ready to start creating new instructions into the vectorized body. 3510 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3511 "Inconsistent vector loop preheader"); 3512 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3513 3514 Optional<MDNode *> VectorizedLoopID = 3515 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3516 LLVMLoopVectorizeFollowupVectorized}); 3517 if (VectorizedLoopID.hasValue()) { 3518 L->setLoopID(VectorizedLoopID.getValue()); 3519 3520 // Do not setAlreadyVectorized if loop attributes have been defined 3521 // explicitly. 3522 return LoopVectorPreHeader; 3523 } 3524 3525 // Keep all loop hints from the original loop on the vector loop (we'll 3526 // replace the vectorizer-specific hints below). 3527 if (MDNode *LID = OrigLoop->getLoopID()) 3528 L->setLoopID(LID); 3529 3530 LoopVectorizeHints Hints(L, true, *ORE); 3531 Hints.setAlreadyVectorized(); 3532 3533 #ifdef EXPENSIVE_CHECKS 3534 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3535 LI->verify(*DT); 3536 #endif 3537 3538 return LoopVectorPreHeader; 3539 } 3540 3541 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3542 /* 3543 In this function we generate a new loop. The new loop will contain 3544 the vectorized instructions while the old loop will continue to run the 3545 scalar remainder. 3546 3547 [ ] <-- loop iteration number check. 3548 / | 3549 / v 3550 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3551 | / | 3552 | / v 3553 || [ ] <-- vector pre header. 3554 |/ | 3555 | v 3556 | [ ] \ 3557 | [ ]_| <-- vector loop. 3558 | | 3559 | v 3560 | -[ ] <--- middle-block. 3561 | / | 3562 | / v 3563 -|- >[ ] <--- new preheader. 3564 | | 3565 | v 3566 | [ ] \ 3567 | [ ]_| <-- old scalar loop to handle remainder. 3568 \ | 3569 \ v 3570 >[ ] <-- exit block. 3571 ... 3572 */ 3573 3574 // Get the metadata of the original loop before it gets modified. 3575 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3576 3577 // Create an empty vector loop, and prepare basic blocks for the runtime 3578 // checks. 3579 Loop *Lp = createVectorLoopSkeleton(""); 3580 3581 // Now, compare the new count to zero. If it is zero skip the vector loop and 3582 // jump to the scalar loop. This check also covers the case where the 3583 // backedge-taken count is uint##_max: adding one to it will overflow leading 3584 // to an incorrect trip count of zero. In this (rare) case we will also jump 3585 // to the scalar loop. 3586 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3587 3588 // Generate the code to check any assumptions that we've made for SCEV 3589 // expressions. 3590 emitSCEVChecks(Lp, LoopScalarPreHeader); 3591 3592 // Generate the code that checks in runtime if arrays overlap. We put the 3593 // checks into a separate block to make the more common case of few elements 3594 // faster. 3595 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3596 3597 // Some loops have a single integer induction variable, while other loops 3598 // don't. One example is c++ iterators that often have multiple pointer 3599 // induction variables. In the code below we also support a case where we 3600 // don't have a single induction variable. 3601 // 3602 // We try to obtain an induction variable from the original loop as hard 3603 // as possible. However if we don't find one that: 3604 // - is an integer 3605 // - counts from zero, stepping by one 3606 // - is the size of the widest induction variable type 3607 // then we create a new one. 3608 OldInduction = Legal->getPrimaryInduction(); 3609 Type *IdxTy = Legal->getWidestInductionType(); 3610 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3611 // The loop step is equal to the vectorization factor (num of SIMD elements) 3612 // times the unroll factor (num of SIMD instructions). 3613 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3614 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3615 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3616 Induction = 3617 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3618 getDebugLocFromInstOrOperands(OldInduction)); 3619 3620 // Emit phis for the new starting index of the scalar loop. 3621 createInductionResumeValues(Lp, CountRoundDown); 3622 3623 return completeLoopSkeleton(Lp, OrigLoopID); 3624 } 3625 3626 // Fix up external users of the induction variable. At this point, we are 3627 // in LCSSA form, with all external PHIs that use the IV having one input value, 3628 // coming from the remainder loop. We need those PHIs to also have a correct 3629 // value for the IV when arriving directly from the middle block. 3630 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3631 const InductionDescriptor &II, 3632 Value *CountRoundDown, Value *EndValue, 3633 BasicBlock *MiddleBlock) { 3634 // There are two kinds of external IV usages - those that use the value 3635 // computed in the last iteration (the PHI) and those that use the penultimate 3636 // value (the value that feeds into the phi from the loop latch). 3637 // We allow both, but they, obviously, have different values. 3638 3639 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3640 3641 DenseMap<Value *, Value *> MissingVals; 3642 3643 // An external user of the last iteration's value should see the value that 3644 // the remainder loop uses to initialize its own IV. 3645 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3646 for (User *U : PostInc->users()) { 3647 Instruction *UI = cast<Instruction>(U); 3648 if (!OrigLoop->contains(UI)) { 3649 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3650 MissingVals[UI] = EndValue; 3651 } 3652 } 3653 3654 // An external user of the penultimate value need to see EndValue - Step. 3655 // The simplest way to get this is to recompute it from the constituent SCEVs, 3656 // that is Start + (Step * (CRD - 1)). 3657 for (User *U : OrigPhi->users()) { 3658 auto *UI = cast<Instruction>(U); 3659 if (!OrigLoop->contains(UI)) { 3660 const DataLayout &DL = 3661 OrigLoop->getHeader()->getModule()->getDataLayout(); 3662 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3663 3664 IRBuilder<> B(MiddleBlock->getTerminator()); 3665 3666 // Fast-math-flags propagate from the original induction instruction. 3667 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3668 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3669 3670 Value *CountMinusOne = B.CreateSub( 3671 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3672 Value *CMO = 3673 !II.getStep()->getType()->isIntegerTy() 3674 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3675 II.getStep()->getType()) 3676 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3677 CMO->setName("cast.cmo"); 3678 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3679 Escape->setName("ind.escape"); 3680 MissingVals[UI] = Escape; 3681 } 3682 } 3683 3684 for (auto &I : MissingVals) { 3685 PHINode *PHI = cast<PHINode>(I.first); 3686 // One corner case we have to handle is two IVs "chasing" each-other, 3687 // that is %IV2 = phi [...], [ %IV1, %latch ] 3688 // In this case, if IV1 has an external use, we need to avoid adding both 3689 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3690 // don't already have an incoming value for the middle block. 3691 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3692 PHI->addIncoming(I.second, MiddleBlock); 3693 } 3694 } 3695 3696 namespace { 3697 3698 struct CSEDenseMapInfo { 3699 static bool canHandle(const Instruction *I) { 3700 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3701 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3702 } 3703 3704 static inline Instruction *getEmptyKey() { 3705 return DenseMapInfo<Instruction *>::getEmptyKey(); 3706 } 3707 3708 static inline Instruction *getTombstoneKey() { 3709 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3710 } 3711 3712 static unsigned getHashValue(const Instruction *I) { 3713 assert(canHandle(I) && "Unknown instruction!"); 3714 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3715 I->value_op_end())); 3716 } 3717 3718 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3719 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3720 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3721 return LHS == RHS; 3722 return LHS->isIdenticalTo(RHS); 3723 } 3724 }; 3725 3726 } // end anonymous namespace 3727 3728 ///Perform cse of induction variable instructions. 3729 static void cse(BasicBlock *BB) { 3730 // Perform simple cse. 3731 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3732 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3733 Instruction *In = &*I++; 3734 3735 if (!CSEDenseMapInfo::canHandle(In)) 3736 continue; 3737 3738 // Check if we can replace this instruction with any of the 3739 // visited instructions. 3740 if (Instruction *V = CSEMap.lookup(In)) { 3741 In->replaceAllUsesWith(V); 3742 In->eraseFromParent(); 3743 continue; 3744 } 3745 3746 CSEMap[In] = In; 3747 } 3748 } 3749 3750 InstructionCost 3751 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3752 bool &NeedToScalarize) const { 3753 Function *F = CI->getCalledFunction(); 3754 Type *ScalarRetTy = CI->getType(); 3755 SmallVector<Type *, 4> Tys, ScalarTys; 3756 for (auto &ArgOp : CI->arg_operands()) 3757 ScalarTys.push_back(ArgOp->getType()); 3758 3759 // Estimate cost of scalarized vector call. The source operands are assumed 3760 // to be vectors, so we need to extract individual elements from there, 3761 // execute VF scalar calls, and then gather the result into the vector return 3762 // value. 3763 InstructionCost ScalarCallCost = 3764 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3765 if (VF.isScalar()) 3766 return ScalarCallCost; 3767 3768 // Compute corresponding vector type for return value and arguments. 3769 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3770 for (Type *ScalarTy : ScalarTys) 3771 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3772 3773 // Compute costs of unpacking argument values for the scalar calls and 3774 // packing the return values to a vector. 3775 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3776 3777 InstructionCost Cost = 3778 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3779 3780 // If we can't emit a vector call for this function, then the currently found 3781 // cost is the cost we need to return. 3782 NeedToScalarize = true; 3783 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3784 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3785 3786 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3787 return Cost; 3788 3789 // If the corresponding vector cost is cheaper, return its cost. 3790 InstructionCost VectorCallCost = 3791 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3792 if (VectorCallCost < Cost) { 3793 NeedToScalarize = false; 3794 Cost = VectorCallCost; 3795 } 3796 return Cost; 3797 } 3798 3799 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3800 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3801 return Elt; 3802 return VectorType::get(Elt, VF); 3803 } 3804 3805 InstructionCost 3806 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3807 ElementCount VF) const { 3808 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3809 assert(ID && "Expected intrinsic call!"); 3810 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3811 FastMathFlags FMF; 3812 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3813 FMF = FPMO->getFastMathFlags(); 3814 3815 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3816 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3817 SmallVector<Type *> ParamTys; 3818 std::transform(FTy->param_begin(), FTy->param_end(), 3819 std::back_inserter(ParamTys), 3820 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3821 3822 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3823 dyn_cast<IntrinsicInst>(CI)); 3824 return TTI.getIntrinsicInstrCost(CostAttrs, 3825 TargetTransformInfo::TCK_RecipThroughput); 3826 } 3827 3828 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3829 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3830 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3831 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3832 } 3833 3834 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3835 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3836 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3837 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3838 } 3839 3840 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3841 // For every instruction `I` in MinBWs, truncate the operands, create a 3842 // truncated version of `I` and reextend its result. InstCombine runs 3843 // later and will remove any ext/trunc pairs. 3844 SmallPtrSet<Value *, 4> Erased; 3845 for (const auto &KV : Cost->getMinimalBitwidths()) { 3846 // If the value wasn't vectorized, we must maintain the original scalar 3847 // type. The absence of the value from State indicates that it 3848 // wasn't vectorized. 3849 VPValue *Def = State.Plan->getVPValue(KV.first); 3850 if (!State.hasAnyVectorValue(Def)) 3851 continue; 3852 for (unsigned Part = 0; Part < UF; ++Part) { 3853 Value *I = State.get(Def, Part); 3854 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3855 continue; 3856 Type *OriginalTy = I->getType(); 3857 Type *ScalarTruncatedTy = 3858 IntegerType::get(OriginalTy->getContext(), KV.second); 3859 auto *TruncatedTy = FixedVectorType::get( 3860 ScalarTruncatedTy, 3861 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3862 if (TruncatedTy == OriginalTy) 3863 continue; 3864 3865 IRBuilder<> B(cast<Instruction>(I)); 3866 auto ShrinkOperand = [&](Value *V) -> Value * { 3867 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3868 if (ZI->getSrcTy() == TruncatedTy) 3869 return ZI->getOperand(0); 3870 return B.CreateZExtOrTrunc(V, TruncatedTy); 3871 }; 3872 3873 // The actual instruction modification depends on the instruction type, 3874 // unfortunately. 3875 Value *NewI = nullptr; 3876 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3877 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3878 ShrinkOperand(BO->getOperand(1))); 3879 3880 // Any wrapping introduced by shrinking this operation shouldn't be 3881 // considered undefined behavior. So, we can't unconditionally copy 3882 // arithmetic wrapping flags to NewI. 3883 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3884 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3885 NewI = 3886 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3887 ShrinkOperand(CI->getOperand(1))); 3888 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3889 NewI = B.CreateSelect(SI->getCondition(), 3890 ShrinkOperand(SI->getTrueValue()), 3891 ShrinkOperand(SI->getFalseValue())); 3892 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3893 switch (CI->getOpcode()) { 3894 default: 3895 llvm_unreachable("Unhandled cast!"); 3896 case Instruction::Trunc: 3897 NewI = ShrinkOperand(CI->getOperand(0)); 3898 break; 3899 case Instruction::SExt: 3900 NewI = B.CreateSExtOrTrunc( 3901 CI->getOperand(0), 3902 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3903 break; 3904 case Instruction::ZExt: 3905 NewI = B.CreateZExtOrTrunc( 3906 CI->getOperand(0), 3907 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3908 break; 3909 } 3910 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3911 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3912 ->getNumElements(); 3913 auto *O0 = B.CreateZExtOrTrunc( 3914 SI->getOperand(0), 3915 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3916 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3917 ->getNumElements(); 3918 auto *O1 = B.CreateZExtOrTrunc( 3919 SI->getOperand(1), 3920 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3921 3922 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3923 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3924 // Don't do anything with the operands, just extend the result. 3925 continue; 3926 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3927 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3928 ->getNumElements(); 3929 auto *O0 = B.CreateZExtOrTrunc( 3930 IE->getOperand(0), 3931 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3932 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3933 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3934 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3935 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3936 ->getNumElements(); 3937 auto *O0 = B.CreateZExtOrTrunc( 3938 EE->getOperand(0), 3939 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3940 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3941 } else { 3942 // If we don't know what to do, be conservative and don't do anything. 3943 continue; 3944 } 3945 3946 // Lastly, extend the result. 3947 NewI->takeName(cast<Instruction>(I)); 3948 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3949 I->replaceAllUsesWith(Res); 3950 cast<Instruction>(I)->eraseFromParent(); 3951 Erased.insert(I); 3952 State.reset(Def, Res, Part); 3953 } 3954 } 3955 3956 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3957 for (const auto &KV : Cost->getMinimalBitwidths()) { 3958 // If the value wasn't vectorized, we must maintain the original scalar 3959 // type. The absence of the value from State indicates that it 3960 // wasn't vectorized. 3961 VPValue *Def = State.Plan->getVPValue(KV.first); 3962 if (!State.hasAnyVectorValue(Def)) 3963 continue; 3964 for (unsigned Part = 0; Part < UF; ++Part) { 3965 Value *I = State.get(Def, Part); 3966 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3967 if (Inst && Inst->use_empty()) { 3968 Value *NewI = Inst->getOperand(0); 3969 Inst->eraseFromParent(); 3970 State.reset(Def, NewI, Part); 3971 } 3972 } 3973 } 3974 } 3975 3976 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3977 // Insert truncates and extends for any truncated instructions as hints to 3978 // InstCombine. 3979 if (VF.isVector()) 3980 truncateToMinimalBitwidths(State); 3981 3982 // Fix widened non-induction PHIs by setting up the PHI operands. 3983 if (OrigPHIsToFix.size()) { 3984 assert(EnableVPlanNativePath && 3985 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3986 fixNonInductionPHIs(State); 3987 } 3988 3989 // At this point every instruction in the original loop is widened to a 3990 // vector form. Now we need to fix the recurrences in the loop. These PHI 3991 // nodes are currently empty because we did not want to introduce cycles. 3992 // This is the second stage of vectorizing recurrences. 3993 fixCrossIterationPHIs(State); 3994 3995 // Forget the original basic block. 3996 PSE.getSE()->forgetLoop(OrigLoop); 3997 3998 // Fix-up external users of the induction variables. 3999 for (auto &Entry : Legal->getInductionVars()) 4000 fixupIVUsers(Entry.first, Entry.second, 4001 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4002 IVEndValues[Entry.first], LoopMiddleBlock); 4003 4004 fixLCSSAPHIs(State); 4005 for (Instruction *PI : PredicatedInstructions) 4006 sinkScalarOperands(&*PI); 4007 4008 // Remove redundant induction instructions. 4009 cse(LoopVectorBody); 4010 4011 // Set/update profile weights for the vector and remainder loops as original 4012 // loop iterations are now distributed among them. Note that original loop 4013 // represented by LoopScalarBody becomes remainder loop after vectorization. 4014 // 4015 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4016 // end up getting slightly roughened result but that should be OK since 4017 // profile is not inherently precise anyway. Note also possible bypass of 4018 // vector code caused by legality checks is ignored, assigning all the weight 4019 // to the vector loop, optimistically. 4020 // 4021 // For scalable vectorization we can't know at compile time how many iterations 4022 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4023 // vscale of '1'. 4024 setProfileInfoAfterUnrolling( 4025 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4026 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4027 } 4028 4029 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4030 // In order to support recurrences we need to be able to vectorize Phi nodes. 4031 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4032 // stage #2: We now need to fix the recurrences by adding incoming edges to 4033 // the currently empty PHI nodes. At this point every instruction in the 4034 // original loop is widened to a vector form so we can use them to construct 4035 // the incoming edges. 4036 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4037 // Handle first-order recurrences and reductions that need to be fixed. 4038 if (Legal->isFirstOrderRecurrence(&Phi)) 4039 fixFirstOrderRecurrence(&Phi, State); 4040 else if (Legal->isReductionVariable(&Phi)) 4041 fixReduction(&Phi, State); 4042 } 4043 } 4044 4045 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4046 VPTransformState &State) { 4047 // This is the second phase of vectorizing first-order recurrences. An 4048 // overview of the transformation is described below. Suppose we have the 4049 // following loop. 4050 // 4051 // for (int i = 0; i < n; ++i) 4052 // b[i] = a[i] - a[i - 1]; 4053 // 4054 // There is a first-order recurrence on "a". For this loop, the shorthand 4055 // scalar IR looks like: 4056 // 4057 // scalar.ph: 4058 // s_init = a[-1] 4059 // br scalar.body 4060 // 4061 // scalar.body: 4062 // i = phi [0, scalar.ph], [i+1, scalar.body] 4063 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4064 // s2 = a[i] 4065 // b[i] = s2 - s1 4066 // br cond, scalar.body, ... 4067 // 4068 // In this example, s1 is a recurrence because it's value depends on the 4069 // previous iteration. In the first phase of vectorization, we created a 4070 // temporary value for s1. We now complete the vectorization and produce the 4071 // shorthand vector IR shown below (for VF = 4, UF = 1). 4072 // 4073 // vector.ph: 4074 // v_init = vector(..., ..., ..., a[-1]) 4075 // br vector.body 4076 // 4077 // vector.body 4078 // i = phi [0, vector.ph], [i+4, vector.body] 4079 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4080 // v2 = a[i, i+1, i+2, i+3]; 4081 // v3 = vector(v1(3), v2(0, 1, 2)) 4082 // b[i, i+1, i+2, i+3] = v2 - v3 4083 // br cond, vector.body, middle.block 4084 // 4085 // middle.block: 4086 // x = v2(3) 4087 // br scalar.ph 4088 // 4089 // scalar.ph: 4090 // s_init = phi [x, middle.block], [a[-1], otherwise] 4091 // br scalar.body 4092 // 4093 // After execution completes the vector loop, we extract the next value of 4094 // the recurrence (x) to use as the initial value in the scalar loop. 4095 4096 // Get the original loop preheader and single loop latch. 4097 auto *Preheader = OrigLoop->getLoopPreheader(); 4098 auto *Latch = OrigLoop->getLoopLatch(); 4099 4100 // Get the initial and previous values of the scalar recurrence. 4101 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4102 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4103 4104 // Create a vector from the initial value. 4105 auto *VectorInit = ScalarInit; 4106 if (VF.isVector()) { 4107 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4108 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4109 VectorInit = Builder.CreateInsertElement( 4110 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4111 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4112 } 4113 4114 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4115 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4116 // We constructed a temporary phi node in the first phase of vectorization. 4117 // This phi node will eventually be deleted. 4118 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4119 4120 // Create a phi node for the new recurrence. The current value will either be 4121 // the initial value inserted into a vector or loop-varying vector value. 4122 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4123 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4124 4125 // Get the vectorized previous value of the last part UF - 1. It appears last 4126 // among all unrolled iterations, due to the order of their construction. 4127 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4128 4129 // Find and set the insertion point after the previous value if it is an 4130 // instruction. 4131 BasicBlock::iterator InsertPt; 4132 // Note that the previous value may have been constant-folded so it is not 4133 // guaranteed to be an instruction in the vector loop. 4134 // FIXME: Loop invariant values do not form recurrences. We should deal with 4135 // them earlier. 4136 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4137 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4138 else { 4139 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4140 if (isa<PHINode>(PreviousLastPart)) 4141 // If the previous value is a phi node, we should insert after all the phi 4142 // nodes in the block containing the PHI to avoid breaking basic block 4143 // verification. Note that the basic block may be different to 4144 // LoopVectorBody, in case we predicate the loop. 4145 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4146 else 4147 InsertPt = ++PreviousInst->getIterator(); 4148 } 4149 Builder.SetInsertPoint(&*InsertPt); 4150 4151 // We will construct a vector for the recurrence by combining the values for 4152 // the current and previous iterations. This is the required shuffle mask. 4153 assert(!VF.isScalable()); 4154 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4155 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4156 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4157 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4158 4159 // The vector from which to take the initial value for the current iteration 4160 // (actual or unrolled). Initially, this is the vector phi node. 4161 Value *Incoming = VecPhi; 4162 4163 // Shuffle the current and previous vector and update the vector parts. 4164 for (unsigned Part = 0; Part < UF; ++Part) { 4165 Value *PreviousPart = State.get(PreviousDef, Part); 4166 Value *PhiPart = State.get(PhiDef, Part); 4167 auto *Shuffle = 4168 VF.isVector() 4169 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4170 : Incoming; 4171 PhiPart->replaceAllUsesWith(Shuffle); 4172 cast<Instruction>(PhiPart)->eraseFromParent(); 4173 State.reset(PhiDef, Shuffle, Part); 4174 Incoming = PreviousPart; 4175 } 4176 4177 // Fix the latch value of the new recurrence in the vector loop. 4178 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4179 4180 // Extract the last vector element in the middle block. This will be the 4181 // initial value for the recurrence when jumping to the scalar loop. 4182 auto *ExtractForScalar = Incoming; 4183 if (VF.isVector()) { 4184 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4185 ExtractForScalar = Builder.CreateExtractElement( 4186 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4187 "vector.recur.extract"); 4188 } 4189 // Extract the second last element in the middle block if the 4190 // Phi is used outside the loop. We need to extract the phi itself 4191 // and not the last element (the phi update in the current iteration). This 4192 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4193 // when the scalar loop is not run at all. 4194 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4195 if (VF.isVector()) 4196 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4197 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4198 "vector.recur.extract.for.phi"); 4199 // When loop is unrolled without vectorizing, initialize 4200 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4201 // `Incoming`. This is analogous to the vectorized case above: extracting the 4202 // second last element when VF > 1. 4203 else if (UF > 1) 4204 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4205 4206 // Fix the initial value of the original recurrence in the scalar loop. 4207 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4208 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4209 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4210 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4211 Start->addIncoming(Incoming, BB); 4212 } 4213 4214 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4215 Phi->setName("scalar.recur"); 4216 4217 // Finally, fix users of the recurrence outside the loop. The users will need 4218 // either the last value of the scalar recurrence or the last value of the 4219 // vector recurrence we extracted in the middle block. Since the loop is in 4220 // LCSSA form, we just need to find all the phi nodes for the original scalar 4221 // recurrence in the exit block, and then add an edge for the middle block. 4222 // Note that LCSSA does not imply single entry when the original scalar loop 4223 // had multiple exiting edges (as we always run the last iteration in the 4224 // scalar epilogue); in that case, the exiting path through middle will be 4225 // dynamically dead and the value picked for the phi doesn't matter. 4226 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4227 if (any_of(LCSSAPhi.incoming_values(), 4228 [Phi](Value *V) { return V == Phi; })) 4229 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4230 } 4231 4232 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4233 // Get it's reduction variable descriptor. 4234 assert(Legal->isReductionVariable(Phi) && 4235 "Unable to find the reduction variable"); 4236 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4237 4238 RecurKind RK = RdxDesc.getRecurrenceKind(); 4239 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4240 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4241 setDebugLocFromInst(Builder, ReductionStartValue); 4242 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4243 4244 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4245 // This is the vector-clone of the value that leaves the loop. 4246 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4247 4248 // Wrap flags are in general invalid after vectorization, clear them. 4249 clearReductionWrapFlags(RdxDesc, State); 4250 4251 // Fix the vector-loop phi. 4252 4253 // Reductions do not have to start at zero. They can start with 4254 // any loop invariant values. 4255 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4256 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4257 4258 for (unsigned Part = 0; Part < UF; ++Part) { 4259 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4260 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4261 cast<PHINode>(VecRdxPhi) 4262 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4263 } 4264 4265 // Before each round, move the insertion point right between 4266 // the PHIs and the values we are going to write. 4267 // This allows us to write both PHINodes and the extractelement 4268 // instructions. 4269 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4270 4271 setDebugLocFromInst(Builder, LoopExitInst); 4272 4273 Type *PhiTy = Phi->getType(); 4274 // If tail is folded by masking, the vector value to leave the loop should be 4275 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4276 // instead of the former. For an inloop reduction the reduction will already 4277 // be predicated, and does not need to be handled here. 4278 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4279 for (unsigned Part = 0; Part < UF; ++Part) { 4280 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4281 Value *Sel = nullptr; 4282 for (User *U : VecLoopExitInst->users()) { 4283 if (isa<SelectInst>(U)) { 4284 assert(!Sel && "Reduction exit feeding two selects"); 4285 Sel = U; 4286 } else 4287 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4288 } 4289 assert(Sel && "Reduction exit feeds no select"); 4290 State.reset(LoopExitInstDef, Sel, Part); 4291 4292 // If the target can create a predicated operator for the reduction at no 4293 // extra cost in the loop (for example a predicated vadd), it can be 4294 // cheaper for the select to remain in the loop than be sunk out of it, 4295 // and so use the select value for the phi instead of the old 4296 // LoopExitValue. 4297 if (PreferPredicatedReductionSelect || 4298 TTI->preferPredicatedReductionSelect( 4299 RdxDesc.getOpcode(), PhiTy, 4300 TargetTransformInfo::ReductionFlags())) { 4301 auto *VecRdxPhi = 4302 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4303 VecRdxPhi->setIncomingValueForBlock( 4304 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4305 } 4306 } 4307 } 4308 4309 // If the vector reduction can be performed in a smaller type, we truncate 4310 // then extend the loop exit value to enable InstCombine to evaluate the 4311 // entire expression in the smaller type. 4312 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4313 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4314 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4315 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4316 Builder.SetInsertPoint( 4317 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4318 VectorParts RdxParts(UF); 4319 for (unsigned Part = 0; Part < UF; ++Part) { 4320 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4321 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4322 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4323 : Builder.CreateZExt(Trunc, VecTy); 4324 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4325 UI != RdxParts[Part]->user_end();) 4326 if (*UI != Trunc) { 4327 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4328 RdxParts[Part] = Extnd; 4329 } else { 4330 ++UI; 4331 } 4332 } 4333 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4334 for (unsigned Part = 0; Part < UF; ++Part) { 4335 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4336 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4337 } 4338 } 4339 4340 // Reduce all of the unrolled parts into a single vector. 4341 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4342 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4343 4344 // The middle block terminator has already been assigned a DebugLoc here (the 4345 // OrigLoop's single latch terminator). We want the whole middle block to 4346 // appear to execute on this line because: (a) it is all compiler generated, 4347 // (b) these instructions are always executed after evaluating the latch 4348 // conditional branch, and (c) other passes may add new predecessors which 4349 // terminate on this line. This is the easiest way to ensure we don't 4350 // accidentally cause an extra step back into the loop while debugging. 4351 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4352 { 4353 // Floating-point operations should have some FMF to enable the reduction. 4354 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4355 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4356 for (unsigned Part = 1; Part < UF; ++Part) { 4357 Value *RdxPart = State.get(LoopExitInstDef, Part); 4358 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4359 ReducedPartRdx = Builder.CreateBinOp( 4360 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4361 } else { 4362 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4363 } 4364 } 4365 } 4366 4367 // Create the reduction after the loop. Note that inloop reductions create the 4368 // target reduction in the loop using a Reduction recipe. 4369 if (VF.isVector() && !IsInLoopReductionPhi) { 4370 ReducedPartRdx = 4371 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4372 // If the reduction can be performed in a smaller type, we need to extend 4373 // the reduction to the wider type before we branch to the original loop. 4374 if (PhiTy != RdxDesc.getRecurrenceType()) 4375 ReducedPartRdx = RdxDesc.isSigned() 4376 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4377 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4378 } 4379 4380 // Create a phi node that merges control-flow from the backedge-taken check 4381 // block and the middle block. 4382 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4383 LoopScalarPreHeader->getTerminator()); 4384 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4385 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4386 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4387 4388 // Now, we need to fix the users of the reduction variable 4389 // inside and outside of the scalar remainder loop. 4390 4391 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4392 // in the exit blocks. See comment on analogous loop in 4393 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4394 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4395 if (any_of(LCSSAPhi.incoming_values(), 4396 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4397 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4398 4399 // Fix the scalar loop reduction variable with the incoming reduction sum 4400 // from the vector body and from the backedge value. 4401 int IncomingEdgeBlockIdx = 4402 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4403 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4404 // Pick the other block. 4405 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4406 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4407 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4408 } 4409 4410 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4411 VPTransformState &State) { 4412 RecurKind RK = RdxDesc.getRecurrenceKind(); 4413 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4414 return; 4415 4416 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4417 assert(LoopExitInstr && "null loop exit instruction"); 4418 SmallVector<Instruction *, 8> Worklist; 4419 SmallPtrSet<Instruction *, 8> Visited; 4420 Worklist.push_back(LoopExitInstr); 4421 Visited.insert(LoopExitInstr); 4422 4423 while (!Worklist.empty()) { 4424 Instruction *Cur = Worklist.pop_back_val(); 4425 if (isa<OverflowingBinaryOperator>(Cur)) 4426 for (unsigned Part = 0; Part < UF; ++Part) { 4427 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4428 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4429 } 4430 4431 for (User *U : Cur->users()) { 4432 Instruction *UI = cast<Instruction>(U); 4433 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4434 Visited.insert(UI).second) 4435 Worklist.push_back(UI); 4436 } 4437 } 4438 } 4439 4440 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4441 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4442 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4443 // Some phis were already hand updated by the reduction and recurrence 4444 // code above, leave them alone. 4445 continue; 4446 4447 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4448 // Non-instruction incoming values will have only one value. 4449 4450 VPLane Lane = VPLane::getFirstLane(); 4451 if (isa<Instruction>(IncomingValue) && 4452 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4453 VF)) 4454 Lane = VPLane::getLastLaneForVF(VF); 4455 4456 // Can be a loop invariant incoming value or the last scalar value to be 4457 // extracted from the vectorized loop. 4458 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4459 Value *lastIncomingValue = 4460 OrigLoop->isLoopInvariant(IncomingValue) 4461 ? IncomingValue 4462 : State.get(State.Plan->getVPValue(IncomingValue), 4463 VPIteration(UF - 1, Lane)); 4464 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4465 } 4466 } 4467 4468 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4469 // The basic block and loop containing the predicated instruction. 4470 auto *PredBB = PredInst->getParent(); 4471 auto *VectorLoop = LI->getLoopFor(PredBB); 4472 4473 // Initialize a worklist with the operands of the predicated instruction. 4474 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4475 4476 // Holds instructions that we need to analyze again. An instruction may be 4477 // reanalyzed if we don't yet know if we can sink it or not. 4478 SmallVector<Instruction *, 8> InstsToReanalyze; 4479 4480 // Returns true if a given use occurs in the predicated block. Phi nodes use 4481 // their operands in their corresponding predecessor blocks. 4482 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4483 auto *I = cast<Instruction>(U.getUser()); 4484 BasicBlock *BB = I->getParent(); 4485 if (auto *Phi = dyn_cast<PHINode>(I)) 4486 BB = Phi->getIncomingBlock( 4487 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4488 return BB == PredBB; 4489 }; 4490 4491 // Iteratively sink the scalarized operands of the predicated instruction 4492 // into the block we created for it. When an instruction is sunk, it's 4493 // operands are then added to the worklist. The algorithm ends after one pass 4494 // through the worklist doesn't sink a single instruction. 4495 bool Changed; 4496 do { 4497 // Add the instructions that need to be reanalyzed to the worklist, and 4498 // reset the changed indicator. 4499 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4500 InstsToReanalyze.clear(); 4501 Changed = false; 4502 4503 while (!Worklist.empty()) { 4504 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4505 4506 // We can't sink an instruction if it is a phi node, is already in the 4507 // predicated block, is not in the loop, or may have side effects. 4508 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4509 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4510 continue; 4511 4512 // It's legal to sink the instruction if all its uses occur in the 4513 // predicated block. Otherwise, there's nothing to do yet, and we may 4514 // need to reanalyze the instruction. 4515 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4516 InstsToReanalyze.push_back(I); 4517 continue; 4518 } 4519 4520 // Move the instruction to the beginning of the predicated block, and add 4521 // it's operands to the worklist. 4522 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4523 Worklist.insert(I->op_begin(), I->op_end()); 4524 4525 // The sinking may have enabled other instructions to be sunk, so we will 4526 // need to iterate. 4527 Changed = true; 4528 } 4529 } while (Changed); 4530 } 4531 4532 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4533 for (PHINode *OrigPhi : OrigPHIsToFix) { 4534 VPWidenPHIRecipe *VPPhi = 4535 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4536 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4537 // Make sure the builder has a valid insert point. 4538 Builder.SetInsertPoint(NewPhi); 4539 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4540 VPValue *Inc = VPPhi->getIncomingValue(i); 4541 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4542 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4543 } 4544 } 4545 } 4546 4547 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4548 VPUser &Operands, unsigned UF, 4549 ElementCount VF, bool IsPtrLoopInvariant, 4550 SmallBitVector &IsIndexLoopInvariant, 4551 VPTransformState &State) { 4552 // Construct a vector GEP by widening the operands of the scalar GEP as 4553 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4554 // results in a vector of pointers when at least one operand of the GEP 4555 // is vector-typed. Thus, to keep the representation compact, we only use 4556 // vector-typed operands for loop-varying values. 4557 4558 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4559 // If we are vectorizing, but the GEP has only loop-invariant operands, 4560 // the GEP we build (by only using vector-typed operands for 4561 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4562 // produce a vector of pointers, we need to either arbitrarily pick an 4563 // operand to broadcast, or broadcast a clone of the original GEP. 4564 // Here, we broadcast a clone of the original. 4565 // 4566 // TODO: If at some point we decide to scalarize instructions having 4567 // loop-invariant operands, this special case will no longer be 4568 // required. We would add the scalarization decision to 4569 // collectLoopScalars() and teach getVectorValue() to broadcast 4570 // the lane-zero scalar value. 4571 auto *Clone = Builder.Insert(GEP->clone()); 4572 for (unsigned Part = 0; Part < UF; ++Part) { 4573 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4574 State.set(VPDef, EntryPart, Part); 4575 addMetadata(EntryPart, GEP); 4576 } 4577 } else { 4578 // If the GEP has at least one loop-varying operand, we are sure to 4579 // produce a vector of pointers. But if we are only unrolling, we want 4580 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4581 // produce with the code below will be scalar (if VF == 1) or vector 4582 // (otherwise). Note that for the unroll-only case, we still maintain 4583 // values in the vector mapping with initVector, as we do for other 4584 // instructions. 4585 for (unsigned Part = 0; Part < UF; ++Part) { 4586 // The pointer operand of the new GEP. If it's loop-invariant, we 4587 // won't broadcast it. 4588 auto *Ptr = IsPtrLoopInvariant 4589 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4590 : State.get(Operands.getOperand(0), Part); 4591 4592 // Collect all the indices for the new GEP. If any index is 4593 // loop-invariant, we won't broadcast it. 4594 SmallVector<Value *, 4> Indices; 4595 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4596 VPValue *Operand = Operands.getOperand(I); 4597 if (IsIndexLoopInvariant[I - 1]) 4598 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4599 else 4600 Indices.push_back(State.get(Operand, Part)); 4601 } 4602 4603 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4604 // but it should be a vector, otherwise. 4605 auto *NewGEP = 4606 GEP->isInBounds() 4607 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4608 Indices) 4609 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4610 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4611 "NewGEP is not a pointer vector"); 4612 State.set(VPDef, NewGEP, Part); 4613 addMetadata(NewGEP, GEP); 4614 } 4615 } 4616 } 4617 4618 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4619 RecurrenceDescriptor *RdxDesc, 4620 VPValue *StartVPV, VPValue *Def, 4621 VPTransformState &State) { 4622 PHINode *P = cast<PHINode>(PN); 4623 if (EnableVPlanNativePath) { 4624 // Currently we enter here in the VPlan-native path for non-induction 4625 // PHIs where all control flow is uniform. We simply widen these PHIs. 4626 // Create a vector phi with no operands - the vector phi operands will be 4627 // set at the end of vector code generation. 4628 Type *VecTy = (State.VF.isScalar()) 4629 ? PN->getType() 4630 : VectorType::get(PN->getType(), State.VF); 4631 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4632 State.set(Def, VecPhi, 0); 4633 OrigPHIsToFix.push_back(P); 4634 4635 return; 4636 } 4637 4638 assert(PN->getParent() == OrigLoop->getHeader() && 4639 "Non-header phis should have been handled elsewhere"); 4640 4641 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4642 // In order to support recurrences we need to be able to vectorize Phi nodes. 4643 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4644 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4645 // this value when we vectorize all of the instructions that use the PHI. 4646 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4647 Value *Iden = nullptr; 4648 bool ScalarPHI = 4649 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4650 Type *VecTy = 4651 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4652 4653 if (RdxDesc) { 4654 assert(Legal->isReductionVariable(P) && StartV && 4655 "RdxDesc should only be set for reduction variables; in that case " 4656 "a StartV is also required"); 4657 RecurKind RK = RdxDesc->getRecurrenceKind(); 4658 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4659 // MinMax reduction have the start value as their identify. 4660 if (ScalarPHI) { 4661 Iden = StartV; 4662 } else { 4663 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4664 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4665 StartV = Iden = 4666 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4667 } 4668 } else { 4669 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4670 RK, VecTy->getScalarType()); 4671 Iden = IdenC; 4672 4673 if (!ScalarPHI) { 4674 Iden = ConstantVector::getSplat(State.VF, IdenC); 4675 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4676 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4677 Constant *Zero = Builder.getInt32(0); 4678 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4679 } 4680 } 4681 } 4682 4683 for (unsigned Part = 0; Part < State.UF; ++Part) { 4684 // This is phase one of vectorizing PHIs. 4685 Value *EntryPart = PHINode::Create( 4686 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4687 State.set(Def, EntryPart, Part); 4688 if (StartV) { 4689 // Make sure to add the reduction start value only to the 4690 // first unroll part. 4691 Value *StartVal = (Part == 0) ? StartV : Iden; 4692 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4693 } 4694 } 4695 return; 4696 } 4697 4698 assert(!Legal->isReductionVariable(P) && 4699 "reductions should be handled above"); 4700 4701 setDebugLocFromInst(Builder, P); 4702 4703 // This PHINode must be an induction variable. 4704 // Make sure that we know about it. 4705 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4706 4707 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4708 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4709 4710 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4711 // which can be found from the original scalar operations. 4712 switch (II.getKind()) { 4713 case InductionDescriptor::IK_NoInduction: 4714 llvm_unreachable("Unknown induction"); 4715 case InductionDescriptor::IK_IntInduction: 4716 case InductionDescriptor::IK_FpInduction: 4717 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4718 case InductionDescriptor::IK_PtrInduction: { 4719 // Handle the pointer induction variable case. 4720 assert(P->getType()->isPointerTy() && "Unexpected type."); 4721 4722 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4723 // This is the normalized GEP that starts counting at zero. 4724 Value *PtrInd = 4725 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4726 // Determine the number of scalars we need to generate for each unroll 4727 // iteration. If the instruction is uniform, we only need to generate the 4728 // first lane. Otherwise, we generate all VF values. 4729 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4730 ? 1 4731 : State.VF.getKnownMinValue(); 4732 for (unsigned Part = 0; Part < UF; ++Part) { 4733 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4734 Constant *Idx = ConstantInt::get( 4735 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4736 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4737 Value *SclrGep = 4738 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4739 SclrGep->setName("next.gep"); 4740 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4741 } 4742 } 4743 return; 4744 } 4745 assert(isa<SCEVConstant>(II.getStep()) && 4746 "Induction step not a SCEV constant!"); 4747 Type *PhiType = II.getStep()->getType(); 4748 4749 // Build a pointer phi 4750 Value *ScalarStartValue = II.getStartValue(); 4751 Type *ScStValueType = ScalarStartValue->getType(); 4752 PHINode *NewPointerPhi = 4753 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4754 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4755 4756 // A pointer induction, performed by using a gep 4757 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4758 Instruction *InductionLoc = LoopLatch->getTerminator(); 4759 const SCEV *ScalarStep = II.getStep(); 4760 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4761 Value *ScalarStepValue = 4762 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4763 Value *InductionGEP = GetElementPtrInst::Create( 4764 ScStValueType->getPointerElementType(), NewPointerPhi, 4765 Builder.CreateMul( 4766 ScalarStepValue, 4767 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4768 "ptr.ind", InductionLoc); 4769 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4770 4771 // Create UF many actual address geps that use the pointer 4772 // phi as base and a vectorized version of the step value 4773 // (<step*0, ..., step*N>) as offset. 4774 for (unsigned Part = 0; Part < State.UF; ++Part) { 4775 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4776 Value *StartOffset = 4777 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue()); 4778 // Create a vector of consecutive numbers from zero to VF. 4779 StartOffset = 4780 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4781 4782 Value *GEP = Builder.CreateGEP( 4783 ScStValueType->getPointerElementType(), NewPointerPhi, 4784 Builder.CreateMul(StartOffset, 4785 Builder.CreateVectorSplat( 4786 State.VF.getKnownMinValue(), ScalarStepValue), 4787 "vector.gep")); 4788 State.set(Def, GEP, Part); 4789 } 4790 } 4791 } 4792 } 4793 4794 /// A helper function for checking whether an integer division-related 4795 /// instruction may divide by zero (in which case it must be predicated if 4796 /// executed conditionally in the scalar code). 4797 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4798 /// Non-zero divisors that are non compile-time constants will not be 4799 /// converted into multiplication, so we will still end up scalarizing 4800 /// the division, but can do so w/o predication. 4801 static bool mayDivideByZero(Instruction &I) { 4802 assert((I.getOpcode() == Instruction::UDiv || 4803 I.getOpcode() == Instruction::SDiv || 4804 I.getOpcode() == Instruction::URem || 4805 I.getOpcode() == Instruction::SRem) && 4806 "Unexpected instruction"); 4807 Value *Divisor = I.getOperand(1); 4808 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4809 return !CInt || CInt->isZero(); 4810 } 4811 4812 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4813 VPUser &User, 4814 VPTransformState &State) { 4815 switch (I.getOpcode()) { 4816 case Instruction::Call: 4817 case Instruction::Br: 4818 case Instruction::PHI: 4819 case Instruction::GetElementPtr: 4820 case Instruction::Select: 4821 llvm_unreachable("This instruction is handled by a different recipe."); 4822 case Instruction::UDiv: 4823 case Instruction::SDiv: 4824 case Instruction::SRem: 4825 case Instruction::URem: 4826 case Instruction::Add: 4827 case Instruction::FAdd: 4828 case Instruction::Sub: 4829 case Instruction::FSub: 4830 case Instruction::FNeg: 4831 case Instruction::Mul: 4832 case Instruction::FMul: 4833 case Instruction::FDiv: 4834 case Instruction::FRem: 4835 case Instruction::Shl: 4836 case Instruction::LShr: 4837 case Instruction::AShr: 4838 case Instruction::And: 4839 case Instruction::Or: 4840 case Instruction::Xor: { 4841 // Just widen unops and binops. 4842 setDebugLocFromInst(Builder, &I); 4843 4844 for (unsigned Part = 0; Part < UF; ++Part) { 4845 SmallVector<Value *, 2> Ops; 4846 for (VPValue *VPOp : User.operands()) 4847 Ops.push_back(State.get(VPOp, Part)); 4848 4849 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4850 4851 if (auto *VecOp = dyn_cast<Instruction>(V)) 4852 VecOp->copyIRFlags(&I); 4853 4854 // Use this vector value for all users of the original instruction. 4855 State.set(Def, V, Part); 4856 addMetadata(V, &I); 4857 } 4858 4859 break; 4860 } 4861 case Instruction::ICmp: 4862 case Instruction::FCmp: { 4863 // Widen compares. Generate vector compares. 4864 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4865 auto *Cmp = cast<CmpInst>(&I); 4866 setDebugLocFromInst(Builder, Cmp); 4867 for (unsigned Part = 0; Part < UF; ++Part) { 4868 Value *A = State.get(User.getOperand(0), Part); 4869 Value *B = State.get(User.getOperand(1), Part); 4870 Value *C = nullptr; 4871 if (FCmp) { 4872 // Propagate fast math flags. 4873 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4874 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4875 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4876 } else { 4877 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4878 } 4879 State.set(Def, C, Part); 4880 addMetadata(C, &I); 4881 } 4882 4883 break; 4884 } 4885 4886 case Instruction::ZExt: 4887 case Instruction::SExt: 4888 case Instruction::FPToUI: 4889 case Instruction::FPToSI: 4890 case Instruction::FPExt: 4891 case Instruction::PtrToInt: 4892 case Instruction::IntToPtr: 4893 case Instruction::SIToFP: 4894 case Instruction::UIToFP: 4895 case Instruction::Trunc: 4896 case Instruction::FPTrunc: 4897 case Instruction::BitCast: { 4898 auto *CI = cast<CastInst>(&I); 4899 setDebugLocFromInst(Builder, CI); 4900 4901 /// Vectorize casts. 4902 Type *DestTy = 4903 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4904 4905 for (unsigned Part = 0; Part < UF; ++Part) { 4906 Value *A = State.get(User.getOperand(0), Part); 4907 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4908 State.set(Def, Cast, Part); 4909 addMetadata(Cast, &I); 4910 } 4911 break; 4912 } 4913 default: 4914 // This instruction is not vectorized by simple widening. 4915 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4916 llvm_unreachable("Unhandled instruction!"); 4917 } // end of switch. 4918 } 4919 4920 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4921 VPUser &ArgOperands, 4922 VPTransformState &State) { 4923 assert(!isa<DbgInfoIntrinsic>(I) && 4924 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4925 setDebugLocFromInst(Builder, &I); 4926 4927 Module *M = I.getParent()->getParent()->getParent(); 4928 auto *CI = cast<CallInst>(&I); 4929 4930 SmallVector<Type *, 4> Tys; 4931 for (Value *ArgOperand : CI->arg_operands()) 4932 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4933 4934 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4935 4936 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4937 // version of the instruction. 4938 // Is it beneficial to perform intrinsic call compared to lib call? 4939 bool NeedToScalarize = false; 4940 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4941 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4942 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4943 assert((UseVectorIntrinsic || !NeedToScalarize) && 4944 "Instruction should be scalarized elsewhere."); 4945 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4946 "Cannot have invalid costs while widening"); 4947 4948 for (unsigned Part = 0; Part < UF; ++Part) { 4949 SmallVector<Value *, 4> Args; 4950 for (auto &I : enumerate(ArgOperands.operands())) { 4951 // Some intrinsics have a scalar argument - don't replace it with a 4952 // vector. 4953 Value *Arg; 4954 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4955 Arg = State.get(I.value(), Part); 4956 else 4957 Arg = State.get(I.value(), VPIteration(0, 0)); 4958 Args.push_back(Arg); 4959 } 4960 4961 Function *VectorF; 4962 if (UseVectorIntrinsic) { 4963 // Use vector version of the intrinsic. 4964 Type *TysForDecl[] = {CI->getType()}; 4965 if (VF.isVector()) 4966 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4967 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4968 assert(VectorF && "Can't retrieve vector intrinsic."); 4969 } else { 4970 // Use vector version of the function call. 4971 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4972 #ifndef NDEBUG 4973 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4974 "Can't create vector function."); 4975 #endif 4976 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4977 } 4978 SmallVector<OperandBundleDef, 1> OpBundles; 4979 CI->getOperandBundlesAsDefs(OpBundles); 4980 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4981 4982 if (isa<FPMathOperator>(V)) 4983 V->copyFastMathFlags(CI); 4984 4985 State.set(Def, V, Part); 4986 addMetadata(V, &I); 4987 } 4988 } 4989 4990 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4991 VPUser &Operands, 4992 bool InvariantCond, 4993 VPTransformState &State) { 4994 setDebugLocFromInst(Builder, &I); 4995 4996 // The condition can be loop invariant but still defined inside the 4997 // loop. This means that we can't just use the original 'cond' value. 4998 // We have to take the 'vectorized' value and pick the first lane. 4999 // Instcombine will make this a no-op. 5000 auto *InvarCond = InvariantCond 5001 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5002 : nullptr; 5003 5004 for (unsigned Part = 0; Part < UF; ++Part) { 5005 Value *Cond = 5006 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5007 Value *Op0 = State.get(Operands.getOperand(1), Part); 5008 Value *Op1 = State.get(Operands.getOperand(2), Part); 5009 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5010 State.set(VPDef, Sel, Part); 5011 addMetadata(Sel, &I); 5012 } 5013 } 5014 5015 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5016 // We should not collect Scalars more than once per VF. Right now, this 5017 // function is called from collectUniformsAndScalars(), which already does 5018 // this check. Collecting Scalars for VF=1 does not make any sense. 5019 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5020 "This function should not be visited twice for the same VF"); 5021 5022 SmallSetVector<Instruction *, 8> Worklist; 5023 5024 // These sets are used to seed the analysis with pointers used by memory 5025 // accesses that will remain scalar. 5026 SmallSetVector<Instruction *, 8> ScalarPtrs; 5027 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5028 auto *Latch = TheLoop->getLoopLatch(); 5029 5030 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5031 // The pointer operands of loads and stores will be scalar as long as the 5032 // memory access is not a gather or scatter operation. The value operand of a 5033 // store will remain scalar if the store is scalarized. 5034 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5035 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5036 assert(WideningDecision != CM_Unknown && 5037 "Widening decision should be ready at this moment"); 5038 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5039 if (Ptr == Store->getValueOperand()) 5040 return WideningDecision == CM_Scalarize; 5041 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5042 "Ptr is neither a value or pointer operand"); 5043 return WideningDecision != CM_GatherScatter; 5044 }; 5045 5046 // A helper that returns true if the given value is a bitcast or 5047 // getelementptr instruction contained in the loop. 5048 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5049 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5050 isa<GetElementPtrInst>(V)) && 5051 !TheLoop->isLoopInvariant(V); 5052 }; 5053 5054 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5055 if (!isa<PHINode>(Ptr) || 5056 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5057 return false; 5058 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5059 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5060 return false; 5061 return isScalarUse(MemAccess, Ptr); 5062 }; 5063 5064 // A helper that evaluates a memory access's use of a pointer. If the 5065 // pointer is actually the pointer induction of a loop, it is being 5066 // inserted into Worklist. If the use will be a scalar use, and the 5067 // pointer is only used by memory accesses, we place the pointer in 5068 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5069 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5070 if (isScalarPtrInduction(MemAccess, Ptr)) { 5071 Worklist.insert(cast<Instruction>(Ptr)); 5072 Instruction *Update = cast<Instruction>( 5073 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5074 Worklist.insert(Update); 5075 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5076 << "\n"); 5077 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5078 << "\n"); 5079 return; 5080 } 5081 // We only care about bitcast and getelementptr instructions contained in 5082 // the loop. 5083 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5084 return; 5085 5086 // If the pointer has already been identified as scalar (e.g., if it was 5087 // also identified as uniform), there's nothing to do. 5088 auto *I = cast<Instruction>(Ptr); 5089 if (Worklist.count(I)) 5090 return; 5091 5092 // If the use of the pointer will be a scalar use, and all users of the 5093 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5094 // place the pointer in PossibleNonScalarPtrs. 5095 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5096 return isa<LoadInst>(U) || isa<StoreInst>(U); 5097 })) 5098 ScalarPtrs.insert(I); 5099 else 5100 PossibleNonScalarPtrs.insert(I); 5101 }; 5102 5103 // We seed the scalars analysis with three classes of instructions: (1) 5104 // instructions marked uniform-after-vectorization and (2) bitcast, 5105 // getelementptr and (pointer) phi instructions used by memory accesses 5106 // requiring a scalar use. 5107 // 5108 // (1) Add to the worklist all instructions that have been identified as 5109 // uniform-after-vectorization. 5110 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5111 5112 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5113 // memory accesses requiring a scalar use. The pointer operands of loads and 5114 // stores will be scalar as long as the memory accesses is not a gather or 5115 // scatter operation. The value operand of a store will remain scalar if the 5116 // store is scalarized. 5117 for (auto *BB : TheLoop->blocks()) 5118 for (auto &I : *BB) { 5119 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5120 evaluatePtrUse(Load, Load->getPointerOperand()); 5121 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5122 evaluatePtrUse(Store, Store->getPointerOperand()); 5123 evaluatePtrUse(Store, Store->getValueOperand()); 5124 } 5125 } 5126 for (auto *I : ScalarPtrs) 5127 if (!PossibleNonScalarPtrs.count(I)) { 5128 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5129 Worklist.insert(I); 5130 } 5131 5132 // Insert the forced scalars. 5133 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5134 // induction variable when the PHI user is scalarized. 5135 auto ForcedScalar = ForcedScalars.find(VF); 5136 if (ForcedScalar != ForcedScalars.end()) 5137 for (auto *I : ForcedScalar->second) 5138 Worklist.insert(I); 5139 5140 // Expand the worklist by looking through any bitcasts and getelementptr 5141 // instructions we've already identified as scalar. This is similar to the 5142 // expansion step in collectLoopUniforms(); however, here we're only 5143 // expanding to include additional bitcasts and getelementptr instructions. 5144 unsigned Idx = 0; 5145 while (Idx != Worklist.size()) { 5146 Instruction *Dst = Worklist[Idx++]; 5147 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5148 continue; 5149 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5150 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5151 auto *J = cast<Instruction>(U); 5152 return !TheLoop->contains(J) || Worklist.count(J) || 5153 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5154 isScalarUse(J, Src)); 5155 })) { 5156 Worklist.insert(Src); 5157 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5158 } 5159 } 5160 5161 // An induction variable will remain scalar if all users of the induction 5162 // variable and induction variable update remain scalar. 5163 for (auto &Induction : Legal->getInductionVars()) { 5164 auto *Ind = Induction.first; 5165 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5166 5167 // If tail-folding is applied, the primary induction variable will be used 5168 // to feed a vector compare. 5169 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5170 continue; 5171 5172 // Determine if all users of the induction variable are scalar after 5173 // vectorization. 5174 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5175 auto *I = cast<Instruction>(U); 5176 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5177 }); 5178 if (!ScalarInd) 5179 continue; 5180 5181 // Determine if all users of the induction variable update instruction are 5182 // scalar after vectorization. 5183 auto ScalarIndUpdate = 5184 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5185 auto *I = cast<Instruction>(U); 5186 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5187 }); 5188 if (!ScalarIndUpdate) 5189 continue; 5190 5191 // The induction variable and its update instruction will remain scalar. 5192 Worklist.insert(Ind); 5193 Worklist.insert(IndUpdate); 5194 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5195 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5196 << "\n"); 5197 } 5198 5199 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5200 } 5201 5202 bool LoopVectorizationCostModel::isScalarWithPredication( 5203 Instruction *I, ElementCount VF) const { 5204 if (!blockNeedsPredication(I->getParent())) 5205 return false; 5206 switch(I->getOpcode()) { 5207 default: 5208 break; 5209 case Instruction::Load: 5210 case Instruction::Store: { 5211 if (!Legal->isMaskRequired(I)) 5212 return false; 5213 auto *Ptr = getLoadStorePointerOperand(I); 5214 auto *Ty = getMemInstValueType(I); 5215 // We have already decided how to vectorize this instruction, get that 5216 // result. 5217 if (VF.isVector()) { 5218 InstWidening WideningDecision = getWideningDecision(I, VF); 5219 assert(WideningDecision != CM_Unknown && 5220 "Widening decision should be ready at this moment"); 5221 return WideningDecision == CM_Scalarize; 5222 } 5223 const Align Alignment = getLoadStoreAlignment(I); 5224 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5225 isLegalMaskedGather(Ty, Alignment)) 5226 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5227 isLegalMaskedScatter(Ty, Alignment)); 5228 } 5229 case Instruction::UDiv: 5230 case Instruction::SDiv: 5231 case Instruction::SRem: 5232 case Instruction::URem: 5233 return mayDivideByZero(*I); 5234 } 5235 return false; 5236 } 5237 5238 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5239 Instruction *I, ElementCount VF) { 5240 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5241 assert(getWideningDecision(I, VF) == CM_Unknown && 5242 "Decision should not be set yet."); 5243 auto *Group = getInterleavedAccessGroup(I); 5244 assert(Group && "Must have a group."); 5245 5246 // If the instruction's allocated size doesn't equal it's type size, it 5247 // requires padding and will be scalarized. 5248 auto &DL = I->getModule()->getDataLayout(); 5249 auto *ScalarTy = getMemInstValueType(I); 5250 if (hasIrregularType(ScalarTy, DL)) 5251 return false; 5252 5253 // Check if masking is required. 5254 // A Group may need masking for one of two reasons: it resides in a block that 5255 // needs predication, or it was decided to use masking to deal with gaps. 5256 bool PredicatedAccessRequiresMasking = 5257 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5258 bool AccessWithGapsRequiresMasking = 5259 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5260 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5261 return true; 5262 5263 // If masked interleaving is required, we expect that the user/target had 5264 // enabled it, because otherwise it either wouldn't have been created or 5265 // it should have been invalidated by the CostModel. 5266 assert(useMaskedInterleavedAccesses(TTI) && 5267 "Masked interleave-groups for predicated accesses are not enabled."); 5268 5269 auto *Ty = getMemInstValueType(I); 5270 const Align Alignment = getLoadStoreAlignment(I); 5271 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5272 : TTI.isLegalMaskedStore(Ty, Alignment); 5273 } 5274 5275 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5276 Instruction *I, ElementCount VF) { 5277 // Get and ensure we have a valid memory instruction. 5278 LoadInst *LI = dyn_cast<LoadInst>(I); 5279 StoreInst *SI = dyn_cast<StoreInst>(I); 5280 assert((LI || SI) && "Invalid memory instruction"); 5281 5282 auto *Ptr = getLoadStorePointerOperand(I); 5283 5284 // In order to be widened, the pointer should be consecutive, first of all. 5285 if (!Legal->isConsecutivePtr(Ptr)) 5286 return false; 5287 5288 // If the instruction is a store located in a predicated block, it will be 5289 // scalarized. 5290 if (isScalarWithPredication(I)) 5291 return false; 5292 5293 // If the instruction's allocated size doesn't equal it's type size, it 5294 // requires padding and will be scalarized. 5295 auto &DL = I->getModule()->getDataLayout(); 5296 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5297 if (hasIrregularType(ScalarTy, DL)) 5298 return false; 5299 5300 return true; 5301 } 5302 5303 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5304 // We should not collect Uniforms more than once per VF. Right now, 5305 // this function is called from collectUniformsAndScalars(), which 5306 // already does this check. Collecting Uniforms for VF=1 does not make any 5307 // sense. 5308 5309 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5310 "This function should not be visited twice for the same VF"); 5311 5312 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5313 // not analyze again. Uniforms.count(VF) will return 1. 5314 Uniforms[VF].clear(); 5315 5316 // We now know that the loop is vectorizable! 5317 // Collect instructions inside the loop that will remain uniform after 5318 // vectorization. 5319 5320 // Global values, params and instructions outside of current loop are out of 5321 // scope. 5322 auto isOutOfScope = [&](Value *V) -> bool { 5323 Instruction *I = dyn_cast<Instruction>(V); 5324 return (!I || !TheLoop->contains(I)); 5325 }; 5326 5327 SetVector<Instruction *> Worklist; 5328 BasicBlock *Latch = TheLoop->getLoopLatch(); 5329 5330 // Instructions that are scalar with predication must not be considered 5331 // uniform after vectorization, because that would create an erroneous 5332 // replicating region where only a single instance out of VF should be formed. 5333 // TODO: optimize such seldom cases if found important, see PR40816. 5334 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5335 if (isOutOfScope(I)) { 5336 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5337 << *I << "\n"); 5338 return; 5339 } 5340 if (isScalarWithPredication(I, VF)) { 5341 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5342 << *I << "\n"); 5343 return; 5344 } 5345 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5346 Worklist.insert(I); 5347 }; 5348 5349 // Start with the conditional branch. If the branch condition is an 5350 // instruction contained in the loop that is only used by the branch, it is 5351 // uniform. 5352 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5353 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5354 addToWorklistIfAllowed(Cmp); 5355 5356 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5357 InstWidening WideningDecision = getWideningDecision(I, VF); 5358 assert(WideningDecision != CM_Unknown && 5359 "Widening decision should be ready at this moment"); 5360 5361 // A uniform memory op is itself uniform. We exclude uniform stores 5362 // here as they demand the last lane, not the first one. 5363 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5364 assert(WideningDecision == CM_Scalarize); 5365 return true; 5366 } 5367 5368 return (WideningDecision == CM_Widen || 5369 WideningDecision == CM_Widen_Reverse || 5370 WideningDecision == CM_Interleave); 5371 }; 5372 5373 5374 // Returns true if Ptr is the pointer operand of a memory access instruction 5375 // I, and I is known to not require scalarization. 5376 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5377 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5378 }; 5379 5380 // Holds a list of values which are known to have at least one uniform use. 5381 // Note that there may be other uses which aren't uniform. A "uniform use" 5382 // here is something which only demands lane 0 of the unrolled iterations; 5383 // it does not imply that all lanes produce the same value (e.g. this is not 5384 // the usual meaning of uniform) 5385 SmallPtrSet<Value *, 8> HasUniformUse; 5386 5387 // Scan the loop for instructions which are either a) known to have only 5388 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5389 for (auto *BB : TheLoop->blocks()) 5390 for (auto &I : *BB) { 5391 // If there's no pointer operand, there's nothing to do. 5392 auto *Ptr = getLoadStorePointerOperand(&I); 5393 if (!Ptr) 5394 continue; 5395 5396 // A uniform memory op is itself uniform. We exclude uniform stores 5397 // here as they demand the last lane, not the first one. 5398 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5399 addToWorklistIfAllowed(&I); 5400 5401 if (isUniformDecision(&I, VF)) { 5402 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5403 HasUniformUse.insert(Ptr); 5404 } 5405 } 5406 5407 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5408 // demanding) users. Since loops are assumed to be in LCSSA form, this 5409 // disallows uses outside the loop as well. 5410 for (auto *V : HasUniformUse) { 5411 if (isOutOfScope(V)) 5412 continue; 5413 auto *I = cast<Instruction>(V); 5414 auto UsersAreMemAccesses = 5415 llvm::all_of(I->users(), [&](User *U) -> bool { 5416 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5417 }); 5418 if (UsersAreMemAccesses) 5419 addToWorklistIfAllowed(I); 5420 } 5421 5422 // Expand Worklist in topological order: whenever a new instruction 5423 // is added , its users should be already inside Worklist. It ensures 5424 // a uniform instruction will only be used by uniform instructions. 5425 unsigned idx = 0; 5426 while (idx != Worklist.size()) { 5427 Instruction *I = Worklist[idx++]; 5428 5429 for (auto OV : I->operand_values()) { 5430 // isOutOfScope operands cannot be uniform instructions. 5431 if (isOutOfScope(OV)) 5432 continue; 5433 // First order recurrence Phi's should typically be considered 5434 // non-uniform. 5435 auto *OP = dyn_cast<PHINode>(OV); 5436 if (OP && Legal->isFirstOrderRecurrence(OP)) 5437 continue; 5438 // If all the users of the operand are uniform, then add the 5439 // operand into the uniform worklist. 5440 auto *OI = cast<Instruction>(OV); 5441 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5442 auto *J = cast<Instruction>(U); 5443 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5444 })) 5445 addToWorklistIfAllowed(OI); 5446 } 5447 } 5448 5449 // For an instruction to be added into Worklist above, all its users inside 5450 // the loop should also be in Worklist. However, this condition cannot be 5451 // true for phi nodes that form a cyclic dependence. We must process phi 5452 // nodes separately. An induction variable will remain uniform if all users 5453 // of the induction variable and induction variable update remain uniform. 5454 // The code below handles both pointer and non-pointer induction variables. 5455 for (auto &Induction : Legal->getInductionVars()) { 5456 auto *Ind = Induction.first; 5457 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5458 5459 // Determine if all users of the induction variable are uniform after 5460 // vectorization. 5461 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5462 auto *I = cast<Instruction>(U); 5463 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5464 isVectorizedMemAccessUse(I, Ind); 5465 }); 5466 if (!UniformInd) 5467 continue; 5468 5469 // Determine if all users of the induction variable update instruction are 5470 // uniform after vectorization. 5471 auto UniformIndUpdate = 5472 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5473 auto *I = cast<Instruction>(U); 5474 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5475 isVectorizedMemAccessUse(I, IndUpdate); 5476 }); 5477 if (!UniformIndUpdate) 5478 continue; 5479 5480 // The induction variable and its update instruction will remain uniform. 5481 addToWorklistIfAllowed(Ind); 5482 addToWorklistIfAllowed(IndUpdate); 5483 } 5484 5485 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5486 } 5487 5488 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5489 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5490 5491 if (Legal->getRuntimePointerChecking()->Need) { 5492 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5493 "runtime pointer checks needed. Enable vectorization of this " 5494 "loop with '#pragma clang loop vectorize(enable)' when " 5495 "compiling with -Os/-Oz", 5496 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5497 return true; 5498 } 5499 5500 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5501 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5502 "runtime SCEV checks needed. Enable vectorization of this " 5503 "loop with '#pragma clang loop vectorize(enable)' when " 5504 "compiling with -Os/-Oz", 5505 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5506 return true; 5507 } 5508 5509 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5510 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5511 reportVectorizationFailure("Runtime stride check for small trip count", 5512 "runtime stride == 1 checks needed. Enable vectorization of " 5513 "this loop without such check by compiling with -Os/-Oz", 5514 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5515 return true; 5516 } 5517 5518 return false; 5519 } 5520 5521 Optional<ElementCount> 5522 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5523 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5524 // TODO: It may by useful to do since it's still likely to be dynamically 5525 // uniform if the target can skip. 5526 reportVectorizationFailure( 5527 "Not inserting runtime ptr check for divergent target", 5528 "runtime pointer checks needed. Not enabled for divergent target", 5529 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5530 return None; 5531 } 5532 5533 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5534 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5535 if (TC == 1) { 5536 reportVectorizationFailure("Single iteration (non) loop", 5537 "loop trip count is one, irrelevant for vectorization", 5538 "SingleIterationLoop", ORE, TheLoop); 5539 return None; 5540 } 5541 5542 switch (ScalarEpilogueStatus) { 5543 case CM_ScalarEpilogueAllowed: 5544 return computeFeasibleMaxVF(TC, UserVF); 5545 case CM_ScalarEpilogueNotAllowedUsePredicate: 5546 LLVM_FALLTHROUGH; 5547 case CM_ScalarEpilogueNotNeededUsePredicate: 5548 LLVM_DEBUG( 5549 dbgs() << "LV: vector predicate hint/switch found.\n" 5550 << "LV: Not allowing scalar epilogue, creating predicated " 5551 << "vector loop.\n"); 5552 break; 5553 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5554 // fallthrough as a special case of OptForSize 5555 case CM_ScalarEpilogueNotAllowedOptSize: 5556 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5557 LLVM_DEBUG( 5558 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5559 else 5560 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5561 << "count.\n"); 5562 5563 // Bail if runtime checks are required, which are not good when optimising 5564 // for size. 5565 if (runtimeChecksRequired()) 5566 return None; 5567 5568 break; 5569 } 5570 5571 // The only loops we can vectorize without a scalar epilogue, are loops with 5572 // a bottom-test and a single exiting block. We'd have to handle the fact 5573 // that not every instruction executes on the last iteration. This will 5574 // require a lane mask which varies through the vector loop body. (TODO) 5575 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5576 // If there was a tail-folding hint/switch, but we can't fold the tail by 5577 // masking, fallback to a vectorization with a scalar epilogue. 5578 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5579 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5580 "scalar epilogue instead.\n"); 5581 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5582 return computeFeasibleMaxVF(TC, UserVF); 5583 } 5584 return None; 5585 } 5586 5587 // Now try the tail folding 5588 5589 // Invalidate interleave groups that require an epilogue if we can't mask 5590 // the interleave-group. 5591 if (!useMaskedInterleavedAccesses(TTI)) { 5592 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5593 "No decisions should have been taken at this point"); 5594 // Note: There is no need to invalidate any cost modeling decisions here, as 5595 // non where taken so far. 5596 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5597 } 5598 5599 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5600 assert(!MaxVF.isScalable() && 5601 "Scalable vectors do not yet support tail folding"); 5602 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5603 "MaxVF must be a power of 2"); 5604 unsigned MaxVFtimesIC = 5605 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5606 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5607 // chose. 5608 ScalarEvolution *SE = PSE.getSE(); 5609 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5610 const SCEV *ExitCount = SE->getAddExpr( 5611 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5612 const SCEV *Rem = SE->getURemExpr( 5613 SE->applyLoopGuards(ExitCount, TheLoop), 5614 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5615 if (Rem->isZero()) { 5616 // Accept MaxVF if we do not have a tail. 5617 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5618 return MaxVF; 5619 } 5620 5621 // If we don't know the precise trip count, or if the trip count that we 5622 // found modulo the vectorization factor is not zero, try to fold the tail 5623 // by masking. 5624 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5625 if (Legal->prepareToFoldTailByMasking()) { 5626 FoldTailByMasking = true; 5627 return MaxVF; 5628 } 5629 5630 // If there was a tail-folding hint/switch, but we can't fold the tail by 5631 // masking, fallback to a vectorization with a scalar epilogue. 5632 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5633 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5634 "scalar epilogue instead.\n"); 5635 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5636 return MaxVF; 5637 } 5638 5639 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5640 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5641 return None; 5642 } 5643 5644 if (TC == 0) { 5645 reportVectorizationFailure( 5646 "Unable to calculate the loop count due to complex control flow", 5647 "unable to calculate the loop count due to complex control flow", 5648 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5649 return None; 5650 } 5651 5652 reportVectorizationFailure( 5653 "Cannot optimize for size and vectorize at the same time.", 5654 "cannot optimize for size and vectorize at the same time. " 5655 "Enable vectorization of this loop with '#pragma clang loop " 5656 "vectorize(enable)' when compiling with -Os/-Oz", 5657 "NoTailLoopWithOptForSize", ORE, TheLoop); 5658 return None; 5659 } 5660 5661 ElementCount 5662 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5663 ElementCount UserVF) { 5664 bool IgnoreScalableUserVF = UserVF.isScalable() && 5665 !TTI.supportsScalableVectors() && 5666 !ForceTargetSupportsScalableVectors; 5667 if (IgnoreScalableUserVF) { 5668 LLVM_DEBUG( 5669 dbgs() << "LV: Ignoring VF=" << UserVF 5670 << " because target does not support scalable vectors.\n"); 5671 ORE->emit([&]() { 5672 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5673 TheLoop->getStartLoc(), 5674 TheLoop->getHeader()) 5675 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5676 << " because target does not support scalable vectors."; 5677 }); 5678 } 5679 5680 // Beyond this point two scenarios are handled. If UserVF isn't specified 5681 // then a suitable VF is chosen. If UserVF is specified and there are 5682 // dependencies, check if it's legal. However, if a UserVF is specified and 5683 // there are no dependencies, then there's nothing to do. 5684 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5685 if (!canVectorizeReductions(UserVF)) { 5686 reportVectorizationFailure( 5687 "LV: Scalable vectorization not supported for the reduction " 5688 "operations found in this loop. Using fixed-width " 5689 "vectorization instead.", 5690 "Scalable vectorization not supported for the reduction operations " 5691 "found in this loop. Using fixed-width vectorization instead.", 5692 "ScalableVFUnfeasible", ORE, TheLoop); 5693 return computeFeasibleMaxVF( 5694 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5695 } 5696 5697 if (Legal->isSafeForAnyVectorWidth()) 5698 return UserVF; 5699 } 5700 5701 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5702 unsigned SmallestType, WidestType; 5703 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5704 unsigned WidestRegister = 5705 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 5706 .getFixedSize(); 5707 5708 // Get the maximum safe dependence distance in bits computed by LAA. 5709 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5710 // the memory accesses that is most restrictive (involved in the smallest 5711 // dependence distance). 5712 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5713 5714 // If the user vectorization factor is legally unsafe, clamp it to a safe 5715 // value. Otherwise, return as is. 5716 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5717 unsigned MaxSafeElements = 5718 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5719 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5720 5721 if (UserVF.isScalable()) { 5722 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5723 5724 // Scale VF by vscale before checking if it's safe. 5725 MaxSafeVF = ElementCount::getScalable( 5726 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5727 5728 if (MaxSafeVF.isZero()) { 5729 // The dependence distance is too small to use scalable vectors, 5730 // fallback on fixed. 5731 LLVM_DEBUG( 5732 dbgs() 5733 << "LV: Max legal vector width too small, scalable vectorization " 5734 "unfeasible. Using fixed-width vectorization instead.\n"); 5735 ORE->emit([&]() { 5736 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5737 TheLoop->getStartLoc(), 5738 TheLoop->getHeader()) 5739 << "Max legal vector width too small, scalable vectorization " 5740 << "unfeasible. Using fixed-width vectorization instead."; 5741 }); 5742 return computeFeasibleMaxVF( 5743 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5744 } 5745 } 5746 5747 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5748 5749 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5750 return UserVF; 5751 5752 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5753 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5754 << ".\n"); 5755 ORE->emit([&]() { 5756 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5757 TheLoop->getStartLoc(), 5758 TheLoop->getHeader()) 5759 << "User-specified vectorization factor " 5760 << ore::NV("UserVectorizationFactor", UserVF) 5761 << " is unsafe, clamping to maximum safe vectorization factor " 5762 << ore::NV("VectorizationFactor", MaxSafeVF); 5763 }); 5764 return MaxSafeVF; 5765 } 5766 5767 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5768 5769 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5770 // Note that both WidestRegister and WidestType may not be a powers of 2. 5771 auto MaxVectorSize = 5772 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5773 5774 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5775 << " / " << WidestType << " bits.\n"); 5776 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5777 << WidestRegister << " bits.\n"); 5778 5779 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5780 "Did not expect to pack so many elements" 5781 " into one vector!"); 5782 if (MaxVectorSize.getFixedValue() == 0) { 5783 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5784 return ElementCount::getFixed(1); 5785 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5786 isPowerOf2_32(ConstTripCount)) { 5787 // We need to clamp the VF to be the ConstTripCount. There is no point in 5788 // choosing a higher viable VF as done in the loop below. 5789 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5790 << ConstTripCount << "\n"); 5791 return ElementCount::getFixed(ConstTripCount); 5792 } 5793 5794 ElementCount MaxVF = MaxVectorSize; 5795 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5796 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5797 // Collect all viable vectorization factors larger than the default MaxVF 5798 // (i.e. MaxVectorSize). 5799 SmallVector<ElementCount, 8> VFs; 5800 auto MaxVectorSizeMaxBW = 5801 ElementCount::getFixed(WidestRegister / SmallestType); 5802 for (ElementCount VS = MaxVectorSize * 2; 5803 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5804 VFs.push_back(VS); 5805 5806 // For each VF calculate its register usage. 5807 auto RUs = calculateRegisterUsage(VFs); 5808 5809 // Select the largest VF which doesn't require more registers than existing 5810 // ones. 5811 for (int i = RUs.size() - 1; i >= 0; --i) { 5812 bool Selected = true; 5813 for (auto &pair : RUs[i].MaxLocalUsers) { 5814 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5815 if (pair.second > TargetNumRegisters) 5816 Selected = false; 5817 } 5818 if (Selected) { 5819 MaxVF = VFs[i]; 5820 break; 5821 } 5822 } 5823 if (ElementCount MinVF = 5824 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5825 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5826 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5827 << ") with target's minimum: " << MinVF << '\n'); 5828 MaxVF = MinVF; 5829 } 5830 } 5831 } 5832 return MaxVF; 5833 } 5834 5835 VectorizationFactor 5836 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5837 // FIXME: This can be fixed for scalable vectors later, because at this stage 5838 // the LoopVectorizer will only consider vectorizing a loop with scalable 5839 // vectors when the loop has a hint to enable vectorization for a given VF. 5840 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5841 5842 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5843 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5844 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5845 5846 auto Width = ElementCount::getFixed(1); 5847 const float ScalarCost = *ExpectedCost.getValue(); 5848 float Cost = ScalarCost; 5849 5850 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5851 if (ForceVectorization && MaxVF.isVector()) { 5852 // Ignore scalar width, because the user explicitly wants vectorization. 5853 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5854 // evaluation. 5855 Cost = std::numeric_limits<float>::max(); 5856 } 5857 5858 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5859 i *= 2) { 5860 // Notice that the vector loop needs to be executed less times, so 5861 // we need to divide the cost of the vector loops by the width of 5862 // the vector elements. 5863 VectorizationCostTy C = expectedCost(i); 5864 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5865 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5866 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5867 << " costs: " << (int)VectorCost << ".\n"); 5868 if (!C.second && !ForceVectorization) { 5869 LLVM_DEBUG( 5870 dbgs() << "LV: Not considering vector loop of width " << i 5871 << " because it will not generate any vector instructions.\n"); 5872 continue; 5873 } 5874 5875 // If profitable add it to ProfitableVF list. 5876 if (VectorCost < ScalarCost) { 5877 ProfitableVFs.push_back(VectorizationFactor( 5878 {i, (unsigned)VectorCost})); 5879 } 5880 5881 if (VectorCost < Cost) { 5882 Cost = VectorCost; 5883 Width = i; 5884 } 5885 } 5886 5887 if (!EnableCondStoresVectorization && NumPredStores) { 5888 reportVectorizationFailure("There are conditional stores.", 5889 "store that is conditionally executed prevents vectorization", 5890 "ConditionalStore", ORE, TheLoop); 5891 Width = ElementCount::getFixed(1); 5892 Cost = ScalarCost; 5893 } 5894 5895 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5896 << "LV: Vectorization seems to be not beneficial, " 5897 << "but was forced by a user.\n"); 5898 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5899 VectorizationFactor Factor = {Width, 5900 (unsigned)(Width.getKnownMinValue() * Cost)}; 5901 return Factor; 5902 } 5903 5904 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5905 const Loop &L, ElementCount VF) const { 5906 // Cross iteration phis such as reductions need special handling and are 5907 // currently unsupported. 5908 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5909 return Legal->isFirstOrderRecurrence(&Phi) || 5910 Legal->isReductionVariable(&Phi); 5911 })) 5912 return false; 5913 5914 // Phis with uses outside of the loop require special handling and are 5915 // currently unsupported. 5916 for (auto &Entry : Legal->getInductionVars()) { 5917 // Look for uses of the value of the induction at the last iteration. 5918 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5919 for (User *U : PostInc->users()) 5920 if (!L.contains(cast<Instruction>(U))) 5921 return false; 5922 // Look for uses of penultimate value of the induction. 5923 for (User *U : Entry.first->users()) 5924 if (!L.contains(cast<Instruction>(U))) 5925 return false; 5926 } 5927 5928 // Induction variables that are widened require special handling that is 5929 // currently not supported. 5930 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5931 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5932 this->isProfitableToScalarize(Entry.first, VF)); 5933 })) 5934 return false; 5935 5936 return true; 5937 } 5938 5939 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5940 const ElementCount VF) const { 5941 // FIXME: We need a much better cost-model to take different parameters such 5942 // as register pressure, code size increase and cost of extra branches into 5943 // account. For now we apply a very crude heuristic and only consider loops 5944 // with vectorization factors larger than a certain value. 5945 // We also consider epilogue vectorization unprofitable for targets that don't 5946 // consider interleaving beneficial (eg. MVE). 5947 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5948 return false; 5949 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5950 return true; 5951 return false; 5952 } 5953 5954 VectorizationFactor 5955 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5956 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5957 VectorizationFactor Result = VectorizationFactor::Disabled(); 5958 if (!EnableEpilogueVectorization) { 5959 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5960 return Result; 5961 } 5962 5963 if (!isScalarEpilogueAllowed()) { 5964 LLVM_DEBUG( 5965 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5966 "allowed.\n";); 5967 return Result; 5968 } 5969 5970 // FIXME: This can be fixed for scalable vectors later, because at this stage 5971 // the LoopVectorizer will only consider vectorizing a loop with scalable 5972 // vectors when the loop has a hint to enable vectorization for a given VF. 5973 if (MainLoopVF.isScalable()) { 5974 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5975 "yet supported.\n"); 5976 return Result; 5977 } 5978 5979 // Not really a cost consideration, but check for unsupported cases here to 5980 // simplify the logic. 5981 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5982 LLVM_DEBUG( 5983 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5984 "not a supported candidate.\n";); 5985 return Result; 5986 } 5987 5988 if (EpilogueVectorizationForceVF > 1) { 5989 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5990 if (LVP.hasPlanWithVFs( 5991 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5992 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5993 else { 5994 LLVM_DEBUG( 5995 dbgs() 5996 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5997 return Result; 5998 } 5999 } 6000 6001 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6002 TheLoop->getHeader()->getParent()->hasMinSize()) { 6003 LLVM_DEBUG( 6004 dbgs() 6005 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6006 return Result; 6007 } 6008 6009 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6010 return Result; 6011 6012 for (auto &NextVF : ProfitableVFs) 6013 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6014 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6015 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6016 Result = NextVF; 6017 6018 if (Result != VectorizationFactor::Disabled()) 6019 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6020 << Result.Width.getFixedValue() << "\n";); 6021 return Result; 6022 } 6023 6024 std::pair<unsigned, unsigned> 6025 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6026 unsigned MinWidth = -1U; 6027 unsigned MaxWidth = 8; 6028 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6029 6030 // For each block. 6031 for (BasicBlock *BB : TheLoop->blocks()) { 6032 // For each instruction in the loop. 6033 for (Instruction &I : BB->instructionsWithoutDebug()) { 6034 Type *T = I.getType(); 6035 6036 // Skip ignored values. 6037 if (ValuesToIgnore.count(&I)) 6038 continue; 6039 6040 // Only examine Loads, Stores and PHINodes. 6041 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6042 continue; 6043 6044 // Examine PHI nodes that are reduction variables. Update the type to 6045 // account for the recurrence type. 6046 if (auto *PN = dyn_cast<PHINode>(&I)) { 6047 if (!Legal->isReductionVariable(PN)) 6048 continue; 6049 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6050 if (PreferInLoopReductions || 6051 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6052 RdxDesc.getRecurrenceType(), 6053 TargetTransformInfo::ReductionFlags())) 6054 continue; 6055 T = RdxDesc.getRecurrenceType(); 6056 } 6057 6058 // Examine the stored values. 6059 if (auto *ST = dyn_cast<StoreInst>(&I)) 6060 T = ST->getValueOperand()->getType(); 6061 6062 // Ignore loaded pointer types and stored pointer types that are not 6063 // vectorizable. 6064 // 6065 // FIXME: The check here attempts to predict whether a load or store will 6066 // be vectorized. We only know this for certain after a VF has 6067 // been selected. Here, we assume that if an access can be 6068 // vectorized, it will be. We should also look at extending this 6069 // optimization to non-pointer types. 6070 // 6071 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6072 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6073 continue; 6074 6075 MinWidth = std::min(MinWidth, 6076 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6077 MaxWidth = std::max(MaxWidth, 6078 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6079 } 6080 } 6081 6082 return {MinWidth, MaxWidth}; 6083 } 6084 6085 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6086 unsigned LoopCost) { 6087 // -- The interleave heuristics -- 6088 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6089 // There are many micro-architectural considerations that we can't predict 6090 // at this level. For example, frontend pressure (on decode or fetch) due to 6091 // code size, or the number and capabilities of the execution ports. 6092 // 6093 // We use the following heuristics to select the interleave count: 6094 // 1. If the code has reductions, then we interleave to break the cross 6095 // iteration dependency. 6096 // 2. If the loop is really small, then we interleave to reduce the loop 6097 // overhead. 6098 // 3. We don't interleave if we think that we will spill registers to memory 6099 // due to the increased register pressure. 6100 6101 if (!isScalarEpilogueAllowed()) 6102 return 1; 6103 6104 // We used the distance for the interleave count. 6105 if (Legal->getMaxSafeDepDistBytes() != -1U) 6106 return 1; 6107 6108 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6109 const bool HasReductions = !Legal->getReductionVars().empty(); 6110 // Do not interleave loops with a relatively small known or estimated trip 6111 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6112 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6113 // because with the above conditions interleaving can expose ILP and break 6114 // cross iteration dependences for reductions. 6115 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6116 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6117 return 1; 6118 6119 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6120 // We divide by these constants so assume that we have at least one 6121 // instruction that uses at least one register. 6122 for (auto& pair : R.MaxLocalUsers) { 6123 pair.second = std::max(pair.second, 1U); 6124 } 6125 6126 // We calculate the interleave count using the following formula. 6127 // Subtract the number of loop invariants from the number of available 6128 // registers. These registers are used by all of the interleaved instances. 6129 // Next, divide the remaining registers by the number of registers that is 6130 // required by the loop, in order to estimate how many parallel instances 6131 // fit without causing spills. All of this is rounded down if necessary to be 6132 // a power of two. We want power of two interleave count to simplify any 6133 // addressing operations or alignment considerations. 6134 // We also want power of two interleave counts to ensure that the induction 6135 // variable of the vector loop wraps to zero, when tail is folded by masking; 6136 // this currently happens when OptForSize, in which case IC is set to 1 above. 6137 unsigned IC = UINT_MAX; 6138 6139 for (auto& pair : R.MaxLocalUsers) { 6140 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6141 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6142 << " registers of " 6143 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6144 if (VF.isScalar()) { 6145 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6146 TargetNumRegisters = ForceTargetNumScalarRegs; 6147 } else { 6148 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6149 TargetNumRegisters = ForceTargetNumVectorRegs; 6150 } 6151 unsigned MaxLocalUsers = pair.second; 6152 unsigned LoopInvariantRegs = 0; 6153 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6154 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6155 6156 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6157 // Don't count the induction variable as interleaved. 6158 if (EnableIndVarRegisterHeur) { 6159 TmpIC = 6160 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6161 std::max(1U, (MaxLocalUsers - 1))); 6162 } 6163 6164 IC = std::min(IC, TmpIC); 6165 } 6166 6167 // Clamp the interleave ranges to reasonable counts. 6168 unsigned MaxInterleaveCount = 6169 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6170 6171 // Check if the user has overridden the max. 6172 if (VF.isScalar()) { 6173 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6174 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6175 } else { 6176 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6177 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6178 } 6179 6180 // If trip count is known or estimated compile time constant, limit the 6181 // interleave count to be less than the trip count divided by VF, provided it 6182 // is at least 1. 6183 // 6184 // For scalable vectors we can't know if interleaving is beneficial. It may 6185 // not be beneficial for small loops if none of the lanes in the second vector 6186 // iterations is enabled. However, for larger loops, there is likely to be a 6187 // similar benefit as for fixed-width vectors. For now, we choose to leave 6188 // the InterleaveCount as if vscale is '1', although if some information about 6189 // the vector is known (e.g. min vector size), we can make a better decision. 6190 if (BestKnownTC) { 6191 MaxInterleaveCount = 6192 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6193 // Make sure MaxInterleaveCount is greater than 0. 6194 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6195 } 6196 6197 assert(MaxInterleaveCount > 0 && 6198 "Maximum interleave count must be greater than 0"); 6199 6200 // Clamp the calculated IC to be between the 1 and the max interleave count 6201 // that the target and trip count allows. 6202 if (IC > MaxInterleaveCount) 6203 IC = MaxInterleaveCount; 6204 else 6205 // Make sure IC is greater than 0. 6206 IC = std::max(1u, IC); 6207 6208 assert(IC > 0 && "Interleave count must be greater than 0."); 6209 6210 // If we did not calculate the cost for VF (because the user selected the VF) 6211 // then we calculate the cost of VF here. 6212 if (LoopCost == 0) { 6213 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6214 LoopCost = *expectedCost(VF).first.getValue(); 6215 } 6216 6217 assert(LoopCost && "Non-zero loop cost expected"); 6218 6219 // Interleave if we vectorized this loop and there is a reduction that could 6220 // benefit from interleaving. 6221 if (VF.isVector() && HasReductions) { 6222 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6223 return IC; 6224 } 6225 6226 // Note that if we've already vectorized the loop we will have done the 6227 // runtime check and so interleaving won't require further checks. 6228 bool InterleavingRequiresRuntimePointerCheck = 6229 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6230 6231 // We want to interleave small loops in order to reduce the loop overhead and 6232 // potentially expose ILP opportunities. 6233 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6234 << "LV: IC is " << IC << '\n' 6235 << "LV: VF is " << VF << '\n'); 6236 const bool AggressivelyInterleaveReductions = 6237 TTI.enableAggressiveInterleaving(HasReductions); 6238 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6239 // We assume that the cost overhead is 1 and we use the cost model 6240 // to estimate the cost of the loop and interleave until the cost of the 6241 // loop overhead is about 5% of the cost of the loop. 6242 unsigned SmallIC = 6243 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6244 6245 // Interleave until store/load ports (estimated by max interleave count) are 6246 // saturated. 6247 unsigned NumStores = Legal->getNumStores(); 6248 unsigned NumLoads = Legal->getNumLoads(); 6249 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6250 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6251 6252 // If we have a scalar reduction (vector reductions are already dealt with 6253 // by this point), we can increase the critical path length if the loop 6254 // we're interleaving is inside another loop. Limit, by default to 2, so the 6255 // critical path only gets increased by one reduction operation. 6256 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6257 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6258 SmallIC = std::min(SmallIC, F); 6259 StoresIC = std::min(StoresIC, F); 6260 LoadsIC = std::min(LoadsIC, F); 6261 } 6262 6263 if (EnableLoadStoreRuntimeInterleave && 6264 std::max(StoresIC, LoadsIC) > SmallIC) { 6265 LLVM_DEBUG( 6266 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6267 return std::max(StoresIC, LoadsIC); 6268 } 6269 6270 // If there are scalar reductions and TTI has enabled aggressive 6271 // interleaving for reductions, we will interleave to expose ILP. 6272 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6273 AggressivelyInterleaveReductions) { 6274 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6275 // Interleave no less than SmallIC but not as aggressive as the normal IC 6276 // to satisfy the rare situation when resources are too limited. 6277 return std::max(IC / 2, SmallIC); 6278 } else { 6279 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6280 return SmallIC; 6281 } 6282 } 6283 6284 // Interleave if this is a large loop (small loops are already dealt with by 6285 // this point) that could benefit from interleaving. 6286 if (AggressivelyInterleaveReductions) { 6287 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6288 return IC; 6289 } 6290 6291 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6292 return 1; 6293 } 6294 6295 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6296 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6297 // This function calculates the register usage by measuring the highest number 6298 // of values that are alive at a single location. Obviously, this is a very 6299 // rough estimation. We scan the loop in a topological order in order and 6300 // assign a number to each instruction. We use RPO to ensure that defs are 6301 // met before their users. We assume that each instruction that has in-loop 6302 // users starts an interval. We record every time that an in-loop value is 6303 // used, so we have a list of the first and last occurrences of each 6304 // instruction. Next, we transpose this data structure into a multi map that 6305 // holds the list of intervals that *end* at a specific location. This multi 6306 // map allows us to perform a linear search. We scan the instructions linearly 6307 // and record each time that a new interval starts, by placing it in a set. 6308 // If we find this value in the multi-map then we remove it from the set. 6309 // The max register usage is the maximum size of the set. 6310 // We also search for instructions that are defined outside the loop, but are 6311 // used inside the loop. We need this number separately from the max-interval 6312 // usage number because when we unroll, loop-invariant values do not take 6313 // more register. 6314 LoopBlocksDFS DFS(TheLoop); 6315 DFS.perform(LI); 6316 6317 RegisterUsage RU; 6318 6319 // Each 'key' in the map opens a new interval. The values 6320 // of the map are the index of the 'last seen' usage of the 6321 // instruction that is the key. 6322 using IntervalMap = DenseMap<Instruction *, unsigned>; 6323 6324 // Maps instruction to its index. 6325 SmallVector<Instruction *, 64> IdxToInstr; 6326 // Marks the end of each interval. 6327 IntervalMap EndPoint; 6328 // Saves the list of instruction indices that are used in the loop. 6329 SmallPtrSet<Instruction *, 8> Ends; 6330 // Saves the list of values that are used in the loop but are 6331 // defined outside the loop, such as arguments and constants. 6332 SmallPtrSet<Value *, 8> LoopInvariants; 6333 6334 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6335 for (Instruction &I : BB->instructionsWithoutDebug()) { 6336 IdxToInstr.push_back(&I); 6337 6338 // Save the end location of each USE. 6339 for (Value *U : I.operands()) { 6340 auto *Instr = dyn_cast<Instruction>(U); 6341 6342 // Ignore non-instruction values such as arguments, constants, etc. 6343 if (!Instr) 6344 continue; 6345 6346 // If this instruction is outside the loop then record it and continue. 6347 if (!TheLoop->contains(Instr)) { 6348 LoopInvariants.insert(Instr); 6349 continue; 6350 } 6351 6352 // Overwrite previous end points. 6353 EndPoint[Instr] = IdxToInstr.size(); 6354 Ends.insert(Instr); 6355 } 6356 } 6357 } 6358 6359 // Saves the list of intervals that end with the index in 'key'. 6360 using InstrList = SmallVector<Instruction *, 2>; 6361 DenseMap<unsigned, InstrList> TransposeEnds; 6362 6363 // Transpose the EndPoints to a list of values that end at each index. 6364 for (auto &Interval : EndPoint) 6365 TransposeEnds[Interval.second].push_back(Interval.first); 6366 6367 SmallPtrSet<Instruction *, 8> OpenIntervals; 6368 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6369 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6370 6371 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6372 6373 // A lambda that gets the register usage for the given type and VF. 6374 const auto &TTICapture = TTI; 6375 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6376 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6377 return 0U; 6378 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6379 }; 6380 6381 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6382 Instruction *I = IdxToInstr[i]; 6383 6384 // Remove all of the instructions that end at this location. 6385 InstrList &List = TransposeEnds[i]; 6386 for (Instruction *ToRemove : List) 6387 OpenIntervals.erase(ToRemove); 6388 6389 // Ignore instructions that are never used within the loop. 6390 if (!Ends.count(I)) 6391 continue; 6392 6393 // Skip ignored values. 6394 if (ValuesToIgnore.count(I)) 6395 continue; 6396 6397 // For each VF find the maximum usage of registers. 6398 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6399 // Count the number of live intervals. 6400 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6401 6402 if (VFs[j].isScalar()) { 6403 for (auto Inst : OpenIntervals) { 6404 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6405 if (RegUsage.find(ClassID) == RegUsage.end()) 6406 RegUsage[ClassID] = 1; 6407 else 6408 RegUsage[ClassID] += 1; 6409 } 6410 } else { 6411 collectUniformsAndScalars(VFs[j]); 6412 for (auto Inst : OpenIntervals) { 6413 // Skip ignored values for VF > 1. 6414 if (VecValuesToIgnore.count(Inst)) 6415 continue; 6416 if (isScalarAfterVectorization(Inst, VFs[j])) { 6417 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6418 if (RegUsage.find(ClassID) == RegUsage.end()) 6419 RegUsage[ClassID] = 1; 6420 else 6421 RegUsage[ClassID] += 1; 6422 } else { 6423 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6424 if (RegUsage.find(ClassID) == RegUsage.end()) 6425 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6426 else 6427 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6428 } 6429 } 6430 } 6431 6432 for (auto& pair : RegUsage) { 6433 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6434 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6435 else 6436 MaxUsages[j][pair.first] = pair.second; 6437 } 6438 } 6439 6440 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6441 << OpenIntervals.size() << '\n'); 6442 6443 // Add the current instruction to the list of open intervals. 6444 OpenIntervals.insert(I); 6445 } 6446 6447 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6448 SmallMapVector<unsigned, unsigned, 4> Invariant; 6449 6450 for (auto Inst : LoopInvariants) { 6451 unsigned Usage = 6452 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6453 unsigned ClassID = 6454 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6455 if (Invariant.find(ClassID) == Invariant.end()) 6456 Invariant[ClassID] = Usage; 6457 else 6458 Invariant[ClassID] += Usage; 6459 } 6460 6461 LLVM_DEBUG({ 6462 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6463 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6464 << " item\n"; 6465 for (const auto &pair : MaxUsages[i]) { 6466 dbgs() << "LV(REG): RegisterClass: " 6467 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6468 << " registers\n"; 6469 } 6470 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6471 << " item\n"; 6472 for (const auto &pair : Invariant) { 6473 dbgs() << "LV(REG): RegisterClass: " 6474 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6475 << " registers\n"; 6476 } 6477 }); 6478 6479 RU.LoopInvariantRegs = Invariant; 6480 RU.MaxLocalUsers = MaxUsages[i]; 6481 RUs[i] = RU; 6482 } 6483 6484 return RUs; 6485 } 6486 6487 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6488 // TODO: Cost model for emulated masked load/store is completely 6489 // broken. This hack guides the cost model to use an artificially 6490 // high enough value to practically disable vectorization with such 6491 // operations, except where previously deployed legality hack allowed 6492 // using very low cost values. This is to avoid regressions coming simply 6493 // from moving "masked load/store" check from legality to cost model. 6494 // Masked Load/Gather emulation was previously never allowed. 6495 // Limited number of Masked Store/Scatter emulation was allowed. 6496 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6497 return isa<LoadInst>(I) || 6498 (isa<StoreInst>(I) && 6499 NumPredStores > NumberOfStoresToPredicate); 6500 } 6501 6502 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6503 // If we aren't vectorizing the loop, or if we've already collected the 6504 // instructions to scalarize, there's nothing to do. Collection may already 6505 // have occurred if we have a user-selected VF and are now computing the 6506 // expected cost for interleaving. 6507 if (VF.isScalar() || VF.isZero() || 6508 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6509 return; 6510 6511 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6512 // not profitable to scalarize any instructions, the presence of VF in the 6513 // map will indicate that we've analyzed it already. 6514 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6515 6516 // Find all the instructions that are scalar with predication in the loop and 6517 // determine if it would be better to not if-convert the blocks they are in. 6518 // If so, we also record the instructions to scalarize. 6519 for (BasicBlock *BB : TheLoop->blocks()) { 6520 if (!blockNeedsPredication(BB)) 6521 continue; 6522 for (Instruction &I : *BB) 6523 if (isScalarWithPredication(&I)) { 6524 ScalarCostsTy ScalarCosts; 6525 // Do not apply discount logic if hacked cost is needed 6526 // for emulated masked memrefs. 6527 if (!useEmulatedMaskMemRefHack(&I) && 6528 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6529 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6530 // Remember that BB will remain after vectorization. 6531 PredicatedBBsAfterVectorization.insert(BB); 6532 } 6533 } 6534 } 6535 6536 int LoopVectorizationCostModel::computePredInstDiscount( 6537 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6538 assert(!isUniformAfterVectorization(PredInst, VF) && 6539 "Instruction marked uniform-after-vectorization will be predicated"); 6540 6541 // Initialize the discount to zero, meaning that the scalar version and the 6542 // vector version cost the same. 6543 InstructionCost Discount = 0; 6544 6545 // Holds instructions to analyze. The instructions we visit are mapped in 6546 // ScalarCosts. Those instructions are the ones that would be scalarized if 6547 // we find that the scalar version costs less. 6548 SmallVector<Instruction *, 8> Worklist; 6549 6550 // Returns true if the given instruction can be scalarized. 6551 auto canBeScalarized = [&](Instruction *I) -> bool { 6552 // We only attempt to scalarize instructions forming a single-use chain 6553 // from the original predicated block that would otherwise be vectorized. 6554 // Although not strictly necessary, we give up on instructions we know will 6555 // already be scalar to avoid traversing chains that are unlikely to be 6556 // beneficial. 6557 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6558 isScalarAfterVectorization(I, VF)) 6559 return false; 6560 6561 // If the instruction is scalar with predication, it will be analyzed 6562 // separately. We ignore it within the context of PredInst. 6563 if (isScalarWithPredication(I)) 6564 return false; 6565 6566 // If any of the instruction's operands are uniform after vectorization, 6567 // the instruction cannot be scalarized. This prevents, for example, a 6568 // masked load from being scalarized. 6569 // 6570 // We assume we will only emit a value for lane zero of an instruction 6571 // marked uniform after vectorization, rather than VF identical values. 6572 // Thus, if we scalarize an instruction that uses a uniform, we would 6573 // create uses of values corresponding to the lanes we aren't emitting code 6574 // for. This behavior can be changed by allowing getScalarValue to clone 6575 // the lane zero values for uniforms rather than asserting. 6576 for (Use &U : I->operands()) 6577 if (auto *J = dyn_cast<Instruction>(U.get())) 6578 if (isUniformAfterVectorization(J, VF)) 6579 return false; 6580 6581 // Otherwise, we can scalarize the instruction. 6582 return true; 6583 }; 6584 6585 // Compute the expected cost discount from scalarizing the entire expression 6586 // feeding the predicated instruction. We currently only consider expressions 6587 // that are single-use instruction chains. 6588 Worklist.push_back(PredInst); 6589 while (!Worklist.empty()) { 6590 Instruction *I = Worklist.pop_back_val(); 6591 6592 // If we've already analyzed the instruction, there's nothing to do. 6593 if (ScalarCosts.find(I) != ScalarCosts.end()) 6594 continue; 6595 6596 // Compute the cost of the vector instruction. Note that this cost already 6597 // includes the scalarization overhead of the predicated instruction. 6598 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6599 6600 // Compute the cost of the scalarized instruction. This cost is the cost of 6601 // the instruction as if it wasn't if-converted and instead remained in the 6602 // predicated block. We will scale this cost by block probability after 6603 // computing the scalarization overhead. 6604 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6605 InstructionCost ScalarCost = 6606 VF.getKnownMinValue() * 6607 getInstructionCost(I, ElementCount::getFixed(1)).first; 6608 6609 // Compute the scalarization overhead of needed insertelement instructions 6610 // and phi nodes. 6611 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6612 ScalarCost += TTI.getScalarizationOverhead( 6613 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6614 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6615 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6616 ScalarCost += 6617 VF.getKnownMinValue() * 6618 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6619 } 6620 6621 // Compute the scalarization overhead of needed extractelement 6622 // instructions. For each of the instruction's operands, if the operand can 6623 // be scalarized, add it to the worklist; otherwise, account for the 6624 // overhead. 6625 for (Use &U : I->operands()) 6626 if (auto *J = dyn_cast<Instruction>(U.get())) { 6627 assert(VectorType::isValidElementType(J->getType()) && 6628 "Instruction has non-scalar type"); 6629 if (canBeScalarized(J)) 6630 Worklist.push_back(J); 6631 else if (needsExtract(J, VF)) { 6632 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6633 ScalarCost += TTI.getScalarizationOverhead( 6634 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6635 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6636 } 6637 } 6638 6639 // Scale the total scalar cost by block probability. 6640 ScalarCost /= getReciprocalPredBlockProb(); 6641 6642 // Compute the discount. A non-negative discount means the vector version 6643 // of the instruction costs more, and scalarizing would be beneficial. 6644 Discount += VectorCost - ScalarCost; 6645 ScalarCosts[I] = ScalarCost; 6646 } 6647 6648 return *Discount.getValue(); 6649 } 6650 6651 LoopVectorizationCostModel::VectorizationCostTy 6652 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6653 VectorizationCostTy Cost; 6654 6655 // For each block. 6656 for (BasicBlock *BB : TheLoop->blocks()) { 6657 VectorizationCostTy BlockCost; 6658 6659 // For each instruction in the old loop. 6660 for (Instruction &I : BB->instructionsWithoutDebug()) { 6661 // Skip ignored values. 6662 if (ValuesToIgnore.count(&I) || 6663 (VF.isVector() && VecValuesToIgnore.count(&I))) 6664 continue; 6665 6666 VectorizationCostTy C = getInstructionCost(&I, VF); 6667 6668 // Check if we should override the cost. 6669 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6670 C.first = InstructionCost(ForceTargetInstructionCost); 6671 6672 BlockCost.first += C.first; 6673 BlockCost.second |= C.second; 6674 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6675 << " for VF " << VF << " For instruction: " << I 6676 << '\n'); 6677 } 6678 6679 // If we are vectorizing a predicated block, it will have been 6680 // if-converted. This means that the block's instructions (aside from 6681 // stores and instructions that may divide by zero) will now be 6682 // unconditionally executed. For the scalar case, we may not always execute 6683 // the predicated block, if it is an if-else block. Thus, scale the block's 6684 // cost by the probability of executing it. blockNeedsPredication from 6685 // Legal is used so as to not include all blocks in tail folded loops. 6686 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6687 BlockCost.first /= getReciprocalPredBlockProb(); 6688 6689 Cost.first += BlockCost.first; 6690 Cost.second |= BlockCost.second; 6691 } 6692 6693 return Cost; 6694 } 6695 6696 /// Gets Address Access SCEV after verifying that the access pattern 6697 /// is loop invariant except the induction variable dependence. 6698 /// 6699 /// This SCEV can be sent to the Target in order to estimate the address 6700 /// calculation cost. 6701 static const SCEV *getAddressAccessSCEV( 6702 Value *Ptr, 6703 LoopVectorizationLegality *Legal, 6704 PredicatedScalarEvolution &PSE, 6705 const Loop *TheLoop) { 6706 6707 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6708 if (!Gep) 6709 return nullptr; 6710 6711 // We are looking for a gep with all loop invariant indices except for one 6712 // which should be an induction variable. 6713 auto SE = PSE.getSE(); 6714 unsigned NumOperands = Gep->getNumOperands(); 6715 for (unsigned i = 1; i < NumOperands; ++i) { 6716 Value *Opd = Gep->getOperand(i); 6717 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6718 !Legal->isInductionVariable(Opd)) 6719 return nullptr; 6720 } 6721 6722 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6723 return PSE.getSCEV(Ptr); 6724 } 6725 6726 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6727 return Legal->hasStride(I->getOperand(0)) || 6728 Legal->hasStride(I->getOperand(1)); 6729 } 6730 6731 InstructionCost 6732 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6733 ElementCount VF) { 6734 assert(VF.isVector() && 6735 "Scalarization cost of instruction implies vectorization."); 6736 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6737 Type *ValTy = getMemInstValueType(I); 6738 auto SE = PSE.getSE(); 6739 6740 unsigned AS = getLoadStoreAddressSpace(I); 6741 Value *Ptr = getLoadStorePointerOperand(I); 6742 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6743 6744 // Figure out whether the access is strided and get the stride value 6745 // if it's known in compile time 6746 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6747 6748 // Get the cost of the scalar memory instruction and address computation. 6749 InstructionCost Cost = 6750 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6751 6752 // Don't pass *I here, since it is scalar but will actually be part of a 6753 // vectorized loop where the user of it is a vectorized instruction. 6754 const Align Alignment = getLoadStoreAlignment(I); 6755 Cost += VF.getKnownMinValue() * 6756 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6757 AS, TTI::TCK_RecipThroughput); 6758 6759 // Get the overhead of the extractelement and insertelement instructions 6760 // we might create due to scalarization. 6761 Cost += getScalarizationOverhead(I, VF); 6762 6763 // If we have a predicated load/store, it will need extra i1 extracts and 6764 // conditional branches, but may not be executed for each vector lane. Scale 6765 // the cost by the probability of executing the predicated block. 6766 if (isPredicatedInst(I)) { 6767 Cost /= getReciprocalPredBlockProb(); 6768 6769 // Add the cost of an i1 extract and a branch 6770 auto *Vec_i1Ty = 6771 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6772 Cost += TTI.getScalarizationOverhead( 6773 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6774 /*Insert=*/false, /*Extract=*/true); 6775 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6776 6777 if (useEmulatedMaskMemRefHack(I)) 6778 // Artificially setting to a high enough value to practically disable 6779 // vectorization with such operations. 6780 Cost = 3000000; 6781 } 6782 6783 return Cost; 6784 } 6785 6786 InstructionCost 6787 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6788 ElementCount VF) { 6789 Type *ValTy = getMemInstValueType(I); 6790 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6791 Value *Ptr = getLoadStorePointerOperand(I); 6792 unsigned AS = getLoadStoreAddressSpace(I); 6793 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6794 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6795 6796 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6797 "Stride should be 1 or -1 for consecutive memory access"); 6798 const Align Alignment = getLoadStoreAlignment(I); 6799 InstructionCost Cost = 0; 6800 if (Legal->isMaskRequired(I)) 6801 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6802 CostKind); 6803 else 6804 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6805 CostKind, I); 6806 6807 bool Reverse = ConsecutiveStride < 0; 6808 if (Reverse) 6809 Cost += 6810 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6811 return Cost; 6812 } 6813 6814 InstructionCost 6815 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6816 ElementCount VF) { 6817 assert(Legal->isUniformMemOp(*I)); 6818 6819 Type *ValTy = getMemInstValueType(I); 6820 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6821 const Align Alignment = getLoadStoreAlignment(I); 6822 unsigned AS = getLoadStoreAddressSpace(I); 6823 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6824 if (isa<LoadInst>(I)) { 6825 return TTI.getAddressComputationCost(ValTy) + 6826 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6827 CostKind) + 6828 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6829 } 6830 StoreInst *SI = cast<StoreInst>(I); 6831 6832 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6833 return TTI.getAddressComputationCost(ValTy) + 6834 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6835 CostKind) + 6836 (isLoopInvariantStoreValue 6837 ? 0 6838 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6839 VF.getKnownMinValue() - 1)); 6840 } 6841 6842 InstructionCost 6843 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6844 ElementCount VF) { 6845 Type *ValTy = getMemInstValueType(I); 6846 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6847 const Align Alignment = getLoadStoreAlignment(I); 6848 const Value *Ptr = getLoadStorePointerOperand(I); 6849 6850 return TTI.getAddressComputationCost(VectorTy) + 6851 TTI.getGatherScatterOpCost( 6852 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6853 TargetTransformInfo::TCK_RecipThroughput, I); 6854 } 6855 6856 InstructionCost 6857 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6858 ElementCount VF) { 6859 // TODO: Once we have support for interleaving with scalable vectors 6860 // we can calculate the cost properly here. 6861 if (VF.isScalable()) 6862 return InstructionCost::getInvalid(); 6863 6864 Type *ValTy = getMemInstValueType(I); 6865 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6866 unsigned AS = getLoadStoreAddressSpace(I); 6867 6868 auto Group = getInterleavedAccessGroup(I); 6869 assert(Group && "Fail to get an interleaved access group."); 6870 6871 unsigned InterleaveFactor = Group->getFactor(); 6872 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6873 6874 // Holds the indices of existing members in an interleaved load group. 6875 // An interleaved store group doesn't need this as it doesn't allow gaps. 6876 SmallVector<unsigned, 4> Indices; 6877 if (isa<LoadInst>(I)) { 6878 for (unsigned i = 0; i < InterleaveFactor; i++) 6879 if (Group->getMember(i)) 6880 Indices.push_back(i); 6881 } 6882 6883 // Calculate the cost of the whole interleaved group. 6884 bool UseMaskForGaps = 6885 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6886 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6887 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6888 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6889 6890 if (Group->isReverse()) { 6891 // TODO: Add support for reversed masked interleaved access. 6892 assert(!Legal->isMaskRequired(I) && 6893 "Reverse masked interleaved access not supported."); 6894 Cost += 6895 Group->getNumMembers() * 6896 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6897 } 6898 return Cost; 6899 } 6900 6901 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6902 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6903 // Early exit for no inloop reductions 6904 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6905 return InstructionCost::getInvalid(); 6906 auto *VectorTy = cast<VectorType>(Ty); 6907 6908 // We are looking for a pattern of, and finding the minimal acceptable cost: 6909 // reduce(mul(ext(A), ext(B))) or 6910 // reduce(mul(A, B)) or 6911 // reduce(ext(A)) or 6912 // reduce(A). 6913 // The basic idea is that we walk down the tree to do that, finding the root 6914 // reduction instruction in InLoopReductionImmediateChains. From there we find 6915 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6916 // of the components. If the reduction cost is lower then we return it for the 6917 // reduction instruction and 0 for the other instructions in the pattern. If 6918 // it is not we return an invalid cost specifying the orignal cost method 6919 // should be used. 6920 Instruction *RetI = I; 6921 if ((RetI->getOpcode() == Instruction::SExt || 6922 RetI->getOpcode() == Instruction::ZExt)) { 6923 if (!RetI->hasOneUser()) 6924 return InstructionCost::getInvalid(); 6925 RetI = RetI->user_back(); 6926 } 6927 if (RetI->getOpcode() == Instruction::Mul && 6928 RetI->user_back()->getOpcode() == Instruction::Add) { 6929 if (!RetI->hasOneUser()) 6930 return InstructionCost::getInvalid(); 6931 RetI = RetI->user_back(); 6932 } 6933 6934 // Test if the found instruction is a reduction, and if not return an invalid 6935 // cost specifying the parent to use the original cost modelling. 6936 if (!InLoopReductionImmediateChains.count(RetI)) 6937 return InstructionCost::getInvalid(); 6938 6939 // Find the reduction this chain is a part of and calculate the basic cost of 6940 // the reduction on its own. 6941 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6942 Instruction *ReductionPhi = LastChain; 6943 while (!isa<PHINode>(ReductionPhi)) 6944 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6945 6946 RecurrenceDescriptor RdxDesc = 6947 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6948 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6949 VectorTy, false, CostKind); 6950 6951 // Get the operand that was not the reduction chain and match it to one of the 6952 // patterns, returning the better cost if it is found. 6953 Instruction *RedOp = RetI->getOperand(1) == LastChain 6954 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6955 : dyn_cast<Instruction>(RetI->getOperand(1)); 6956 6957 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6958 6959 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6960 !TheLoop->isLoopInvariant(RedOp)) { 6961 bool IsUnsigned = isa<ZExtInst>(RedOp); 6962 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6963 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6964 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6965 CostKind); 6966 6967 unsigned ExtCost = 6968 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6969 TTI::CastContextHint::None, CostKind, RedOp); 6970 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6971 return I == RetI ? *RedCost.getValue() : 0; 6972 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6973 Instruction *Mul = RedOp; 6974 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6975 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6976 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6977 Op0->getOpcode() == Op1->getOpcode() && 6978 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6979 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6980 bool IsUnsigned = isa<ZExtInst>(Op0); 6981 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6982 // reduce(mul(ext, ext)) 6983 unsigned ExtCost = 6984 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6985 TTI::CastContextHint::None, CostKind, Op0); 6986 InstructionCost MulCost = 6987 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6988 6989 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6990 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6991 CostKind); 6992 6993 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6994 return I == RetI ? *RedCost.getValue() : 0; 6995 } else { 6996 InstructionCost MulCost = 6997 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6998 6999 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7000 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7001 CostKind); 7002 7003 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7004 return I == RetI ? *RedCost.getValue() : 0; 7005 } 7006 } 7007 7008 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7009 } 7010 7011 InstructionCost 7012 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7013 ElementCount VF) { 7014 // Calculate scalar cost only. Vectorization cost should be ready at this 7015 // moment. 7016 if (VF.isScalar()) { 7017 Type *ValTy = getMemInstValueType(I); 7018 const Align Alignment = getLoadStoreAlignment(I); 7019 unsigned AS = getLoadStoreAddressSpace(I); 7020 7021 return TTI.getAddressComputationCost(ValTy) + 7022 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7023 TTI::TCK_RecipThroughput, I); 7024 } 7025 return getWideningCost(I, VF); 7026 } 7027 7028 LoopVectorizationCostModel::VectorizationCostTy 7029 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7030 ElementCount VF) { 7031 // If we know that this instruction will remain uniform, check the cost of 7032 // the scalar version. 7033 if (isUniformAfterVectorization(I, VF)) 7034 VF = ElementCount::getFixed(1); 7035 7036 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7037 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7038 7039 // Forced scalars do not have any scalarization overhead. 7040 auto ForcedScalar = ForcedScalars.find(VF); 7041 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7042 auto InstSet = ForcedScalar->second; 7043 if (InstSet.count(I)) 7044 return VectorizationCostTy( 7045 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7046 VF.getKnownMinValue()), 7047 false); 7048 } 7049 7050 Type *VectorTy; 7051 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7052 7053 bool TypeNotScalarized = 7054 VF.isVector() && VectorTy->isVectorTy() && 7055 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7056 return VectorizationCostTy(C, TypeNotScalarized); 7057 } 7058 7059 InstructionCost 7060 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7061 ElementCount VF) const { 7062 7063 if (VF.isScalable()) 7064 return InstructionCost::getInvalid(); 7065 7066 if (VF.isScalar()) 7067 return 0; 7068 7069 InstructionCost Cost = 0; 7070 Type *RetTy = ToVectorTy(I->getType(), VF); 7071 if (!RetTy->isVoidTy() && 7072 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7073 Cost += TTI.getScalarizationOverhead( 7074 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7075 true, false); 7076 7077 // Some targets keep addresses scalar. 7078 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7079 return Cost; 7080 7081 // Some targets support efficient element stores. 7082 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7083 return Cost; 7084 7085 // Collect operands to consider. 7086 CallInst *CI = dyn_cast<CallInst>(I); 7087 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7088 7089 // Skip operands that do not require extraction/scalarization and do not incur 7090 // any overhead. 7091 SmallVector<Type *> Tys; 7092 for (auto *V : filterExtractingOperands(Ops, VF)) 7093 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7094 return Cost + TTI.getOperandsScalarizationOverhead( 7095 filterExtractingOperands(Ops, VF), Tys); 7096 } 7097 7098 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7099 if (VF.isScalar()) 7100 return; 7101 NumPredStores = 0; 7102 for (BasicBlock *BB : TheLoop->blocks()) { 7103 // For each instruction in the old loop. 7104 for (Instruction &I : *BB) { 7105 Value *Ptr = getLoadStorePointerOperand(&I); 7106 if (!Ptr) 7107 continue; 7108 7109 // TODO: We should generate better code and update the cost model for 7110 // predicated uniform stores. Today they are treated as any other 7111 // predicated store (see added test cases in 7112 // invariant-store-vectorization.ll). 7113 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7114 NumPredStores++; 7115 7116 if (Legal->isUniformMemOp(I)) { 7117 // TODO: Avoid replicating loads and stores instead of 7118 // relying on instcombine to remove them. 7119 // Load: Scalar load + broadcast 7120 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7121 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7122 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7123 continue; 7124 } 7125 7126 // We assume that widening is the best solution when possible. 7127 if (memoryInstructionCanBeWidened(&I, VF)) { 7128 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7129 int ConsecutiveStride = 7130 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7131 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7132 "Expected consecutive stride."); 7133 InstWidening Decision = 7134 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7135 setWideningDecision(&I, VF, Decision, Cost); 7136 continue; 7137 } 7138 7139 // Choose between Interleaving, Gather/Scatter or Scalarization. 7140 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7141 unsigned NumAccesses = 1; 7142 if (isAccessInterleaved(&I)) { 7143 auto Group = getInterleavedAccessGroup(&I); 7144 assert(Group && "Fail to get an interleaved access group."); 7145 7146 // Make one decision for the whole group. 7147 if (getWideningDecision(&I, VF) != CM_Unknown) 7148 continue; 7149 7150 NumAccesses = Group->getNumMembers(); 7151 if (interleavedAccessCanBeWidened(&I, VF)) 7152 InterleaveCost = getInterleaveGroupCost(&I, VF); 7153 } 7154 7155 InstructionCost GatherScatterCost = 7156 isLegalGatherOrScatter(&I) 7157 ? getGatherScatterCost(&I, VF) * NumAccesses 7158 : InstructionCost::getInvalid(); 7159 7160 InstructionCost ScalarizationCost = 7161 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7162 : InstructionCost::getInvalid(); 7163 7164 // Choose better solution for the current VF, 7165 // write down this decision and use it during vectorization. 7166 InstructionCost Cost; 7167 InstWidening Decision; 7168 if (InterleaveCost <= GatherScatterCost && 7169 InterleaveCost < ScalarizationCost) { 7170 Decision = CM_Interleave; 7171 Cost = InterleaveCost; 7172 } else if (GatherScatterCost < ScalarizationCost) { 7173 Decision = CM_GatherScatter; 7174 Cost = GatherScatterCost; 7175 } else { 7176 assert(!VF.isScalable() && 7177 "We cannot yet scalarise for scalable vectors"); 7178 Decision = CM_Scalarize; 7179 Cost = ScalarizationCost; 7180 } 7181 // If the instructions belongs to an interleave group, the whole group 7182 // receives the same decision. The whole group receives the cost, but 7183 // the cost will actually be assigned to one instruction. 7184 if (auto Group = getInterleavedAccessGroup(&I)) 7185 setWideningDecision(Group, VF, Decision, Cost); 7186 else 7187 setWideningDecision(&I, VF, Decision, Cost); 7188 } 7189 } 7190 7191 // Make sure that any load of address and any other address computation 7192 // remains scalar unless there is gather/scatter support. This avoids 7193 // inevitable extracts into address registers, and also has the benefit of 7194 // activating LSR more, since that pass can't optimize vectorized 7195 // addresses. 7196 if (TTI.prefersVectorizedAddressing()) 7197 return; 7198 7199 // Start with all scalar pointer uses. 7200 SmallPtrSet<Instruction *, 8> AddrDefs; 7201 for (BasicBlock *BB : TheLoop->blocks()) 7202 for (Instruction &I : *BB) { 7203 Instruction *PtrDef = 7204 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7205 if (PtrDef && TheLoop->contains(PtrDef) && 7206 getWideningDecision(&I, VF) != CM_GatherScatter) 7207 AddrDefs.insert(PtrDef); 7208 } 7209 7210 // Add all instructions used to generate the addresses. 7211 SmallVector<Instruction *, 4> Worklist; 7212 append_range(Worklist, AddrDefs); 7213 while (!Worklist.empty()) { 7214 Instruction *I = Worklist.pop_back_val(); 7215 for (auto &Op : I->operands()) 7216 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7217 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7218 AddrDefs.insert(InstOp).second) 7219 Worklist.push_back(InstOp); 7220 } 7221 7222 for (auto *I : AddrDefs) { 7223 if (isa<LoadInst>(I)) { 7224 // Setting the desired widening decision should ideally be handled in 7225 // by cost functions, but since this involves the task of finding out 7226 // if the loaded register is involved in an address computation, it is 7227 // instead changed here when we know this is the case. 7228 InstWidening Decision = getWideningDecision(I, VF); 7229 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7230 // Scalarize a widened load of address. 7231 setWideningDecision( 7232 I, VF, CM_Scalarize, 7233 (VF.getKnownMinValue() * 7234 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7235 else if (auto Group = getInterleavedAccessGroup(I)) { 7236 // Scalarize an interleave group of address loads. 7237 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7238 if (Instruction *Member = Group->getMember(I)) 7239 setWideningDecision( 7240 Member, VF, CM_Scalarize, 7241 (VF.getKnownMinValue() * 7242 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7243 } 7244 } 7245 } else 7246 // Make sure I gets scalarized and a cost estimate without 7247 // scalarization overhead. 7248 ForcedScalars[VF].insert(I); 7249 } 7250 } 7251 7252 InstructionCost 7253 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7254 Type *&VectorTy) { 7255 Type *RetTy = I->getType(); 7256 if (canTruncateToMinimalBitwidth(I, VF)) 7257 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7258 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7259 auto SE = PSE.getSE(); 7260 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7261 7262 // TODO: We need to estimate the cost of intrinsic calls. 7263 switch (I->getOpcode()) { 7264 case Instruction::GetElementPtr: 7265 // We mark this instruction as zero-cost because the cost of GEPs in 7266 // vectorized code depends on whether the corresponding memory instruction 7267 // is scalarized or not. Therefore, we handle GEPs with the memory 7268 // instruction cost. 7269 return 0; 7270 case Instruction::Br: { 7271 // In cases of scalarized and predicated instructions, there will be VF 7272 // predicated blocks in the vectorized loop. Each branch around these 7273 // blocks requires also an extract of its vector compare i1 element. 7274 bool ScalarPredicatedBB = false; 7275 BranchInst *BI = cast<BranchInst>(I); 7276 if (VF.isVector() && BI->isConditional() && 7277 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7278 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7279 ScalarPredicatedBB = true; 7280 7281 if (ScalarPredicatedBB) { 7282 // Return cost for branches around scalarized and predicated blocks. 7283 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7284 auto *Vec_i1Ty = 7285 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7286 return (TTI.getScalarizationOverhead( 7287 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7288 false, true) + 7289 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7290 VF.getKnownMinValue())); 7291 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7292 // The back-edge branch will remain, as will all scalar branches. 7293 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7294 else 7295 // This branch will be eliminated by if-conversion. 7296 return 0; 7297 // Note: We currently assume zero cost for an unconditional branch inside 7298 // a predicated block since it will become a fall-through, although we 7299 // may decide in the future to call TTI for all branches. 7300 } 7301 case Instruction::PHI: { 7302 auto *Phi = cast<PHINode>(I); 7303 7304 // First-order recurrences are replaced by vector shuffles inside the loop. 7305 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7306 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7307 return TTI.getShuffleCost( 7308 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7309 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7310 7311 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7312 // converted into select instructions. We require N - 1 selects per phi 7313 // node, where N is the number of incoming values. 7314 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7315 return (Phi->getNumIncomingValues() - 1) * 7316 TTI.getCmpSelInstrCost( 7317 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7318 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7319 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7320 7321 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7322 } 7323 case Instruction::UDiv: 7324 case Instruction::SDiv: 7325 case Instruction::URem: 7326 case Instruction::SRem: 7327 // If we have a predicated instruction, it may not be executed for each 7328 // vector lane. Get the scalarization cost and scale this amount by the 7329 // probability of executing the predicated block. If the instruction is not 7330 // predicated, we fall through to the next case. 7331 if (VF.isVector() && isScalarWithPredication(I)) { 7332 InstructionCost Cost = 0; 7333 7334 // These instructions have a non-void type, so account for the phi nodes 7335 // that we will create. This cost is likely to be zero. The phi node 7336 // cost, if any, should be scaled by the block probability because it 7337 // models a copy at the end of each predicated block. 7338 Cost += VF.getKnownMinValue() * 7339 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7340 7341 // The cost of the non-predicated instruction. 7342 Cost += VF.getKnownMinValue() * 7343 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7344 7345 // The cost of insertelement and extractelement instructions needed for 7346 // scalarization. 7347 Cost += getScalarizationOverhead(I, VF); 7348 7349 // Scale the cost by the probability of executing the predicated blocks. 7350 // This assumes the predicated block for each vector lane is equally 7351 // likely. 7352 return Cost / getReciprocalPredBlockProb(); 7353 } 7354 LLVM_FALLTHROUGH; 7355 case Instruction::Add: 7356 case Instruction::FAdd: 7357 case Instruction::Sub: 7358 case Instruction::FSub: 7359 case Instruction::Mul: 7360 case Instruction::FMul: 7361 case Instruction::FDiv: 7362 case Instruction::FRem: 7363 case Instruction::Shl: 7364 case Instruction::LShr: 7365 case Instruction::AShr: 7366 case Instruction::And: 7367 case Instruction::Or: 7368 case Instruction::Xor: { 7369 // Since we will replace the stride by 1 the multiplication should go away. 7370 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7371 return 0; 7372 7373 // Detect reduction patterns 7374 InstructionCost RedCost; 7375 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7376 .isValid()) 7377 return RedCost; 7378 7379 // Certain instructions can be cheaper to vectorize if they have a constant 7380 // second vector operand. One example of this are shifts on x86. 7381 Value *Op2 = I->getOperand(1); 7382 TargetTransformInfo::OperandValueProperties Op2VP; 7383 TargetTransformInfo::OperandValueKind Op2VK = 7384 TTI.getOperandInfo(Op2, Op2VP); 7385 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7386 Op2VK = TargetTransformInfo::OK_UniformValue; 7387 7388 SmallVector<const Value *, 4> Operands(I->operand_values()); 7389 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7390 return N * TTI.getArithmeticInstrCost( 7391 I->getOpcode(), VectorTy, CostKind, 7392 TargetTransformInfo::OK_AnyValue, 7393 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7394 } 7395 case Instruction::FNeg: { 7396 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7397 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7398 return N * TTI.getArithmeticInstrCost( 7399 I->getOpcode(), VectorTy, CostKind, 7400 TargetTransformInfo::OK_AnyValue, 7401 TargetTransformInfo::OK_AnyValue, 7402 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7403 I->getOperand(0), I); 7404 } 7405 case Instruction::Select: { 7406 SelectInst *SI = cast<SelectInst>(I); 7407 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7408 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7409 Type *CondTy = SI->getCondition()->getType(); 7410 if (!ScalarCond) 7411 CondTy = VectorType::get(CondTy, VF); 7412 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7413 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7414 } 7415 case Instruction::ICmp: 7416 case Instruction::FCmp: { 7417 Type *ValTy = I->getOperand(0)->getType(); 7418 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7419 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7420 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7421 VectorTy = ToVectorTy(ValTy, VF); 7422 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7423 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7424 } 7425 case Instruction::Store: 7426 case Instruction::Load: { 7427 ElementCount Width = VF; 7428 if (Width.isVector()) { 7429 InstWidening Decision = getWideningDecision(I, Width); 7430 assert(Decision != CM_Unknown && 7431 "CM decision should be taken at this point"); 7432 if (Decision == CM_Scalarize) 7433 Width = ElementCount::getFixed(1); 7434 } 7435 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7436 return getMemoryInstructionCost(I, VF); 7437 } 7438 case Instruction::ZExt: 7439 case Instruction::SExt: 7440 case Instruction::FPToUI: 7441 case Instruction::FPToSI: 7442 case Instruction::FPExt: 7443 case Instruction::PtrToInt: 7444 case Instruction::IntToPtr: 7445 case Instruction::SIToFP: 7446 case Instruction::UIToFP: 7447 case Instruction::Trunc: 7448 case Instruction::FPTrunc: 7449 case Instruction::BitCast: { 7450 // Computes the CastContextHint from a Load/Store instruction. 7451 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7452 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7453 "Expected a load or a store!"); 7454 7455 if (VF.isScalar() || !TheLoop->contains(I)) 7456 return TTI::CastContextHint::Normal; 7457 7458 switch (getWideningDecision(I, VF)) { 7459 case LoopVectorizationCostModel::CM_GatherScatter: 7460 return TTI::CastContextHint::GatherScatter; 7461 case LoopVectorizationCostModel::CM_Interleave: 7462 return TTI::CastContextHint::Interleave; 7463 case LoopVectorizationCostModel::CM_Scalarize: 7464 case LoopVectorizationCostModel::CM_Widen: 7465 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7466 : TTI::CastContextHint::Normal; 7467 case LoopVectorizationCostModel::CM_Widen_Reverse: 7468 return TTI::CastContextHint::Reversed; 7469 case LoopVectorizationCostModel::CM_Unknown: 7470 llvm_unreachable("Instr did not go through cost modelling?"); 7471 } 7472 7473 llvm_unreachable("Unhandled case!"); 7474 }; 7475 7476 unsigned Opcode = I->getOpcode(); 7477 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7478 // For Trunc, the context is the only user, which must be a StoreInst. 7479 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7480 if (I->hasOneUse()) 7481 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7482 CCH = ComputeCCH(Store); 7483 } 7484 // For Z/Sext, the context is the operand, which must be a LoadInst. 7485 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7486 Opcode == Instruction::FPExt) { 7487 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7488 CCH = ComputeCCH(Load); 7489 } 7490 7491 // We optimize the truncation of induction variables having constant 7492 // integer steps. The cost of these truncations is the same as the scalar 7493 // operation. 7494 if (isOptimizableIVTruncate(I, VF)) { 7495 auto *Trunc = cast<TruncInst>(I); 7496 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7497 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7498 } 7499 7500 // Detect reduction patterns 7501 InstructionCost RedCost; 7502 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7503 .isValid()) 7504 return RedCost; 7505 7506 Type *SrcScalarTy = I->getOperand(0)->getType(); 7507 Type *SrcVecTy = 7508 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7509 if (canTruncateToMinimalBitwidth(I, VF)) { 7510 // This cast is going to be shrunk. This may remove the cast or it might 7511 // turn it into slightly different cast. For example, if MinBW == 16, 7512 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7513 // 7514 // Calculate the modified src and dest types. 7515 Type *MinVecTy = VectorTy; 7516 if (Opcode == Instruction::Trunc) { 7517 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7518 VectorTy = 7519 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7520 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7521 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7522 VectorTy = 7523 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7524 } 7525 } 7526 7527 unsigned N; 7528 if (isScalarAfterVectorization(I, VF)) { 7529 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7530 N = VF.getKnownMinValue(); 7531 } else 7532 N = 1; 7533 return N * 7534 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7535 } 7536 case Instruction::Call: { 7537 bool NeedToScalarize; 7538 CallInst *CI = cast<CallInst>(I); 7539 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7540 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7541 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7542 return std::min(CallCost, IntrinsicCost); 7543 } 7544 return CallCost; 7545 } 7546 case Instruction::ExtractValue: 7547 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7548 default: 7549 // The cost of executing VF copies of the scalar instruction. This opcode 7550 // is unknown. Assume that it is the same as 'mul'. 7551 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7552 Instruction::Mul, VectorTy, CostKind) + 7553 getScalarizationOverhead(I, VF); 7554 } // end of switch. 7555 } 7556 7557 char LoopVectorize::ID = 0; 7558 7559 static const char lv_name[] = "Loop Vectorization"; 7560 7561 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7562 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7563 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7564 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7565 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7566 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7567 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7568 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7569 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7570 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7571 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7572 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7573 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7574 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7575 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7576 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7577 7578 namespace llvm { 7579 7580 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7581 7582 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7583 bool VectorizeOnlyWhenForced) { 7584 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7585 } 7586 7587 } // end namespace llvm 7588 7589 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7590 // Check if the pointer operand of a load or store instruction is 7591 // consecutive. 7592 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7593 return Legal->isConsecutivePtr(Ptr); 7594 return false; 7595 } 7596 7597 void LoopVectorizationCostModel::collectValuesToIgnore() { 7598 // Ignore ephemeral values. 7599 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7600 7601 // Ignore type-promoting instructions we identified during reduction 7602 // detection. 7603 for (auto &Reduction : Legal->getReductionVars()) { 7604 RecurrenceDescriptor &RedDes = Reduction.second; 7605 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7606 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7607 } 7608 // Ignore type-casting instructions we identified during induction 7609 // detection. 7610 for (auto &Induction : Legal->getInductionVars()) { 7611 InductionDescriptor &IndDes = Induction.second; 7612 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7613 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7614 } 7615 } 7616 7617 void LoopVectorizationCostModel::collectInLoopReductions() { 7618 for (auto &Reduction : Legal->getReductionVars()) { 7619 PHINode *Phi = Reduction.first; 7620 RecurrenceDescriptor &RdxDesc = Reduction.second; 7621 7622 // We don't collect reductions that are type promoted (yet). 7623 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7624 continue; 7625 7626 // If the target would prefer this reduction to happen "in-loop", then we 7627 // want to record it as such. 7628 unsigned Opcode = RdxDesc.getOpcode(); 7629 if (!PreferInLoopReductions && 7630 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7631 TargetTransformInfo::ReductionFlags())) 7632 continue; 7633 7634 // Check that we can correctly put the reductions into the loop, by 7635 // finding the chain of operations that leads from the phi to the loop 7636 // exit value. 7637 SmallVector<Instruction *, 4> ReductionOperations = 7638 RdxDesc.getReductionOpChain(Phi, TheLoop); 7639 bool InLoop = !ReductionOperations.empty(); 7640 if (InLoop) { 7641 InLoopReductionChains[Phi] = ReductionOperations; 7642 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7643 Instruction *LastChain = Phi; 7644 for (auto *I : ReductionOperations) { 7645 InLoopReductionImmediateChains[I] = LastChain; 7646 LastChain = I; 7647 } 7648 } 7649 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7650 << " reduction for phi: " << *Phi << "\n"); 7651 } 7652 } 7653 7654 // TODO: we could return a pair of values that specify the max VF and 7655 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7656 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7657 // doesn't have a cost model that can choose which plan to execute if 7658 // more than one is generated. 7659 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7660 LoopVectorizationCostModel &CM) { 7661 unsigned WidestType; 7662 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7663 return WidestVectorRegBits / WidestType; 7664 } 7665 7666 VectorizationFactor 7667 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7668 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7669 ElementCount VF = UserVF; 7670 // Outer loop handling: They may require CFG and instruction level 7671 // transformations before even evaluating whether vectorization is profitable. 7672 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7673 // the vectorization pipeline. 7674 if (!OrigLoop->isInnermost()) { 7675 // If the user doesn't provide a vectorization factor, determine a 7676 // reasonable one. 7677 if (UserVF.isZero()) { 7678 VF = ElementCount::getFixed(determineVPlanVF( 7679 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7680 .getFixedSize(), 7681 CM)); 7682 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7683 7684 // Make sure we have a VF > 1 for stress testing. 7685 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7686 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7687 << "overriding computed VF.\n"); 7688 VF = ElementCount::getFixed(4); 7689 } 7690 } 7691 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7692 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7693 "VF needs to be a power of two"); 7694 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7695 << "VF " << VF << " to build VPlans.\n"); 7696 buildVPlans(VF, VF); 7697 7698 // For VPlan build stress testing, we bail out after VPlan construction. 7699 if (VPlanBuildStressTest) 7700 return VectorizationFactor::Disabled(); 7701 7702 return {VF, 0 /*Cost*/}; 7703 } 7704 7705 LLVM_DEBUG( 7706 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7707 "VPlan-native path.\n"); 7708 return VectorizationFactor::Disabled(); 7709 } 7710 7711 Optional<VectorizationFactor> 7712 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7713 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7714 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7715 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7716 return None; 7717 7718 // Invalidate interleave groups if all blocks of loop will be predicated. 7719 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7720 !useMaskedInterleavedAccesses(*TTI)) { 7721 LLVM_DEBUG( 7722 dbgs() 7723 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7724 "which requires masked-interleaved support.\n"); 7725 if (CM.InterleaveInfo.invalidateGroups()) 7726 // Invalidating interleave groups also requires invalidating all decisions 7727 // based on them, which includes widening decisions and uniform and scalar 7728 // values. 7729 CM.invalidateCostModelingDecisions(); 7730 } 7731 7732 ElementCount MaxVF = MaybeMaxVF.getValue(); 7733 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7734 7735 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7736 if (!UserVF.isZero() && 7737 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7738 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7739 // VFs here, this should be reverted to only use legal UserVFs once the 7740 // loop below supports scalable VFs. 7741 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7742 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7743 << " VF " << VF << ".\n"); 7744 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7745 "VF needs to be a power of two"); 7746 // Collect the instructions (and their associated costs) that will be more 7747 // profitable to scalarize. 7748 CM.selectUserVectorizationFactor(VF); 7749 CM.collectInLoopReductions(); 7750 buildVPlansWithVPRecipes(VF, VF); 7751 LLVM_DEBUG(printPlans(dbgs())); 7752 return {{VF, 0}}; 7753 } 7754 7755 assert(!MaxVF.isScalable() && 7756 "Scalable vectors not yet supported beyond this point"); 7757 7758 for (ElementCount VF = ElementCount::getFixed(1); 7759 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7760 // Collect Uniform and Scalar instructions after vectorization with VF. 7761 CM.collectUniformsAndScalars(VF); 7762 7763 // Collect the instructions (and their associated costs) that will be more 7764 // profitable to scalarize. 7765 if (VF.isVector()) 7766 CM.collectInstsToScalarize(VF); 7767 } 7768 7769 CM.collectInLoopReductions(); 7770 7771 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7772 LLVM_DEBUG(printPlans(dbgs())); 7773 if (MaxVF.isScalar()) 7774 return VectorizationFactor::Disabled(); 7775 7776 // Select the optimal vectorization factor. 7777 return CM.selectVectorizationFactor(MaxVF); 7778 } 7779 7780 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7781 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7782 << '\n'); 7783 BestVF = VF; 7784 BestUF = UF; 7785 7786 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7787 return !Plan->hasVF(VF); 7788 }); 7789 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7790 } 7791 7792 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7793 DominatorTree *DT) { 7794 // Perform the actual loop transformation. 7795 7796 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7797 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7798 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7799 7800 VPTransformState State{ 7801 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7802 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7803 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7804 State.CanonicalIV = ILV.Induction; 7805 7806 ILV.printDebugTracesAtStart(); 7807 7808 //===------------------------------------------------===// 7809 // 7810 // Notice: any optimization or new instruction that go 7811 // into the code below should also be implemented in 7812 // the cost-model. 7813 // 7814 //===------------------------------------------------===// 7815 7816 // 2. Copy and widen instructions from the old loop into the new loop. 7817 VPlans.front()->execute(&State); 7818 7819 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7820 // predication, updating analyses. 7821 ILV.fixVectorizedLoop(State); 7822 7823 ILV.printDebugTracesAtEnd(); 7824 } 7825 7826 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7827 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7828 for (const auto &Plan : VPlans) 7829 if (PrintVPlansInDotFormat) 7830 Plan->printDOT(O); 7831 else 7832 Plan->print(O); 7833 } 7834 #endif 7835 7836 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7837 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7838 7839 // We create new control-flow for the vectorized loop, so the original exit 7840 // conditions will be dead after vectorization if it's only used by the 7841 // terminator 7842 SmallVector<BasicBlock*> ExitingBlocks; 7843 OrigLoop->getExitingBlocks(ExitingBlocks); 7844 for (auto *BB : ExitingBlocks) { 7845 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7846 if (!Cmp || !Cmp->hasOneUse()) 7847 continue; 7848 7849 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7850 if (!DeadInstructions.insert(Cmp).second) 7851 continue; 7852 7853 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7854 // TODO: can recurse through operands in general 7855 for (Value *Op : Cmp->operands()) { 7856 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7857 DeadInstructions.insert(cast<Instruction>(Op)); 7858 } 7859 } 7860 7861 // We create new "steps" for induction variable updates to which the original 7862 // induction variables map. An original update instruction will be dead if 7863 // all its users except the induction variable are dead. 7864 auto *Latch = OrigLoop->getLoopLatch(); 7865 for (auto &Induction : Legal->getInductionVars()) { 7866 PHINode *Ind = Induction.first; 7867 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7868 7869 // If the tail is to be folded by masking, the primary induction variable, 7870 // if exists, isn't dead: it will be used for masking. Don't kill it. 7871 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7872 continue; 7873 7874 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7875 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7876 })) 7877 DeadInstructions.insert(IndUpdate); 7878 7879 // We record as "Dead" also the type-casting instructions we had identified 7880 // during induction analysis. We don't need any handling for them in the 7881 // vectorized loop because we have proven that, under a proper runtime 7882 // test guarding the vectorized loop, the value of the phi, and the casted 7883 // value of the phi, are the same. The last instruction in this casting chain 7884 // will get its scalar/vector/widened def from the scalar/vector/widened def 7885 // of the respective phi node. Any other casts in the induction def-use chain 7886 // have no other uses outside the phi update chain, and will be ignored. 7887 InductionDescriptor &IndDes = Induction.second; 7888 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7889 DeadInstructions.insert(Casts.begin(), Casts.end()); 7890 } 7891 } 7892 7893 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7894 7895 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7896 7897 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7898 Instruction::BinaryOps BinOp) { 7899 // When unrolling and the VF is 1, we only need to add a simple scalar. 7900 Type *Ty = Val->getType(); 7901 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7902 7903 if (Ty->isFloatingPointTy()) { 7904 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7905 7906 // Floating-point operations inherit FMF via the builder's flags. 7907 Value *MulOp = Builder.CreateFMul(C, Step); 7908 return Builder.CreateBinOp(BinOp, Val, MulOp); 7909 } 7910 Constant *C = ConstantInt::get(Ty, StartIdx); 7911 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7912 } 7913 7914 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7915 SmallVector<Metadata *, 4> MDs; 7916 // Reserve first location for self reference to the LoopID metadata node. 7917 MDs.push_back(nullptr); 7918 bool IsUnrollMetadata = false; 7919 MDNode *LoopID = L->getLoopID(); 7920 if (LoopID) { 7921 // First find existing loop unrolling disable metadata. 7922 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7923 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7924 if (MD) { 7925 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7926 IsUnrollMetadata = 7927 S && S->getString().startswith("llvm.loop.unroll.disable"); 7928 } 7929 MDs.push_back(LoopID->getOperand(i)); 7930 } 7931 } 7932 7933 if (!IsUnrollMetadata) { 7934 // Add runtime unroll disable metadata. 7935 LLVMContext &Context = L->getHeader()->getContext(); 7936 SmallVector<Metadata *, 1> DisableOperands; 7937 DisableOperands.push_back( 7938 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7939 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7940 MDs.push_back(DisableNode); 7941 MDNode *NewLoopID = MDNode::get(Context, MDs); 7942 // Set operand 0 to refer to the loop id itself. 7943 NewLoopID->replaceOperandWith(0, NewLoopID); 7944 L->setLoopID(NewLoopID); 7945 } 7946 } 7947 7948 //===--------------------------------------------------------------------===// 7949 // EpilogueVectorizerMainLoop 7950 //===--------------------------------------------------------------------===// 7951 7952 /// This function is partially responsible for generating the control flow 7953 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7954 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7955 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7956 Loop *Lp = createVectorLoopSkeleton(""); 7957 7958 // Generate the code to check the minimum iteration count of the vector 7959 // epilogue (see below). 7960 EPI.EpilogueIterationCountCheck = 7961 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7962 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7963 7964 // Generate the code to check any assumptions that we've made for SCEV 7965 // expressions. 7966 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7967 7968 // Generate the code that checks at runtime if arrays overlap. We put the 7969 // checks into a separate block to make the more common case of few elements 7970 // faster. 7971 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7972 7973 // Generate the iteration count check for the main loop, *after* the check 7974 // for the epilogue loop, so that the path-length is shorter for the case 7975 // that goes directly through the vector epilogue. The longer-path length for 7976 // the main loop is compensated for, by the gain from vectorizing the larger 7977 // trip count. Note: the branch will get updated later on when we vectorize 7978 // the epilogue. 7979 EPI.MainLoopIterationCountCheck = 7980 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7981 7982 // Generate the induction variable. 7983 OldInduction = Legal->getPrimaryInduction(); 7984 Type *IdxTy = Legal->getWidestInductionType(); 7985 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7986 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7987 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7988 EPI.VectorTripCount = CountRoundDown; 7989 Induction = 7990 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7991 getDebugLocFromInstOrOperands(OldInduction)); 7992 7993 // Skip induction resume value creation here because they will be created in 7994 // the second pass. If we created them here, they wouldn't be used anyway, 7995 // because the vplan in the second pass still contains the inductions from the 7996 // original loop. 7997 7998 return completeLoopSkeleton(Lp, OrigLoopID); 7999 } 8000 8001 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8002 LLVM_DEBUG({ 8003 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8004 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8005 << ", Main Loop UF:" << EPI.MainLoopUF 8006 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8007 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8008 }); 8009 } 8010 8011 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8012 DEBUG_WITH_TYPE(VerboseDebug, { 8013 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8014 }); 8015 } 8016 8017 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8018 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8019 assert(L && "Expected valid Loop."); 8020 assert(Bypass && "Expected valid bypass basic block."); 8021 unsigned VFactor = 8022 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8023 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8024 Value *Count = getOrCreateTripCount(L); 8025 // Reuse existing vector loop preheader for TC checks. 8026 // Note that new preheader block is generated for vector loop. 8027 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8028 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8029 8030 // Generate code to check if the loop's trip count is less than VF * UF of the 8031 // main vector loop. 8032 auto P = 8033 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8034 8035 Value *CheckMinIters = Builder.CreateICmp( 8036 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8037 "min.iters.check"); 8038 8039 if (!ForEpilogue) 8040 TCCheckBlock->setName("vector.main.loop.iter.check"); 8041 8042 // Create new preheader for vector loop. 8043 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8044 DT, LI, nullptr, "vector.ph"); 8045 8046 if (ForEpilogue) { 8047 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8048 DT->getNode(Bypass)->getIDom()) && 8049 "TC check is expected to dominate Bypass"); 8050 8051 // Update dominator for Bypass & LoopExit. 8052 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8053 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8054 8055 LoopBypassBlocks.push_back(TCCheckBlock); 8056 8057 // Save the trip count so we don't have to regenerate it in the 8058 // vec.epilog.iter.check. This is safe to do because the trip count 8059 // generated here dominates the vector epilog iter check. 8060 EPI.TripCount = Count; 8061 } 8062 8063 ReplaceInstWithInst( 8064 TCCheckBlock->getTerminator(), 8065 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8066 8067 return TCCheckBlock; 8068 } 8069 8070 //===--------------------------------------------------------------------===// 8071 // EpilogueVectorizerEpilogueLoop 8072 //===--------------------------------------------------------------------===// 8073 8074 /// This function is partially responsible for generating the control flow 8075 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8076 BasicBlock * 8077 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8078 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8079 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8080 8081 // Now, compare the remaining count and if there aren't enough iterations to 8082 // execute the vectorized epilogue skip to the scalar part. 8083 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8084 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8085 LoopVectorPreHeader = 8086 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8087 LI, nullptr, "vec.epilog.ph"); 8088 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8089 VecEpilogueIterationCountCheck); 8090 8091 // Adjust the control flow taking the state info from the main loop 8092 // vectorization into account. 8093 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8094 "expected this to be saved from the previous pass."); 8095 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8096 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8097 8098 DT->changeImmediateDominator(LoopVectorPreHeader, 8099 EPI.MainLoopIterationCountCheck); 8100 8101 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8102 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8103 8104 if (EPI.SCEVSafetyCheck) 8105 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8106 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8107 if (EPI.MemSafetyCheck) 8108 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8109 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8110 8111 DT->changeImmediateDominator( 8112 VecEpilogueIterationCountCheck, 8113 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8114 8115 DT->changeImmediateDominator(LoopScalarPreHeader, 8116 EPI.EpilogueIterationCountCheck); 8117 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8118 8119 // Keep track of bypass blocks, as they feed start values to the induction 8120 // phis in the scalar loop preheader. 8121 if (EPI.SCEVSafetyCheck) 8122 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8123 if (EPI.MemSafetyCheck) 8124 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8125 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8126 8127 // Generate a resume induction for the vector epilogue and put it in the 8128 // vector epilogue preheader 8129 Type *IdxTy = Legal->getWidestInductionType(); 8130 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8131 LoopVectorPreHeader->getFirstNonPHI()); 8132 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8133 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8134 EPI.MainLoopIterationCountCheck); 8135 8136 // Generate the induction variable. 8137 OldInduction = Legal->getPrimaryInduction(); 8138 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8139 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8140 Value *StartIdx = EPResumeVal; 8141 Induction = 8142 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8143 getDebugLocFromInstOrOperands(OldInduction)); 8144 8145 // Generate induction resume values. These variables save the new starting 8146 // indexes for the scalar loop. They are used to test if there are any tail 8147 // iterations left once the vector loop has completed. 8148 // Note that when the vectorized epilogue is skipped due to iteration count 8149 // check, then the resume value for the induction variable comes from 8150 // the trip count of the main vector loop, hence passing the AdditionalBypass 8151 // argument. 8152 createInductionResumeValues(Lp, CountRoundDown, 8153 {VecEpilogueIterationCountCheck, 8154 EPI.VectorTripCount} /* AdditionalBypass */); 8155 8156 AddRuntimeUnrollDisableMetaData(Lp); 8157 return completeLoopSkeleton(Lp, OrigLoopID); 8158 } 8159 8160 BasicBlock * 8161 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8162 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8163 8164 assert(EPI.TripCount && 8165 "Expected trip count to have been safed in the first pass."); 8166 assert( 8167 (!isa<Instruction>(EPI.TripCount) || 8168 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8169 "saved trip count does not dominate insertion point."); 8170 Value *TC = EPI.TripCount; 8171 IRBuilder<> Builder(Insert->getTerminator()); 8172 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8173 8174 // Generate code to check if the loop's trip count is less than VF * UF of the 8175 // vector epilogue loop. 8176 auto P = 8177 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8178 8179 Value *CheckMinIters = Builder.CreateICmp( 8180 P, Count, 8181 ConstantInt::get(Count->getType(), 8182 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8183 "min.epilog.iters.check"); 8184 8185 ReplaceInstWithInst( 8186 Insert->getTerminator(), 8187 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8188 8189 LoopBypassBlocks.push_back(Insert); 8190 return Insert; 8191 } 8192 8193 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8194 LLVM_DEBUG({ 8195 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8196 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8197 << ", Main Loop UF:" << EPI.MainLoopUF 8198 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8199 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8200 }); 8201 } 8202 8203 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8204 DEBUG_WITH_TYPE(VerboseDebug, { 8205 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8206 }); 8207 } 8208 8209 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8210 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8211 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8212 bool PredicateAtRangeStart = Predicate(Range.Start); 8213 8214 for (ElementCount TmpVF = Range.Start * 2; 8215 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8216 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8217 Range.End = TmpVF; 8218 break; 8219 } 8220 8221 return PredicateAtRangeStart; 8222 } 8223 8224 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8225 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8226 /// of VF's starting at a given VF and extending it as much as possible. Each 8227 /// vectorization decision can potentially shorten this sub-range during 8228 /// buildVPlan(). 8229 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8230 ElementCount MaxVF) { 8231 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8232 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8233 VFRange SubRange = {VF, MaxVFPlusOne}; 8234 VPlans.push_back(buildVPlan(SubRange)); 8235 VF = SubRange.End; 8236 } 8237 } 8238 8239 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8240 VPlanPtr &Plan) { 8241 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8242 8243 // Look for cached value. 8244 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8245 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8246 if (ECEntryIt != EdgeMaskCache.end()) 8247 return ECEntryIt->second; 8248 8249 VPValue *SrcMask = createBlockInMask(Src, Plan); 8250 8251 // The terminator has to be a branch inst! 8252 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8253 assert(BI && "Unexpected terminator found"); 8254 8255 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8256 return EdgeMaskCache[Edge] = SrcMask; 8257 8258 // If source is an exiting block, we know the exit edge is dynamically dead 8259 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8260 // adding uses of an otherwise potentially dead instruction. 8261 if (OrigLoop->isLoopExiting(Src)) 8262 return EdgeMaskCache[Edge] = SrcMask; 8263 8264 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8265 assert(EdgeMask && "No Edge Mask found for condition"); 8266 8267 if (BI->getSuccessor(0) != Dst) 8268 EdgeMask = Builder.createNot(EdgeMask); 8269 8270 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8271 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8272 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8273 // The select version does not introduce new UB if SrcMask is false and 8274 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8275 VPValue *False = Plan->getOrAddVPValue( 8276 ConstantInt::getFalse(BI->getCondition()->getType())); 8277 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8278 } 8279 8280 return EdgeMaskCache[Edge] = EdgeMask; 8281 } 8282 8283 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8284 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8285 8286 // Look for cached value. 8287 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8288 if (BCEntryIt != BlockMaskCache.end()) 8289 return BCEntryIt->second; 8290 8291 // All-one mask is modelled as no-mask following the convention for masked 8292 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8293 VPValue *BlockMask = nullptr; 8294 8295 if (OrigLoop->getHeader() == BB) { 8296 if (!CM.blockNeedsPredication(BB)) 8297 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8298 8299 // Create the block in mask as the first non-phi instruction in the block. 8300 VPBuilder::InsertPointGuard Guard(Builder); 8301 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8302 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8303 8304 // Introduce the early-exit compare IV <= BTC to form header block mask. 8305 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8306 // Start by constructing the desired canonical IV. 8307 VPValue *IV = nullptr; 8308 if (Legal->getPrimaryInduction()) 8309 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8310 else { 8311 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8312 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8313 IV = IVRecipe->getVPValue(); 8314 } 8315 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8316 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8317 8318 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8319 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8320 // as a second argument, we only pass the IV here and extract the 8321 // tripcount from the transform state where codegen of the VP instructions 8322 // happen. 8323 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8324 } else { 8325 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8326 } 8327 return BlockMaskCache[BB] = BlockMask; 8328 } 8329 8330 // This is the block mask. We OR all incoming edges. 8331 for (auto *Predecessor : predecessors(BB)) { 8332 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8333 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8334 return BlockMaskCache[BB] = EdgeMask; 8335 8336 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8337 BlockMask = EdgeMask; 8338 continue; 8339 } 8340 8341 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8342 } 8343 8344 return BlockMaskCache[BB] = BlockMask; 8345 } 8346 8347 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8348 VPlanPtr &Plan) { 8349 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8350 "Must be called with either a load or store"); 8351 8352 auto willWiden = [&](ElementCount VF) -> bool { 8353 if (VF.isScalar()) 8354 return false; 8355 LoopVectorizationCostModel::InstWidening Decision = 8356 CM.getWideningDecision(I, VF); 8357 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8358 "CM decision should be taken at this point."); 8359 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8360 return true; 8361 if (CM.isScalarAfterVectorization(I, VF) || 8362 CM.isProfitableToScalarize(I, VF)) 8363 return false; 8364 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8365 }; 8366 8367 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8368 return nullptr; 8369 8370 VPValue *Mask = nullptr; 8371 if (Legal->isMaskRequired(I)) 8372 Mask = createBlockInMask(I->getParent(), Plan); 8373 8374 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8375 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8376 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8377 8378 StoreInst *Store = cast<StoreInst>(I); 8379 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8380 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8381 } 8382 8383 VPWidenIntOrFpInductionRecipe * 8384 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8385 // Check if this is an integer or fp induction. If so, build the recipe that 8386 // produces its scalar and vector values. 8387 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8388 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8389 II.getKind() == InductionDescriptor::IK_FpInduction) { 8390 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8391 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8392 return new VPWidenIntOrFpInductionRecipe( 8393 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8394 } 8395 8396 return nullptr; 8397 } 8398 8399 VPWidenIntOrFpInductionRecipe * 8400 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8401 VPlan &Plan) const { 8402 // Optimize the special case where the source is a constant integer 8403 // induction variable. Notice that we can only optimize the 'trunc' case 8404 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8405 // (c) other casts depend on pointer size. 8406 8407 // Determine whether \p K is a truncation based on an induction variable that 8408 // can be optimized. 8409 auto isOptimizableIVTruncate = 8410 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8411 return [=](ElementCount VF) -> bool { 8412 return CM.isOptimizableIVTruncate(K, VF); 8413 }; 8414 }; 8415 8416 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8417 isOptimizableIVTruncate(I), Range)) { 8418 8419 InductionDescriptor II = 8420 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8421 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8422 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8423 Start, nullptr, I); 8424 } 8425 return nullptr; 8426 } 8427 8428 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8429 // If all incoming values are equal, the incoming VPValue can be used directly 8430 // instead of creating a new VPBlendRecipe. 8431 Value *FirstIncoming = Phi->getIncomingValue(0); 8432 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8433 return FirstIncoming == Inc; 8434 })) { 8435 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8436 } 8437 8438 // We know that all PHIs in non-header blocks are converted into selects, so 8439 // we don't have to worry about the insertion order and we can just use the 8440 // builder. At this point we generate the predication tree. There may be 8441 // duplications since this is a simple recursive scan, but future 8442 // optimizations will clean it up. 8443 SmallVector<VPValue *, 2> Operands; 8444 unsigned NumIncoming = Phi->getNumIncomingValues(); 8445 8446 for (unsigned In = 0; In < NumIncoming; In++) { 8447 VPValue *EdgeMask = 8448 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8449 assert((EdgeMask || NumIncoming == 1) && 8450 "Multiple predecessors with one having a full mask"); 8451 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8452 if (EdgeMask) 8453 Operands.push_back(EdgeMask); 8454 } 8455 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8456 } 8457 8458 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8459 VPlan &Plan) const { 8460 8461 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8462 [this, CI](ElementCount VF) { 8463 return CM.isScalarWithPredication(CI, VF); 8464 }, 8465 Range); 8466 8467 if (IsPredicated) 8468 return nullptr; 8469 8470 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8471 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8472 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8473 ID == Intrinsic::pseudoprobe || 8474 ID == Intrinsic::experimental_noalias_scope_decl)) 8475 return nullptr; 8476 8477 auto willWiden = [&](ElementCount VF) -> bool { 8478 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8479 // The following case may be scalarized depending on the VF. 8480 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8481 // version of the instruction. 8482 // Is it beneficial to perform intrinsic call compared to lib call? 8483 bool NeedToScalarize = false; 8484 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8485 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8486 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8487 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8488 "Cannot have invalid costs while widening"); 8489 return UseVectorIntrinsic || !NeedToScalarize; 8490 }; 8491 8492 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8493 return nullptr; 8494 8495 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8496 } 8497 8498 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8499 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8500 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8501 // Instruction should be widened, unless it is scalar after vectorization, 8502 // scalarization is profitable or it is predicated. 8503 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8504 return CM.isScalarAfterVectorization(I, VF) || 8505 CM.isProfitableToScalarize(I, VF) || 8506 CM.isScalarWithPredication(I, VF); 8507 }; 8508 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8509 Range); 8510 } 8511 8512 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8513 auto IsVectorizableOpcode = [](unsigned Opcode) { 8514 switch (Opcode) { 8515 case Instruction::Add: 8516 case Instruction::And: 8517 case Instruction::AShr: 8518 case Instruction::BitCast: 8519 case Instruction::FAdd: 8520 case Instruction::FCmp: 8521 case Instruction::FDiv: 8522 case Instruction::FMul: 8523 case Instruction::FNeg: 8524 case Instruction::FPExt: 8525 case Instruction::FPToSI: 8526 case Instruction::FPToUI: 8527 case Instruction::FPTrunc: 8528 case Instruction::FRem: 8529 case Instruction::FSub: 8530 case Instruction::ICmp: 8531 case Instruction::IntToPtr: 8532 case Instruction::LShr: 8533 case Instruction::Mul: 8534 case Instruction::Or: 8535 case Instruction::PtrToInt: 8536 case Instruction::SDiv: 8537 case Instruction::Select: 8538 case Instruction::SExt: 8539 case Instruction::Shl: 8540 case Instruction::SIToFP: 8541 case Instruction::SRem: 8542 case Instruction::Sub: 8543 case Instruction::Trunc: 8544 case Instruction::UDiv: 8545 case Instruction::UIToFP: 8546 case Instruction::URem: 8547 case Instruction::Xor: 8548 case Instruction::ZExt: 8549 return true; 8550 } 8551 return false; 8552 }; 8553 8554 if (!IsVectorizableOpcode(I->getOpcode())) 8555 return nullptr; 8556 8557 // Success: widen this instruction. 8558 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8559 } 8560 8561 VPBasicBlock *VPRecipeBuilder::handleReplication( 8562 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8563 VPlanPtr &Plan) { 8564 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8565 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8566 Range); 8567 8568 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8569 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8570 Range); 8571 8572 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8573 IsUniform, IsPredicated); 8574 setRecipe(I, Recipe); 8575 Plan->addVPValue(I, Recipe); 8576 8577 // Find if I uses a predicated instruction. If so, it will use its scalar 8578 // value. Avoid hoisting the insert-element which packs the scalar value into 8579 // a vector value, as that happens iff all users use the vector value. 8580 for (VPValue *Op : Recipe->operands()) { 8581 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8582 if (!PredR) 8583 continue; 8584 auto *RepR = 8585 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8586 assert(RepR->isPredicated() && 8587 "expected Replicate recipe to be predicated"); 8588 RepR->setAlsoPack(false); 8589 } 8590 8591 // Finalize the recipe for Instr, first if it is not predicated. 8592 if (!IsPredicated) { 8593 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8594 VPBB->appendRecipe(Recipe); 8595 return VPBB; 8596 } 8597 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8598 assert(VPBB->getSuccessors().empty() && 8599 "VPBB has successors when handling predicated replication."); 8600 // Record predicated instructions for above packing optimizations. 8601 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8602 VPBlockUtils::insertBlockAfter(Region, VPBB); 8603 auto *RegSucc = new VPBasicBlock(); 8604 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8605 return RegSucc; 8606 } 8607 8608 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8609 VPRecipeBase *PredRecipe, 8610 VPlanPtr &Plan) { 8611 // Instructions marked for predication are replicated and placed under an 8612 // if-then construct to prevent side-effects. 8613 8614 // Generate recipes to compute the block mask for this region. 8615 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8616 8617 // Build the triangular if-then region. 8618 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8619 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8620 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8621 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8622 auto *PHIRecipe = Instr->getType()->isVoidTy() 8623 ? nullptr 8624 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8625 if (PHIRecipe) { 8626 Plan->removeVPValueFor(Instr); 8627 Plan->addVPValue(Instr, PHIRecipe); 8628 } 8629 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8630 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8631 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8632 8633 // Note: first set Entry as region entry and then connect successors starting 8634 // from it in order, to propagate the "parent" of each VPBasicBlock. 8635 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8636 VPBlockUtils::connectBlocks(Pred, Exit); 8637 8638 return Region; 8639 } 8640 8641 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8642 VFRange &Range, 8643 VPlanPtr &Plan) { 8644 // First, check for specific widening recipes that deal with calls, memory 8645 // operations, inductions and Phi nodes. 8646 if (auto *CI = dyn_cast<CallInst>(Instr)) 8647 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8648 8649 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8650 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8651 8652 VPRecipeBase *Recipe; 8653 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8654 if (Phi->getParent() != OrigLoop->getHeader()) 8655 return tryToBlend(Phi, Plan); 8656 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8657 return toVPRecipeResult(Recipe); 8658 8659 if (Legal->isReductionVariable(Phi)) { 8660 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8661 VPValue *StartV = 8662 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8663 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8664 } 8665 8666 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8667 } 8668 8669 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8670 cast<TruncInst>(Instr), Range, *Plan))) 8671 return toVPRecipeResult(Recipe); 8672 8673 if (!shouldWiden(Instr, Range)) 8674 return nullptr; 8675 8676 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8677 return toVPRecipeResult(new VPWidenGEPRecipe( 8678 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8679 8680 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8681 bool InvariantCond = 8682 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8683 return toVPRecipeResult(new VPWidenSelectRecipe( 8684 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8685 } 8686 8687 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8688 } 8689 8690 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8691 ElementCount MaxVF) { 8692 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8693 8694 // Collect instructions from the original loop that will become trivially dead 8695 // in the vectorized loop. We don't need to vectorize these instructions. For 8696 // example, original induction update instructions can become dead because we 8697 // separately emit induction "steps" when generating code for the new loop. 8698 // Similarly, we create a new latch condition when setting up the structure 8699 // of the new loop, so the old one can become dead. 8700 SmallPtrSet<Instruction *, 4> DeadInstructions; 8701 collectTriviallyDeadInstructions(DeadInstructions); 8702 8703 // Add assume instructions we need to drop to DeadInstructions, to prevent 8704 // them from being added to the VPlan. 8705 // TODO: We only need to drop assumes in blocks that get flattend. If the 8706 // control flow is preserved, we should keep them. 8707 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8708 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8709 8710 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8711 // Dead instructions do not need sinking. Remove them from SinkAfter. 8712 for (Instruction *I : DeadInstructions) 8713 SinkAfter.erase(I); 8714 8715 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8716 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8717 VFRange SubRange = {VF, MaxVFPlusOne}; 8718 VPlans.push_back( 8719 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8720 VF = SubRange.End; 8721 } 8722 } 8723 8724 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8725 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8726 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8727 8728 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8729 8730 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8731 8732 // --------------------------------------------------------------------------- 8733 // Pre-construction: record ingredients whose recipes we'll need to further 8734 // process after constructing the initial VPlan. 8735 // --------------------------------------------------------------------------- 8736 8737 // Mark instructions we'll need to sink later and their targets as 8738 // ingredients whose recipe we'll need to record. 8739 for (auto &Entry : SinkAfter) { 8740 RecipeBuilder.recordRecipeOf(Entry.first); 8741 RecipeBuilder.recordRecipeOf(Entry.second); 8742 } 8743 for (auto &Reduction : CM.getInLoopReductionChains()) { 8744 PHINode *Phi = Reduction.first; 8745 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8746 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8747 8748 RecipeBuilder.recordRecipeOf(Phi); 8749 for (auto &R : ReductionOperations) { 8750 RecipeBuilder.recordRecipeOf(R); 8751 // For min/max reducitons, where we have a pair of icmp/select, we also 8752 // need to record the ICmp recipe, so it can be removed later. 8753 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8754 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8755 } 8756 } 8757 8758 // For each interleave group which is relevant for this (possibly trimmed) 8759 // Range, add it to the set of groups to be later applied to the VPlan and add 8760 // placeholders for its members' Recipes which we'll be replacing with a 8761 // single VPInterleaveRecipe. 8762 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8763 auto applyIG = [IG, this](ElementCount VF) -> bool { 8764 return (VF.isVector() && // Query is illegal for VF == 1 8765 CM.getWideningDecision(IG->getInsertPos(), VF) == 8766 LoopVectorizationCostModel::CM_Interleave); 8767 }; 8768 if (!getDecisionAndClampRange(applyIG, Range)) 8769 continue; 8770 InterleaveGroups.insert(IG); 8771 for (unsigned i = 0; i < IG->getFactor(); i++) 8772 if (Instruction *Member = IG->getMember(i)) 8773 RecipeBuilder.recordRecipeOf(Member); 8774 }; 8775 8776 // --------------------------------------------------------------------------- 8777 // Build initial VPlan: Scan the body of the loop in a topological order to 8778 // visit each basic block after having visited its predecessor basic blocks. 8779 // --------------------------------------------------------------------------- 8780 8781 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8782 auto Plan = std::make_unique<VPlan>(); 8783 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8784 Plan->setEntry(VPBB); 8785 8786 // Scan the body of the loop in a topological order to visit each basic block 8787 // after having visited its predecessor basic blocks. 8788 LoopBlocksDFS DFS(OrigLoop); 8789 DFS.perform(LI); 8790 8791 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8792 // Relevant instructions from basic block BB will be grouped into VPRecipe 8793 // ingredients and fill a new VPBasicBlock. 8794 unsigned VPBBsForBB = 0; 8795 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8796 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8797 VPBB = FirstVPBBForBB; 8798 Builder.setInsertPoint(VPBB); 8799 8800 // Introduce each ingredient into VPlan. 8801 // TODO: Model and preserve debug instrinsics in VPlan. 8802 for (Instruction &I : BB->instructionsWithoutDebug()) { 8803 Instruction *Instr = &I; 8804 8805 // First filter out irrelevant instructions, to ensure no recipes are 8806 // built for them. 8807 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8808 continue; 8809 8810 if (auto RecipeOrValue = 8811 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8812 // If Instr can be simplified to an existing VPValue, use it. 8813 if (RecipeOrValue.is<VPValue *>()) { 8814 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8815 continue; 8816 } 8817 // Otherwise, add the new recipe. 8818 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8819 for (auto *Def : Recipe->definedValues()) { 8820 auto *UV = Def->getUnderlyingValue(); 8821 Plan->addVPValue(UV, Def); 8822 } 8823 8824 RecipeBuilder.setRecipe(Instr, Recipe); 8825 VPBB->appendRecipe(Recipe); 8826 continue; 8827 } 8828 8829 // Otherwise, if all widening options failed, Instruction is to be 8830 // replicated. This may create a successor for VPBB. 8831 VPBasicBlock *NextVPBB = 8832 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8833 if (NextVPBB != VPBB) { 8834 VPBB = NextVPBB; 8835 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8836 : ""); 8837 } 8838 } 8839 } 8840 8841 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8842 // may also be empty, such as the last one VPBB, reflecting original 8843 // basic-blocks with no recipes. 8844 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8845 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8846 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8847 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8848 delete PreEntry; 8849 8850 // --------------------------------------------------------------------------- 8851 // Transform initial VPlan: Apply previously taken decisions, in order, to 8852 // bring the VPlan to its final state. 8853 // --------------------------------------------------------------------------- 8854 8855 // Apply Sink-After legal constraints. 8856 for (auto &Entry : SinkAfter) { 8857 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8858 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8859 // If the target is in a replication region, make sure to move Sink to the 8860 // block after it, not into the replication region itself. 8861 if (auto *Region = 8862 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8863 if (Region->isReplicator()) { 8864 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8865 VPBasicBlock *NextBlock = 8866 cast<VPBasicBlock>(Region->getSuccessors().front()); 8867 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8868 continue; 8869 } 8870 } 8871 Sink->moveAfter(Target); 8872 } 8873 8874 // Interleave memory: for each Interleave Group we marked earlier as relevant 8875 // for this VPlan, replace the Recipes widening its memory instructions with a 8876 // single VPInterleaveRecipe at its insertion point. 8877 for (auto IG : InterleaveGroups) { 8878 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8879 RecipeBuilder.getRecipe(IG->getInsertPos())); 8880 SmallVector<VPValue *, 4> StoredValues; 8881 for (unsigned i = 0; i < IG->getFactor(); ++i) 8882 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8883 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8884 8885 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8886 Recipe->getMask()); 8887 VPIG->insertBefore(Recipe); 8888 unsigned J = 0; 8889 for (unsigned i = 0; i < IG->getFactor(); ++i) 8890 if (Instruction *Member = IG->getMember(i)) { 8891 if (!Member->getType()->isVoidTy()) { 8892 VPValue *OriginalV = Plan->getVPValue(Member); 8893 Plan->removeVPValueFor(Member); 8894 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8895 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8896 J++; 8897 } 8898 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8899 } 8900 } 8901 8902 // Adjust the recipes for any inloop reductions. 8903 if (Range.Start.isVector()) 8904 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8905 8906 // Finally, if tail is folded by masking, introduce selects between the phi 8907 // and the live-out instruction of each reduction, at the end of the latch. 8908 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8909 Builder.setInsertPoint(VPBB); 8910 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8911 for (auto &Reduction : Legal->getReductionVars()) { 8912 if (CM.isInLoopReduction(Reduction.first)) 8913 continue; 8914 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8915 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8916 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8917 } 8918 } 8919 8920 std::string PlanName; 8921 raw_string_ostream RSO(PlanName); 8922 ElementCount VF = Range.Start; 8923 Plan->addVF(VF); 8924 RSO << "Initial VPlan for VF={" << VF; 8925 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8926 Plan->addVF(VF); 8927 RSO << "," << VF; 8928 } 8929 RSO << "},UF>=1"; 8930 RSO.flush(); 8931 Plan->setName(PlanName); 8932 8933 return Plan; 8934 } 8935 8936 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8937 // Outer loop handling: They may require CFG and instruction level 8938 // transformations before even evaluating whether vectorization is profitable. 8939 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8940 // the vectorization pipeline. 8941 assert(!OrigLoop->isInnermost()); 8942 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8943 8944 // Create new empty VPlan 8945 auto Plan = std::make_unique<VPlan>(); 8946 8947 // Build hierarchical CFG 8948 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8949 HCFGBuilder.buildHierarchicalCFG(); 8950 8951 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8952 VF *= 2) 8953 Plan->addVF(VF); 8954 8955 if (EnableVPlanPredication) { 8956 VPlanPredicator VPP(*Plan); 8957 VPP.predicate(); 8958 8959 // Avoid running transformation to recipes until masked code generation in 8960 // VPlan-native path is in place. 8961 return Plan; 8962 } 8963 8964 SmallPtrSet<Instruction *, 1> DeadInstructions; 8965 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 8966 Legal->getInductionVars(), 8967 DeadInstructions, *PSE.getSE()); 8968 return Plan; 8969 } 8970 8971 // Adjust the recipes for any inloop reductions. The chain of instructions 8972 // leading from the loop exit instr to the phi need to be converted to 8973 // reductions, with one operand being vector and the other being the scalar 8974 // reduction chain. 8975 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8976 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8977 for (auto &Reduction : CM.getInLoopReductionChains()) { 8978 PHINode *Phi = Reduction.first; 8979 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8980 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8981 8982 // ReductionOperations are orders top-down from the phi's use to the 8983 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8984 // which of the two operands will remain scalar and which will be reduced. 8985 // For minmax the chain will be the select instructions. 8986 Instruction *Chain = Phi; 8987 for (Instruction *R : ReductionOperations) { 8988 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8989 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8990 8991 VPValue *ChainOp = Plan->getVPValue(Chain); 8992 unsigned FirstOpId; 8993 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8994 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8995 "Expected to replace a VPWidenSelectSC"); 8996 FirstOpId = 1; 8997 } else { 8998 assert(isa<VPWidenRecipe>(WidenRecipe) && 8999 "Expected to replace a VPWidenSC"); 9000 FirstOpId = 0; 9001 } 9002 unsigned VecOpId = 9003 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9004 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9005 9006 auto *CondOp = CM.foldTailByMasking() 9007 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9008 : nullptr; 9009 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9010 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9011 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9012 Plan->removeVPValueFor(R); 9013 Plan->addVPValue(R, RedRecipe); 9014 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9015 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9016 WidenRecipe->eraseFromParent(); 9017 9018 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9019 VPRecipeBase *CompareRecipe = 9020 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9021 assert(isa<VPWidenRecipe>(CompareRecipe) && 9022 "Expected to replace a VPWidenSC"); 9023 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9024 "Expected no remaining users"); 9025 CompareRecipe->eraseFromParent(); 9026 } 9027 Chain = R; 9028 } 9029 } 9030 } 9031 9032 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9033 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9034 VPSlotTracker &SlotTracker) const { 9035 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9036 IG->getInsertPos()->printAsOperand(O, false); 9037 O << ", "; 9038 getAddr()->printAsOperand(O, SlotTracker); 9039 VPValue *Mask = getMask(); 9040 if (Mask) { 9041 O << ", "; 9042 Mask->printAsOperand(O, SlotTracker); 9043 } 9044 for (unsigned i = 0; i < IG->getFactor(); ++i) 9045 if (Instruction *I = IG->getMember(i)) 9046 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9047 } 9048 #endif 9049 9050 void VPWidenCallRecipe::execute(VPTransformState &State) { 9051 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9052 *this, State); 9053 } 9054 9055 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9056 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9057 this, *this, InvariantCond, State); 9058 } 9059 9060 void VPWidenRecipe::execute(VPTransformState &State) { 9061 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9062 } 9063 9064 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9065 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9066 *this, State.UF, State.VF, IsPtrLoopInvariant, 9067 IsIndexLoopInvariant, State); 9068 } 9069 9070 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9071 assert(!State.Instance && "Int or FP induction being replicated."); 9072 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9073 getTruncInst(), getVPValue(0), 9074 getCastValue(), State); 9075 } 9076 9077 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9078 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9079 getStartValue(), this, State); 9080 } 9081 9082 void VPBlendRecipe::execute(VPTransformState &State) { 9083 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9084 // We know that all PHIs in non-header blocks are converted into 9085 // selects, so we don't have to worry about the insertion order and we 9086 // can just use the builder. 9087 // At this point we generate the predication tree. There may be 9088 // duplications since this is a simple recursive scan, but future 9089 // optimizations will clean it up. 9090 9091 unsigned NumIncoming = getNumIncomingValues(); 9092 9093 // Generate a sequence of selects of the form: 9094 // SELECT(Mask3, In3, 9095 // SELECT(Mask2, In2, 9096 // SELECT(Mask1, In1, 9097 // In0))) 9098 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9099 // are essentially undef are taken from In0. 9100 InnerLoopVectorizer::VectorParts Entry(State.UF); 9101 for (unsigned In = 0; In < NumIncoming; ++In) { 9102 for (unsigned Part = 0; Part < State.UF; ++Part) { 9103 // We might have single edge PHIs (blocks) - use an identity 9104 // 'select' for the first PHI operand. 9105 Value *In0 = State.get(getIncomingValue(In), Part); 9106 if (In == 0) 9107 Entry[Part] = In0; // Initialize with the first incoming value. 9108 else { 9109 // Select between the current value and the previous incoming edge 9110 // based on the incoming mask. 9111 Value *Cond = State.get(getMask(In), Part); 9112 Entry[Part] = 9113 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9114 } 9115 } 9116 } 9117 for (unsigned Part = 0; Part < State.UF; ++Part) 9118 State.set(this, Entry[Part], Part); 9119 } 9120 9121 void VPInterleaveRecipe::execute(VPTransformState &State) { 9122 assert(!State.Instance && "Interleave group being replicated."); 9123 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9124 getStoredValues(), getMask()); 9125 } 9126 9127 void VPReductionRecipe::execute(VPTransformState &State) { 9128 assert(!State.Instance && "Reduction being replicated."); 9129 for (unsigned Part = 0; Part < State.UF; ++Part) { 9130 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9131 Value *NewVecOp = State.get(getVecOp(), Part); 9132 if (VPValue *Cond = getCondOp()) { 9133 Value *NewCond = State.get(Cond, Part); 9134 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9135 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9136 Kind, VecTy->getElementType()); 9137 Constant *IdenVec = 9138 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9139 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9140 NewVecOp = Select; 9141 } 9142 Value *NewRed = 9143 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9144 Value *PrevInChain = State.get(getChainOp(), Part); 9145 Value *NextInChain; 9146 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9147 NextInChain = 9148 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9149 NewRed, PrevInChain); 9150 } else { 9151 NextInChain = State.Builder.CreateBinOp( 9152 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9153 PrevInChain); 9154 } 9155 State.set(this, NextInChain, Part); 9156 } 9157 } 9158 9159 void VPReplicateRecipe::execute(VPTransformState &State) { 9160 if (State.Instance) { // Generate a single instance. 9161 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9162 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9163 *State.Instance, IsPredicated, State); 9164 // Insert scalar instance packing it into a vector. 9165 if (AlsoPack && State.VF.isVector()) { 9166 // If we're constructing lane 0, initialize to start from poison. 9167 if (State.Instance->Lane.isFirstLane()) { 9168 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9169 Value *Poison = PoisonValue::get( 9170 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9171 State.set(this, Poison, State.Instance->Part); 9172 } 9173 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9174 } 9175 return; 9176 } 9177 9178 // Generate scalar instances for all VF lanes of all UF parts, unless the 9179 // instruction is uniform inwhich case generate only the first lane for each 9180 // of the UF parts. 9181 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9182 assert((!State.VF.isScalable() || IsUniform) && 9183 "Can't scalarize a scalable vector"); 9184 for (unsigned Part = 0; Part < State.UF; ++Part) 9185 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9186 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9187 VPIteration(Part, Lane), IsPredicated, 9188 State); 9189 } 9190 9191 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9192 assert(State.Instance && "Branch on Mask works only on single instance."); 9193 9194 unsigned Part = State.Instance->Part; 9195 unsigned Lane = State.Instance->Lane.getKnownLane(); 9196 9197 Value *ConditionBit = nullptr; 9198 VPValue *BlockInMask = getMask(); 9199 if (BlockInMask) { 9200 ConditionBit = State.get(BlockInMask, Part); 9201 if (ConditionBit->getType()->isVectorTy()) 9202 ConditionBit = State.Builder.CreateExtractElement( 9203 ConditionBit, State.Builder.getInt32(Lane)); 9204 } else // Block in mask is all-one. 9205 ConditionBit = State.Builder.getTrue(); 9206 9207 // Replace the temporary unreachable terminator with a new conditional branch, 9208 // whose two destinations will be set later when they are created. 9209 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9210 assert(isa<UnreachableInst>(CurrentTerminator) && 9211 "Expected to replace unreachable terminator with conditional branch."); 9212 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9213 CondBr->setSuccessor(0, nullptr); 9214 ReplaceInstWithInst(CurrentTerminator, CondBr); 9215 } 9216 9217 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9218 assert(State.Instance && "Predicated instruction PHI works per instance."); 9219 Instruction *ScalarPredInst = 9220 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9221 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9222 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9223 assert(PredicatingBB && "Predicated block has no single predecessor."); 9224 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9225 "operand must be VPReplicateRecipe"); 9226 9227 // By current pack/unpack logic we need to generate only a single phi node: if 9228 // a vector value for the predicated instruction exists at this point it means 9229 // the instruction has vector users only, and a phi for the vector value is 9230 // needed. In this case the recipe of the predicated instruction is marked to 9231 // also do that packing, thereby "hoisting" the insert-element sequence. 9232 // Otherwise, a phi node for the scalar value is needed. 9233 unsigned Part = State.Instance->Part; 9234 if (State.hasVectorValue(getOperand(0), Part)) { 9235 Value *VectorValue = State.get(getOperand(0), Part); 9236 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9237 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9238 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9239 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9240 if (State.hasVectorValue(this, Part)) 9241 State.reset(this, VPhi, Part); 9242 else 9243 State.set(this, VPhi, Part); 9244 // NOTE: Currently we need to update the value of the operand, so the next 9245 // predicated iteration inserts its generated value in the correct vector. 9246 State.reset(getOperand(0), VPhi, Part); 9247 } else { 9248 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9249 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9250 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9251 PredicatingBB); 9252 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9253 if (State.hasScalarValue(this, *State.Instance)) 9254 State.reset(this, Phi, *State.Instance); 9255 else 9256 State.set(this, Phi, *State.Instance); 9257 // NOTE: Currently we need to update the value of the operand, so the next 9258 // predicated iteration inserts its generated value in the correct vector. 9259 State.reset(getOperand(0), Phi, *State.Instance); 9260 } 9261 } 9262 9263 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9264 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9265 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9266 StoredValue ? nullptr : getVPValue(), 9267 getAddr(), StoredValue, getMask()); 9268 } 9269 9270 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9271 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9272 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9273 // for predication. 9274 static ScalarEpilogueLowering getScalarEpilogueLowering( 9275 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9276 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9277 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9278 LoopVectorizationLegality &LVL) { 9279 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9280 // don't look at hints or options, and don't request a scalar epilogue. 9281 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9282 // LoopAccessInfo (due to code dependency and not being able to reliably get 9283 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9284 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9285 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9286 // back to the old way and vectorize with versioning when forced. See D81345.) 9287 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9288 PGSOQueryType::IRPass) && 9289 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9290 return CM_ScalarEpilogueNotAllowedOptSize; 9291 9292 // 2) If set, obey the directives 9293 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9294 switch (PreferPredicateOverEpilogue) { 9295 case PreferPredicateTy::ScalarEpilogue: 9296 return CM_ScalarEpilogueAllowed; 9297 case PreferPredicateTy::PredicateElseScalarEpilogue: 9298 return CM_ScalarEpilogueNotNeededUsePredicate; 9299 case PreferPredicateTy::PredicateOrDontVectorize: 9300 return CM_ScalarEpilogueNotAllowedUsePredicate; 9301 }; 9302 } 9303 9304 // 3) If set, obey the hints 9305 switch (Hints.getPredicate()) { 9306 case LoopVectorizeHints::FK_Enabled: 9307 return CM_ScalarEpilogueNotNeededUsePredicate; 9308 case LoopVectorizeHints::FK_Disabled: 9309 return CM_ScalarEpilogueAllowed; 9310 }; 9311 9312 // 4) if the TTI hook indicates this is profitable, request predication. 9313 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9314 LVL.getLAI())) 9315 return CM_ScalarEpilogueNotNeededUsePredicate; 9316 9317 return CM_ScalarEpilogueAllowed; 9318 } 9319 9320 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9321 // If Values have been set for this Def return the one relevant for \p Part. 9322 if (hasVectorValue(Def, Part)) 9323 return Data.PerPartOutput[Def][Part]; 9324 9325 if (!hasScalarValue(Def, {Part, 0})) { 9326 Value *IRV = Def->getLiveInIRValue(); 9327 Value *B = ILV->getBroadcastInstrs(IRV); 9328 set(Def, B, Part); 9329 return B; 9330 } 9331 9332 Value *ScalarValue = get(Def, {Part, 0}); 9333 // If we aren't vectorizing, we can just copy the scalar map values over 9334 // to the vector map. 9335 if (VF.isScalar()) { 9336 set(Def, ScalarValue, Part); 9337 return ScalarValue; 9338 } 9339 9340 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9341 bool IsUniform = RepR && RepR->isUniform(); 9342 9343 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9344 // Check if there is a scalar value for the selected lane. 9345 if (!hasScalarValue(Def, {Part, LastLane})) { 9346 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9347 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9348 "unexpected recipe found to be invariant"); 9349 IsUniform = true; 9350 LastLane = 0; 9351 } 9352 9353 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9354 9355 // Set the insert point after the last scalarized instruction. This 9356 // ensures the insertelement sequence will directly follow the scalar 9357 // definitions. 9358 auto OldIP = Builder.saveIP(); 9359 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9360 Builder.SetInsertPoint(&*NewIP); 9361 9362 // However, if we are vectorizing, we need to construct the vector values. 9363 // If the value is known to be uniform after vectorization, we can just 9364 // broadcast the scalar value corresponding to lane zero for each unroll 9365 // iteration. Otherwise, we construct the vector values using 9366 // insertelement instructions. Since the resulting vectors are stored in 9367 // State, we will only generate the insertelements once. 9368 Value *VectorValue = nullptr; 9369 if (IsUniform) { 9370 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9371 set(Def, VectorValue, Part); 9372 } else { 9373 // Initialize packing with insertelements to start from undef. 9374 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9375 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9376 set(Def, Undef, Part); 9377 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9378 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9379 VectorValue = get(Def, Part); 9380 } 9381 Builder.restoreIP(OldIP); 9382 return VectorValue; 9383 } 9384 9385 // Process the loop in the VPlan-native vectorization path. This path builds 9386 // VPlan upfront in the vectorization pipeline, which allows to apply 9387 // VPlan-to-VPlan transformations from the very beginning without modifying the 9388 // input LLVM IR. 9389 static bool processLoopInVPlanNativePath( 9390 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9391 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9392 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9393 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9394 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9395 9396 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9397 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9398 return false; 9399 } 9400 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9401 Function *F = L->getHeader()->getParent(); 9402 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9403 9404 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9405 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9406 9407 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9408 &Hints, IAI); 9409 // Use the planner for outer loop vectorization. 9410 // TODO: CM is not used at this point inside the planner. Turn CM into an 9411 // optional argument if we don't need it in the future. 9412 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9413 9414 // Get user vectorization factor. 9415 ElementCount UserVF = Hints.getWidth(); 9416 9417 // Plan how to best vectorize, return the best VF and its cost. 9418 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9419 9420 // If we are stress testing VPlan builds, do not attempt to generate vector 9421 // code. Masked vector code generation support will follow soon. 9422 // Also, do not attempt to vectorize if no vector code will be produced. 9423 if (VPlanBuildStressTest || EnableVPlanPredication || 9424 VectorizationFactor::Disabled() == VF) 9425 return false; 9426 9427 LVP.setBestPlan(VF.Width, 1); 9428 9429 { 9430 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9431 F->getParent()->getDataLayout()); 9432 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9433 &CM, BFI, PSI, Checks); 9434 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9435 << L->getHeader()->getParent()->getName() << "\"\n"); 9436 LVP.executePlan(LB, DT); 9437 } 9438 9439 // Mark the loop as already vectorized to avoid vectorizing again. 9440 Hints.setAlreadyVectorized(); 9441 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9442 return true; 9443 } 9444 9445 // Emit a remark if there are stores to floats that required a floating point 9446 // extension. If the vectorized loop was generated with floating point there 9447 // will be a performance penalty from the conversion overhead and the change in 9448 // the vector width. 9449 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9450 SmallVector<Instruction *, 4> Worklist; 9451 for (BasicBlock *BB : L->getBlocks()) { 9452 for (Instruction &Inst : *BB) { 9453 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9454 if (S->getValueOperand()->getType()->isFloatTy()) 9455 Worklist.push_back(S); 9456 } 9457 } 9458 } 9459 9460 // Traverse the floating point stores upwards searching, for floating point 9461 // conversions. 9462 SmallPtrSet<const Instruction *, 4> Visited; 9463 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9464 while (!Worklist.empty()) { 9465 auto *I = Worklist.pop_back_val(); 9466 if (!L->contains(I)) 9467 continue; 9468 if (!Visited.insert(I).second) 9469 continue; 9470 9471 // Emit a remark if the floating point store required a floating 9472 // point conversion. 9473 // TODO: More work could be done to identify the root cause such as a 9474 // constant or a function return type and point the user to it. 9475 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9476 ORE->emit([&]() { 9477 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9478 I->getDebugLoc(), L->getHeader()) 9479 << "floating point conversion changes vector width. " 9480 << "Mixed floating point precision requires an up/down " 9481 << "cast that will negatively impact performance."; 9482 }); 9483 9484 for (Use &Op : I->operands()) 9485 if (auto *OpI = dyn_cast<Instruction>(Op)) 9486 Worklist.push_back(OpI); 9487 } 9488 } 9489 9490 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9491 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9492 !EnableLoopInterleaving), 9493 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9494 !EnableLoopVectorization) {} 9495 9496 bool LoopVectorizePass::processLoop(Loop *L) { 9497 assert((EnableVPlanNativePath || L->isInnermost()) && 9498 "VPlan-native path is not enabled. Only process inner loops."); 9499 9500 #ifndef NDEBUG 9501 const std::string DebugLocStr = getDebugLocString(L); 9502 #endif /* NDEBUG */ 9503 9504 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9505 << L->getHeader()->getParent()->getName() << "\" from " 9506 << DebugLocStr << "\n"); 9507 9508 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9509 9510 LLVM_DEBUG( 9511 dbgs() << "LV: Loop hints:" 9512 << " force=" 9513 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9514 ? "disabled" 9515 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9516 ? "enabled" 9517 : "?")) 9518 << " width=" << Hints.getWidth() 9519 << " unroll=" << Hints.getInterleave() << "\n"); 9520 9521 // Function containing loop 9522 Function *F = L->getHeader()->getParent(); 9523 9524 // Looking at the diagnostic output is the only way to determine if a loop 9525 // was vectorized (other than looking at the IR or machine code), so it 9526 // is important to generate an optimization remark for each loop. Most of 9527 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9528 // generated as OptimizationRemark and OptimizationRemarkMissed are 9529 // less verbose reporting vectorized loops and unvectorized loops that may 9530 // benefit from vectorization, respectively. 9531 9532 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9533 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9534 return false; 9535 } 9536 9537 PredicatedScalarEvolution PSE(*SE, *L); 9538 9539 // Check if it is legal to vectorize the loop. 9540 LoopVectorizationRequirements Requirements(*ORE); 9541 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9542 &Requirements, &Hints, DB, AC, BFI, PSI); 9543 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9544 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9545 Hints.emitRemarkWithHints(); 9546 return false; 9547 } 9548 9549 // Check the function attributes and profiles to find out if this function 9550 // should be optimized for size. 9551 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9552 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9553 9554 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9555 // here. They may require CFG and instruction level transformations before 9556 // even evaluating whether vectorization is profitable. Since we cannot modify 9557 // the incoming IR, we need to build VPlan upfront in the vectorization 9558 // pipeline. 9559 if (!L->isInnermost()) 9560 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9561 ORE, BFI, PSI, Hints); 9562 9563 assert(L->isInnermost() && "Inner loop expected."); 9564 9565 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9566 // count by optimizing for size, to minimize overheads. 9567 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9568 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9569 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9570 << "This loop is worth vectorizing only if no scalar " 9571 << "iteration overheads are incurred."); 9572 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9573 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9574 else { 9575 LLVM_DEBUG(dbgs() << "\n"); 9576 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9577 } 9578 } 9579 9580 // Check the function attributes to see if implicit floats are allowed. 9581 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9582 // an integer loop and the vector instructions selected are purely integer 9583 // vector instructions? 9584 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9585 reportVectorizationFailure( 9586 "Can't vectorize when the NoImplicitFloat attribute is used", 9587 "loop not vectorized due to NoImplicitFloat attribute", 9588 "NoImplicitFloat", ORE, L); 9589 Hints.emitRemarkWithHints(); 9590 return false; 9591 } 9592 9593 // Check if the target supports potentially unsafe FP vectorization. 9594 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9595 // for the target we're vectorizing for, to make sure none of the 9596 // additional fp-math flags can help. 9597 if (Hints.isPotentiallyUnsafe() && 9598 TTI->isFPVectorizationPotentiallyUnsafe()) { 9599 reportVectorizationFailure( 9600 "Potentially unsafe FP op prevents vectorization", 9601 "loop not vectorized due to unsafe FP support.", 9602 "UnsafeFP", ORE, L); 9603 Hints.emitRemarkWithHints(); 9604 return false; 9605 } 9606 9607 if (!Requirements.canVectorizeFPMath(Hints)) { 9608 ORE->emit([&]() { 9609 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9610 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9611 ExactFPMathInst->getDebugLoc(), 9612 ExactFPMathInst->getParent()) 9613 << "loop not vectorized: cannot prove it is safe to reorder " 9614 "floating-point operations"; 9615 }); 9616 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9617 "reorder floating-point operations\n"); 9618 Hints.emitRemarkWithHints(); 9619 return false; 9620 } 9621 9622 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9623 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9624 9625 // If an override option has been passed in for interleaved accesses, use it. 9626 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9627 UseInterleaved = EnableInterleavedMemAccesses; 9628 9629 // Analyze interleaved memory accesses. 9630 if (UseInterleaved) { 9631 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9632 } 9633 9634 // Use the cost model. 9635 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9636 F, &Hints, IAI); 9637 CM.collectValuesToIgnore(); 9638 9639 // Use the planner for vectorization. 9640 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9641 9642 // Get user vectorization factor and interleave count. 9643 ElementCount UserVF = Hints.getWidth(); 9644 unsigned UserIC = Hints.getInterleave(); 9645 9646 // Plan how to best vectorize, return the best VF and its cost. 9647 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9648 9649 VectorizationFactor VF = VectorizationFactor::Disabled(); 9650 unsigned IC = 1; 9651 9652 if (MaybeVF) { 9653 VF = *MaybeVF; 9654 // Select the interleave count. 9655 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9656 } 9657 9658 // Identify the diagnostic messages that should be produced. 9659 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9660 bool VectorizeLoop = true, InterleaveLoop = true; 9661 if (Requirements.doesNotMeet(F, L, Hints)) { 9662 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9663 "requirements.\n"); 9664 Hints.emitRemarkWithHints(); 9665 return false; 9666 } 9667 9668 if (VF.Width.isScalar()) { 9669 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9670 VecDiagMsg = std::make_pair( 9671 "VectorizationNotBeneficial", 9672 "the cost-model indicates that vectorization is not beneficial"); 9673 VectorizeLoop = false; 9674 } 9675 9676 if (!MaybeVF && UserIC > 1) { 9677 // Tell the user interleaving was avoided up-front, despite being explicitly 9678 // requested. 9679 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9680 "interleaving should be avoided up front\n"); 9681 IntDiagMsg = std::make_pair( 9682 "InterleavingAvoided", 9683 "Ignoring UserIC, because interleaving was avoided up front"); 9684 InterleaveLoop = false; 9685 } else if (IC == 1 && UserIC <= 1) { 9686 // Tell the user interleaving is not beneficial. 9687 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9688 IntDiagMsg = std::make_pair( 9689 "InterleavingNotBeneficial", 9690 "the cost-model indicates that interleaving is not beneficial"); 9691 InterleaveLoop = false; 9692 if (UserIC == 1) { 9693 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9694 IntDiagMsg.second += 9695 " and is explicitly disabled or interleave count is set to 1"; 9696 } 9697 } else if (IC > 1 && UserIC == 1) { 9698 // Tell the user interleaving is beneficial, but it explicitly disabled. 9699 LLVM_DEBUG( 9700 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9701 IntDiagMsg = std::make_pair( 9702 "InterleavingBeneficialButDisabled", 9703 "the cost-model indicates that interleaving is beneficial " 9704 "but is explicitly disabled or interleave count is set to 1"); 9705 InterleaveLoop = false; 9706 } 9707 9708 // Override IC if user provided an interleave count. 9709 IC = UserIC > 0 ? UserIC : IC; 9710 9711 // Emit diagnostic messages, if any. 9712 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9713 if (!VectorizeLoop && !InterleaveLoop) { 9714 // Do not vectorize or interleaving the loop. 9715 ORE->emit([&]() { 9716 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9717 L->getStartLoc(), L->getHeader()) 9718 << VecDiagMsg.second; 9719 }); 9720 ORE->emit([&]() { 9721 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9722 L->getStartLoc(), L->getHeader()) 9723 << IntDiagMsg.second; 9724 }); 9725 return false; 9726 } else if (!VectorizeLoop && InterleaveLoop) { 9727 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9728 ORE->emit([&]() { 9729 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9730 L->getStartLoc(), L->getHeader()) 9731 << VecDiagMsg.second; 9732 }); 9733 } else if (VectorizeLoop && !InterleaveLoop) { 9734 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9735 << ") in " << DebugLocStr << '\n'); 9736 ORE->emit([&]() { 9737 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9738 L->getStartLoc(), L->getHeader()) 9739 << IntDiagMsg.second; 9740 }); 9741 } else if (VectorizeLoop && InterleaveLoop) { 9742 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9743 << ") in " << DebugLocStr << '\n'); 9744 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9745 } 9746 9747 bool DisableRuntimeUnroll = false; 9748 MDNode *OrigLoopID = L->getLoopID(); 9749 { 9750 // Optimistically generate runtime checks. Drop them if they turn out to not 9751 // be profitable. Limit the scope of Checks, so the cleanup happens 9752 // immediately after vector codegeneration is done. 9753 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9754 F->getParent()->getDataLayout()); 9755 if (!VF.Width.isScalar() || IC > 1) 9756 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9757 LVP.setBestPlan(VF.Width, IC); 9758 9759 using namespace ore; 9760 if (!VectorizeLoop) { 9761 assert(IC > 1 && "interleave count should not be 1 or 0"); 9762 // If we decided that it is not legal to vectorize the loop, then 9763 // interleave it. 9764 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9765 &CM, BFI, PSI, Checks); 9766 LVP.executePlan(Unroller, DT); 9767 9768 ORE->emit([&]() { 9769 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9770 L->getHeader()) 9771 << "interleaved loop (interleaved count: " 9772 << NV("InterleaveCount", IC) << ")"; 9773 }); 9774 } else { 9775 // If we decided that it is *legal* to vectorize the loop, then do it. 9776 9777 // Consider vectorizing the epilogue too if it's profitable. 9778 VectorizationFactor EpilogueVF = 9779 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9780 if (EpilogueVF.Width.isVector()) { 9781 9782 // The first pass vectorizes the main loop and creates a scalar epilogue 9783 // to be vectorized by executing the plan (potentially with a different 9784 // factor) again shortly afterwards. 9785 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9786 EpilogueVF.Width.getKnownMinValue(), 9787 1); 9788 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9789 EPI, &LVL, &CM, BFI, PSI, Checks); 9790 9791 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9792 LVP.executePlan(MainILV, DT); 9793 ++LoopsVectorized; 9794 9795 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9796 formLCSSARecursively(*L, *DT, LI, SE); 9797 9798 // Second pass vectorizes the epilogue and adjusts the control flow 9799 // edges from the first pass. 9800 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9801 EPI.MainLoopVF = EPI.EpilogueVF; 9802 EPI.MainLoopUF = EPI.EpilogueUF; 9803 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9804 ORE, EPI, &LVL, &CM, BFI, PSI, 9805 Checks); 9806 LVP.executePlan(EpilogILV, DT); 9807 ++LoopsEpilogueVectorized; 9808 9809 if (!MainILV.areSafetyChecksAdded()) 9810 DisableRuntimeUnroll = true; 9811 } else { 9812 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9813 &LVL, &CM, BFI, PSI, Checks); 9814 LVP.executePlan(LB, DT); 9815 ++LoopsVectorized; 9816 9817 // Add metadata to disable runtime unrolling a scalar loop when there 9818 // are no runtime checks about strides and memory. A scalar loop that is 9819 // rarely used is not worth unrolling. 9820 if (!LB.areSafetyChecksAdded()) 9821 DisableRuntimeUnroll = true; 9822 } 9823 // Report the vectorization decision. 9824 ORE->emit([&]() { 9825 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9826 L->getHeader()) 9827 << "vectorized loop (vectorization width: " 9828 << NV("VectorizationFactor", VF.Width) 9829 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9830 }); 9831 } 9832 9833 if (ORE->allowExtraAnalysis(LV_NAME)) 9834 checkMixedPrecision(L, ORE); 9835 } 9836 9837 Optional<MDNode *> RemainderLoopID = 9838 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9839 LLVMLoopVectorizeFollowupEpilogue}); 9840 if (RemainderLoopID.hasValue()) { 9841 L->setLoopID(RemainderLoopID.getValue()); 9842 } else { 9843 if (DisableRuntimeUnroll) 9844 AddRuntimeUnrollDisableMetaData(L); 9845 9846 // Mark the loop as already vectorized to avoid vectorizing again. 9847 Hints.setAlreadyVectorized(); 9848 } 9849 9850 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9851 return true; 9852 } 9853 9854 LoopVectorizeResult LoopVectorizePass::runImpl( 9855 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9856 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9857 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9858 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9859 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9860 SE = &SE_; 9861 LI = &LI_; 9862 TTI = &TTI_; 9863 DT = &DT_; 9864 BFI = &BFI_; 9865 TLI = TLI_; 9866 AA = &AA_; 9867 AC = &AC_; 9868 GetLAA = &GetLAA_; 9869 DB = &DB_; 9870 ORE = &ORE_; 9871 PSI = PSI_; 9872 9873 // Don't attempt if 9874 // 1. the target claims to have no vector registers, and 9875 // 2. interleaving won't help ILP. 9876 // 9877 // The second condition is necessary because, even if the target has no 9878 // vector registers, loop vectorization may still enable scalar 9879 // interleaving. 9880 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9881 TTI->getMaxInterleaveFactor(1) < 2) 9882 return LoopVectorizeResult(false, false); 9883 9884 bool Changed = false, CFGChanged = false; 9885 9886 // The vectorizer requires loops to be in simplified form. 9887 // Since simplification may add new inner loops, it has to run before the 9888 // legality and profitability checks. This means running the loop vectorizer 9889 // will simplify all loops, regardless of whether anything end up being 9890 // vectorized. 9891 for (auto &L : *LI) 9892 Changed |= CFGChanged |= 9893 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9894 9895 // Build up a worklist of inner-loops to vectorize. This is necessary as 9896 // the act of vectorizing or partially unrolling a loop creates new loops 9897 // and can invalidate iterators across the loops. 9898 SmallVector<Loop *, 8> Worklist; 9899 9900 for (Loop *L : *LI) 9901 collectSupportedLoops(*L, LI, ORE, Worklist); 9902 9903 LoopsAnalyzed += Worklist.size(); 9904 9905 // Now walk the identified inner loops. 9906 while (!Worklist.empty()) { 9907 Loop *L = Worklist.pop_back_val(); 9908 9909 // For the inner loops we actually process, form LCSSA to simplify the 9910 // transform. 9911 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9912 9913 Changed |= CFGChanged |= processLoop(L); 9914 } 9915 9916 // Process each loop nest in the function. 9917 return LoopVectorizeResult(Changed, CFGChanged); 9918 } 9919 9920 PreservedAnalyses LoopVectorizePass::run(Function &F, 9921 FunctionAnalysisManager &AM) { 9922 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9923 auto &LI = AM.getResult<LoopAnalysis>(F); 9924 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9925 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9926 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9927 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9928 auto &AA = AM.getResult<AAManager>(F); 9929 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9930 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9931 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9932 MemorySSA *MSSA = EnableMSSALoopDependency 9933 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9934 : nullptr; 9935 9936 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9937 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9938 [&](Loop &L) -> const LoopAccessInfo & { 9939 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9940 TLI, TTI, nullptr, MSSA}; 9941 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9942 }; 9943 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9944 ProfileSummaryInfo *PSI = 9945 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9946 LoopVectorizeResult Result = 9947 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9948 if (!Result.MadeAnyChange) 9949 return PreservedAnalyses::all(); 9950 PreservedAnalyses PA; 9951 9952 // We currently do not preserve loopinfo/dominator analyses with outer loop 9953 // vectorization. Until this is addressed, mark these analyses as preserved 9954 // only for non-VPlan-native path. 9955 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9956 if (!EnableVPlanNativePath) { 9957 PA.preserve<LoopAnalysis>(); 9958 PA.preserve<DominatorTreeAnalysis>(); 9959 } 9960 PA.preserve<BasicAA>(); 9961 PA.preserve<GlobalsAA>(); 9962 if (!Result.MadeCFGChange) 9963 PA.preserveSet<CFGAnalyses>(); 9964 return PA; 9965 } 9966