1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<bool> ForceTargetSupportsScalableVectors( 276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 277 cl::desc( 278 "Pretend that scalable vectors are supported, even if the target does " 279 "not support them. This flag should only be used for testing.")); 280 281 static cl::opt<unsigned> SmallLoopCost( 282 "small-loop-cost", cl::init(20), cl::Hidden, 283 cl::desc( 284 "The cost of a loop that is considered 'small' by the interleaver.")); 285 286 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 288 cl::desc("Enable the use of the block frequency analysis to access PGO " 289 "heuristics minimizing code growth in cold regions and being more " 290 "aggressive in hot regions.")); 291 292 // Runtime interleave loops for load/store throughput. 293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 295 cl::desc( 296 "Enable runtime interleaving until load/store ports are saturated")); 297 298 /// Interleave small loops with scalar reductions. 299 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 301 cl::desc("Enable interleaving for loops with small iteration counts that " 302 "contain scalar reductions to expose ILP.")); 303 304 /// The number of stores in a loop that are allowed to need predication. 305 static cl::opt<unsigned> NumberOfStoresToPredicate( 306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 307 cl::desc("Max number of stores to be predicated behind an if.")); 308 309 static cl::opt<bool> EnableIndVarRegisterHeur( 310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 311 cl::desc("Count the induction variable only once when interleaving")); 312 313 static cl::opt<bool> EnableCondStoresVectorization( 314 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 315 cl::desc("Enable if predication of stores during vectorization.")); 316 317 static cl::opt<unsigned> MaxNestedScalarReductionIC( 318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 319 cl::desc("The maximum interleave count to use when interleaving a scalar " 320 "reduction in a nested loop.")); 321 322 static cl::opt<bool> 323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 324 cl::Hidden, 325 cl::desc("Prefer in-loop vector reductions, " 326 "overriding the targets preference.")); 327 328 static cl::opt<bool> PreferPredicatedReductionSelect( 329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 330 cl::desc( 331 "Prefer predicating a reduction operation over an after loop select.")); 332 333 cl::opt<bool> EnableVPlanNativePath( 334 "enable-vplan-native-path", cl::init(false), cl::Hidden, 335 cl::desc("Enable VPlan-native vectorization path with " 336 "support for outer loop vectorization.")); 337 338 // FIXME: Remove this switch once we have divergence analysis. Currently we 339 // assume divergent non-backedge branches when this switch is true. 340 cl::opt<bool> EnableVPlanPredication( 341 "enable-vplan-predication", cl::init(false), cl::Hidden, 342 cl::desc("Enable VPlan-native vectorization path predicator with " 343 "support for outer loop vectorization.")); 344 345 // This flag enables the stress testing of the VPlan H-CFG construction in the 346 // VPlan-native vectorization path. It must be used in conjuction with 347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 348 // verification of the H-CFGs built. 349 static cl::opt<bool> VPlanBuildStressTest( 350 "vplan-build-stress-test", cl::init(false), cl::Hidden, 351 cl::desc( 352 "Build VPlan for every supported loop nest in the function and bail " 353 "out right after the build (stress test the VPlan H-CFG construction " 354 "in the VPlan-native vectorization path).")); 355 356 cl::opt<bool> llvm::EnableLoopInterleaving( 357 "interleave-loops", cl::init(true), cl::Hidden, 358 cl::desc("Enable loop interleaving in Loop vectorization passes")); 359 cl::opt<bool> llvm::EnableLoopVectorization( 360 "vectorize-loops", cl::init(true), cl::Hidden, 361 cl::desc("Run the Loop vectorization passes")); 362 363 cl::opt<bool> PrintVPlansInDotFormat( 364 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 365 cl::desc("Use dot format instead of plain text when dumping VPlans")); 366 367 /// A helper function that returns the type of loaded or stored value. 368 static Type *getMemInstValueType(Value *I) { 369 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 370 "Expected Load or Store instruction"); 371 if (auto *LI = dyn_cast<LoadInst>(I)) 372 return LI->getType(); 373 return cast<StoreInst>(I)->getValueOperand()->getType(); 374 } 375 376 /// A helper function that returns true if the given type is irregular. The 377 /// type is irregular if its allocated size doesn't equal the store size of an 378 /// element of the corresponding vector type. 379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 380 // Determine if an array of N elements of type Ty is "bitcast compatible" 381 // with a <N x Ty> vector. 382 // This is only true if there is no padding between the array elements. 383 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 384 } 385 386 /// A helper function that returns the reciprocal of the block probability of 387 /// predicated blocks. If we return X, we are assuming the predicated block 388 /// will execute once for every X iterations of the loop header. 389 /// 390 /// TODO: We should use actual block probability here, if available. Currently, 391 /// we always assume predicated blocks have a 50% chance of executing. 392 static unsigned getReciprocalPredBlockProb() { return 2; } 393 394 /// A helper function that returns an integer or floating-point constant with 395 /// value C. 396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 397 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 398 : ConstantFP::get(Ty, C); 399 } 400 401 /// Returns "best known" trip count for the specified loop \p L as defined by 402 /// the following procedure: 403 /// 1) Returns exact trip count if it is known. 404 /// 2) Returns expected trip count according to profile data if any. 405 /// 3) Returns upper bound estimate if it is known. 406 /// 4) Returns None if all of the above failed. 407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 408 // Check if exact trip count is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 410 return ExpectedTC; 411 412 // Check if there is an expected trip count available from profile data. 413 if (LoopVectorizeWithBlockFrequency) 414 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 415 return EstimatedTC; 416 417 // Check if upper bound estimate is known. 418 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 419 return ExpectedTC; 420 421 return None; 422 } 423 424 // Forward declare GeneratedRTChecks. 425 class GeneratedRTChecks; 426 427 namespace llvm { 428 429 /// InnerLoopVectorizer vectorizes loops which contain only one basic 430 /// block to a specified vectorization factor (VF). 431 /// This class performs the widening of scalars into vectors, or multiple 432 /// scalars. This class also implements the following features: 433 /// * It inserts an epilogue loop for handling loops that don't have iteration 434 /// counts that are known to be a multiple of the vectorization factor. 435 /// * It handles the code generation for reduction variables. 436 /// * Scalarization (implementation using scalars) of un-vectorizable 437 /// instructions. 438 /// InnerLoopVectorizer does not perform any vectorization-legality 439 /// checks, and relies on the caller to check for the different legality 440 /// aspects. The InnerLoopVectorizer relies on the 441 /// LoopVectorizationLegality class to provide information about the induction 442 /// and reduction variables that were found to a given vectorization factor. 443 class InnerLoopVectorizer { 444 public: 445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 446 LoopInfo *LI, DominatorTree *DT, 447 const TargetLibraryInfo *TLI, 448 const TargetTransformInfo *TTI, AssumptionCache *AC, 449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 450 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 451 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 452 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 453 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 454 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 455 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 456 PSI(PSI), RTChecks(RTChecks) { 457 // Query this against the original loop and save it here because the profile 458 // of the original loop header may change as the transformation happens. 459 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 460 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop. 470 /// In the case of epilogue vectorization, this function is overriden to 471 /// handle the more complex control flow around the loops. 472 virtual BasicBlock *createVectorizedLoopSkeleton(); 473 474 /// Widen a single instruction within the innermost loop. 475 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 476 VPTransformState &State); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Widen a single select instruction within the innermost loop. 483 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 484 bool InvariantCond, VPTransformState &State); 485 486 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 487 void fixVectorizedLoop(VPTransformState &State); 488 489 // Return true if any runtime check is added. 490 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 491 492 /// A type for vectorized values in the new loop. Each value from the 493 /// original loop, when vectorized, is represented by UF vector values in the 494 /// new unrolled loop, where UF is the unroll factor. 495 using VectorParts = SmallVector<Value *, 2>; 496 497 /// Vectorize a single GetElementPtrInst based on information gathered and 498 /// decisions taken during planning. 499 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 500 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 501 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 502 503 /// Vectorize a single PHINode in a block. This method handles the induction 504 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 505 /// arbitrary length vectors. 506 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 507 VPValue *StartV, VPValue *Def, 508 VPTransformState &State); 509 510 /// A helper function to scalarize a single Instruction in the innermost loop. 511 /// Generates a sequence of scalar instances for each lane between \p MinLane 512 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 513 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 514 /// Instr's operands. 515 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 516 const VPIteration &Instance, bool IfPredicateInstr, 517 VPTransformState &State); 518 519 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 520 /// is provided, the integer induction variable will first be truncated to 521 /// the corresponding type. 522 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 523 VPValue *Def, VPValue *CastDef, 524 VPTransformState &State); 525 526 /// Construct the vector value of a scalarized value \p V one lane at a time. 527 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 528 VPTransformState &State); 529 530 /// Try to vectorize interleaved access group \p Group with the base address 531 /// given in \p Addr, optionally masking the vector operations if \p 532 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 533 /// values in the vectorized loop. 534 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 535 ArrayRef<VPValue *> VPDefs, 536 VPTransformState &State, VPValue *Addr, 537 ArrayRef<VPValue *> StoredValues, 538 VPValue *BlockInMask = nullptr); 539 540 /// Vectorize Load and Store instructions with the base address given in \p 541 /// Addr, optionally masking the vector operations if \p BlockInMask is 542 /// non-null. Use \p State to translate given VPValues to IR values in the 543 /// vectorized loop. 544 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 545 VPValue *Def, VPValue *Addr, 546 VPValue *StoredValue, VPValue *BlockInMask); 547 548 /// Set the debug location in the builder using the debug location in 549 /// the instruction. 550 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 551 552 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 553 void fixNonInductionPHIs(VPTransformState &State); 554 555 /// Create a broadcast instruction. This method generates a broadcast 556 /// instruction (shuffle) for loop invariant values and for the induction 557 /// value. If this is the induction variable then we extend it to N, N+1, ... 558 /// this is needed because each iteration in the loop corresponds to a SIMD 559 /// element. 560 virtual Value *getBroadcastInstrs(Value *V); 561 562 protected: 563 friend class LoopVectorizationPlanner; 564 565 /// A small list of PHINodes. 566 using PhiVector = SmallVector<PHINode *, 4>; 567 568 /// A type for scalarized values in the new loop. Each value from the 569 /// original loop, when scalarized, is represented by UF x VF scalar values 570 /// in the new unrolled loop, where UF is the unroll factor and VF is the 571 /// vectorization factor. 572 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 573 574 /// Set up the values of the IVs correctly when exiting the vector loop. 575 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 576 Value *CountRoundDown, Value *EndValue, 577 BasicBlock *MiddleBlock); 578 579 /// Create a new induction variable inside L. 580 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 581 Value *Step, Instruction *DL); 582 583 /// Handle all cross-iteration phis in the header. 584 void fixCrossIterationPHIs(VPTransformState &State); 585 586 /// Fix a first-order recurrence. This is the second phase of vectorizing 587 /// this phi node. 588 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 589 590 /// Fix a reduction cross-iteration phi. This is the second phase of 591 /// vectorizing this phi node. 592 void fixReduction(PHINode *Phi, VPTransformState &State); 593 594 /// Clear NSW/NUW flags from reduction instructions if necessary. 595 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 596 VPTransformState &State); 597 598 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 599 /// means we need to add the appropriate incoming value from the middle 600 /// block as exiting edges from the scalar epilogue loop (if present) are 601 /// already in place, and we exit the vector loop exclusively to the middle 602 /// block. 603 void fixLCSSAPHIs(VPTransformState &State); 604 605 /// Iteratively sink the scalarized operands of a predicated instruction into 606 /// the block that was created for it. 607 void sinkScalarOperands(Instruction *PredInst); 608 609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 610 /// represented as. 611 void truncateToMinimalBitwidths(VPTransformState &State); 612 613 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 614 /// to each vector element of Val. The sequence starts at StartIndex. 615 /// \p Opcode is relevant for FP induction variable. 616 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 617 Instruction::BinaryOps Opcode = 618 Instruction::BinaryOpsEnd); 619 620 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 621 /// variable on which to base the steps, \p Step is the size of the step, and 622 /// \p EntryVal is the value from the original loop that maps to the steps. 623 /// Note that \p EntryVal doesn't have to be an induction variable - it 624 /// can also be a truncate instruction. 625 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 626 const InductionDescriptor &ID, VPValue *Def, 627 VPValue *CastDef, VPTransformState &State); 628 629 /// Create a vector induction phi node based on an existing scalar one. \p 630 /// EntryVal is the value from the original loop that maps to the vector phi 631 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 632 /// truncate instruction, instead of widening the original IV, we widen a 633 /// version of the IV truncated to \p EntryVal's type. 634 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 635 Value *Step, Value *Start, 636 Instruction *EntryVal, VPValue *Def, 637 VPValue *CastDef, 638 VPTransformState &State); 639 640 /// Returns true if an instruction \p I should be scalarized instead of 641 /// vectorized for the chosen vectorization factor. 642 bool shouldScalarizeInstruction(Instruction *I) const; 643 644 /// Returns true if we should generate a scalar version of \p IV. 645 bool needsScalarInduction(Instruction *IV) const; 646 647 /// If there is a cast involved in the induction variable \p ID, which should 648 /// be ignored in the vectorized loop body, this function records the 649 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 650 /// cast. We had already proved that the casted Phi is equal to the uncasted 651 /// Phi in the vectorized loop (under a runtime guard), and therefore 652 /// there is no need to vectorize the cast - the same value can be used in the 653 /// vector loop for both the Phi and the cast. 654 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 655 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 656 /// 657 /// \p EntryVal is the value from the original loop that maps to the vector 658 /// phi node and is used to distinguish what is the IV currently being 659 /// processed - original one (if \p EntryVal is a phi corresponding to the 660 /// original IV) or the "newly-created" one based on the proof mentioned above 661 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 662 /// latter case \p EntryVal is a TruncInst and we must not record anything for 663 /// that IV, but it's error-prone to expect callers of this routine to care 664 /// about that, hence this explicit parameter. 665 void recordVectorLoopValueForInductionCast( 666 const InductionDescriptor &ID, const Instruction *EntryVal, 667 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 668 unsigned Part, unsigned Lane = UINT_MAX); 669 670 /// Generate a shuffle sequence that will reverse the vector Vec. 671 virtual Value *reverseVector(Value *Vec); 672 673 /// Returns (and creates if needed) the original loop trip count. 674 Value *getOrCreateTripCount(Loop *NewLoop); 675 676 /// Returns (and creates if needed) the trip count of the widened loop. 677 Value *getOrCreateVectorTripCount(Loop *NewLoop); 678 679 /// Returns a bitcasted value to the requested vector type. 680 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 681 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 682 const DataLayout &DL); 683 684 /// Emit a bypass check to see if the vector trip count is zero, including if 685 /// it overflows. 686 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 687 688 /// Emit a bypass check to see if all of the SCEV assumptions we've 689 /// had to make are correct. Returns the block containing the checks or 690 /// nullptr if no checks have been added. 691 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 692 693 /// Emit bypass checks to check any memory assumptions we may have made. 694 /// Returns the block containing the checks or nullptr if no checks have been 695 /// added. 696 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 697 698 /// Compute the transformed value of Index at offset StartValue using step 699 /// StepValue. 700 /// For integer induction, returns StartValue + Index * StepValue. 701 /// For pointer induction, returns StartValue[Index * StepValue]. 702 /// FIXME: The newly created binary instructions should contain nsw/nuw 703 /// flags, which can be found from the original scalar operations. 704 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 705 const DataLayout &DL, 706 const InductionDescriptor &ID) const; 707 708 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 709 /// vector loop preheader, middle block and scalar preheader. Also 710 /// allocate a loop object for the new vector loop and return it. 711 Loop *createVectorLoopSkeleton(StringRef Prefix); 712 713 /// Create new phi nodes for the induction variables to resume iteration count 714 /// in the scalar epilogue, from where the vectorized loop left off (given by 715 /// \p VectorTripCount). 716 /// In cases where the loop skeleton is more complicated (eg. epilogue 717 /// vectorization) and the resume values can come from an additional bypass 718 /// block, the \p AdditionalBypass pair provides information about the bypass 719 /// block and the end value on the edge from bypass to this loop. 720 void createInductionResumeValues( 721 Loop *L, Value *VectorTripCount, 722 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 723 724 /// Complete the loop skeleton by adding debug MDs, creating appropriate 725 /// conditional branches in the middle block, preparing the builder and 726 /// running the verifier. Take in the vector loop \p L as argument, and return 727 /// the preheader of the completed vector loop. 728 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 729 730 /// Add additional metadata to \p To that was not present on \p Orig. 731 /// 732 /// Currently this is used to add the noalias annotations based on the 733 /// inserted memchecks. Use this for instructions that are *cloned* into the 734 /// vector loop. 735 void addNewMetadata(Instruction *To, const Instruction *Orig); 736 737 /// Add metadata from one instruction to another. 738 /// 739 /// This includes both the original MDs from \p From and additional ones (\see 740 /// addNewMetadata). Use this for *newly created* instructions in the vector 741 /// loop. 742 void addMetadata(Instruction *To, Instruction *From); 743 744 /// Similar to the previous function but it adds the metadata to a 745 /// vector of instructions. 746 void addMetadata(ArrayRef<Value *> To, Instruction *From); 747 748 /// Allow subclasses to override and print debug traces before/after vplan 749 /// execution, when trace information is requested. 750 virtual void printDebugTracesAtStart(){}; 751 virtual void printDebugTracesAtEnd(){}; 752 753 /// The original loop. 754 Loop *OrigLoop; 755 756 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 757 /// dynamic knowledge to simplify SCEV expressions and converts them to a 758 /// more usable form. 759 PredicatedScalarEvolution &PSE; 760 761 /// Loop Info. 762 LoopInfo *LI; 763 764 /// Dominator Tree. 765 DominatorTree *DT; 766 767 /// Alias Analysis. 768 AAResults *AA; 769 770 /// Target Library Info. 771 const TargetLibraryInfo *TLI; 772 773 /// Target Transform Info. 774 const TargetTransformInfo *TTI; 775 776 /// Assumption Cache. 777 AssumptionCache *AC; 778 779 /// Interface to emit optimization remarks. 780 OptimizationRemarkEmitter *ORE; 781 782 /// LoopVersioning. It's only set up (non-null) if memchecks were 783 /// used. 784 /// 785 /// This is currently only used to add no-alias metadata based on the 786 /// memchecks. The actually versioning is performed manually. 787 std::unique_ptr<LoopVersioning> LVer; 788 789 /// The vectorization SIMD factor to use. Each vector will have this many 790 /// vector elements. 791 ElementCount VF; 792 793 /// The vectorization unroll factor to use. Each scalar is vectorized to this 794 /// many different vector instructions. 795 unsigned UF; 796 797 /// The builder that we use 798 IRBuilder<> Builder; 799 800 // --- Vectorization state --- 801 802 /// The vector-loop preheader. 803 BasicBlock *LoopVectorPreHeader; 804 805 /// The scalar-loop preheader. 806 BasicBlock *LoopScalarPreHeader; 807 808 /// Middle Block between the vector and the scalar. 809 BasicBlock *LoopMiddleBlock; 810 811 /// The (unique) ExitBlock of the scalar loop. Note that 812 /// there can be multiple exiting edges reaching this block. 813 BasicBlock *LoopExitBlock; 814 815 /// The vector loop body. 816 BasicBlock *LoopVectorBody; 817 818 /// The scalar loop body. 819 BasicBlock *LoopScalarBody; 820 821 /// A list of all bypass blocks. The first block is the entry of the loop. 822 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 823 824 /// The new Induction variable which was added to the new block. 825 PHINode *Induction = nullptr; 826 827 /// The induction variable of the old basic block. 828 PHINode *OldInduction = nullptr; 829 830 /// Store instructions that were predicated. 831 SmallVector<Instruction *, 4> PredicatedInstructions; 832 833 /// Trip count of the original loop. 834 Value *TripCount = nullptr; 835 836 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 837 Value *VectorTripCount = nullptr; 838 839 /// The legality analysis. 840 LoopVectorizationLegality *Legal; 841 842 /// The profitablity analysis. 843 LoopVectorizationCostModel *Cost; 844 845 // Record whether runtime checks are added. 846 bool AddedSafetyChecks = false; 847 848 // Holds the end values for each induction variable. We save the end values 849 // so we can later fix-up the external users of the induction variables. 850 DenseMap<PHINode *, Value *> IVEndValues; 851 852 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 853 // fixed up at the end of vector code generation. 854 SmallVector<PHINode *, 8> OrigPHIsToFix; 855 856 /// BFI and PSI are used to check for profile guided size optimizations. 857 BlockFrequencyInfo *BFI; 858 ProfileSummaryInfo *PSI; 859 860 // Whether this loop should be optimized for size based on profile guided size 861 // optimizatios. 862 bool OptForSizeBasedOnProfile; 863 864 /// Structure to hold information about generated runtime checks, responsible 865 /// for cleaning the checks, if vectorization turns out unprofitable. 866 GeneratedRTChecks &RTChecks; 867 }; 868 869 class InnerLoopUnroller : public InnerLoopVectorizer { 870 public: 871 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 872 LoopInfo *LI, DominatorTree *DT, 873 const TargetLibraryInfo *TLI, 874 const TargetTransformInfo *TTI, AssumptionCache *AC, 875 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 876 LoopVectorizationLegality *LVL, 877 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 878 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 879 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 880 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 881 BFI, PSI, Check) {} 882 883 private: 884 Value *getBroadcastInstrs(Value *V) override; 885 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 886 Instruction::BinaryOps Opcode = 887 Instruction::BinaryOpsEnd) override; 888 Value *reverseVector(Value *Vec) override; 889 }; 890 891 /// Encapsulate information regarding vectorization of a loop and its epilogue. 892 /// This information is meant to be updated and used across two stages of 893 /// epilogue vectorization. 894 struct EpilogueLoopVectorizationInfo { 895 ElementCount MainLoopVF = ElementCount::getFixed(0); 896 unsigned MainLoopUF = 0; 897 ElementCount EpilogueVF = ElementCount::getFixed(0); 898 unsigned EpilogueUF = 0; 899 BasicBlock *MainLoopIterationCountCheck = nullptr; 900 BasicBlock *EpilogueIterationCountCheck = nullptr; 901 BasicBlock *SCEVSafetyCheck = nullptr; 902 BasicBlock *MemSafetyCheck = nullptr; 903 Value *TripCount = nullptr; 904 Value *VectorTripCount = nullptr; 905 906 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 907 unsigned EUF) 908 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 909 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 910 assert(EUF == 1 && 911 "A high UF for the epilogue loop is likely not beneficial."); 912 } 913 }; 914 915 /// An extension of the inner loop vectorizer that creates a skeleton for a 916 /// vectorized loop that has its epilogue (residual) also vectorized. 917 /// The idea is to run the vplan on a given loop twice, firstly to setup the 918 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 919 /// from the first step and vectorize the epilogue. This is achieved by 920 /// deriving two concrete strategy classes from this base class and invoking 921 /// them in succession from the loop vectorizer planner. 922 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 923 public: 924 InnerLoopAndEpilogueVectorizer( 925 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 926 DominatorTree *DT, const TargetLibraryInfo *TLI, 927 const TargetTransformInfo *TTI, AssumptionCache *AC, 928 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 929 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 930 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 931 GeneratedRTChecks &Checks) 932 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 933 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 934 Checks), 935 EPI(EPI) {} 936 937 // Override this function to handle the more complex control flow around the 938 // three loops. 939 BasicBlock *createVectorizedLoopSkeleton() final override { 940 return createEpilogueVectorizedLoopSkeleton(); 941 } 942 943 /// The interface for creating a vectorized skeleton using one of two 944 /// different strategies, each corresponding to one execution of the vplan 945 /// as described above. 946 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 947 948 /// Holds and updates state information required to vectorize the main loop 949 /// and its epilogue in two separate passes. This setup helps us avoid 950 /// regenerating and recomputing runtime safety checks. It also helps us to 951 /// shorten the iteration-count-check path length for the cases where the 952 /// iteration count of the loop is so small that the main vector loop is 953 /// completely skipped. 954 EpilogueLoopVectorizationInfo &EPI; 955 }; 956 957 /// A specialized derived class of inner loop vectorizer that performs 958 /// vectorization of *main* loops in the process of vectorizing loops and their 959 /// epilogues. 960 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 961 public: 962 EpilogueVectorizerMainLoop( 963 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 964 DominatorTree *DT, const TargetLibraryInfo *TLI, 965 const TargetTransformInfo *TTI, AssumptionCache *AC, 966 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 967 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 968 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 969 GeneratedRTChecks &Check) 970 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 971 EPI, LVL, CM, BFI, PSI, Check) {} 972 /// Implements the interface for creating a vectorized skeleton using the 973 /// *main loop* strategy (ie the first pass of vplan execution). 974 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 975 976 protected: 977 /// Emits an iteration count bypass check once for the main loop (when \p 978 /// ForEpilogue is false) and once for the epilogue loop (when \p 979 /// ForEpilogue is true). 980 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 981 bool ForEpilogue); 982 void printDebugTracesAtStart() override; 983 void printDebugTracesAtEnd() override; 984 }; 985 986 // A specialized derived class of inner loop vectorizer that performs 987 // vectorization of *epilogue* loops in the process of vectorizing loops and 988 // their epilogues. 989 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 990 public: 991 EpilogueVectorizerEpilogueLoop( 992 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 993 DominatorTree *DT, const TargetLibraryInfo *TLI, 994 const TargetTransformInfo *TTI, AssumptionCache *AC, 995 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 996 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 997 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 998 GeneratedRTChecks &Checks) 999 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1000 EPI, LVL, CM, BFI, PSI, Checks) {} 1001 /// Implements the interface for creating a vectorized skeleton using the 1002 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1003 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1004 1005 protected: 1006 /// Emits an iteration count bypass check after the main vector loop has 1007 /// finished to see if there are any iterations left to execute by either 1008 /// the vector epilogue or the scalar epilogue. 1009 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1010 BasicBlock *Bypass, 1011 BasicBlock *Insert); 1012 void printDebugTracesAtStart() override; 1013 void printDebugTracesAtEnd() override; 1014 }; 1015 } // end namespace llvm 1016 1017 /// Look for a meaningful debug location on the instruction or it's 1018 /// operands. 1019 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1020 if (!I) 1021 return I; 1022 1023 DebugLoc Empty; 1024 if (I->getDebugLoc() != Empty) 1025 return I; 1026 1027 for (Use &Op : I->operands()) { 1028 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1029 if (OpInst->getDebugLoc() != Empty) 1030 return OpInst; 1031 } 1032 1033 return I; 1034 } 1035 1036 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1037 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1038 const DILocation *DIL = Inst->getDebugLoc(); 1039 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1040 !isa<DbgInfoIntrinsic>(Inst)) { 1041 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1042 auto NewDIL = 1043 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1044 if (NewDIL) 1045 B.SetCurrentDebugLocation(NewDIL.getValue()); 1046 else 1047 LLVM_DEBUG(dbgs() 1048 << "Failed to create new discriminator: " 1049 << DIL->getFilename() << " Line: " << DIL->getLine()); 1050 } 1051 else 1052 B.SetCurrentDebugLocation(DIL); 1053 } else 1054 B.SetCurrentDebugLocation(DebugLoc()); 1055 } 1056 1057 /// Write a record \p DebugMsg about vectorization failure to the debug 1058 /// output stream. If \p I is passed, it is an instruction that prevents 1059 /// vectorization. 1060 #ifndef NDEBUG 1061 static void debugVectorizationFailure(const StringRef DebugMsg, 1062 Instruction *I) { 1063 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1064 if (I != nullptr) 1065 dbgs() << " " << *I; 1066 else 1067 dbgs() << '.'; 1068 dbgs() << '\n'; 1069 } 1070 #endif 1071 1072 /// Create an analysis remark that explains why vectorization failed 1073 /// 1074 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1075 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1076 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1077 /// the location of the remark. \return the remark object that can be 1078 /// streamed to. 1079 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1080 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1081 Value *CodeRegion = TheLoop->getHeader(); 1082 DebugLoc DL = TheLoop->getStartLoc(); 1083 1084 if (I) { 1085 CodeRegion = I->getParent(); 1086 // If there is no debug location attached to the instruction, revert back to 1087 // using the loop's. 1088 if (I->getDebugLoc()) 1089 DL = I->getDebugLoc(); 1090 } 1091 1092 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1093 R << "loop not vectorized: "; 1094 return R; 1095 } 1096 1097 /// Return a value for Step multiplied by VF. 1098 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1099 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1100 Constant *StepVal = ConstantInt::get( 1101 Step->getType(), 1102 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1103 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1104 } 1105 1106 namespace llvm { 1107 1108 /// Return the runtime value for VF. 1109 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1110 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1111 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1112 } 1113 1114 void reportVectorizationFailure(const StringRef DebugMsg, 1115 const StringRef OREMsg, const StringRef ORETag, 1116 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1117 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1118 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1119 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1120 ORETag, TheLoop, I) << OREMsg); 1121 } 1122 1123 } // end namespace llvm 1124 1125 #ifndef NDEBUG 1126 /// \return string containing a file name and a line # for the given loop. 1127 static std::string getDebugLocString(const Loop *L) { 1128 std::string Result; 1129 if (L) { 1130 raw_string_ostream OS(Result); 1131 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1132 LoopDbgLoc.print(OS); 1133 else 1134 // Just print the module name. 1135 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1136 OS.flush(); 1137 } 1138 return Result; 1139 } 1140 #endif 1141 1142 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1143 const Instruction *Orig) { 1144 // If the loop was versioned with memchecks, add the corresponding no-alias 1145 // metadata. 1146 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1147 LVer->annotateInstWithNoAlias(To, Orig); 1148 } 1149 1150 void InnerLoopVectorizer::addMetadata(Instruction *To, 1151 Instruction *From) { 1152 propagateMetadata(To, From); 1153 addNewMetadata(To, From); 1154 } 1155 1156 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1157 Instruction *From) { 1158 for (Value *V : To) { 1159 if (Instruction *I = dyn_cast<Instruction>(V)) 1160 addMetadata(I, From); 1161 } 1162 } 1163 1164 namespace llvm { 1165 1166 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1167 // lowered. 1168 enum ScalarEpilogueLowering { 1169 1170 // The default: allowing scalar epilogues. 1171 CM_ScalarEpilogueAllowed, 1172 1173 // Vectorization with OptForSize: don't allow epilogues. 1174 CM_ScalarEpilogueNotAllowedOptSize, 1175 1176 // A special case of vectorisation with OptForSize: loops with a very small 1177 // trip count are considered for vectorization under OptForSize, thereby 1178 // making sure the cost of their loop body is dominant, free of runtime 1179 // guards and scalar iteration overheads. 1180 CM_ScalarEpilogueNotAllowedLowTripLoop, 1181 1182 // Loop hint predicate indicating an epilogue is undesired. 1183 CM_ScalarEpilogueNotNeededUsePredicate, 1184 1185 // Directive indicating we must either tail fold or not vectorize 1186 CM_ScalarEpilogueNotAllowedUsePredicate 1187 }; 1188 1189 /// LoopVectorizationCostModel - estimates the expected speedups due to 1190 /// vectorization. 1191 /// In many cases vectorization is not profitable. This can happen because of 1192 /// a number of reasons. In this class we mainly attempt to predict the 1193 /// expected speedup/slowdowns due to the supported instruction set. We use the 1194 /// TargetTransformInfo to query the different backends for the cost of 1195 /// different operations. 1196 class LoopVectorizationCostModel { 1197 public: 1198 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1199 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1200 LoopVectorizationLegality *Legal, 1201 const TargetTransformInfo &TTI, 1202 const TargetLibraryInfo *TLI, DemandedBits *DB, 1203 AssumptionCache *AC, 1204 OptimizationRemarkEmitter *ORE, const Function *F, 1205 const LoopVectorizeHints *Hints, 1206 InterleavedAccessInfo &IAI) 1207 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1208 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1209 Hints(Hints), InterleaveInfo(IAI) {} 1210 1211 /// \return An upper bound for the vectorization factor, or None if 1212 /// vectorization and interleaving should be avoided up front. 1213 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1214 1215 /// \return True if runtime checks are required for vectorization, and false 1216 /// otherwise. 1217 bool runtimeChecksRequired(); 1218 1219 /// \return The most profitable vectorization factor and the cost of that VF. 1220 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1221 /// then this vectorization factor will be selected if vectorization is 1222 /// possible. 1223 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1224 VectorizationFactor 1225 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1226 const LoopVectorizationPlanner &LVP); 1227 1228 /// Setup cost-based decisions for user vectorization factor. 1229 void selectUserVectorizationFactor(ElementCount UserVF) { 1230 collectUniformsAndScalars(UserVF); 1231 collectInstsToScalarize(UserVF); 1232 } 1233 1234 /// \return The size (in bits) of the smallest and widest types in the code 1235 /// that needs to be vectorized. We ignore values that remain scalar such as 1236 /// 64 bit loop indices. 1237 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1238 1239 /// \return The desired interleave count. 1240 /// If interleave count has been specified by metadata it will be returned. 1241 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1242 /// are the selected vectorization factor and the cost of the selected VF. 1243 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1244 1245 /// Memory access instruction may be vectorized in more than one way. 1246 /// Form of instruction after vectorization depends on cost. 1247 /// This function takes cost-based decisions for Load/Store instructions 1248 /// and collects them in a map. This decisions map is used for building 1249 /// the lists of loop-uniform and loop-scalar instructions. 1250 /// The calculated cost is saved with widening decision in order to 1251 /// avoid redundant calculations. 1252 void setCostBasedWideningDecision(ElementCount VF); 1253 1254 /// A struct that represents some properties of the register usage 1255 /// of a loop. 1256 struct RegisterUsage { 1257 /// Holds the number of loop invariant values that are used in the loop. 1258 /// The key is ClassID of target-provided register class. 1259 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1260 /// Holds the maximum number of concurrent live intervals in the loop. 1261 /// The key is ClassID of target-provided register class. 1262 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1263 }; 1264 1265 /// \return Returns information about the register usages of the loop for the 1266 /// given vectorization factors. 1267 SmallVector<RegisterUsage, 8> 1268 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1269 1270 /// Collect values we want to ignore in the cost model. 1271 void collectValuesToIgnore(); 1272 1273 /// Split reductions into those that happen in the loop, and those that happen 1274 /// outside. In loop reductions are collected into InLoopReductionChains. 1275 void collectInLoopReductions(); 1276 1277 /// \returns The smallest bitwidth each instruction can be represented with. 1278 /// The vector equivalents of these instructions should be truncated to this 1279 /// type. 1280 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1281 return MinBWs; 1282 } 1283 1284 /// \returns True if it is more profitable to scalarize instruction \p I for 1285 /// vectorization factor \p VF. 1286 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1287 assert(VF.isVector() && 1288 "Profitable to scalarize relevant only for VF > 1."); 1289 1290 // Cost model is not run in the VPlan-native path - return conservative 1291 // result until this changes. 1292 if (EnableVPlanNativePath) 1293 return false; 1294 1295 auto Scalars = InstsToScalarize.find(VF); 1296 assert(Scalars != InstsToScalarize.end() && 1297 "VF not yet analyzed for scalarization profitability"); 1298 return Scalars->second.find(I) != Scalars->second.end(); 1299 } 1300 1301 /// Returns true if \p I is known to be uniform after vectorization. 1302 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1303 if (VF.isScalar()) 1304 return true; 1305 1306 // Cost model is not run in the VPlan-native path - return conservative 1307 // result until this changes. 1308 if (EnableVPlanNativePath) 1309 return false; 1310 1311 auto UniformsPerVF = Uniforms.find(VF); 1312 assert(UniformsPerVF != Uniforms.end() && 1313 "VF not yet analyzed for uniformity"); 1314 return UniformsPerVF->second.count(I); 1315 } 1316 1317 /// Returns true if \p I is known to be scalar after vectorization. 1318 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1319 if (VF.isScalar()) 1320 return true; 1321 1322 // Cost model is not run in the VPlan-native path - return conservative 1323 // result until this changes. 1324 if (EnableVPlanNativePath) 1325 return false; 1326 1327 auto ScalarsPerVF = Scalars.find(VF); 1328 assert(ScalarsPerVF != Scalars.end() && 1329 "Scalar values are not calculated for VF"); 1330 return ScalarsPerVF->second.count(I); 1331 } 1332 1333 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1334 /// for vectorization factor \p VF. 1335 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1336 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1337 !isProfitableToScalarize(I, VF) && 1338 !isScalarAfterVectorization(I, VF); 1339 } 1340 1341 /// Decision that was taken during cost calculation for memory instruction. 1342 enum InstWidening { 1343 CM_Unknown, 1344 CM_Widen, // For consecutive accesses with stride +1. 1345 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1346 CM_Interleave, 1347 CM_GatherScatter, 1348 CM_Scalarize 1349 }; 1350 1351 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1352 /// instruction \p I and vector width \p VF. 1353 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1354 InstructionCost Cost) { 1355 assert(VF.isVector() && "Expected VF >=2"); 1356 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1357 } 1358 1359 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1360 /// interleaving group \p Grp and vector width \p VF. 1361 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1362 ElementCount VF, InstWidening W, 1363 InstructionCost Cost) { 1364 assert(VF.isVector() && "Expected VF >=2"); 1365 /// Broadcast this decicion to all instructions inside the group. 1366 /// But the cost will be assigned to one instruction only. 1367 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1368 if (auto *I = Grp->getMember(i)) { 1369 if (Grp->getInsertPos() == I) 1370 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1371 else 1372 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1373 } 1374 } 1375 } 1376 1377 /// Return the cost model decision for the given instruction \p I and vector 1378 /// width \p VF. Return CM_Unknown if this instruction did not pass 1379 /// through the cost modeling. 1380 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1381 assert(VF.isVector() && "Expected VF to be a vector VF"); 1382 // Cost model is not run in the VPlan-native path - return conservative 1383 // result until this changes. 1384 if (EnableVPlanNativePath) 1385 return CM_GatherScatter; 1386 1387 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1388 auto Itr = WideningDecisions.find(InstOnVF); 1389 if (Itr == WideningDecisions.end()) 1390 return CM_Unknown; 1391 return Itr->second.first; 1392 } 1393 1394 /// Return the vectorization cost for the given instruction \p I and vector 1395 /// width \p VF. 1396 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1397 assert(VF.isVector() && "Expected VF >=2"); 1398 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1399 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1400 "The cost is not calculated"); 1401 return WideningDecisions[InstOnVF].second; 1402 } 1403 1404 /// Return True if instruction \p I is an optimizable truncate whose operand 1405 /// is an induction variable. Such a truncate will be removed by adding a new 1406 /// induction variable with the destination type. 1407 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1408 // If the instruction is not a truncate, return false. 1409 auto *Trunc = dyn_cast<TruncInst>(I); 1410 if (!Trunc) 1411 return false; 1412 1413 // Get the source and destination types of the truncate. 1414 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1415 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1416 1417 // If the truncate is free for the given types, return false. Replacing a 1418 // free truncate with an induction variable would add an induction variable 1419 // update instruction to each iteration of the loop. We exclude from this 1420 // check the primary induction variable since it will need an update 1421 // instruction regardless. 1422 Value *Op = Trunc->getOperand(0); 1423 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1424 return false; 1425 1426 // If the truncated value is not an induction variable, return false. 1427 return Legal->isInductionPhi(Op); 1428 } 1429 1430 /// Collects the instructions to scalarize for each predicated instruction in 1431 /// the loop. 1432 void collectInstsToScalarize(ElementCount VF); 1433 1434 /// Collect Uniform and Scalar values for the given \p VF. 1435 /// The sets depend on CM decision for Load/Store instructions 1436 /// that may be vectorized as interleave, gather-scatter or scalarized. 1437 void collectUniformsAndScalars(ElementCount VF) { 1438 // Do the analysis once. 1439 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1440 return; 1441 setCostBasedWideningDecision(VF); 1442 collectLoopUniforms(VF); 1443 collectLoopScalars(VF); 1444 } 1445 1446 /// Returns true if the target machine supports masked store operation 1447 /// for the given \p DataType and kind of access to \p Ptr. 1448 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1449 return Legal->isConsecutivePtr(Ptr) && 1450 TTI.isLegalMaskedStore(DataType, Alignment); 1451 } 1452 1453 /// Returns true if the target machine supports masked load operation 1454 /// for the given \p DataType and kind of access to \p Ptr. 1455 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1456 return Legal->isConsecutivePtr(Ptr) && 1457 TTI.isLegalMaskedLoad(DataType, Alignment); 1458 } 1459 1460 /// Returns true if the target machine supports masked scatter operation 1461 /// for the given \p DataType. 1462 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1463 return TTI.isLegalMaskedScatter(DataType, Alignment); 1464 } 1465 1466 /// Returns true if the target machine supports masked gather operation 1467 /// for the given \p DataType. 1468 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1469 return TTI.isLegalMaskedGather(DataType, Alignment); 1470 } 1471 1472 /// Returns true if the target machine can represent \p V as a masked gather 1473 /// or scatter operation. 1474 bool isLegalGatherOrScatter(Value *V) { 1475 bool LI = isa<LoadInst>(V); 1476 bool SI = isa<StoreInst>(V); 1477 if (!LI && !SI) 1478 return false; 1479 auto *Ty = getMemInstValueType(V); 1480 Align Align = getLoadStoreAlignment(V); 1481 return (LI && isLegalMaskedGather(Ty, Align)) || 1482 (SI && isLegalMaskedScatter(Ty, Align)); 1483 } 1484 1485 /// Returns true if the target machine supports all of the reduction 1486 /// variables found for the given VF. 1487 bool canVectorizeReductions(ElementCount VF) { 1488 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1489 RecurrenceDescriptor RdxDesc = Reduction.second; 1490 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1491 })); 1492 } 1493 1494 /// Returns true if \p I is an instruction that will be scalarized with 1495 /// predication. Such instructions include conditional stores and 1496 /// instructions that may divide by zero. 1497 /// If a non-zero VF has been calculated, we check if I will be scalarized 1498 /// predication for that VF. 1499 bool isScalarWithPredication(Instruction *I, 1500 ElementCount VF = ElementCount::getFixed(1)); 1501 1502 // Returns true if \p I is an instruction that will be predicated either 1503 // through scalar predication or masked load/store or masked gather/scatter. 1504 // Superset of instructions that return true for isScalarWithPredication. 1505 bool isPredicatedInst(Instruction *I) { 1506 if (!blockNeedsPredication(I->getParent())) 1507 return false; 1508 // Loads and stores that need some form of masked operation are predicated 1509 // instructions. 1510 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1511 return Legal->isMaskRequired(I); 1512 return isScalarWithPredication(I); 1513 } 1514 1515 /// Returns true if \p I is a memory instruction with consecutive memory 1516 /// access that can be widened. 1517 bool 1518 memoryInstructionCanBeWidened(Instruction *I, 1519 ElementCount VF = ElementCount::getFixed(1)); 1520 1521 /// Returns true if \p I is a memory instruction in an interleaved-group 1522 /// of memory accesses that can be vectorized with wide vector loads/stores 1523 /// and shuffles. 1524 bool 1525 interleavedAccessCanBeWidened(Instruction *I, 1526 ElementCount VF = ElementCount::getFixed(1)); 1527 1528 /// Check if \p Instr belongs to any interleaved access group. 1529 bool isAccessInterleaved(Instruction *Instr) { 1530 return InterleaveInfo.isInterleaved(Instr); 1531 } 1532 1533 /// Get the interleaved access group that \p Instr belongs to. 1534 const InterleaveGroup<Instruction> * 1535 getInterleavedAccessGroup(Instruction *Instr) { 1536 return InterleaveInfo.getInterleaveGroup(Instr); 1537 } 1538 1539 /// Returns true if we're required to use a scalar epilogue for at least 1540 /// the final iteration of the original loop. 1541 bool requiresScalarEpilogue() const { 1542 if (!isScalarEpilogueAllowed()) 1543 return false; 1544 // If we might exit from anywhere but the latch, must run the exiting 1545 // iteration in scalar form. 1546 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1547 return true; 1548 return InterleaveInfo.requiresScalarEpilogue(); 1549 } 1550 1551 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1552 /// loop hint annotation. 1553 bool isScalarEpilogueAllowed() const { 1554 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1555 } 1556 1557 /// Returns true if all loop blocks should be masked to fold tail loop. 1558 bool foldTailByMasking() const { return FoldTailByMasking; } 1559 1560 bool blockNeedsPredication(BasicBlock *BB) { 1561 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1562 } 1563 1564 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1565 /// nodes to the chain of instructions representing the reductions. Uses a 1566 /// MapVector to ensure deterministic iteration order. 1567 using ReductionChainMap = 1568 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1569 1570 /// Return the chain of instructions representing an inloop reduction. 1571 const ReductionChainMap &getInLoopReductionChains() const { 1572 return InLoopReductionChains; 1573 } 1574 1575 /// Returns true if the Phi is part of an inloop reduction. 1576 bool isInLoopReduction(PHINode *Phi) const { 1577 return InLoopReductionChains.count(Phi); 1578 } 1579 1580 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1581 /// with factor VF. Return the cost of the instruction, including 1582 /// scalarization overhead if it's needed. 1583 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1584 1585 /// Estimate cost of a call instruction CI if it were vectorized with factor 1586 /// VF. Return the cost of the instruction, including scalarization overhead 1587 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1588 /// scalarized - 1589 /// i.e. either vector version isn't available, or is too expensive. 1590 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1591 bool &NeedToScalarize); 1592 1593 /// Invalidates decisions already taken by the cost model. 1594 void invalidateCostModelingDecisions() { 1595 WideningDecisions.clear(); 1596 Uniforms.clear(); 1597 Scalars.clear(); 1598 } 1599 1600 private: 1601 unsigned NumPredStores = 0; 1602 1603 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1604 /// than zero. One is returned if vectorization should best be avoided due 1605 /// to cost. 1606 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1607 ElementCount UserVF); 1608 1609 /// The vectorization cost is a combination of the cost itself and a boolean 1610 /// indicating whether any of the contributing operations will actually 1611 /// operate on 1612 /// vector values after type legalization in the backend. If this latter value 1613 /// is 1614 /// false, then all operations will be scalarized (i.e. no vectorization has 1615 /// actually taken place). 1616 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1617 1618 /// Returns the expected execution cost. The unit of the cost does 1619 /// not matter because we use the 'cost' units to compare different 1620 /// vector widths. The cost that is returned is *not* normalized by 1621 /// the factor width. 1622 VectorizationCostTy expectedCost(ElementCount VF); 1623 1624 /// Returns the execution time cost of an instruction for a given vector 1625 /// width. Vector width of one means scalar. 1626 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1627 1628 /// The cost-computation logic from getInstructionCost which provides 1629 /// the vector type as an output parameter. 1630 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1631 Type *&VectorTy); 1632 1633 /// Return the cost of instructions in an inloop reduction pattern, if I is 1634 /// part of that pattern. 1635 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1636 Type *VectorTy, 1637 TTI::TargetCostKind CostKind); 1638 1639 /// Calculate vectorization cost of memory instruction \p I. 1640 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1641 1642 /// The cost computation for scalarized memory instruction. 1643 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1644 1645 /// The cost computation for interleaving group of memory instructions. 1646 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1647 1648 /// The cost computation for Gather/Scatter instruction. 1649 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1650 1651 /// The cost computation for widening instruction \p I with consecutive 1652 /// memory access. 1653 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1654 1655 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1656 /// Load: scalar load + broadcast. 1657 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1658 /// element) 1659 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1660 1661 /// Estimate the overhead of scalarizing an instruction. This is a 1662 /// convenience wrapper for the type-based getScalarizationOverhead API. 1663 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1664 1665 /// Returns whether the instruction is a load or store and will be a emitted 1666 /// as a vector operation. 1667 bool isConsecutiveLoadOrStore(Instruction *I); 1668 1669 /// Returns true if an artificially high cost for emulated masked memrefs 1670 /// should be used. 1671 bool useEmulatedMaskMemRefHack(Instruction *I); 1672 1673 /// Map of scalar integer values to the smallest bitwidth they can be legally 1674 /// represented as. The vector equivalents of these values should be truncated 1675 /// to this type. 1676 MapVector<Instruction *, uint64_t> MinBWs; 1677 1678 /// A type representing the costs for instructions if they were to be 1679 /// scalarized rather than vectorized. The entries are Instruction-Cost 1680 /// pairs. 1681 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1682 1683 /// A set containing all BasicBlocks that are known to present after 1684 /// vectorization as a predicated block. 1685 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1686 1687 /// Records whether it is allowed to have the original scalar loop execute at 1688 /// least once. This may be needed as a fallback loop in case runtime 1689 /// aliasing/dependence checks fail, or to handle the tail/remainder 1690 /// iterations when the trip count is unknown or doesn't divide by the VF, 1691 /// or as a peel-loop to handle gaps in interleave-groups. 1692 /// Under optsize and when the trip count is very small we don't allow any 1693 /// iterations to execute in the scalar loop. 1694 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1695 1696 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1697 bool FoldTailByMasking = false; 1698 1699 /// A map holding scalar costs for different vectorization factors. The 1700 /// presence of a cost for an instruction in the mapping indicates that the 1701 /// instruction will be scalarized when vectorizing with the associated 1702 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1703 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1704 1705 /// Holds the instructions known to be uniform after vectorization. 1706 /// The data is collected per VF. 1707 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1708 1709 /// Holds the instructions known to be scalar after vectorization. 1710 /// The data is collected per VF. 1711 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1712 1713 /// Holds the instructions (address computations) that are forced to be 1714 /// scalarized. 1715 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1716 1717 /// PHINodes of the reductions that should be expanded in-loop along with 1718 /// their associated chains of reduction operations, in program order from top 1719 /// (PHI) to bottom 1720 ReductionChainMap InLoopReductionChains; 1721 1722 /// A Map of inloop reduction operations and their immediate chain operand. 1723 /// FIXME: This can be removed once reductions can be costed correctly in 1724 /// vplan. This was added to allow quick lookup to the inloop operations, 1725 /// without having to loop through InLoopReductionChains. 1726 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1727 1728 /// Returns the expected difference in cost from scalarizing the expression 1729 /// feeding a predicated instruction \p PredInst. The instructions to 1730 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1731 /// non-negative return value implies the expression will be scalarized. 1732 /// Currently, only single-use chains are considered for scalarization. 1733 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1734 ElementCount VF); 1735 1736 /// Collect the instructions that are uniform after vectorization. An 1737 /// instruction is uniform if we represent it with a single scalar value in 1738 /// the vectorized loop corresponding to each vector iteration. Examples of 1739 /// uniform instructions include pointer operands of consecutive or 1740 /// interleaved memory accesses. Note that although uniformity implies an 1741 /// instruction will be scalar, the reverse is not true. In general, a 1742 /// scalarized instruction will be represented by VF scalar values in the 1743 /// vectorized loop, each corresponding to an iteration of the original 1744 /// scalar loop. 1745 void collectLoopUniforms(ElementCount VF); 1746 1747 /// Collect the instructions that are scalar after vectorization. An 1748 /// instruction is scalar if it is known to be uniform or will be scalarized 1749 /// during vectorization. Non-uniform scalarized instructions will be 1750 /// represented by VF values in the vectorized loop, each corresponding to an 1751 /// iteration of the original scalar loop. 1752 void collectLoopScalars(ElementCount VF); 1753 1754 /// Keeps cost model vectorization decision and cost for instructions. 1755 /// Right now it is used for memory instructions only. 1756 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1757 std::pair<InstWidening, InstructionCost>>; 1758 1759 DecisionList WideningDecisions; 1760 1761 /// Returns true if \p V is expected to be vectorized and it needs to be 1762 /// extracted. 1763 bool needsExtract(Value *V, ElementCount VF) const { 1764 Instruction *I = dyn_cast<Instruction>(V); 1765 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1766 TheLoop->isLoopInvariant(I)) 1767 return false; 1768 1769 // Assume we can vectorize V (and hence we need extraction) if the 1770 // scalars are not computed yet. This can happen, because it is called 1771 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1772 // the scalars are collected. That should be a safe assumption in most 1773 // cases, because we check if the operands have vectorizable types 1774 // beforehand in LoopVectorizationLegality. 1775 return Scalars.find(VF) == Scalars.end() || 1776 !isScalarAfterVectorization(I, VF); 1777 }; 1778 1779 /// Returns a range containing only operands needing to be extracted. 1780 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1781 ElementCount VF) { 1782 return SmallVector<Value *, 4>(make_filter_range( 1783 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1784 } 1785 1786 /// Determines if we have the infrastructure to vectorize loop \p L and its 1787 /// epilogue, assuming the main loop is vectorized by \p VF. 1788 bool isCandidateForEpilogueVectorization(const Loop &L, 1789 const ElementCount VF) const; 1790 1791 /// Returns true if epilogue vectorization is considered profitable, and 1792 /// false otherwise. 1793 /// \p VF is the vectorization factor chosen for the original loop. 1794 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1795 1796 public: 1797 /// The loop that we evaluate. 1798 Loop *TheLoop; 1799 1800 /// Predicated scalar evolution analysis. 1801 PredicatedScalarEvolution &PSE; 1802 1803 /// Loop Info analysis. 1804 LoopInfo *LI; 1805 1806 /// Vectorization legality. 1807 LoopVectorizationLegality *Legal; 1808 1809 /// Vector target information. 1810 const TargetTransformInfo &TTI; 1811 1812 /// Target Library Info. 1813 const TargetLibraryInfo *TLI; 1814 1815 /// Demanded bits analysis. 1816 DemandedBits *DB; 1817 1818 /// Assumption cache. 1819 AssumptionCache *AC; 1820 1821 /// Interface to emit optimization remarks. 1822 OptimizationRemarkEmitter *ORE; 1823 1824 const Function *TheFunction; 1825 1826 /// Loop Vectorize Hint. 1827 const LoopVectorizeHints *Hints; 1828 1829 /// The interleave access information contains groups of interleaved accesses 1830 /// with the same stride and close to each other. 1831 InterleavedAccessInfo &InterleaveInfo; 1832 1833 /// Values to ignore in the cost model. 1834 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1835 1836 /// Values to ignore in the cost model when VF > 1. 1837 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1838 1839 /// Profitable vector factors. 1840 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1841 }; 1842 } // end namespace llvm 1843 1844 /// Helper struct to manage generating runtime checks for vectorization. 1845 /// 1846 /// The runtime checks are created up-front in temporary blocks to allow better 1847 /// estimating the cost and un-linked from the existing IR. After deciding to 1848 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1849 /// temporary blocks are completely removed. 1850 class GeneratedRTChecks { 1851 /// Basic block which contains the generated SCEV checks, if any. 1852 BasicBlock *SCEVCheckBlock = nullptr; 1853 1854 /// The value representing the result of the generated SCEV checks. If it is 1855 /// nullptr, either no SCEV checks have been generated or they have been used. 1856 Value *SCEVCheckCond = nullptr; 1857 1858 /// Basic block which contains the generated memory runtime checks, if any. 1859 BasicBlock *MemCheckBlock = nullptr; 1860 1861 /// The value representing the result of the generated memory runtime checks. 1862 /// If it is nullptr, either no memory runtime checks have been generated or 1863 /// they have been used. 1864 Instruction *MemRuntimeCheckCond = nullptr; 1865 1866 DominatorTree *DT; 1867 LoopInfo *LI; 1868 1869 SCEVExpander SCEVExp; 1870 SCEVExpander MemCheckExp; 1871 1872 public: 1873 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1874 const DataLayout &DL) 1875 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1876 MemCheckExp(SE, DL, "scev.check") {} 1877 1878 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1879 /// accurately estimate the cost of the runtime checks. The blocks are 1880 /// un-linked from the IR and is added back during vector code generation. If 1881 /// there is no vector code generation, the check blocks are removed 1882 /// completely. 1883 void Create(Loop *L, const LoopAccessInfo &LAI, 1884 const SCEVUnionPredicate &UnionPred) { 1885 1886 BasicBlock *LoopHeader = L->getHeader(); 1887 BasicBlock *Preheader = L->getLoopPreheader(); 1888 1889 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1890 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1891 // may be used by SCEVExpander. The blocks will be un-linked from their 1892 // predecessors and removed from LI & DT at the end of the function. 1893 if (!UnionPred.isAlwaysTrue()) { 1894 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1895 nullptr, "vector.scevcheck"); 1896 1897 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1898 &UnionPred, SCEVCheckBlock->getTerminator()); 1899 } 1900 1901 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1902 if (RtPtrChecking.Need) { 1903 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1904 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1905 "vector.memcheck"); 1906 1907 std::tie(std::ignore, MemRuntimeCheckCond) = 1908 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1909 RtPtrChecking.getChecks(), MemCheckExp); 1910 assert(MemRuntimeCheckCond && 1911 "no RT checks generated although RtPtrChecking " 1912 "claimed checks are required"); 1913 } 1914 1915 if (!MemCheckBlock && !SCEVCheckBlock) 1916 return; 1917 1918 // Unhook the temporary block with the checks, update various places 1919 // accordingly. 1920 if (SCEVCheckBlock) 1921 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1922 if (MemCheckBlock) 1923 MemCheckBlock->replaceAllUsesWith(Preheader); 1924 1925 if (SCEVCheckBlock) { 1926 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1927 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1928 Preheader->getTerminator()->eraseFromParent(); 1929 } 1930 if (MemCheckBlock) { 1931 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1932 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1933 Preheader->getTerminator()->eraseFromParent(); 1934 } 1935 1936 DT->changeImmediateDominator(LoopHeader, Preheader); 1937 if (MemCheckBlock) { 1938 DT->eraseNode(MemCheckBlock); 1939 LI->removeBlock(MemCheckBlock); 1940 } 1941 if (SCEVCheckBlock) { 1942 DT->eraseNode(SCEVCheckBlock); 1943 LI->removeBlock(SCEVCheckBlock); 1944 } 1945 } 1946 1947 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1948 /// unused. 1949 ~GeneratedRTChecks() { 1950 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1951 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1952 if (!SCEVCheckCond) 1953 SCEVCleaner.markResultUsed(); 1954 1955 if (!MemRuntimeCheckCond) 1956 MemCheckCleaner.markResultUsed(); 1957 1958 if (MemRuntimeCheckCond) { 1959 auto &SE = *MemCheckExp.getSE(); 1960 // Memory runtime check generation creates compares that use expanded 1961 // values. Remove them before running the SCEVExpanderCleaners. 1962 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1963 if (MemCheckExp.isInsertedInstruction(&I)) 1964 continue; 1965 SE.forgetValue(&I); 1966 SE.eraseValueFromMap(&I); 1967 I.eraseFromParent(); 1968 } 1969 } 1970 MemCheckCleaner.cleanup(); 1971 SCEVCleaner.cleanup(); 1972 1973 if (SCEVCheckCond) 1974 SCEVCheckBlock->eraseFromParent(); 1975 if (MemRuntimeCheckCond) 1976 MemCheckBlock->eraseFromParent(); 1977 } 1978 1979 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1980 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1981 /// depending on the generated condition. 1982 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1983 BasicBlock *LoopVectorPreHeader, 1984 BasicBlock *LoopExitBlock) { 1985 if (!SCEVCheckCond) 1986 return nullptr; 1987 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1988 if (C->isZero()) 1989 return nullptr; 1990 1991 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1992 1993 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1994 // Create new preheader for vector loop. 1995 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 1996 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 1997 1998 SCEVCheckBlock->getTerminator()->eraseFromParent(); 1999 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2000 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2001 SCEVCheckBlock); 2002 2003 DT->addNewBlock(SCEVCheckBlock, Pred); 2004 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2005 2006 ReplaceInstWithInst( 2007 SCEVCheckBlock->getTerminator(), 2008 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2009 // Mark the check as used, to prevent it from being removed during cleanup. 2010 SCEVCheckCond = nullptr; 2011 return SCEVCheckBlock; 2012 } 2013 2014 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2015 /// the branches to branch to the vector preheader or \p Bypass, depending on 2016 /// the generated condition. 2017 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2018 BasicBlock *LoopVectorPreHeader) { 2019 // Check if we generated code that checks in runtime if arrays overlap. 2020 if (!MemRuntimeCheckCond) 2021 return nullptr; 2022 2023 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2024 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2025 MemCheckBlock); 2026 2027 DT->addNewBlock(MemCheckBlock, Pred); 2028 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2029 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2030 2031 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2032 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2033 2034 ReplaceInstWithInst( 2035 MemCheckBlock->getTerminator(), 2036 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2037 MemCheckBlock->getTerminator()->setDebugLoc( 2038 Pred->getTerminator()->getDebugLoc()); 2039 2040 // Mark the check as used, to prevent it from being removed during cleanup. 2041 MemRuntimeCheckCond = nullptr; 2042 return MemCheckBlock; 2043 } 2044 }; 2045 2046 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2047 // vectorization. The loop needs to be annotated with #pragma omp simd 2048 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2049 // vector length information is not provided, vectorization is not considered 2050 // explicit. Interleave hints are not allowed either. These limitations will be 2051 // relaxed in the future. 2052 // Please, note that we are currently forced to abuse the pragma 'clang 2053 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2054 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2055 // provides *explicit vectorization hints* (LV can bypass legal checks and 2056 // assume that vectorization is legal). However, both hints are implemented 2057 // using the same metadata (llvm.loop.vectorize, processed by 2058 // LoopVectorizeHints). This will be fixed in the future when the native IR 2059 // representation for pragma 'omp simd' is introduced. 2060 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2061 OptimizationRemarkEmitter *ORE) { 2062 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2063 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2064 2065 // Only outer loops with an explicit vectorization hint are supported. 2066 // Unannotated outer loops are ignored. 2067 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2068 return false; 2069 2070 Function *Fn = OuterLp->getHeader()->getParent(); 2071 if (!Hints.allowVectorization(Fn, OuterLp, 2072 true /*VectorizeOnlyWhenForced*/)) { 2073 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2074 return false; 2075 } 2076 2077 if (Hints.getInterleave() > 1) { 2078 // TODO: Interleave support is future work. 2079 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2080 "outer loops.\n"); 2081 Hints.emitRemarkWithHints(); 2082 return false; 2083 } 2084 2085 return true; 2086 } 2087 2088 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2089 OptimizationRemarkEmitter *ORE, 2090 SmallVectorImpl<Loop *> &V) { 2091 // Collect inner loops and outer loops without irreducible control flow. For 2092 // now, only collect outer loops that have explicit vectorization hints. If we 2093 // are stress testing the VPlan H-CFG construction, we collect the outermost 2094 // loop of every loop nest. 2095 if (L.isInnermost() || VPlanBuildStressTest || 2096 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2097 LoopBlocksRPO RPOT(&L); 2098 RPOT.perform(LI); 2099 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2100 V.push_back(&L); 2101 // TODO: Collect inner loops inside marked outer loops in case 2102 // vectorization fails for the outer loop. Do not invoke 2103 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2104 // already known to be reducible. We can use an inherited attribute for 2105 // that. 2106 return; 2107 } 2108 } 2109 for (Loop *InnerL : L) 2110 collectSupportedLoops(*InnerL, LI, ORE, V); 2111 } 2112 2113 namespace { 2114 2115 /// The LoopVectorize Pass. 2116 struct LoopVectorize : public FunctionPass { 2117 /// Pass identification, replacement for typeid 2118 static char ID; 2119 2120 LoopVectorizePass Impl; 2121 2122 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2123 bool VectorizeOnlyWhenForced = false) 2124 : FunctionPass(ID), 2125 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2126 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2127 } 2128 2129 bool runOnFunction(Function &F) override { 2130 if (skipFunction(F)) 2131 return false; 2132 2133 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2134 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2135 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2136 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2137 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2138 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2139 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2140 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2141 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2142 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2143 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2144 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2145 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2146 2147 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2148 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2149 2150 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2151 GetLAA, *ORE, PSI).MadeAnyChange; 2152 } 2153 2154 void getAnalysisUsage(AnalysisUsage &AU) const override { 2155 AU.addRequired<AssumptionCacheTracker>(); 2156 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2157 AU.addRequired<DominatorTreeWrapperPass>(); 2158 AU.addRequired<LoopInfoWrapperPass>(); 2159 AU.addRequired<ScalarEvolutionWrapperPass>(); 2160 AU.addRequired<TargetTransformInfoWrapperPass>(); 2161 AU.addRequired<AAResultsWrapperPass>(); 2162 AU.addRequired<LoopAccessLegacyAnalysis>(); 2163 AU.addRequired<DemandedBitsWrapperPass>(); 2164 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2165 AU.addRequired<InjectTLIMappingsLegacy>(); 2166 2167 // We currently do not preserve loopinfo/dominator analyses with outer loop 2168 // vectorization. Until this is addressed, mark these analyses as preserved 2169 // only for non-VPlan-native path. 2170 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2171 if (!EnableVPlanNativePath) { 2172 AU.addPreserved<LoopInfoWrapperPass>(); 2173 AU.addPreserved<DominatorTreeWrapperPass>(); 2174 } 2175 2176 AU.addPreserved<BasicAAWrapperPass>(); 2177 AU.addPreserved<GlobalsAAWrapperPass>(); 2178 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2179 } 2180 }; 2181 2182 } // end anonymous namespace 2183 2184 //===----------------------------------------------------------------------===// 2185 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2186 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2187 //===----------------------------------------------------------------------===// 2188 2189 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2190 // We need to place the broadcast of invariant variables outside the loop, 2191 // but only if it's proven safe to do so. Else, broadcast will be inside 2192 // vector loop body. 2193 Instruction *Instr = dyn_cast<Instruction>(V); 2194 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2195 (!Instr || 2196 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2197 // Place the code for broadcasting invariant variables in the new preheader. 2198 IRBuilder<>::InsertPointGuard Guard(Builder); 2199 if (SafeToHoist) 2200 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2201 2202 // Broadcast the scalar into all locations in the vector. 2203 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2204 2205 return Shuf; 2206 } 2207 2208 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2209 const InductionDescriptor &II, Value *Step, Value *Start, 2210 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2211 VPTransformState &State) { 2212 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2213 "Expected either an induction phi-node or a truncate of it!"); 2214 2215 // Construct the initial value of the vector IV in the vector loop preheader 2216 auto CurrIP = Builder.saveIP(); 2217 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2218 if (isa<TruncInst>(EntryVal)) { 2219 assert(Start->getType()->isIntegerTy() && 2220 "Truncation requires an integer type"); 2221 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2222 Step = Builder.CreateTrunc(Step, TruncType); 2223 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2224 } 2225 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2226 Value *SteppedStart = 2227 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2228 2229 // We create vector phi nodes for both integer and floating-point induction 2230 // variables. Here, we determine the kind of arithmetic we will perform. 2231 Instruction::BinaryOps AddOp; 2232 Instruction::BinaryOps MulOp; 2233 if (Step->getType()->isIntegerTy()) { 2234 AddOp = Instruction::Add; 2235 MulOp = Instruction::Mul; 2236 } else { 2237 AddOp = II.getInductionOpcode(); 2238 MulOp = Instruction::FMul; 2239 } 2240 2241 // Multiply the vectorization factor by the step using integer or 2242 // floating-point arithmetic as appropriate. 2243 Value *ConstVF = 2244 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2245 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); 2246 2247 // Create a vector splat to use in the induction update. 2248 // 2249 // FIXME: If the step is non-constant, we create the vector splat with 2250 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2251 // handle a constant vector splat. 2252 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2253 Value *SplatVF = isa<Constant>(Mul) 2254 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2255 : Builder.CreateVectorSplat(VF, Mul); 2256 Builder.restoreIP(CurrIP); 2257 2258 // We may need to add the step a number of times, depending on the unroll 2259 // factor. The last of those goes into the PHI. 2260 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2261 &*LoopVectorBody->getFirstInsertionPt()); 2262 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2263 Instruction *LastInduction = VecInd; 2264 for (unsigned Part = 0; Part < UF; ++Part) { 2265 State.set(Def, LastInduction, Part); 2266 2267 if (isa<TruncInst>(EntryVal)) 2268 addMetadata(LastInduction, EntryVal); 2269 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2270 State, Part); 2271 2272 LastInduction = cast<Instruction>( 2273 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2274 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2275 } 2276 2277 // Move the last step to the end of the latch block. This ensures consistent 2278 // placement of all induction updates. 2279 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2280 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2281 auto *ICmp = cast<Instruction>(Br->getCondition()); 2282 LastInduction->moveBefore(ICmp); 2283 LastInduction->setName("vec.ind.next"); 2284 2285 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2286 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2287 } 2288 2289 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2290 return Cost->isScalarAfterVectorization(I, VF) || 2291 Cost->isProfitableToScalarize(I, VF); 2292 } 2293 2294 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2295 if (shouldScalarizeInstruction(IV)) 2296 return true; 2297 auto isScalarInst = [&](User *U) -> bool { 2298 auto *I = cast<Instruction>(U); 2299 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2300 }; 2301 return llvm::any_of(IV->users(), isScalarInst); 2302 } 2303 2304 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2305 const InductionDescriptor &ID, const Instruction *EntryVal, 2306 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2307 unsigned Part, unsigned Lane) { 2308 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2309 "Expected either an induction phi-node or a truncate of it!"); 2310 2311 // This induction variable is not the phi from the original loop but the 2312 // newly-created IV based on the proof that casted Phi is equal to the 2313 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2314 // re-uses the same InductionDescriptor that original IV uses but we don't 2315 // have to do any recording in this case - that is done when original IV is 2316 // processed. 2317 if (isa<TruncInst>(EntryVal)) 2318 return; 2319 2320 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2321 if (Casts.empty()) 2322 return; 2323 // Only the first Cast instruction in the Casts vector is of interest. 2324 // The rest of the Casts (if exist) have no uses outside the 2325 // induction update chain itself. 2326 if (Lane < UINT_MAX) 2327 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2328 else 2329 State.set(CastDef, VectorLoopVal, Part); 2330 } 2331 2332 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2333 TruncInst *Trunc, VPValue *Def, 2334 VPValue *CastDef, 2335 VPTransformState &State) { 2336 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2337 "Primary induction variable must have an integer type"); 2338 2339 auto II = Legal->getInductionVars().find(IV); 2340 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2341 2342 auto ID = II->second; 2343 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2344 2345 // The value from the original loop to which we are mapping the new induction 2346 // variable. 2347 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2348 2349 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2350 2351 // Generate code for the induction step. Note that induction steps are 2352 // required to be loop-invariant 2353 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2354 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2355 "Induction step should be loop invariant"); 2356 if (PSE.getSE()->isSCEVable(IV->getType())) { 2357 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2358 return Exp.expandCodeFor(Step, Step->getType(), 2359 LoopVectorPreHeader->getTerminator()); 2360 } 2361 return cast<SCEVUnknown>(Step)->getValue(); 2362 }; 2363 2364 // The scalar value to broadcast. This is derived from the canonical 2365 // induction variable. If a truncation type is given, truncate the canonical 2366 // induction variable and step. Otherwise, derive these values from the 2367 // induction descriptor. 2368 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2369 Value *ScalarIV = Induction; 2370 if (IV != OldInduction) { 2371 ScalarIV = IV->getType()->isIntegerTy() 2372 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2373 : Builder.CreateCast(Instruction::SIToFP, Induction, 2374 IV->getType()); 2375 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2376 ScalarIV->setName("offset.idx"); 2377 } 2378 if (Trunc) { 2379 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2380 assert(Step->getType()->isIntegerTy() && 2381 "Truncation requires an integer step"); 2382 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2383 Step = Builder.CreateTrunc(Step, TruncType); 2384 } 2385 return ScalarIV; 2386 }; 2387 2388 // Create the vector values from the scalar IV, in the absence of creating a 2389 // vector IV. 2390 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2391 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2392 for (unsigned Part = 0; Part < UF; ++Part) { 2393 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2394 Value *EntryPart = 2395 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2396 ID.getInductionOpcode()); 2397 State.set(Def, EntryPart, Part); 2398 if (Trunc) 2399 addMetadata(EntryPart, Trunc); 2400 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2401 State, Part); 2402 } 2403 }; 2404 2405 // Fast-math-flags propagate from the original induction instruction. 2406 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2407 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2408 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2409 2410 // Now do the actual transformations, and start with creating the step value. 2411 Value *Step = CreateStepValue(ID.getStep()); 2412 if (VF.isZero() || VF.isScalar()) { 2413 Value *ScalarIV = CreateScalarIV(Step); 2414 CreateSplatIV(ScalarIV, Step); 2415 return; 2416 } 2417 2418 // Determine if we want a scalar version of the induction variable. This is 2419 // true if the induction variable itself is not widened, or if it has at 2420 // least one user in the loop that is not widened. 2421 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2422 if (!NeedsScalarIV) { 2423 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2424 State); 2425 return; 2426 } 2427 2428 // Try to create a new independent vector induction variable. If we can't 2429 // create the phi node, we will splat the scalar induction variable in each 2430 // loop iteration. 2431 if (!shouldScalarizeInstruction(EntryVal)) { 2432 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2433 State); 2434 Value *ScalarIV = CreateScalarIV(Step); 2435 // Create scalar steps that can be used by instructions we will later 2436 // scalarize. Note that the addition of the scalar steps will not increase 2437 // the number of instructions in the loop in the common case prior to 2438 // InstCombine. We will be trading one vector extract for each scalar step. 2439 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2440 return; 2441 } 2442 2443 // All IV users are scalar instructions, so only emit a scalar IV, not a 2444 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2445 // predicate used by the masked loads/stores. 2446 Value *ScalarIV = CreateScalarIV(Step); 2447 if (!Cost->isScalarEpilogueAllowed()) 2448 CreateSplatIV(ScalarIV, Step); 2449 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2450 } 2451 2452 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2453 Instruction::BinaryOps BinOp) { 2454 // Create and check the types. 2455 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2456 int VLen = ValVTy->getNumElements(); 2457 2458 Type *STy = Val->getType()->getScalarType(); 2459 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2460 "Induction Step must be an integer or FP"); 2461 assert(Step->getType() == STy && "Step has wrong type"); 2462 2463 SmallVector<Constant *, 8> Indices; 2464 2465 if (STy->isIntegerTy()) { 2466 // Create a vector of consecutive numbers from zero to VF. 2467 for (int i = 0; i < VLen; ++i) 2468 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2469 2470 // Add the consecutive indices to the vector value. 2471 Constant *Cv = ConstantVector::get(Indices); 2472 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2473 Step = Builder.CreateVectorSplat(VLen, Step); 2474 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2475 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2476 // which can be found from the original scalar operations. 2477 Step = Builder.CreateMul(Cv, Step); 2478 return Builder.CreateAdd(Val, Step, "induction"); 2479 } 2480 2481 // Floating point induction. 2482 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2483 "Binary Opcode should be specified for FP induction"); 2484 // Create a vector of consecutive numbers from zero to VF. 2485 for (int i = 0; i < VLen; ++i) 2486 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2487 2488 // Add the consecutive indices to the vector value. 2489 // Floating-point operations inherit FMF via the builder's flags. 2490 Constant *Cv = ConstantVector::get(Indices); 2491 Step = Builder.CreateVectorSplat(VLen, Step); 2492 Value *MulOp = Builder.CreateFMul(Cv, Step); 2493 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2494 } 2495 2496 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2497 Instruction *EntryVal, 2498 const InductionDescriptor &ID, 2499 VPValue *Def, VPValue *CastDef, 2500 VPTransformState &State) { 2501 // We shouldn't have to build scalar steps if we aren't vectorizing. 2502 assert(VF.isVector() && "VF should be greater than one"); 2503 // Get the value type and ensure it and the step have the same integer type. 2504 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2505 assert(ScalarIVTy == Step->getType() && 2506 "Val and Step should have the same type"); 2507 2508 // We build scalar steps for both integer and floating-point induction 2509 // variables. Here, we determine the kind of arithmetic we will perform. 2510 Instruction::BinaryOps AddOp; 2511 Instruction::BinaryOps MulOp; 2512 if (ScalarIVTy->isIntegerTy()) { 2513 AddOp = Instruction::Add; 2514 MulOp = Instruction::Mul; 2515 } else { 2516 AddOp = ID.getInductionOpcode(); 2517 MulOp = Instruction::FMul; 2518 } 2519 2520 // Determine the number of scalars we need to generate for each unroll 2521 // iteration. If EntryVal is uniform, we only need to generate the first 2522 // lane. Otherwise, we generate all VF values. 2523 unsigned Lanes = 2524 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2525 ? 1 2526 : VF.getKnownMinValue(); 2527 assert((!VF.isScalable() || Lanes == 1) && 2528 "Should never scalarize a scalable vector"); 2529 // Compute the scalar steps and save the results in State. 2530 for (unsigned Part = 0; Part < UF; ++Part) { 2531 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2532 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2533 ScalarIVTy->getScalarSizeInBits()); 2534 Value *StartIdx = 2535 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2536 if (ScalarIVTy->isFloatingPointTy()) 2537 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2538 StartIdx = Builder.CreateBinOp( 2539 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2540 // The step returned by `createStepForVF` is a runtime-evaluated value 2541 // when VF is scalable. Otherwise, it should be folded into a Constant. 2542 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2543 "Expected StartIdx to be folded to a constant when VF is not " 2544 "scalable"); 2545 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2546 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2547 State.set(Def, Add, VPIteration(Part, Lane)); 2548 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2549 Part, Lane); 2550 } 2551 } 2552 } 2553 2554 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2555 const VPIteration &Instance, 2556 VPTransformState &State) { 2557 Value *ScalarInst = State.get(Def, Instance); 2558 Value *VectorValue = State.get(Def, Instance.Part); 2559 VectorValue = Builder.CreateInsertElement( 2560 VectorValue, ScalarInst, 2561 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2562 State.set(Def, VectorValue, Instance.Part); 2563 } 2564 2565 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2566 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2567 return Builder.CreateVectorReverse(Vec, "reverse"); 2568 } 2569 2570 // Return whether we allow using masked interleave-groups (for dealing with 2571 // strided loads/stores that reside in predicated blocks, or for dealing 2572 // with gaps). 2573 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2574 // If an override option has been passed in for interleaved accesses, use it. 2575 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2576 return EnableMaskedInterleavedMemAccesses; 2577 2578 return TTI.enableMaskedInterleavedAccessVectorization(); 2579 } 2580 2581 // Try to vectorize the interleave group that \p Instr belongs to. 2582 // 2583 // E.g. Translate following interleaved load group (factor = 3): 2584 // for (i = 0; i < N; i+=3) { 2585 // R = Pic[i]; // Member of index 0 2586 // G = Pic[i+1]; // Member of index 1 2587 // B = Pic[i+2]; // Member of index 2 2588 // ... // do something to R, G, B 2589 // } 2590 // To: 2591 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2592 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2593 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2594 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2595 // 2596 // Or translate following interleaved store group (factor = 3): 2597 // for (i = 0; i < N; i+=3) { 2598 // ... do something to R, G, B 2599 // Pic[i] = R; // Member of index 0 2600 // Pic[i+1] = G; // Member of index 1 2601 // Pic[i+2] = B; // Member of index 2 2602 // } 2603 // To: 2604 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2605 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2606 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2607 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2608 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2609 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2610 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2611 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2612 VPValue *BlockInMask) { 2613 Instruction *Instr = Group->getInsertPos(); 2614 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2615 2616 // Prepare for the vector type of the interleaved load/store. 2617 Type *ScalarTy = getMemInstValueType(Instr); 2618 unsigned InterleaveFactor = Group->getFactor(); 2619 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2620 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2621 2622 // Prepare for the new pointers. 2623 SmallVector<Value *, 2> AddrParts; 2624 unsigned Index = Group->getIndex(Instr); 2625 2626 // TODO: extend the masked interleaved-group support to reversed access. 2627 assert((!BlockInMask || !Group->isReverse()) && 2628 "Reversed masked interleave-group not supported."); 2629 2630 // If the group is reverse, adjust the index to refer to the last vector lane 2631 // instead of the first. We adjust the index from the first vector lane, 2632 // rather than directly getting the pointer for lane VF - 1, because the 2633 // pointer operand of the interleaved access is supposed to be uniform. For 2634 // uniform instructions, we're only required to generate a value for the 2635 // first vector lane in each unroll iteration. 2636 assert(!VF.isScalable() && 2637 "scalable vector reverse operation is not implemented"); 2638 if (Group->isReverse()) 2639 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2640 2641 for (unsigned Part = 0; Part < UF; Part++) { 2642 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2643 setDebugLocFromInst(Builder, AddrPart); 2644 2645 // Notice current instruction could be any index. Need to adjust the address 2646 // to the member of index 0. 2647 // 2648 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2649 // b = A[i]; // Member of index 0 2650 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2651 // 2652 // E.g. A[i+1] = a; // Member of index 1 2653 // A[i] = b; // Member of index 0 2654 // A[i+2] = c; // Member of index 2 (Current instruction) 2655 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2656 2657 bool InBounds = false; 2658 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2659 InBounds = gep->isInBounds(); 2660 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2661 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2662 2663 // Cast to the vector pointer type. 2664 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2665 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2666 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2667 } 2668 2669 setDebugLocFromInst(Builder, Instr); 2670 Value *PoisonVec = PoisonValue::get(VecTy); 2671 2672 Value *MaskForGaps = nullptr; 2673 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2674 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2675 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2676 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2677 } 2678 2679 // Vectorize the interleaved load group. 2680 if (isa<LoadInst>(Instr)) { 2681 // For each unroll part, create a wide load for the group. 2682 SmallVector<Value *, 2> NewLoads; 2683 for (unsigned Part = 0; Part < UF; Part++) { 2684 Instruction *NewLoad; 2685 if (BlockInMask || MaskForGaps) { 2686 assert(useMaskedInterleavedAccesses(*TTI) && 2687 "masked interleaved groups are not allowed."); 2688 Value *GroupMask = MaskForGaps; 2689 if (BlockInMask) { 2690 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2691 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2692 Value *ShuffledMask = Builder.CreateShuffleVector( 2693 BlockInMaskPart, 2694 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2695 "interleaved.mask"); 2696 GroupMask = MaskForGaps 2697 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2698 MaskForGaps) 2699 : ShuffledMask; 2700 } 2701 NewLoad = 2702 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2703 GroupMask, PoisonVec, "wide.masked.vec"); 2704 } 2705 else 2706 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2707 Group->getAlign(), "wide.vec"); 2708 Group->addMetadata(NewLoad); 2709 NewLoads.push_back(NewLoad); 2710 } 2711 2712 // For each member in the group, shuffle out the appropriate data from the 2713 // wide loads. 2714 unsigned J = 0; 2715 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2716 Instruction *Member = Group->getMember(I); 2717 2718 // Skip the gaps in the group. 2719 if (!Member) 2720 continue; 2721 2722 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2723 auto StrideMask = 2724 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2725 for (unsigned Part = 0; Part < UF; Part++) { 2726 Value *StridedVec = Builder.CreateShuffleVector( 2727 NewLoads[Part], StrideMask, "strided.vec"); 2728 2729 // If this member has different type, cast the result type. 2730 if (Member->getType() != ScalarTy) { 2731 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2732 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2733 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2734 } 2735 2736 if (Group->isReverse()) 2737 StridedVec = reverseVector(StridedVec); 2738 2739 State.set(VPDefs[J], StridedVec, Part); 2740 } 2741 ++J; 2742 } 2743 return; 2744 } 2745 2746 // The sub vector type for current instruction. 2747 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2748 auto *SubVT = VectorType::get(ScalarTy, VF); 2749 2750 // Vectorize the interleaved store group. 2751 for (unsigned Part = 0; Part < UF; Part++) { 2752 // Collect the stored vector from each member. 2753 SmallVector<Value *, 4> StoredVecs; 2754 for (unsigned i = 0; i < InterleaveFactor; i++) { 2755 // Interleaved store group doesn't allow a gap, so each index has a member 2756 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2757 2758 Value *StoredVec = State.get(StoredValues[i], Part); 2759 2760 if (Group->isReverse()) 2761 StoredVec = reverseVector(StoredVec); 2762 2763 // If this member has different type, cast it to a unified type. 2764 2765 if (StoredVec->getType() != SubVT) 2766 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2767 2768 StoredVecs.push_back(StoredVec); 2769 } 2770 2771 // Concatenate all vectors into a wide vector. 2772 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2773 2774 // Interleave the elements in the wide vector. 2775 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2776 Value *IVec = Builder.CreateShuffleVector( 2777 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2778 "interleaved.vec"); 2779 2780 Instruction *NewStoreInstr; 2781 if (BlockInMask) { 2782 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2783 Value *ShuffledMask = Builder.CreateShuffleVector( 2784 BlockInMaskPart, 2785 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2786 "interleaved.mask"); 2787 NewStoreInstr = Builder.CreateMaskedStore( 2788 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2789 } 2790 else 2791 NewStoreInstr = 2792 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2793 2794 Group->addMetadata(NewStoreInstr); 2795 } 2796 } 2797 2798 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2799 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2800 VPValue *StoredValue, VPValue *BlockInMask) { 2801 // Attempt to issue a wide load. 2802 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2803 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2804 2805 assert((LI || SI) && "Invalid Load/Store instruction"); 2806 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2807 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2808 2809 LoopVectorizationCostModel::InstWidening Decision = 2810 Cost->getWideningDecision(Instr, VF); 2811 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2812 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2813 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2814 "CM decision is not to widen the memory instruction"); 2815 2816 Type *ScalarDataTy = getMemInstValueType(Instr); 2817 2818 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2819 const Align Alignment = getLoadStoreAlignment(Instr); 2820 2821 // Determine if the pointer operand of the access is either consecutive or 2822 // reverse consecutive. 2823 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2824 bool ConsecutiveStride = 2825 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2826 bool CreateGatherScatter = 2827 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2828 2829 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2830 // gather/scatter. Otherwise Decision should have been to Scalarize. 2831 assert((ConsecutiveStride || CreateGatherScatter) && 2832 "The instruction should be scalarized"); 2833 (void)ConsecutiveStride; 2834 2835 VectorParts BlockInMaskParts(UF); 2836 bool isMaskRequired = BlockInMask; 2837 if (isMaskRequired) 2838 for (unsigned Part = 0; Part < UF; ++Part) 2839 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2840 2841 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2842 // Calculate the pointer for the specific unroll-part. 2843 GetElementPtrInst *PartPtr = nullptr; 2844 2845 bool InBounds = false; 2846 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2847 InBounds = gep->isInBounds(); 2848 if (Reverse) { 2849 // If the address is consecutive but reversed, then the 2850 // wide store needs to start at the last vector element. 2851 // RunTimeVF = VScale * VF.getKnownMinValue() 2852 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2853 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2854 // NumElt = -Part * RunTimeVF 2855 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2856 // LastLane = 1 - RunTimeVF 2857 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2858 PartPtr = 2859 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2860 PartPtr->setIsInBounds(InBounds); 2861 PartPtr = cast<GetElementPtrInst>( 2862 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2863 PartPtr->setIsInBounds(InBounds); 2864 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2865 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2866 } else { 2867 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2868 PartPtr = cast<GetElementPtrInst>( 2869 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2870 PartPtr->setIsInBounds(InBounds); 2871 } 2872 2873 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2874 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2875 }; 2876 2877 // Handle Stores: 2878 if (SI) { 2879 setDebugLocFromInst(Builder, SI); 2880 2881 for (unsigned Part = 0; Part < UF; ++Part) { 2882 Instruction *NewSI = nullptr; 2883 Value *StoredVal = State.get(StoredValue, Part); 2884 if (CreateGatherScatter) { 2885 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2886 Value *VectorGep = State.get(Addr, Part); 2887 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2888 MaskPart); 2889 } else { 2890 if (Reverse) { 2891 // If we store to reverse consecutive memory locations, then we need 2892 // to reverse the order of elements in the stored value. 2893 StoredVal = reverseVector(StoredVal); 2894 // We don't want to update the value in the map as it might be used in 2895 // another expression. So don't call resetVectorValue(StoredVal). 2896 } 2897 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2898 if (isMaskRequired) 2899 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2900 BlockInMaskParts[Part]); 2901 else 2902 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2903 } 2904 addMetadata(NewSI, SI); 2905 } 2906 return; 2907 } 2908 2909 // Handle loads. 2910 assert(LI && "Must have a load instruction"); 2911 setDebugLocFromInst(Builder, LI); 2912 for (unsigned Part = 0; Part < UF; ++Part) { 2913 Value *NewLI; 2914 if (CreateGatherScatter) { 2915 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2916 Value *VectorGep = State.get(Addr, Part); 2917 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2918 nullptr, "wide.masked.gather"); 2919 addMetadata(NewLI, LI); 2920 } else { 2921 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2922 if (isMaskRequired) 2923 NewLI = Builder.CreateMaskedLoad( 2924 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2925 "wide.masked.load"); 2926 else 2927 NewLI = 2928 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2929 2930 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2931 addMetadata(NewLI, LI); 2932 if (Reverse) 2933 NewLI = reverseVector(NewLI); 2934 } 2935 2936 State.set(Def, NewLI, Part); 2937 } 2938 } 2939 2940 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2941 VPUser &User, 2942 const VPIteration &Instance, 2943 bool IfPredicateInstr, 2944 VPTransformState &State) { 2945 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2946 2947 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2948 // the first lane and part. 2949 if (isa<NoAliasScopeDeclInst>(Instr)) 2950 if (!Instance.isFirstIteration()) 2951 return; 2952 2953 setDebugLocFromInst(Builder, Instr); 2954 2955 // Does this instruction return a value ? 2956 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2957 2958 Instruction *Cloned = Instr->clone(); 2959 if (!IsVoidRetTy) 2960 Cloned->setName(Instr->getName() + ".cloned"); 2961 2962 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2963 Builder.GetInsertPoint()); 2964 // Replace the operands of the cloned instructions with their scalar 2965 // equivalents in the new loop. 2966 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2967 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2968 auto InputInstance = Instance; 2969 if (!Operand || !OrigLoop->contains(Operand) || 2970 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2971 InputInstance.Lane = VPLane::getFirstLane(); 2972 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2973 Cloned->setOperand(op, NewOp); 2974 } 2975 addNewMetadata(Cloned, Instr); 2976 2977 // Place the cloned scalar in the new loop. 2978 Builder.Insert(Cloned); 2979 2980 State.set(Def, Cloned, Instance); 2981 2982 // If we just cloned a new assumption, add it the assumption cache. 2983 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2984 if (II->getIntrinsicID() == Intrinsic::assume) 2985 AC->registerAssumption(II); 2986 2987 // End if-block. 2988 if (IfPredicateInstr) 2989 PredicatedInstructions.push_back(Cloned); 2990 } 2991 2992 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2993 Value *End, Value *Step, 2994 Instruction *DL) { 2995 BasicBlock *Header = L->getHeader(); 2996 BasicBlock *Latch = L->getLoopLatch(); 2997 // As we're just creating this loop, it's possible no latch exists 2998 // yet. If so, use the header as this will be a single block loop. 2999 if (!Latch) 3000 Latch = Header; 3001 3002 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3003 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3004 setDebugLocFromInst(Builder, OldInst); 3005 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3006 3007 Builder.SetInsertPoint(Latch->getTerminator()); 3008 setDebugLocFromInst(Builder, OldInst); 3009 3010 // Create i+1 and fill the PHINode. 3011 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3012 Induction->addIncoming(Start, L->getLoopPreheader()); 3013 Induction->addIncoming(Next, Latch); 3014 // Create the compare. 3015 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3016 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3017 3018 // Now we have two terminators. Remove the old one from the block. 3019 Latch->getTerminator()->eraseFromParent(); 3020 3021 return Induction; 3022 } 3023 3024 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3025 if (TripCount) 3026 return TripCount; 3027 3028 assert(L && "Create Trip Count for null loop."); 3029 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3030 // Find the loop boundaries. 3031 ScalarEvolution *SE = PSE.getSE(); 3032 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3033 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3034 "Invalid loop count"); 3035 3036 Type *IdxTy = Legal->getWidestInductionType(); 3037 assert(IdxTy && "No type for induction"); 3038 3039 // The exit count might have the type of i64 while the phi is i32. This can 3040 // happen if we have an induction variable that is sign extended before the 3041 // compare. The only way that we get a backedge taken count is that the 3042 // induction variable was signed and as such will not overflow. In such a case 3043 // truncation is legal. 3044 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3045 IdxTy->getPrimitiveSizeInBits()) 3046 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3047 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3048 3049 // Get the total trip count from the count by adding 1. 3050 const SCEV *ExitCount = SE->getAddExpr( 3051 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3052 3053 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3054 3055 // Expand the trip count and place the new instructions in the preheader. 3056 // Notice that the pre-header does not change, only the loop body. 3057 SCEVExpander Exp(*SE, DL, "induction"); 3058 3059 // Count holds the overall loop count (N). 3060 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3061 L->getLoopPreheader()->getTerminator()); 3062 3063 if (TripCount->getType()->isPointerTy()) 3064 TripCount = 3065 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3066 L->getLoopPreheader()->getTerminator()); 3067 3068 return TripCount; 3069 } 3070 3071 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3072 if (VectorTripCount) 3073 return VectorTripCount; 3074 3075 Value *TC = getOrCreateTripCount(L); 3076 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3077 3078 Type *Ty = TC->getType(); 3079 // This is where we can make the step a runtime constant. 3080 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3081 3082 // If the tail is to be folded by masking, round the number of iterations N 3083 // up to a multiple of Step instead of rounding down. This is done by first 3084 // adding Step-1 and then rounding down. Note that it's ok if this addition 3085 // overflows: the vector induction variable will eventually wrap to zero given 3086 // that it starts at zero and its Step is a power of two; the loop will then 3087 // exit, with the last early-exit vector comparison also producing all-true. 3088 if (Cost->foldTailByMasking()) { 3089 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3090 "VF*UF must be a power of 2 when folding tail by masking"); 3091 assert(!VF.isScalable() && 3092 "Tail folding not yet supported for scalable vectors"); 3093 TC = Builder.CreateAdd( 3094 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3095 } 3096 3097 // Now we need to generate the expression for the part of the loop that the 3098 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3099 // iterations are not required for correctness, or N - Step, otherwise. Step 3100 // is equal to the vectorization factor (number of SIMD elements) times the 3101 // unroll factor (number of SIMD instructions). 3102 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3103 3104 // There are two cases where we need to ensure (at least) the last iteration 3105 // runs in the scalar remainder loop. Thus, if the step evenly divides 3106 // the trip count, we set the remainder to be equal to the step. If the step 3107 // does not evenly divide the trip count, no adjustment is necessary since 3108 // there will already be scalar iterations. Note that the minimum iterations 3109 // check ensures that N >= Step. The cases are: 3110 // 1) If there is a non-reversed interleaved group that may speculatively 3111 // access memory out-of-bounds. 3112 // 2) If any instruction may follow a conditionally taken exit. That is, if 3113 // the loop contains multiple exiting blocks, or a single exiting block 3114 // which is not the latch. 3115 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3116 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3117 R = Builder.CreateSelect(IsZero, Step, R); 3118 } 3119 3120 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3121 3122 return VectorTripCount; 3123 } 3124 3125 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3126 const DataLayout &DL) { 3127 // Verify that V is a vector type with same number of elements as DstVTy. 3128 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3129 unsigned VF = DstFVTy->getNumElements(); 3130 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3131 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3132 Type *SrcElemTy = SrcVecTy->getElementType(); 3133 Type *DstElemTy = DstFVTy->getElementType(); 3134 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3135 "Vector elements must have same size"); 3136 3137 // Do a direct cast if element types are castable. 3138 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3139 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3140 } 3141 // V cannot be directly casted to desired vector type. 3142 // May happen when V is a floating point vector but DstVTy is a vector of 3143 // pointers or vice-versa. Handle this using a two-step bitcast using an 3144 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3145 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3146 "Only one type should be a pointer type"); 3147 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3148 "Only one type should be a floating point type"); 3149 Type *IntTy = 3150 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3151 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3152 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3153 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3154 } 3155 3156 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3157 BasicBlock *Bypass) { 3158 Value *Count = getOrCreateTripCount(L); 3159 // Reuse existing vector loop preheader for TC checks. 3160 // Note that new preheader block is generated for vector loop. 3161 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3162 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3163 3164 // Generate code to check if the loop's trip count is less than VF * UF, or 3165 // equal to it in case a scalar epilogue is required; this implies that the 3166 // vector trip count is zero. This check also covers the case where adding one 3167 // to the backedge-taken count overflowed leading to an incorrect trip count 3168 // of zero. In this case we will also jump to the scalar loop. 3169 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3170 : ICmpInst::ICMP_ULT; 3171 3172 // If tail is to be folded, vector loop takes care of all iterations. 3173 Value *CheckMinIters = Builder.getFalse(); 3174 if (!Cost->foldTailByMasking()) { 3175 Value *Step = 3176 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3177 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3178 } 3179 // Create new preheader for vector loop. 3180 LoopVectorPreHeader = 3181 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3182 "vector.ph"); 3183 3184 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3185 DT->getNode(Bypass)->getIDom()) && 3186 "TC check is expected to dominate Bypass"); 3187 3188 // Update dominator for Bypass & LoopExit. 3189 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3190 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3191 3192 ReplaceInstWithInst( 3193 TCCheckBlock->getTerminator(), 3194 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3195 LoopBypassBlocks.push_back(TCCheckBlock); 3196 } 3197 3198 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3199 3200 BasicBlock *const SCEVCheckBlock = 3201 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3202 if (!SCEVCheckBlock) 3203 return nullptr; 3204 3205 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3206 (OptForSizeBasedOnProfile && 3207 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3208 "Cannot SCEV check stride or overflow when optimizing for size"); 3209 3210 3211 // Update dominator only if this is first RT check. 3212 if (LoopBypassBlocks.empty()) { 3213 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3214 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3215 } 3216 3217 LoopBypassBlocks.push_back(SCEVCheckBlock); 3218 AddedSafetyChecks = true; 3219 return SCEVCheckBlock; 3220 } 3221 3222 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3223 BasicBlock *Bypass) { 3224 // VPlan-native path does not do any analysis for runtime checks currently. 3225 if (EnableVPlanNativePath) 3226 return nullptr; 3227 3228 BasicBlock *const MemCheckBlock = 3229 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3230 3231 // Check if we generated code that checks in runtime if arrays overlap. We put 3232 // the checks into a separate block to make the more common case of few 3233 // elements faster. 3234 if (!MemCheckBlock) 3235 return nullptr; 3236 3237 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3238 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3239 "Cannot emit memory checks when optimizing for size, unless forced " 3240 "to vectorize."); 3241 ORE->emit([&]() { 3242 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3243 L->getStartLoc(), L->getHeader()) 3244 << "Code-size may be reduced by not forcing " 3245 "vectorization, or by source-code modifications " 3246 "eliminating the need for runtime checks " 3247 "(e.g., adding 'restrict')."; 3248 }); 3249 } 3250 3251 LoopBypassBlocks.push_back(MemCheckBlock); 3252 3253 AddedSafetyChecks = true; 3254 3255 // We currently don't use LoopVersioning for the actual loop cloning but we 3256 // still use it to add the noalias metadata. 3257 LVer = std::make_unique<LoopVersioning>( 3258 *Legal->getLAI(), 3259 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3260 DT, PSE.getSE()); 3261 LVer->prepareNoAliasMetadata(); 3262 return MemCheckBlock; 3263 } 3264 3265 Value *InnerLoopVectorizer::emitTransformedIndex( 3266 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3267 const InductionDescriptor &ID) const { 3268 3269 SCEVExpander Exp(*SE, DL, "induction"); 3270 auto Step = ID.getStep(); 3271 auto StartValue = ID.getStartValue(); 3272 assert(Index->getType() == Step->getType() && 3273 "Index type does not match StepValue type"); 3274 3275 // Note: the IR at this point is broken. We cannot use SE to create any new 3276 // SCEV and then expand it, hoping that SCEV's simplification will give us 3277 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3278 // lead to various SCEV crashes. So all we can do is to use builder and rely 3279 // on InstCombine for future simplifications. Here we handle some trivial 3280 // cases only. 3281 auto CreateAdd = [&B](Value *X, Value *Y) { 3282 assert(X->getType() == Y->getType() && "Types don't match!"); 3283 if (auto *CX = dyn_cast<ConstantInt>(X)) 3284 if (CX->isZero()) 3285 return Y; 3286 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3287 if (CY->isZero()) 3288 return X; 3289 return B.CreateAdd(X, Y); 3290 }; 3291 3292 auto CreateMul = [&B](Value *X, Value *Y) { 3293 assert(X->getType() == Y->getType() && "Types don't match!"); 3294 if (auto *CX = dyn_cast<ConstantInt>(X)) 3295 if (CX->isOne()) 3296 return Y; 3297 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3298 if (CY->isOne()) 3299 return X; 3300 return B.CreateMul(X, Y); 3301 }; 3302 3303 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3304 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3305 // the DomTree is not kept up-to-date for additional blocks generated in the 3306 // vector loop. By using the header as insertion point, we guarantee that the 3307 // expanded instructions dominate all their uses. 3308 auto GetInsertPoint = [this, &B]() { 3309 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3310 if (InsertBB != LoopVectorBody && 3311 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3312 return LoopVectorBody->getTerminator(); 3313 return &*B.GetInsertPoint(); 3314 }; 3315 3316 switch (ID.getKind()) { 3317 case InductionDescriptor::IK_IntInduction: { 3318 assert(Index->getType() == StartValue->getType() && 3319 "Index type does not match StartValue type"); 3320 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3321 return B.CreateSub(StartValue, Index); 3322 auto *Offset = CreateMul( 3323 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3324 return CreateAdd(StartValue, Offset); 3325 } 3326 case InductionDescriptor::IK_PtrInduction: { 3327 assert(isa<SCEVConstant>(Step) && 3328 "Expected constant step for pointer induction"); 3329 return B.CreateGEP( 3330 StartValue->getType()->getPointerElementType(), StartValue, 3331 CreateMul(Index, 3332 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3333 } 3334 case InductionDescriptor::IK_FpInduction: { 3335 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3336 auto InductionBinOp = ID.getInductionBinOp(); 3337 assert(InductionBinOp && 3338 (InductionBinOp->getOpcode() == Instruction::FAdd || 3339 InductionBinOp->getOpcode() == Instruction::FSub) && 3340 "Original bin op should be defined for FP induction"); 3341 3342 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3343 Value *MulExp = B.CreateFMul(StepValue, Index); 3344 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3345 "induction"); 3346 } 3347 case InductionDescriptor::IK_NoInduction: 3348 return nullptr; 3349 } 3350 llvm_unreachable("invalid enum"); 3351 } 3352 3353 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3354 LoopScalarBody = OrigLoop->getHeader(); 3355 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3356 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3357 assert(LoopExitBlock && "Must have an exit block"); 3358 assert(LoopVectorPreHeader && "Invalid loop structure"); 3359 3360 LoopMiddleBlock = 3361 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3362 LI, nullptr, Twine(Prefix) + "middle.block"); 3363 LoopScalarPreHeader = 3364 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3365 nullptr, Twine(Prefix) + "scalar.ph"); 3366 3367 // Set up branch from middle block to the exit and scalar preheader blocks. 3368 // completeLoopSkeleton will update the condition to use an iteration check, 3369 // if required to decide whether to execute the remainder. 3370 BranchInst *BrInst = 3371 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3372 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3373 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3374 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3375 3376 // We intentionally don't let SplitBlock to update LoopInfo since 3377 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3378 // LoopVectorBody is explicitly added to the correct place few lines later. 3379 LoopVectorBody = 3380 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3381 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3382 3383 // Update dominator for loop exit. 3384 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3385 3386 // Create and register the new vector loop. 3387 Loop *Lp = LI->AllocateLoop(); 3388 Loop *ParentLoop = OrigLoop->getParentLoop(); 3389 3390 // Insert the new loop into the loop nest and register the new basic blocks 3391 // before calling any utilities such as SCEV that require valid LoopInfo. 3392 if (ParentLoop) { 3393 ParentLoop->addChildLoop(Lp); 3394 } else { 3395 LI->addTopLevelLoop(Lp); 3396 } 3397 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3398 return Lp; 3399 } 3400 3401 void InnerLoopVectorizer::createInductionResumeValues( 3402 Loop *L, Value *VectorTripCount, 3403 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3404 assert(VectorTripCount && L && "Expected valid arguments"); 3405 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3406 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3407 "Inconsistent information about additional bypass."); 3408 // We are going to resume the execution of the scalar loop. 3409 // Go over all of the induction variables that we found and fix the 3410 // PHIs that are left in the scalar version of the loop. 3411 // The starting values of PHI nodes depend on the counter of the last 3412 // iteration in the vectorized loop. 3413 // If we come from a bypass edge then we need to start from the original 3414 // start value. 3415 for (auto &InductionEntry : Legal->getInductionVars()) { 3416 PHINode *OrigPhi = InductionEntry.first; 3417 InductionDescriptor II = InductionEntry.second; 3418 3419 // Create phi nodes to merge from the backedge-taken check block. 3420 PHINode *BCResumeVal = 3421 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3422 LoopScalarPreHeader->getTerminator()); 3423 // Copy original phi DL over to the new one. 3424 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3425 Value *&EndValue = IVEndValues[OrigPhi]; 3426 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3427 if (OrigPhi == OldInduction) { 3428 // We know what the end value is. 3429 EndValue = VectorTripCount; 3430 } else { 3431 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3432 3433 // Fast-math-flags propagate from the original induction instruction. 3434 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3435 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3436 3437 Type *StepType = II.getStep()->getType(); 3438 Instruction::CastOps CastOp = 3439 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3440 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3441 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3442 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3443 EndValue->setName("ind.end"); 3444 3445 // Compute the end value for the additional bypass (if applicable). 3446 if (AdditionalBypass.first) { 3447 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3448 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3449 StepType, true); 3450 CRD = 3451 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3452 EndValueFromAdditionalBypass = 3453 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3454 EndValueFromAdditionalBypass->setName("ind.end"); 3455 } 3456 } 3457 // The new PHI merges the original incoming value, in case of a bypass, 3458 // or the value at the end of the vectorized loop. 3459 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3460 3461 // Fix the scalar body counter (PHI node). 3462 // The old induction's phi node in the scalar body needs the truncated 3463 // value. 3464 for (BasicBlock *BB : LoopBypassBlocks) 3465 BCResumeVal->addIncoming(II.getStartValue(), BB); 3466 3467 if (AdditionalBypass.first) 3468 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3469 EndValueFromAdditionalBypass); 3470 3471 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3472 } 3473 } 3474 3475 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3476 MDNode *OrigLoopID) { 3477 assert(L && "Expected valid loop."); 3478 3479 // The trip counts should be cached by now. 3480 Value *Count = getOrCreateTripCount(L); 3481 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3482 3483 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3484 3485 // Add a check in the middle block to see if we have completed 3486 // all of the iterations in the first vector loop. 3487 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3488 // If tail is to be folded, we know we don't need to run the remainder. 3489 if (!Cost->foldTailByMasking()) { 3490 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3491 Count, VectorTripCount, "cmp.n", 3492 LoopMiddleBlock->getTerminator()); 3493 3494 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3495 // of the corresponding compare because they may have ended up with 3496 // different line numbers and we want to avoid awkward line stepping while 3497 // debugging. Eg. if the compare has got a line number inside the loop. 3498 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3499 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3500 } 3501 3502 // Get ready to start creating new instructions into the vectorized body. 3503 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3504 "Inconsistent vector loop preheader"); 3505 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3506 3507 Optional<MDNode *> VectorizedLoopID = 3508 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3509 LLVMLoopVectorizeFollowupVectorized}); 3510 if (VectorizedLoopID.hasValue()) { 3511 L->setLoopID(VectorizedLoopID.getValue()); 3512 3513 // Do not setAlreadyVectorized if loop attributes have been defined 3514 // explicitly. 3515 return LoopVectorPreHeader; 3516 } 3517 3518 // Keep all loop hints from the original loop on the vector loop (we'll 3519 // replace the vectorizer-specific hints below). 3520 if (MDNode *LID = OrigLoop->getLoopID()) 3521 L->setLoopID(LID); 3522 3523 LoopVectorizeHints Hints(L, true, *ORE); 3524 Hints.setAlreadyVectorized(); 3525 3526 #ifdef EXPENSIVE_CHECKS 3527 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3528 LI->verify(*DT); 3529 #endif 3530 3531 return LoopVectorPreHeader; 3532 } 3533 3534 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3535 /* 3536 In this function we generate a new loop. The new loop will contain 3537 the vectorized instructions while the old loop will continue to run the 3538 scalar remainder. 3539 3540 [ ] <-- loop iteration number check. 3541 / | 3542 / v 3543 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3544 | / | 3545 | / v 3546 || [ ] <-- vector pre header. 3547 |/ | 3548 | v 3549 | [ ] \ 3550 | [ ]_| <-- vector loop. 3551 | | 3552 | v 3553 | -[ ] <--- middle-block. 3554 | / | 3555 | / v 3556 -|- >[ ] <--- new preheader. 3557 | | 3558 | v 3559 | [ ] \ 3560 | [ ]_| <-- old scalar loop to handle remainder. 3561 \ | 3562 \ v 3563 >[ ] <-- exit block. 3564 ... 3565 */ 3566 3567 // Get the metadata of the original loop before it gets modified. 3568 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3569 3570 // Create an empty vector loop, and prepare basic blocks for the runtime 3571 // checks. 3572 Loop *Lp = createVectorLoopSkeleton(""); 3573 3574 // Now, compare the new count to zero. If it is zero skip the vector loop and 3575 // jump to the scalar loop. This check also covers the case where the 3576 // backedge-taken count is uint##_max: adding one to it will overflow leading 3577 // to an incorrect trip count of zero. In this (rare) case we will also jump 3578 // to the scalar loop. 3579 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3580 3581 // Generate the code to check any assumptions that we've made for SCEV 3582 // expressions. 3583 emitSCEVChecks(Lp, LoopScalarPreHeader); 3584 3585 // Generate the code that checks in runtime if arrays overlap. We put the 3586 // checks into a separate block to make the more common case of few elements 3587 // faster. 3588 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3589 3590 // Some loops have a single integer induction variable, while other loops 3591 // don't. One example is c++ iterators that often have multiple pointer 3592 // induction variables. In the code below we also support a case where we 3593 // don't have a single induction variable. 3594 // 3595 // We try to obtain an induction variable from the original loop as hard 3596 // as possible. However if we don't find one that: 3597 // - is an integer 3598 // - counts from zero, stepping by one 3599 // - is the size of the widest induction variable type 3600 // then we create a new one. 3601 OldInduction = Legal->getPrimaryInduction(); 3602 Type *IdxTy = Legal->getWidestInductionType(); 3603 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3604 // The loop step is equal to the vectorization factor (num of SIMD elements) 3605 // times the unroll factor (num of SIMD instructions). 3606 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3607 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3608 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3609 Induction = 3610 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3611 getDebugLocFromInstOrOperands(OldInduction)); 3612 3613 // Emit phis for the new starting index of the scalar loop. 3614 createInductionResumeValues(Lp, CountRoundDown); 3615 3616 return completeLoopSkeleton(Lp, OrigLoopID); 3617 } 3618 3619 // Fix up external users of the induction variable. At this point, we are 3620 // in LCSSA form, with all external PHIs that use the IV having one input value, 3621 // coming from the remainder loop. We need those PHIs to also have a correct 3622 // value for the IV when arriving directly from the middle block. 3623 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3624 const InductionDescriptor &II, 3625 Value *CountRoundDown, Value *EndValue, 3626 BasicBlock *MiddleBlock) { 3627 // There are two kinds of external IV usages - those that use the value 3628 // computed in the last iteration (the PHI) and those that use the penultimate 3629 // value (the value that feeds into the phi from the loop latch). 3630 // We allow both, but they, obviously, have different values. 3631 3632 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3633 3634 DenseMap<Value *, Value *> MissingVals; 3635 3636 // An external user of the last iteration's value should see the value that 3637 // the remainder loop uses to initialize its own IV. 3638 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3639 for (User *U : PostInc->users()) { 3640 Instruction *UI = cast<Instruction>(U); 3641 if (!OrigLoop->contains(UI)) { 3642 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3643 MissingVals[UI] = EndValue; 3644 } 3645 } 3646 3647 // An external user of the penultimate value need to see EndValue - Step. 3648 // The simplest way to get this is to recompute it from the constituent SCEVs, 3649 // that is Start + (Step * (CRD - 1)). 3650 for (User *U : OrigPhi->users()) { 3651 auto *UI = cast<Instruction>(U); 3652 if (!OrigLoop->contains(UI)) { 3653 const DataLayout &DL = 3654 OrigLoop->getHeader()->getModule()->getDataLayout(); 3655 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3656 3657 IRBuilder<> B(MiddleBlock->getTerminator()); 3658 3659 // Fast-math-flags propagate from the original induction instruction. 3660 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3661 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3662 3663 Value *CountMinusOne = B.CreateSub( 3664 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3665 Value *CMO = 3666 !II.getStep()->getType()->isIntegerTy() 3667 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3668 II.getStep()->getType()) 3669 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3670 CMO->setName("cast.cmo"); 3671 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3672 Escape->setName("ind.escape"); 3673 MissingVals[UI] = Escape; 3674 } 3675 } 3676 3677 for (auto &I : MissingVals) { 3678 PHINode *PHI = cast<PHINode>(I.first); 3679 // One corner case we have to handle is two IVs "chasing" each-other, 3680 // that is %IV2 = phi [...], [ %IV1, %latch ] 3681 // In this case, if IV1 has an external use, we need to avoid adding both 3682 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3683 // don't already have an incoming value for the middle block. 3684 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3685 PHI->addIncoming(I.second, MiddleBlock); 3686 } 3687 } 3688 3689 namespace { 3690 3691 struct CSEDenseMapInfo { 3692 static bool canHandle(const Instruction *I) { 3693 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3694 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3695 } 3696 3697 static inline Instruction *getEmptyKey() { 3698 return DenseMapInfo<Instruction *>::getEmptyKey(); 3699 } 3700 3701 static inline Instruction *getTombstoneKey() { 3702 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3703 } 3704 3705 static unsigned getHashValue(const Instruction *I) { 3706 assert(canHandle(I) && "Unknown instruction!"); 3707 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3708 I->value_op_end())); 3709 } 3710 3711 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3712 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3713 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3714 return LHS == RHS; 3715 return LHS->isIdenticalTo(RHS); 3716 } 3717 }; 3718 3719 } // end anonymous namespace 3720 3721 ///Perform cse of induction variable instructions. 3722 static void cse(BasicBlock *BB) { 3723 // Perform simple cse. 3724 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3725 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3726 Instruction *In = &*I++; 3727 3728 if (!CSEDenseMapInfo::canHandle(In)) 3729 continue; 3730 3731 // Check if we can replace this instruction with any of the 3732 // visited instructions. 3733 if (Instruction *V = CSEMap.lookup(In)) { 3734 In->replaceAllUsesWith(V); 3735 In->eraseFromParent(); 3736 continue; 3737 } 3738 3739 CSEMap[In] = In; 3740 } 3741 } 3742 3743 InstructionCost 3744 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3745 bool &NeedToScalarize) { 3746 Function *F = CI->getCalledFunction(); 3747 Type *ScalarRetTy = CI->getType(); 3748 SmallVector<Type *, 4> Tys, ScalarTys; 3749 for (auto &ArgOp : CI->arg_operands()) 3750 ScalarTys.push_back(ArgOp->getType()); 3751 3752 // Estimate cost of scalarized vector call. The source operands are assumed 3753 // to be vectors, so we need to extract individual elements from there, 3754 // execute VF scalar calls, and then gather the result into the vector return 3755 // value. 3756 InstructionCost ScalarCallCost = 3757 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3758 if (VF.isScalar()) 3759 return ScalarCallCost; 3760 3761 // Compute corresponding vector type for return value and arguments. 3762 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3763 for (Type *ScalarTy : ScalarTys) 3764 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3765 3766 // Compute costs of unpacking argument values for the scalar calls and 3767 // packing the return values to a vector. 3768 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3769 3770 InstructionCost Cost = 3771 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3772 3773 // If we can't emit a vector call for this function, then the currently found 3774 // cost is the cost we need to return. 3775 NeedToScalarize = true; 3776 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3777 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3778 3779 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3780 return Cost; 3781 3782 // If the corresponding vector cost is cheaper, return its cost. 3783 InstructionCost VectorCallCost = 3784 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3785 if (VectorCallCost < Cost) { 3786 NeedToScalarize = false; 3787 Cost = VectorCallCost; 3788 } 3789 return Cost; 3790 } 3791 3792 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3793 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3794 return Elt; 3795 return VectorType::get(Elt, VF); 3796 } 3797 3798 InstructionCost 3799 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3800 ElementCount VF) { 3801 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3802 assert(ID && "Expected intrinsic call!"); 3803 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3804 FastMathFlags FMF; 3805 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3806 FMF = FPMO->getFastMathFlags(); 3807 3808 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3809 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3810 SmallVector<Type *> ParamTys; 3811 std::transform(FTy->param_begin(), FTy->param_end(), 3812 std::back_inserter(ParamTys), 3813 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3814 3815 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3816 dyn_cast<IntrinsicInst>(CI)); 3817 return TTI.getIntrinsicInstrCost(CostAttrs, 3818 TargetTransformInfo::TCK_RecipThroughput); 3819 } 3820 3821 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3822 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3823 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3824 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3825 } 3826 3827 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3828 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3829 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3830 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3831 } 3832 3833 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3834 // For every instruction `I` in MinBWs, truncate the operands, create a 3835 // truncated version of `I` and reextend its result. InstCombine runs 3836 // later and will remove any ext/trunc pairs. 3837 SmallPtrSet<Value *, 4> Erased; 3838 for (const auto &KV : Cost->getMinimalBitwidths()) { 3839 // If the value wasn't vectorized, we must maintain the original scalar 3840 // type. The absence of the value from State indicates that it 3841 // wasn't vectorized. 3842 VPValue *Def = State.Plan->getVPValue(KV.first); 3843 if (!State.hasAnyVectorValue(Def)) 3844 continue; 3845 for (unsigned Part = 0; Part < UF; ++Part) { 3846 Value *I = State.get(Def, Part); 3847 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3848 continue; 3849 Type *OriginalTy = I->getType(); 3850 Type *ScalarTruncatedTy = 3851 IntegerType::get(OriginalTy->getContext(), KV.second); 3852 auto *TruncatedTy = FixedVectorType::get( 3853 ScalarTruncatedTy, 3854 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3855 if (TruncatedTy == OriginalTy) 3856 continue; 3857 3858 IRBuilder<> B(cast<Instruction>(I)); 3859 auto ShrinkOperand = [&](Value *V) -> Value * { 3860 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3861 if (ZI->getSrcTy() == TruncatedTy) 3862 return ZI->getOperand(0); 3863 return B.CreateZExtOrTrunc(V, TruncatedTy); 3864 }; 3865 3866 // The actual instruction modification depends on the instruction type, 3867 // unfortunately. 3868 Value *NewI = nullptr; 3869 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3870 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3871 ShrinkOperand(BO->getOperand(1))); 3872 3873 // Any wrapping introduced by shrinking this operation shouldn't be 3874 // considered undefined behavior. So, we can't unconditionally copy 3875 // arithmetic wrapping flags to NewI. 3876 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3877 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3878 NewI = 3879 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3880 ShrinkOperand(CI->getOperand(1))); 3881 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3882 NewI = B.CreateSelect(SI->getCondition(), 3883 ShrinkOperand(SI->getTrueValue()), 3884 ShrinkOperand(SI->getFalseValue())); 3885 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3886 switch (CI->getOpcode()) { 3887 default: 3888 llvm_unreachable("Unhandled cast!"); 3889 case Instruction::Trunc: 3890 NewI = ShrinkOperand(CI->getOperand(0)); 3891 break; 3892 case Instruction::SExt: 3893 NewI = B.CreateSExtOrTrunc( 3894 CI->getOperand(0), 3895 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3896 break; 3897 case Instruction::ZExt: 3898 NewI = B.CreateZExtOrTrunc( 3899 CI->getOperand(0), 3900 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3901 break; 3902 } 3903 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3904 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3905 ->getNumElements(); 3906 auto *O0 = B.CreateZExtOrTrunc( 3907 SI->getOperand(0), 3908 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3909 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3910 ->getNumElements(); 3911 auto *O1 = B.CreateZExtOrTrunc( 3912 SI->getOperand(1), 3913 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3914 3915 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3916 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3917 // Don't do anything with the operands, just extend the result. 3918 continue; 3919 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3920 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3921 ->getNumElements(); 3922 auto *O0 = B.CreateZExtOrTrunc( 3923 IE->getOperand(0), 3924 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3925 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3926 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3927 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3928 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3929 ->getNumElements(); 3930 auto *O0 = B.CreateZExtOrTrunc( 3931 EE->getOperand(0), 3932 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3933 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3934 } else { 3935 // If we don't know what to do, be conservative and don't do anything. 3936 continue; 3937 } 3938 3939 // Lastly, extend the result. 3940 NewI->takeName(cast<Instruction>(I)); 3941 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3942 I->replaceAllUsesWith(Res); 3943 cast<Instruction>(I)->eraseFromParent(); 3944 Erased.insert(I); 3945 State.reset(Def, Res, Part); 3946 } 3947 } 3948 3949 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3950 for (const auto &KV : Cost->getMinimalBitwidths()) { 3951 // If the value wasn't vectorized, we must maintain the original scalar 3952 // type. The absence of the value from State indicates that it 3953 // wasn't vectorized. 3954 VPValue *Def = State.Plan->getVPValue(KV.first); 3955 if (!State.hasAnyVectorValue(Def)) 3956 continue; 3957 for (unsigned Part = 0; Part < UF; ++Part) { 3958 Value *I = State.get(Def, Part); 3959 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3960 if (Inst && Inst->use_empty()) { 3961 Value *NewI = Inst->getOperand(0); 3962 Inst->eraseFromParent(); 3963 State.reset(Def, NewI, Part); 3964 } 3965 } 3966 } 3967 } 3968 3969 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3970 // Insert truncates and extends for any truncated instructions as hints to 3971 // InstCombine. 3972 if (VF.isVector()) 3973 truncateToMinimalBitwidths(State); 3974 3975 // Fix widened non-induction PHIs by setting up the PHI operands. 3976 if (OrigPHIsToFix.size()) { 3977 assert(EnableVPlanNativePath && 3978 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3979 fixNonInductionPHIs(State); 3980 } 3981 3982 // At this point every instruction in the original loop is widened to a 3983 // vector form. Now we need to fix the recurrences in the loop. These PHI 3984 // nodes are currently empty because we did not want to introduce cycles. 3985 // This is the second stage of vectorizing recurrences. 3986 fixCrossIterationPHIs(State); 3987 3988 // Forget the original basic block. 3989 PSE.getSE()->forgetLoop(OrigLoop); 3990 3991 // Fix-up external users of the induction variables. 3992 for (auto &Entry : Legal->getInductionVars()) 3993 fixupIVUsers(Entry.first, Entry.second, 3994 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3995 IVEndValues[Entry.first], LoopMiddleBlock); 3996 3997 fixLCSSAPHIs(State); 3998 for (Instruction *PI : PredicatedInstructions) 3999 sinkScalarOperands(&*PI); 4000 4001 // Remove redundant induction instructions. 4002 cse(LoopVectorBody); 4003 4004 // Set/update profile weights for the vector and remainder loops as original 4005 // loop iterations are now distributed among them. Note that original loop 4006 // represented by LoopScalarBody becomes remainder loop after vectorization. 4007 // 4008 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4009 // end up getting slightly roughened result but that should be OK since 4010 // profile is not inherently precise anyway. Note also possible bypass of 4011 // vector code caused by legality checks is ignored, assigning all the weight 4012 // to the vector loop, optimistically. 4013 // 4014 // For scalable vectorization we can't know at compile time how many iterations 4015 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4016 // vscale of '1'. 4017 setProfileInfoAfterUnrolling( 4018 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4019 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4020 } 4021 4022 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4023 // In order to support recurrences we need to be able to vectorize Phi nodes. 4024 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4025 // stage #2: We now need to fix the recurrences by adding incoming edges to 4026 // the currently empty PHI nodes. At this point every instruction in the 4027 // original loop is widened to a vector form so we can use them to construct 4028 // the incoming edges. 4029 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4030 // Handle first-order recurrences and reductions that need to be fixed. 4031 if (Legal->isFirstOrderRecurrence(&Phi)) 4032 fixFirstOrderRecurrence(&Phi, State); 4033 else if (Legal->isReductionVariable(&Phi)) 4034 fixReduction(&Phi, State); 4035 } 4036 } 4037 4038 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4039 VPTransformState &State) { 4040 // This is the second phase of vectorizing first-order recurrences. An 4041 // overview of the transformation is described below. Suppose we have the 4042 // following loop. 4043 // 4044 // for (int i = 0; i < n; ++i) 4045 // b[i] = a[i] - a[i - 1]; 4046 // 4047 // There is a first-order recurrence on "a". For this loop, the shorthand 4048 // scalar IR looks like: 4049 // 4050 // scalar.ph: 4051 // s_init = a[-1] 4052 // br scalar.body 4053 // 4054 // scalar.body: 4055 // i = phi [0, scalar.ph], [i+1, scalar.body] 4056 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4057 // s2 = a[i] 4058 // b[i] = s2 - s1 4059 // br cond, scalar.body, ... 4060 // 4061 // In this example, s1 is a recurrence because it's value depends on the 4062 // previous iteration. In the first phase of vectorization, we created a 4063 // temporary value for s1. We now complete the vectorization and produce the 4064 // shorthand vector IR shown below (for VF = 4, UF = 1). 4065 // 4066 // vector.ph: 4067 // v_init = vector(..., ..., ..., a[-1]) 4068 // br vector.body 4069 // 4070 // vector.body 4071 // i = phi [0, vector.ph], [i+4, vector.body] 4072 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4073 // v2 = a[i, i+1, i+2, i+3]; 4074 // v3 = vector(v1(3), v2(0, 1, 2)) 4075 // b[i, i+1, i+2, i+3] = v2 - v3 4076 // br cond, vector.body, middle.block 4077 // 4078 // middle.block: 4079 // x = v2(3) 4080 // br scalar.ph 4081 // 4082 // scalar.ph: 4083 // s_init = phi [x, middle.block], [a[-1], otherwise] 4084 // br scalar.body 4085 // 4086 // After execution completes the vector loop, we extract the next value of 4087 // the recurrence (x) to use as the initial value in the scalar loop. 4088 4089 // Get the original loop preheader and single loop latch. 4090 auto *Preheader = OrigLoop->getLoopPreheader(); 4091 auto *Latch = OrigLoop->getLoopLatch(); 4092 4093 // Get the initial and previous values of the scalar recurrence. 4094 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4095 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4096 4097 // Create a vector from the initial value. 4098 auto *VectorInit = ScalarInit; 4099 if (VF.isVector()) { 4100 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4101 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4102 VectorInit = Builder.CreateInsertElement( 4103 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4104 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4105 } 4106 4107 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4108 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4109 // We constructed a temporary phi node in the first phase of vectorization. 4110 // This phi node will eventually be deleted. 4111 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4112 4113 // Create a phi node for the new recurrence. The current value will either be 4114 // the initial value inserted into a vector or loop-varying vector value. 4115 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4116 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4117 4118 // Get the vectorized previous value of the last part UF - 1. It appears last 4119 // among all unrolled iterations, due to the order of their construction. 4120 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4121 4122 // Find and set the insertion point after the previous value if it is an 4123 // instruction. 4124 BasicBlock::iterator InsertPt; 4125 // Note that the previous value may have been constant-folded so it is not 4126 // guaranteed to be an instruction in the vector loop. 4127 // FIXME: Loop invariant values do not form recurrences. We should deal with 4128 // them earlier. 4129 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4130 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4131 else { 4132 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4133 if (isa<PHINode>(PreviousLastPart)) 4134 // If the previous value is a phi node, we should insert after all the phi 4135 // nodes in the block containing the PHI to avoid breaking basic block 4136 // verification. Note that the basic block may be different to 4137 // LoopVectorBody, in case we predicate the loop. 4138 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4139 else 4140 InsertPt = ++PreviousInst->getIterator(); 4141 } 4142 Builder.SetInsertPoint(&*InsertPt); 4143 4144 // We will construct a vector for the recurrence by combining the values for 4145 // the current and previous iterations. This is the required shuffle mask. 4146 assert(!VF.isScalable()); 4147 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4148 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4149 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4150 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4151 4152 // The vector from which to take the initial value for the current iteration 4153 // (actual or unrolled). Initially, this is the vector phi node. 4154 Value *Incoming = VecPhi; 4155 4156 // Shuffle the current and previous vector and update the vector parts. 4157 for (unsigned Part = 0; Part < UF; ++Part) { 4158 Value *PreviousPart = State.get(PreviousDef, Part); 4159 Value *PhiPart = State.get(PhiDef, Part); 4160 auto *Shuffle = 4161 VF.isVector() 4162 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4163 : Incoming; 4164 PhiPart->replaceAllUsesWith(Shuffle); 4165 cast<Instruction>(PhiPart)->eraseFromParent(); 4166 State.reset(PhiDef, Shuffle, Part); 4167 Incoming = PreviousPart; 4168 } 4169 4170 // Fix the latch value of the new recurrence in the vector loop. 4171 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4172 4173 // Extract the last vector element in the middle block. This will be the 4174 // initial value for the recurrence when jumping to the scalar loop. 4175 auto *ExtractForScalar = Incoming; 4176 if (VF.isVector()) { 4177 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4178 ExtractForScalar = Builder.CreateExtractElement( 4179 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4180 "vector.recur.extract"); 4181 } 4182 // Extract the second last element in the middle block if the 4183 // Phi is used outside the loop. We need to extract the phi itself 4184 // and not the last element (the phi update in the current iteration). This 4185 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4186 // when the scalar loop is not run at all. 4187 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4188 if (VF.isVector()) 4189 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4190 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4191 "vector.recur.extract.for.phi"); 4192 // When loop is unrolled without vectorizing, initialize 4193 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4194 // `Incoming`. This is analogous to the vectorized case above: extracting the 4195 // second last element when VF > 1. 4196 else if (UF > 1) 4197 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4198 4199 // Fix the initial value of the original recurrence in the scalar loop. 4200 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4201 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4202 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4203 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4204 Start->addIncoming(Incoming, BB); 4205 } 4206 4207 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4208 Phi->setName("scalar.recur"); 4209 4210 // Finally, fix users of the recurrence outside the loop. The users will need 4211 // either the last value of the scalar recurrence or the last value of the 4212 // vector recurrence we extracted in the middle block. Since the loop is in 4213 // LCSSA form, we just need to find all the phi nodes for the original scalar 4214 // recurrence in the exit block, and then add an edge for the middle block. 4215 // Note that LCSSA does not imply single entry when the original scalar loop 4216 // had multiple exiting edges (as we always run the last iteration in the 4217 // scalar epilogue); in that case, the exiting path through middle will be 4218 // dynamically dead and the value picked for the phi doesn't matter. 4219 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4220 if (any_of(LCSSAPhi.incoming_values(), 4221 [Phi](Value *V) { return V == Phi; })) 4222 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4223 } 4224 4225 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4226 // Get it's reduction variable descriptor. 4227 assert(Legal->isReductionVariable(Phi) && 4228 "Unable to find the reduction variable"); 4229 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4230 4231 RecurKind RK = RdxDesc.getRecurrenceKind(); 4232 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4233 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4234 setDebugLocFromInst(Builder, ReductionStartValue); 4235 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4236 4237 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4238 // This is the vector-clone of the value that leaves the loop. 4239 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4240 4241 // Wrap flags are in general invalid after vectorization, clear them. 4242 clearReductionWrapFlags(RdxDesc, State); 4243 4244 // Fix the vector-loop phi. 4245 4246 // Reductions do not have to start at zero. They can start with 4247 // any loop invariant values. 4248 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4249 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4250 4251 for (unsigned Part = 0; Part < UF; ++Part) { 4252 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4253 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4254 cast<PHINode>(VecRdxPhi) 4255 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4256 } 4257 4258 // Before each round, move the insertion point right between 4259 // the PHIs and the values we are going to write. 4260 // This allows us to write both PHINodes and the extractelement 4261 // instructions. 4262 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4263 4264 setDebugLocFromInst(Builder, LoopExitInst); 4265 4266 // If tail is folded by masking, the vector value to leave the loop should be 4267 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4268 // instead of the former. For an inloop reduction the reduction will already 4269 // be predicated, and does not need to be handled here. 4270 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4271 for (unsigned Part = 0; Part < UF; ++Part) { 4272 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4273 Value *Sel = nullptr; 4274 for (User *U : VecLoopExitInst->users()) { 4275 if (isa<SelectInst>(U)) { 4276 assert(!Sel && "Reduction exit feeding two selects"); 4277 Sel = U; 4278 } else 4279 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4280 } 4281 assert(Sel && "Reduction exit feeds no select"); 4282 State.reset(LoopExitInstDef, Sel, Part); 4283 4284 // If the target can create a predicated operator for the reduction at no 4285 // extra cost in the loop (for example a predicated vadd), it can be 4286 // cheaper for the select to remain in the loop than be sunk out of it, 4287 // and so use the select value for the phi instead of the old 4288 // LoopExitValue. 4289 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4290 if (PreferPredicatedReductionSelect || 4291 TTI->preferPredicatedReductionSelect( 4292 RdxDesc.getOpcode(), Phi->getType(), 4293 TargetTransformInfo::ReductionFlags())) { 4294 auto *VecRdxPhi = 4295 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4296 VecRdxPhi->setIncomingValueForBlock( 4297 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4298 } 4299 } 4300 } 4301 4302 // If the vector reduction can be performed in a smaller type, we truncate 4303 // then extend the loop exit value to enable InstCombine to evaluate the 4304 // entire expression in the smaller type. 4305 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4306 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4307 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4308 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4309 Builder.SetInsertPoint( 4310 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4311 VectorParts RdxParts(UF); 4312 for (unsigned Part = 0; Part < UF; ++Part) { 4313 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4314 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4315 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4316 : Builder.CreateZExt(Trunc, VecTy); 4317 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4318 UI != RdxParts[Part]->user_end();) 4319 if (*UI != Trunc) { 4320 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4321 RdxParts[Part] = Extnd; 4322 } else { 4323 ++UI; 4324 } 4325 } 4326 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4327 for (unsigned Part = 0; Part < UF; ++Part) { 4328 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4329 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4330 } 4331 } 4332 4333 // Reduce all of the unrolled parts into a single vector. 4334 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4335 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4336 4337 // The middle block terminator has already been assigned a DebugLoc here (the 4338 // OrigLoop's single latch terminator). We want the whole middle block to 4339 // appear to execute on this line because: (a) it is all compiler generated, 4340 // (b) these instructions are always executed after evaluating the latch 4341 // conditional branch, and (c) other passes may add new predecessors which 4342 // terminate on this line. This is the easiest way to ensure we don't 4343 // accidentally cause an extra step back into the loop while debugging. 4344 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4345 { 4346 // Floating-point operations should have some FMF to enable the reduction. 4347 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4348 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4349 for (unsigned Part = 1; Part < UF; ++Part) { 4350 Value *RdxPart = State.get(LoopExitInstDef, Part); 4351 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4352 ReducedPartRdx = Builder.CreateBinOp( 4353 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4354 } else { 4355 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4356 } 4357 } 4358 } 4359 4360 // Create the reduction after the loop. Note that inloop reductions create the 4361 // target reduction in the loop using a Reduction recipe. 4362 if (VF.isVector() && !IsInLoopReductionPhi) { 4363 ReducedPartRdx = 4364 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4365 // If the reduction can be performed in a smaller type, we need to extend 4366 // the reduction to the wider type before we branch to the original loop. 4367 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4368 ReducedPartRdx = 4369 RdxDesc.isSigned() 4370 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4371 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4372 } 4373 4374 // Create a phi node that merges control-flow from the backedge-taken check 4375 // block and the middle block. 4376 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4377 LoopScalarPreHeader->getTerminator()); 4378 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4379 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4380 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4381 4382 // Now, we need to fix the users of the reduction variable 4383 // inside and outside of the scalar remainder loop. 4384 4385 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4386 // in the exit blocks. See comment on analogous loop in 4387 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4388 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4389 if (any_of(LCSSAPhi.incoming_values(), 4390 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4391 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4392 4393 // Fix the scalar loop reduction variable with the incoming reduction sum 4394 // from the vector body and from the backedge value. 4395 int IncomingEdgeBlockIdx = 4396 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4397 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4398 // Pick the other block. 4399 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4400 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4401 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4402 } 4403 4404 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4405 VPTransformState &State) { 4406 RecurKind RK = RdxDesc.getRecurrenceKind(); 4407 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4408 return; 4409 4410 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4411 assert(LoopExitInstr && "null loop exit instruction"); 4412 SmallVector<Instruction *, 8> Worklist; 4413 SmallPtrSet<Instruction *, 8> Visited; 4414 Worklist.push_back(LoopExitInstr); 4415 Visited.insert(LoopExitInstr); 4416 4417 while (!Worklist.empty()) { 4418 Instruction *Cur = Worklist.pop_back_val(); 4419 if (isa<OverflowingBinaryOperator>(Cur)) 4420 for (unsigned Part = 0; Part < UF; ++Part) { 4421 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4422 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4423 } 4424 4425 for (User *U : Cur->users()) { 4426 Instruction *UI = cast<Instruction>(U); 4427 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4428 Visited.insert(UI).second) 4429 Worklist.push_back(UI); 4430 } 4431 } 4432 } 4433 4434 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4435 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4436 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4437 // Some phis were already hand updated by the reduction and recurrence 4438 // code above, leave them alone. 4439 continue; 4440 4441 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4442 // Non-instruction incoming values will have only one value. 4443 4444 VPLane Lane = VPLane::getFirstLane(); 4445 if (isa<Instruction>(IncomingValue) && 4446 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4447 VF)) 4448 Lane = VPLane::getLastLaneForVF(VF); 4449 4450 // Can be a loop invariant incoming value or the last scalar value to be 4451 // extracted from the vectorized loop. 4452 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4453 Value *lastIncomingValue = 4454 OrigLoop->isLoopInvariant(IncomingValue) 4455 ? IncomingValue 4456 : State.get(State.Plan->getVPValue(IncomingValue), 4457 VPIteration(UF - 1, Lane)); 4458 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4459 } 4460 } 4461 4462 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4463 // The basic block and loop containing the predicated instruction. 4464 auto *PredBB = PredInst->getParent(); 4465 auto *VectorLoop = LI->getLoopFor(PredBB); 4466 4467 // Initialize a worklist with the operands of the predicated instruction. 4468 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4469 4470 // Holds instructions that we need to analyze again. An instruction may be 4471 // reanalyzed if we don't yet know if we can sink it or not. 4472 SmallVector<Instruction *, 8> InstsToReanalyze; 4473 4474 // Returns true if a given use occurs in the predicated block. Phi nodes use 4475 // their operands in their corresponding predecessor blocks. 4476 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4477 auto *I = cast<Instruction>(U.getUser()); 4478 BasicBlock *BB = I->getParent(); 4479 if (auto *Phi = dyn_cast<PHINode>(I)) 4480 BB = Phi->getIncomingBlock( 4481 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4482 return BB == PredBB; 4483 }; 4484 4485 // Iteratively sink the scalarized operands of the predicated instruction 4486 // into the block we created for it. When an instruction is sunk, it's 4487 // operands are then added to the worklist. The algorithm ends after one pass 4488 // through the worklist doesn't sink a single instruction. 4489 bool Changed; 4490 do { 4491 // Add the instructions that need to be reanalyzed to the worklist, and 4492 // reset the changed indicator. 4493 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4494 InstsToReanalyze.clear(); 4495 Changed = false; 4496 4497 while (!Worklist.empty()) { 4498 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4499 4500 // We can't sink an instruction if it is a phi node, is already in the 4501 // predicated block, is not in the loop, or may have side effects. 4502 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4503 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4504 continue; 4505 4506 // It's legal to sink the instruction if all its uses occur in the 4507 // predicated block. Otherwise, there's nothing to do yet, and we may 4508 // need to reanalyze the instruction. 4509 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4510 InstsToReanalyze.push_back(I); 4511 continue; 4512 } 4513 4514 // Move the instruction to the beginning of the predicated block, and add 4515 // it's operands to the worklist. 4516 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4517 Worklist.insert(I->op_begin(), I->op_end()); 4518 4519 // The sinking may have enabled other instructions to be sunk, so we will 4520 // need to iterate. 4521 Changed = true; 4522 } 4523 } while (Changed); 4524 } 4525 4526 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4527 for (PHINode *OrigPhi : OrigPHIsToFix) { 4528 VPWidenPHIRecipe *VPPhi = 4529 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4530 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4531 // Make sure the builder has a valid insert point. 4532 Builder.SetInsertPoint(NewPhi); 4533 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4534 VPValue *Inc = VPPhi->getIncomingValue(i); 4535 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4536 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4537 } 4538 } 4539 } 4540 4541 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4542 VPUser &Operands, unsigned UF, 4543 ElementCount VF, bool IsPtrLoopInvariant, 4544 SmallBitVector &IsIndexLoopInvariant, 4545 VPTransformState &State) { 4546 // Construct a vector GEP by widening the operands of the scalar GEP as 4547 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4548 // results in a vector of pointers when at least one operand of the GEP 4549 // is vector-typed. Thus, to keep the representation compact, we only use 4550 // vector-typed operands for loop-varying values. 4551 4552 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4553 // If we are vectorizing, but the GEP has only loop-invariant operands, 4554 // the GEP we build (by only using vector-typed operands for 4555 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4556 // produce a vector of pointers, we need to either arbitrarily pick an 4557 // operand to broadcast, or broadcast a clone of the original GEP. 4558 // Here, we broadcast a clone of the original. 4559 // 4560 // TODO: If at some point we decide to scalarize instructions having 4561 // loop-invariant operands, this special case will no longer be 4562 // required. We would add the scalarization decision to 4563 // collectLoopScalars() and teach getVectorValue() to broadcast 4564 // the lane-zero scalar value. 4565 auto *Clone = Builder.Insert(GEP->clone()); 4566 for (unsigned Part = 0; Part < UF; ++Part) { 4567 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4568 State.set(VPDef, EntryPart, Part); 4569 addMetadata(EntryPart, GEP); 4570 } 4571 } else { 4572 // If the GEP has at least one loop-varying operand, we are sure to 4573 // produce a vector of pointers. But if we are only unrolling, we want 4574 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4575 // produce with the code below will be scalar (if VF == 1) or vector 4576 // (otherwise). Note that for the unroll-only case, we still maintain 4577 // values in the vector mapping with initVector, as we do for other 4578 // instructions. 4579 for (unsigned Part = 0; Part < UF; ++Part) { 4580 // The pointer operand of the new GEP. If it's loop-invariant, we 4581 // won't broadcast it. 4582 auto *Ptr = IsPtrLoopInvariant 4583 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4584 : State.get(Operands.getOperand(0), Part); 4585 4586 // Collect all the indices for the new GEP. If any index is 4587 // loop-invariant, we won't broadcast it. 4588 SmallVector<Value *, 4> Indices; 4589 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4590 VPValue *Operand = Operands.getOperand(I); 4591 if (IsIndexLoopInvariant[I - 1]) 4592 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4593 else 4594 Indices.push_back(State.get(Operand, Part)); 4595 } 4596 4597 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4598 // but it should be a vector, otherwise. 4599 auto *NewGEP = 4600 GEP->isInBounds() 4601 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4602 Indices) 4603 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4604 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4605 "NewGEP is not a pointer vector"); 4606 State.set(VPDef, NewGEP, Part); 4607 addMetadata(NewGEP, GEP); 4608 } 4609 } 4610 } 4611 4612 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4613 RecurrenceDescriptor *RdxDesc, 4614 VPValue *StartVPV, VPValue *Def, 4615 VPTransformState &State) { 4616 PHINode *P = cast<PHINode>(PN); 4617 if (EnableVPlanNativePath) { 4618 // Currently we enter here in the VPlan-native path for non-induction 4619 // PHIs where all control flow is uniform. We simply widen these PHIs. 4620 // Create a vector phi with no operands - the vector phi operands will be 4621 // set at the end of vector code generation. 4622 Type *VecTy = (State.VF.isScalar()) 4623 ? PN->getType() 4624 : VectorType::get(PN->getType(), State.VF); 4625 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4626 State.set(Def, VecPhi, 0); 4627 OrigPHIsToFix.push_back(P); 4628 4629 return; 4630 } 4631 4632 assert(PN->getParent() == OrigLoop->getHeader() && 4633 "Non-header phis should have been handled elsewhere"); 4634 4635 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4636 // In order to support recurrences we need to be able to vectorize Phi nodes. 4637 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4638 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4639 // this value when we vectorize all of the instructions that use the PHI. 4640 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4641 Value *Iden = nullptr; 4642 bool ScalarPHI = 4643 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4644 Type *VecTy = 4645 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4646 4647 if (RdxDesc) { 4648 assert(Legal->isReductionVariable(P) && StartV && 4649 "RdxDesc should only be set for reduction variables; in that case " 4650 "a StartV is also required"); 4651 RecurKind RK = RdxDesc->getRecurrenceKind(); 4652 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4653 // MinMax reduction have the start value as their identify. 4654 if (ScalarPHI) { 4655 Iden = StartV; 4656 } else { 4657 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4658 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4659 StartV = Iden = 4660 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4661 } 4662 } else { 4663 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4664 RK, VecTy->getScalarType()); 4665 Iden = IdenC; 4666 4667 if (!ScalarPHI) { 4668 Iden = ConstantVector::getSplat(State.VF, IdenC); 4669 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4670 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4671 Constant *Zero = Builder.getInt32(0); 4672 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4673 } 4674 } 4675 } 4676 4677 for (unsigned Part = 0; Part < State.UF; ++Part) { 4678 // This is phase one of vectorizing PHIs. 4679 Value *EntryPart = PHINode::Create( 4680 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4681 State.set(Def, EntryPart, Part); 4682 if (StartV) { 4683 // Make sure to add the reduction start value only to the 4684 // first unroll part. 4685 Value *StartVal = (Part == 0) ? StartV : Iden; 4686 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4687 } 4688 } 4689 return; 4690 } 4691 4692 assert(!Legal->isReductionVariable(P) && 4693 "reductions should be handled above"); 4694 4695 setDebugLocFromInst(Builder, P); 4696 4697 // This PHINode must be an induction variable. 4698 // Make sure that we know about it. 4699 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4700 4701 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4702 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4703 4704 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4705 // which can be found from the original scalar operations. 4706 switch (II.getKind()) { 4707 case InductionDescriptor::IK_NoInduction: 4708 llvm_unreachable("Unknown induction"); 4709 case InductionDescriptor::IK_IntInduction: 4710 case InductionDescriptor::IK_FpInduction: 4711 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4712 case InductionDescriptor::IK_PtrInduction: { 4713 // Handle the pointer induction variable case. 4714 assert(P->getType()->isPointerTy() && "Unexpected type."); 4715 4716 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4717 // This is the normalized GEP that starts counting at zero. 4718 Value *PtrInd = 4719 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4720 // Determine the number of scalars we need to generate for each unroll 4721 // iteration. If the instruction is uniform, we only need to generate the 4722 // first lane. Otherwise, we generate all VF values. 4723 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4724 ? 1 4725 : State.VF.getKnownMinValue(); 4726 for (unsigned Part = 0; Part < UF; ++Part) { 4727 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4728 Constant *Idx = ConstantInt::get( 4729 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4730 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4731 Value *SclrGep = 4732 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4733 SclrGep->setName("next.gep"); 4734 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4735 } 4736 } 4737 return; 4738 } 4739 assert(isa<SCEVConstant>(II.getStep()) && 4740 "Induction step not a SCEV constant!"); 4741 Type *PhiType = II.getStep()->getType(); 4742 4743 // Build a pointer phi 4744 Value *ScalarStartValue = II.getStartValue(); 4745 Type *ScStValueType = ScalarStartValue->getType(); 4746 PHINode *NewPointerPhi = 4747 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4748 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4749 4750 // A pointer induction, performed by using a gep 4751 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4752 Instruction *InductionLoc = LoopLatch->getTerminator(); 4753 const SCEV *ScalarStep = II.getStep(); 4754 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4755 Value *ScalarStepValue = 4756 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4757 Value *InductionGEP = GetElementPtrInst::Create( 4758 ScStValueType->getPointerElementType(), NewPointerPhi, 4759 Builder.CreateMul( 4760 ScalarStepValue, 4761 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4762 "ptr.ind", InductionLoc); 4763 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4764 4765 // Create UF many actual address geps that use the pointer 4766 // phi as base and a vectorized version of the step value 4767 // (<step*0, ..., step*N>) as offset. 4768 for (unsigned Part = 0; Part < State.UF; ++Part) { 4769 SmallVector<Constant *, 8> Indices; 4770 // Create a vector of consecutive numbers from zero to VF. 4771 for (unsigned i = 0; i < State.VF.getKnownMinValue(); ++i) 4772 Indices.push_back( 4773 ConstantInt::get(PhiType, i + Part * State.VF.getKnownMinValue())); 4774 Constant *StartOffset = ConstantVector::get(Indices); 4775 4776 Value *GEP = Builder.CreateGEP( 4777 ScStValueType->getPointerElementType(), NewPointerPhi, 4778 Builder.CreateMul(StartOffset, 4779 Builder.CreateVectorSplat( 4780 State.VF.getKnownMinValue(), ScalarStepValue), 4781 "vector.gep")); 4782 State.set(Def, GEP, Part); 4783 } 4784 } 4785 } 4786 } 4787 4788 /// A helper function for checking whether an integer division-related 4789 /// instruction may divide by zero (in which case it must be predicated if 4790 /// executed conditionally in the scalar code). 4791 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4792 /// Non-zero divisors that are non compile-time constants will not be 4793 /// converted into multiplication, so we will still end up scalarizing 4794 /// the division, but can do so w/o predication. 4795 static bool mayDivideByZero(Instruction &I) { 4796 assert((I.getOpcode() == Instruction::UDiv || 4797 I.getOpcode() == Instruction::SDiv || 4798 I.getOpcode() == Instruction::URem || 4799 I.getOpcode() == Instruction::SRem) && 4800 "Unexpected instruction"); 4801 Value *Divisor = I.getOperand(1); 4802 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4803 return !CInt || CInt->isZero(); 4804 } 4805 4806 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4807 VPUser &User, 4808 VPTransformState &State) { 4809 switch (I.getOpcode()) { 4810 case Instruction::Call: 4811 case Instruction::Br: 4812 case Instruction::PHI: 4813 case Instruction::GetElementPtr: 4814 case Instruction::Select: 4815 llvm_unreachable("This instruction is handled by a different recipe."); 4816 case Instruction::UDiv: 4817 case Instruction::SDiv: 4818 case Instruction::SRem: 4819 case Instruction::URem: 4820 case Instruction::Add: 4821 case Instruction::FAdd: 4822 case Instruction::Sub: 4823 case Instruction::FSub: 4824 case Instruction::FNeg: 4825 case Instruction::Mul: 4826 case Instruction::FMul: 4827 case Instruction::FDiv: 4828 case Instruction::FRem: 4829 case Instruction::Shl: 4830 case Instruction::LShr: 4831 case Instruction::AShr: 4832 case Instruction::And: 4833 case Instruction::Or: 4834 case Instruction::Xor: { 4835 // Just widen unops and binops. 4836 setDebugLocFromInst(Builder, &I); 4837 4838 for (unsigned Part = 0; Part < UF; ++Part) { 4839 SmallVector<Value *, 2> Ops; 4840 for (VPValue *VPOp : User.operands()) 4841 Ops.push_back(State.get(VPOp, Part)); 4842 4843 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4844 4845 if (auto *VecOp = dyn_cast<Instruction>(V)) 4846 VecOp->copyIRFlags(&I); 4847 4848 // Use this vector value for all users of the original instruction. 4849 State.set(Def, V, Part); 4850 addMetadata(V, &I); 4851 } 4852 4853 break; 4854 } 4855 case Instruction::ICmp: 4856 case Instruction::FCmp: { 4857 // Widen compares. Generate vector compares. 4858 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4859 auto *Cmp = cast<CmpInst>(&I); 4860 setDebugLocFromInst(Builder, Cmp); 4861 for (unsigned Part = 0; Part < UF; ++Part) { 4862 Value *A = State.get(User.getOperand(0), Part); 4863 Value *B = State.get(User.getOperand(1), Part); 4864 Value *C = nullptr; 4865 if (FCmp) { 4866 // Propagate fast math flags. 4867 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4868 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4869 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4870 } else { 4871 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4872 } 4873 State.set(Def, C, Part); 4874 addMetadata(C, &I); 4875 } 4876 4877 break; 4878 } 4879 4880 case Instruction::ZExt: 4881 case Instruction::SExt: 4882 case Instruction::FPToUI: 4883 case Instruction::FPToSI: 4884 case Instruction::FPExt: 4885 case Instruction::PtrToInt: 4886 case Instruction::IntToPtr: 4887 case Instruction::SIToFP: 4888 case Instruction::UIToFP: 4889 case Instruction::Trunc: 4890 case Instruction::FPTrunc: 4891 case Instruction::BitCast: { 4892 auto *CI = cast<CastInst>(&I); 4893 setDebugLocFromInst(Builder, CI); 4894 4895 /// Vectorize casts. 4896 Type *DestTy = 4897 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4898 4899 for (unsigned Part = 0; Part < UF; ++Part) { 4900 Value *A = State.get(User.getOperand(0), Part); 4901 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4902 State.set(Def, Cast, Part); 4903 addMetadata(Cast, &I); 4904 } 4905 break; 4906 } 4907 default: 4908 // This instruction is not vectorized by simple widening. 4909 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4910 llvm_unreachable("Unhandled instruction!"); 4911 } // end of switch. 4912 } 4913 4914 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4915 VPUser &ArgOperands, 4916 VPTransformState &State) { 4917 assert(!isa<DbgInfoIntrinsic>(I) && 4918 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4919 setDebugLocFromInst(Builder, &I); 4920 4921 Module *M = I.getParent()->getParent()->getParent(); 4922 auto *CI = cast<CallInst>(&I); 4923 4924 SmallVector<Type *, 4> Tys; 4925 for (Value *ArgOperand : CI->arg_operands()) 4926 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4927 4928 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4929 4930 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4931 // version of the instruction. 4932 // Is it beneficial to perform intrinsic call compared to lib call? 4933 bool NeedToScalarize = false; 4934 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4935 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4936 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4937 assert((UseVectorIntrinsic || !NeedToScalarize) && 4938 "Instruction should be scalarized elsewhere."); 4939 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4940 "Cannot have invalid costs while widening"); 4941 4942 for (unsigned Part = 0; Part < UF; ++Part) { 4943 SmallVector<Value *, 4> Args; 4944 for (auto &I : enumerate(ArgOperands.operands())) { 4945 // Some intrinsics have a scalar argument - don't replace it with a 4946 // vector. 4947 Value *Arg; 4948 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4949 Arg = State.get(I.value(), Part); 4950 else 4951 Arg = State.get(I.value(), VPIteration(0, 0)); 4952 Args.push_back(Arg); 4953 } 4954 4955 Function *VectorF; 4956 if (UseVectorIntrinsic) { 4957 // Use vector version of the intrinsic. 4958 Type *TysForDecl[] = {CI->getType()}; 4959 if (VF.isVector()) 4960 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4961 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4962 assert(VectorF && "Can't retrieve vector intrinsic."); 4963 } else { 4964 // Use vector version of the function call. 4965 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4966 #ifndef NDEBUG 4967 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4968 "Can't create vector function."); 4969 #endif 4970 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4971 } 4972 SmallVector<OperandBundleDef, 1> OpBundles; 4973 CI->getOperandBundlesAsDefs(OpBundles); 4974 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4975 4976 if (isa<FPMathOperator>(V)) 4977 V->copyFastMathFlags(CI); 4978 4979 State.set(Def, V, Part); 4980 addMetadata(V, &I); 4981 } 4982 } 4983 4984 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4985 VPUser &Operands, 4986 bool InvariantCond, 4987 VPTransformState &State) { 4988 setDebugLocFromInst(Builder, &I); 4989 4990 // The condition can be loop invariant but still defined inside the 4991 // loop. This means that we can't just use the original 'cond' value. 4992 // We have to take the 'vectorized' value and pick the first lane. 4993 // Instcombine will make this a no-op. 4994 auto *InvarCond = InvariantCond 4995 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4996 : nullptr; 4997 4998 for (unsigned Part = 0; Part < UF; ++Part) { 4999 Value *Cond = 5000 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5001 Value *Op0 = State.get(Operands.getOperand(1), Part); 5002 Value *Op1 = State.get(Operands.getOperand(2), Part); 5003 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5004 State.set(VPDef, Sel, Part); 5005 addMetadata(Sel, &I); 5006 } 5007 } 5008 5009 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5010 // We should not collect Scalars more than once per VF. Right now, this 5011 // function is called from collectUniformsAndScalars(), which already does 5012 // this check. Collecting Scalars for VF=1 does not make any sense. 5013 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5014 "This function should not be visited twice for the same VF"); 5015 5016 SmallSetVector<Instruction *, 8> Worklist; 5017 5018 // These sets are used to seed the analysis with pointers used by memory 5019 // accesses that will remain scalar. 5020 SmallSetVector<Instruction *, 8> ScalarPtrs; 5021 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5022 auto *Latch = TheLoop->getLoopLatch(); 5023 5024 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5025 // The pointer operands of loads and stores will be scalar as long as the 5026 // memory access is not a gather or scatter operation. The value operand of a 5027 // store will remain scalar if the store is scalarized. 5028 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5029 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5030 assert(WideningDecision != CM_Unknown && 5031 "Widening decision should be ready at this moment"); 5032 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5033 if (Ptr == Store->getValueOperand()) 5034 return WideningDecision == CM_Scalarize; 5035 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5036 "Ptr is neither a value or pointer operand"); 5037 return WideningDecision != CM_GatherScatter; 5038 }; 5039 5040 // A helper that returns true if the given value is a bitcast or 5041 // getelementptr instruction contained in the loop. 5042 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5043 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5044 isa<GetElementPtrInst>(V)) && 5045 !TheLoop->isLoopInvariant(V); 5046 }; 5047 5048 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5049 if (!isa<PHINode>(Ptr) || 5050 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5051 return false; 5052 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5053 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5054 return false; 5055 return isScalarUse(MemAccess, Ptr); 5056 }; 5057 5058 // A helper that evaluates a memory access's use of a pointer. If the 5059 // pointer is actually the pointer induction of a loop, it is being 5060 // inserted into Worklist. If the use will be a scalar use, and the 5061 // pointer is only used by memory accesses, we place the pointer in 5062 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5063 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5064 if (isScalarPtrInduction(MemAccess, Ptr)) { 5065 Worklist.insert(cast<Instruction>(Ptr)); 5066 Instruction *Update = cast<Instruction>( 5067 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5068 Worklist.insert(Update); 5069 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5070 << "\n"); 5071 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5072 << "\n"); 5073 return; 5074 } 5075 // We only care about bitcast and getelementptr instructions contained in 5076 // the loop. 5077 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5078 return; 5079 5080 // If the pointer has already been identified as scalar (e.g., if it was 5081 // also identified as uniform), there's nothing to do. 5082 auto *I = cast<Instruction>(Ptr); 5083 if (Worklist.count(I)) 5084 return; 5085 5086 // If the use of the pointer will be a scalar use, and all users of the 5087 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5088 // place the pointer in PossibleNonScalarPtrs. 5089 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5090 return isa<LoadInst>(U) || isa<StoreInst>(U); 5091 })) 5092 ScalarPtrs.insert(I); 5093 else 5094 PossibleNonScalarPtrs.insert(I); 5095 }; 5096 5097 // We seed the scalars analysis with three classes of instructions: (1) 5098 // instructions marked uniform-after-vectorization and (2) bitcast, 5099 // getelementptr and (pointer) phi instructions used by memory accesses 5100 // requiring a scalar use. 5101 // 5102 // (1) Add to the worklist all instructions that have been identified as 5103 // uniform-after-vectorization. 5104 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5105 5106 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5107 // memory accesses requiring a scalar use. The pointer operands of loads and 5108 // stores will be scalar as long as the memory accesses is not a gather or 5109 // scatter operation. The value operand of a store will remain scalar if the 5110 // store is scalarized. 5111 for (auto *BB : TheLoop->blocks()) 5112 for (auto &I : *BB) { 5113 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5114 evaluatePtrUse(Load, Load->getPointerOperand()); 5115 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5116 evaluatePtrUse(Store, Store->getPointerOperand()); 5117 evaluatePtrUse(Store, Store->getValueOperand()); 5118 } 5119 } 5120 for (auto *I : ScalarPtrs) 5121 if (!PossibleNonScalarPtrs.count(I)) { 5122 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5123 Worklist.insert(I); 5124 } 5125 5126 // Insert the forced scalars. 5127 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5128 // induction variable when the PHI user is scalarized. 5129 auto ForcedScalar = ForcedScalars.find(VF); 5130 if (ForcedScalar != ForcedScalars.end()) 5131 for (auto *I : ForcedScalar->second) 5132 Worklist.insert(I); 5133 5134 // Expand the worklist by looking through any bitcasts and getelementptr 5135 // instructions we've already identified as scalar. This is similar to the 5136 // expansion step in collectLoopUniforms(); however, here we're only 5137 // expanding to include additional bitcasts and getelementptr instructions. 5138 unsigned Idx = 0; 5139 while (Idx != Worklist.size()) { 5140 Instruction *Dst = Worklist[Idx++]; 5141 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5142 continue; 5143 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5144 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5145 auto *J = cast<Instruction>(U); 5146 return !TheLoop->contains(J) || Worklist.count(J) || 5147 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5148 isScalarUse(J, Src)); 5149 })) { 5150 Worklist.insert(Src); 5151 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5152 } 5153 } 5154 5155 // An induction variable will remain scalar if all users of the induction 5156 // variable and induction variable update remain scalar. 5157 for (auto &Induction : Legal->getInductionVars()) { 5158 auto *Ind = Induction.first; 5159 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5160 5161 // If tail-folding is applied, the primary induction variable will be used 5162 // to feed a vector compare. 5163 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5164 continue; 5165 5166 // Determine if all users of the induction variable are scalar after 5167 // vectorization. 5168 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5169 auto *I = cast<Instruction>(U); 5170 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5171 }); 5172 if (!ScalarInd) 5173 continue; 5174 5175 // Determine if all users of the induction variable update instruction are 5176 // scalar after vectorization. 5177 auto ScalarIndUpdate = 5178 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5179 auto *I = cast<Instruction>(U); 5180 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5181 }); 5182 if (!ScalarIndUpdate) 5183 continue; 5184 5185 // The induction variable and its update instruction will remain scalar. 5186 Worklist.insert(Ind); 5187 Worklist.insert(IndUpdate); 5188 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5189 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5190 << "\n"); 5191 } 5192 5193 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5194 } 5195 5196 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5197 ElementCount VF) { 5198 if (!blockNeedsPredication(I->getParent())) 5199 return false; 5200 switch(I->getOpcode()) { 5201 default: 5202 break; 5203 case Instruction::Load: 5204 case Instruction::Store: { 5205 if (!Legal->isMaskRequired(I)) 5206 return false; 5207 auto *Ptr = getLoadStorePointerOperand(I); 5208 auto *Ty = getMemInstValueType(I); 5209 // We have already decided how to vectorize this instruction, get that 5210 // result. 5211 if (VF.isVector()) { 5212 InstWidening WideningDecision = getWideningDecision(I, VF); 5213 assert(WideningDecision != CM_Unknown && 5214 "Widening decision should be ready at this moment"); 5215 return WideningDecision == CM_Scalarize; 5216 } 5217 const Align Alignment = getLoadStoreAlignment(I); 5218 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5219 isLegalMaskedGather(Ty, Alignment)) 5220 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5221 isLegalMaskedScatter(Ty, Alignment)); 5222 } 5223 case Instruction::UDiv: 5224 case Instruction::SDiv: 5225 case Instruction::SRem: 5226 case Instruction::URem: 5227 return mayDivideByZero(*I); 5228 } 5229 return false; 5230 } 5231 5232 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5233 Instruction *I, ElementCount VF) { 5234 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5235 assert(getWideningDecision(I, VF) == CM_Unknown && 5236 "Decision should not be set yet."); 5237 auto *Group = getInterleavedAccessGroup(I); 5238 assert(Group && "Must have a group."); 5239 5240 // If the instruction's allocated size doesn't equal it's type size, it 5241 // requires padding and will be scalarized. 5242 auto &DL = I->getModule()->getDataLayout(); 5243 auto *ScalarTy = getMemInstValueType(I); 5244 if (hasIrregularType(ScalarTy, DL)) 5245 return false; 5246 5247 // Check if masking is required. 5248 // A Group may need masking for one of two reasons: it resides in a block that 5249 // needs predication, or it was decided to use masking to deal with gaps. 5250 bool PredicatedAccessRequiresMasking = 5251 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5252 bool AccessWithGapsRequiresMasking = 5253 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5254 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5255 return true; 5256 5257 // If masked interleaving is required, we expect that the user/target had 5258 // enabled it, because otherwise it either wouldn't have been created or 5259 // it should have been invalidated by the CostModel. 5260 assert(useMaskedInterleavedAccesses(TTI) && 5261 "Masked interleave-groups for predicated accesses are not enabled."); 5262 5263 auto *Ty = getMemInstValueType(I); 5264 const Align Alignment = getLoadStoreAlignment(I); 5265 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5266 : TTI.isLegalMaskedStore(Ty, Alignment); 5267 } 5268 5269 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5270 Instruction *I, ElementCount VF) { 5271 // Get and ensure we have a valid memory instruction. 5272 LoadInst *LI = dyn_cast<LoadInst>(I); 5273 StoreInst *SI = dyn_cast<StoreInst>(I); 5274 assert((LI || SI) && "Invalid memory instruction"); 5275 5276 auto *Ptr = getLoadStorePointerOperand(I); 5277 5278 // In order to be widened, the pointer should be consecutive, first of all. 5279 if (!Legal->isConsecutivePtr(Ptr)) 5280 return false; 5281 5282 // If the instruction is a store located in a predicated block, it will be 5283 // scalarized. 5284 if (isScalarWithPredication(I)) 5285 return false; 5286 5287 // If the instruction's allocated size doesn't equal it's type size, it 5288 // requires padding and will be scalarized. 5289 auto &DL = I->getModule()->getDataLayout(); 5290 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5291 if (hasIrregularType(ScalarTy, DL)) 5292 return false; 5293 5294 return true; 5295 } 5296 5297 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5298 // We should not collect Uniforms more than once per VF. Right now, 5299 // this function is called from collectUniformsAndScalars(), which 5300 // already does this check. Collecting Uniforms for VF=1 does not make any 5301 // sense. 5302 5303 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5304 "This function should not be visited twice for the same VF"); 5305 5306 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5307 // not analyze again. Uniforms.count(VF) will return 1. 5308 Uniforms[VF].clear(); 5309 5310 // We now know that the loop is vectorizable! 5311 // Collect instructions inside the loop that will remain uniform after 5312 // vectorization. 5313 5314 // Global values, params and instructions outside of current loop are out of 5315 // scope. 5316 auto isOutOfScope = [&](Value *V) -> bool { 5317 Instruction *I = dyn_cast<Instruction>(V); 5318 return (!I || !TheLoop->contains(I)); 5319 }; 5320 5321 SetVector<Instruction *> Worklist; 5322 BasicBlock *Latch = TheLoop->getLoopLatch(); 5323 5324 // Instructions that are scalar with predication must not be considered 5325 // uniform after vectorization, because that would create an erroneous 5326 // replicating region where only a single instance out of VF should be formed. 5327 // TODO: optimize such seldom cases if found important, see PR40816. 5328 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5329 if (isOutOfScope(I)) { 5330 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5331 << *I << "\n"); 5332 return; 5333 } 5334 if (isScalarWithPredication(I, VF)) { 5335 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5336 << *I << "\n"); 5337 return; 5338 } 5339 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5340 Worklist.insert(I); 5341 }; 5342 5343 // Start with the conditional branch. If the branch condition is an 5344 // instruction contained in the loop that is only used by the branch, it is 5345 // uniform. 5346 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5347 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5348 addToWorklistIfAllowed(Cmp); 5349 5350 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5351 InstWidening WideningDecision = getWideningDecision(I, VF); 5352 assert(WideningDecision != CM_Unknown && 5353 "Widening decision should be ready at this moment"); 5354 5355 // A uniform memory op is itself uniform. We exclude uniform stores 5356 // here as they demand the last lane, not the first one. 5357 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5358 assert(WideningDecision == CM_Scalarize); 5359 return true; 5360 } 5361 5362 return (WideningDecision == CM_Widen || 5363 WideningDecision == CM_Widen_Reverse || 5364 WideningDecision == CM_Interleave); 5365 }; 5366 5367 5368 // Returns true if Ptr is the pointer operand of a memory access instruction 5369 // I, and I is known to not require scalarization. 5370 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5371 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5372 }; 5373 5374 // Holds a list of values which are known to have at least one uniform use. 5375 // Note that there may be other uses which aren't uniform. A "uniform use" 5376 // here is something which only demands lane 0 of the unrolled iterations; 5377 // it does not imply that all lanes produce the same value (e.g. this is not 5378 // the usual meaning of uniform) 5379 SmallPtrSet<Value *, 8> HasUniformUse; 5380 5381 // Scan the loop for instructions which are either a) known to have only 5382 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5383 for (auto *BB : TheLoop->blocks()) 5384 for (auto &I : *BB) { 5385 // If there's no pointer operand, there's nothing to do. 5386 auto *Ptr = getLoadStorePointerOperand(&I); 5387 if (!Ptr) 5388 continue; 5389 5390 // A uniform memory op is itself uniform. We exclude uniform stores 5391 // here as they demand the last lane, not the first one. 5392 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5393 addToWorklistIfAllowed(&I); 5394 5395 if (isUniformDecision(&I, VF)) { 5396 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5397 HasUniformUse.insert(Ptr); 5398 } 5399 } 5400 5401 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5402 // demanding) users. Since loops are assumed to be in LCSSA form, this 5403 // disallows uses outside the loop as well. 5404 for (auto *V : HasUniformUse) { 5405 if (isOutOfScope(V)) 5406 continue; 5407 auto *I = cast<Instruction>(V); 5408 auto UsersAreMemAccesses = 5409 llvm::all_of(I->users(), [&](User *U) -> bool { 5410 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5411 }); 5412 if (UsersAreMemAccesses) 5413 addToWorklistIfAllowed(I); 5414 } 5415 5416 // Expand Worklist in topological order: whenever a new instruction 5417 // is added , its users should be already inside Worklist. It ensures 5418 // a uniform instruction will only be used by uniform instructions. 5419 unsigned idx = 0; 5420 while (idx != Worklist.size()) { 5421 Instruction *I = Worklist[idx++]; 5422 5423 for (auto OV : I->operand_values()) { 5424 // isOutOfScope operands cannot be uniform instructions. 5425 if (isOutOfScope(OV)) 5426 continue; 5427 // First order recurrence Phi's should typically be considered 5428 // non-uniform. 5429 auto *OP = dyn_cast<PHINode>(OV); 5430 if (OP && Legal->isFirstOrderRecurrence(OP)) 5431 continue; 5432 // If all the users of the operand are uniform, then add the 5433 // operand into the uniform worklist. 5434 auto *OI = cast<Instruction>(OV); 5435 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5436 auto *J = cast<Instruction>(U); 5437 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5438 })) 5439 addToWorklistIfAllowed(OI); 5440 } 5441 } 5442 5443 // For an instruction to be added into Worklist above, all its users inside 5444 // the loop should also be in Worklist. However, this condition cannot be 5445 // true for phi nodes that form a cyclic dependence. We must process phi 5446 // nodes separately. An induction variable will remain uniform if all users 5447 // of the induction variable and induction variable update remain uniform. 5448 // The code below handles both pointer and non-pointer induction variables. 5449 for (auto &Induction : Legal->getInductionVars()) { 5450 auto *Ind = Induction.first; 5451 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5452 5453 // Determine if all users of the induction variable are uniform after 5454 // vectorization. 5455 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5456 auto *I = cast<Instruction>(U); 5457 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5458 isVectorizedMemAccessUse(I, Ind); 5459 }); 5460 if (!UniformInd) 5461 continue; 5462 5463 // Determine if all users of the induction variable update instruction are 5464 // uniform after vectorization. 5465 auto UniformIndUpdate = 5466 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5467 auto *I = cast<Instruction>(U); 5468 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5469 isVectorizedMemAccessUse(I, IndUpdate); 5470 }); 5471 if (!UniformIndUpdate) 5472 continue; 5473 5474 // The induction variable and its update instruction will remain uniform. 5475 addToWorklistIfAllowed(Ind); 5476 addToWorklistIfAllowed(IndUpdate); 5477 } 5478 5479 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5480 } 5481 5482 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5483 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5484 5485 if (Legal->getRuntimePointerChecking()->Need) { 5486 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5487 "runtime pointer checks needed. Enable vectorization of this " 5488 "loop with '#pragma clang loop vectorize(enable)' when " 5489 "compiling with -Os/-Oz", 5490 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5491 return true; 5492 } 5493 5494 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5495 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5496 "runtime SCEV checks needed. Enable vectorization of this " 5497 "loop with '#pragma clang loop vectorize(enable)' when " 5498 "compiling with -Os/-Oz", 5499 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5500 return true; 5501 } 5502 5503 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5504 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5505 reportVectorizationFailure("Runtime stride check for small trip count", 5506 "runtime stride == 1 checks needed. Enable vectorization of " 5507 "this loop without such check by compiling with -Os/-Oz", 5508 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5509 return true; 5510 } 5511 5512 return false; 5513 } 5514 5515 Optional<ElementCount> 5516 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5517 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5518 // TODO: It may by useful to do since it's still likely to be dynamically 5519 // uniform if the target can skip. 5520 reportVectorizationFailure( 5521 "Not inserting runtime ptr check for divergent target", 5522 "runtime pointer checks needed. Not enabled for divergent target", 5523 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5524 return None; 5525 } 5526 5527 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5528 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5529 if (TC == 1) { 5530 reportVectorizationFailure("Single iteration (non) loop", 5531 "loop trip count is one, irrelevant for vectorization", 5532 "SingleIterationLoop", ORE, TheLoop); 5533 return None; 5534 } 5535 5536 switch (ScalarEpilogueStatus) { 5537 case CM_ScalarEpilogueAllowed: 5538 return computeFeasibleMaxVF(TC, UserVF); 5539 case CM_ScalarEpilogueNotAllowedUsePredicate: 5540 LLVM_FALLTHROUGH; 5541 case CM_ScalarEpilogueNotNeededUsePredicate: 5542 LLVM_DEBUG( 5543 dbgs() << "LV: vector predicate hint/switch found.\n" 5544 << "LV: Not allowing scalar epilogue, creating predicated " 5545 << "vector loop.\n"); 5546 break; 5547 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5548 // fallthrough as a special case of OptForSize 5549 case CM_ScalarEpilogueNotAllowedOptSize: 5550 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5551 LLVM_DEBUG( 5552 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5553 else 5554 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5555 << "count.\n"); 5556 5557 // Bail if runtime checks are required, which are not good when optimising 5558 // for size. 5559 if (runtimeChecksRequired()) 5560 return None; 5561 5562 break; 5563 } 5564 5565 // The only loops we can vectorize without a scalar epilogue, are loops with 5566 // a bottom-test and a single exiting block. We'd have to handle the fact 5567 // that not every instruction executes on the last iteration. This will 5568 // require a lane mask which varies through the vector loop body. (TODO) 5569 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5570 // If there was a tail-folding hint/switch, but we can't fold the tail by 5571 // masking, fallback to a vectorization with a scalar epilogue. 5572 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5573 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5574 "scalar epilogue instead.\n"); 5575 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5576 return computeFeasibleMaxVF(TC, UserVF); 5577 } 5578 return None; 5579 } 5580 5581 // Now try the tail folding 5582 5583 // Invalidate interleave groups that require an epilogue if we can't mask 5584 // the interleave-group. 5585 if (!useMaskedInterleavedAccesses(TTI)) { 5586 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5587 "No decisions should have been taken at this point"); 5588 // Note: There is no need to invalidate any cost modeling decisions here, as 5589 // non where taken so far. 5590 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5591 } 5592 5593 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5594 assert(!MaxVF.isScalable() && 5595 "Scalable vectors do not yet support tail folding"); 5596 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5597 "MaxVF must be a power of 2"); 5598 unsigned MaxVFtimesIC = 5599 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5600 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5601 // chose. 5602 ScalarEvolution *SE = PSE.getSE(); 5603 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5604 const SCEV *ExitCount = SE->getAddExpr( 5605 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5606 const SCEV *Rem = SE->getURemExpr( 5607 SE->applyLoopGuards(ExitCount, TheLoop), 5608 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5609 if (Rem->isZero()) { 5610 // Accept MaxVF if we do not have a tail. 5611 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5612 return MaxVF; 5613 } 5614 5615 // If we don't know the precise trip count, or if the trip count that we 5616 // found modulo the vectorization factor is not zero, try to fold the tail 5617 // by masking. 5618 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5619 if (Legal->prepareToFoldTailByMasking()) { 5620 FoldTailByMasking = true; 5621 return MaxVF; 5622 } 5623 5624 // If there was a tail-folding hint/switch, but we can't fold the tail by 5625 // masking, fallback to a vectorization with a scalar epilogue. 5626 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5627 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5628 "scalar epilogue instead.\n"); 5629 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5630 return MaxVF; 5631 } 5632 5633 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5634 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5635 return None; 5636 } 5637 5638 if (TC == 0) { 5639 reportVectorizationFailure( 5640 "Unable to calculate the loop count due to complex control flow", 5641 "unable to calculate the loop count due to complex control flow", 5642 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5643 return None; 5644 } 5645 5646 reportVectorizationFailure( 5647 "Cannot optimize for size and vectorize at the same time.", 5648 "cannot optimize for size and vectorize at the same time. " 5649 "Enable vectorization of this loop with '#pragma clang loop " 5650 "vectorize(enable)' when compiling with -Os/-Oz", 5651 "NoTailLoopWithOptForSize", ORE, TheLoop); 5652 return None; 5653 } 5654 5655 ElementCount 5656 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5657 ElementCount UserVF) { 5658 bool IgnoreScalableUserVF = UserVF.isScalable() && 5659 !TTI.supportsScalableVectors() && 5660 !ForceTargetSupportsScalableVectors; 5661 if (IgnoreScalableUserVF) { 5662 LLVM_DEBUG( 5663 dbgs() << "LV: Ignoring VF=" << UserVF 5664 << " because target does not support scalable vectors.\n"); 5665 ORE->emit([&]() { 5666 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5667 TheLoop->getStartLoc(), 5668 TheLoop->getHeader()) 5669 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5670 << " because target does not support scalable vectors."; 5671 }); 5672 } 5673 5674 // Beyond this point two scenarios are handled. If UserVF isn't specified 5675 // then a suitable VF is chosen. If UserVF is specified and there are 5676 // dependencies, check if it's legal. However, if a UserVF is specified and 5677 // there are no dependencies, then there's nothing to do. 5678 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5679 if (!canVectorizeReductions(UserVF)) { 5680 reportVectorizationFailure( 5681 "LV: Scalable vectorization not supported for the reduction " 5682 "operations found in this loop. Using fixed-width " 5683 "vectorization instead.", 5684 "Scalable vectorization not supported for the reduction operations " 5685 "found in this loop. Using fixed-width vectorization instead.", 5686 "ScalableVFUnfeasible", ORE, TheLoop); 5687 return computeFeasibleMaxVF( 5688 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5689 } 5690 5691 if (Legal->isSafeForAnyVectorWidth()) 5692 return UserVF; 5693 } 5694 5695 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5696 unsigned SmallestType, WidestType; 5697 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5698 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5699 5700 // Get the maximum safe dependence distance in bits computed by LAA. 5701 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5702 // the memory accesses that is most restrictive (involved in the smallest 5703 // dependence distance). 5704 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5705 5706 // If the user vectorization factor is legally unsafe, clamp it to a safe 5707 // value. Otherwise, return as is. 5708 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5709 unsigned MaxSafeElements = 5710 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5711 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5712 5713 if (UserVF.isScalable()) { 5714 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5715 5716 // Scale VF by vscale before checking if it's safe. 5717 MaxSafeVF = ElementCount::getScalable( 5718 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5719 5720 if (MaxSafeVF.isZero()) { 5721 // The dependence distance is too small to use scalable vectors, 5722 // fallback on fixed. 5723 LLVM_DEBUG( 5724 dbgs() 5725 << "LV: Max legal vector width too small, scalable vectorization " 5726 "unfeasible. Using fixed-width vectorization instead.\n"); 5727 ORE->emit([&]() { 5728 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5729 TheLoop->getStartLoc(), 5730 TheLoop->getHeader()) 5731 << "Max legal vector width too small, scalable vectorization " 5732 << "unfeasible. Using fixed-width vectorization instead."; 5733 }); 5734 return computeFeasibleMaxVF( 5735 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5736 } 5737 } 5738 5739 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5740 5741 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5742 return UserVF; 5743 5744 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5745 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5746 << ".\n"); 5747 ORE->emit([&]() { 5748 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5749 TheLoop->getStartLoc(), 5750 TheLoop->getHeader()) 5751 << "User-specified vectorization factor " 5752 << ore::NV("UserVectorizationFactor", UserVF) 5753 << " is unsafe, clamping to maximum safe vectorization factor " 5754 << ore::NV("VectorizationFactor", MaxSafeVF); 5755 }); 5756 return MaxSafeVF; 5757 } 5758 5759 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5760 5761 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5762 // Note that both WidestRegister and WidestType may not be a powers of 2. 5763 auto MaxVectorSize = 5764 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5765 5766 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5767 << " / " << WidestType << " bits.\n"); 5768 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5769 << WidestRegister << " bits.\n"); 5770 5771 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5772 "Did not expect to pack so many elements" 5773 " into one vector!"); 5774 if (MaxVectorSize.getFixedValue() == 0) { 5775 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5776 return ElementCount::getFixed(1); 5777 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5778 isPowerOf2_32(ConstTripCount)) { 5779 // We need to clamp the VF to be the ConstTripCount. There is no point in 5780 // choosing a higher viable VF as done in the loop below. 5781 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5782 << ConstTripCount << "\n"); 5783 return ElementCount::getFixed(ConstTripCount); 5784 } 5785 5786 ElementCount MaxVF = MaxVectorSize; 5787 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5788 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5789 // Collect all viable vectorization factors larger than the default MaxVF 5790 // (i.e. MaxVectorSize). 5791 SmallVector<ElementCount, 8> VFs; 5792 auto MaxVectorSizeMaxBW = 5793 ElementCount::getFixed(WidestRegister / SmallestType); 5794 for (ElementCount VS = MaxVectorSize * 2; 5795 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5796 VFs.push_back(VS); 5797 5798 // For each VF calculate its register usage. 5799 auto RUs = calculateRegisterUsage(VFs); 5800 5801 // Select the largest VF which doesn't require more registers than existing 5802 // ones. 5803 for (int i = RUs.size() - 1; i >= 0; --i) { 5804 bool Selected = true; 5805 for (auto &pair : RUs[i].MaxLocalUsers) { 5806 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5807 if (pair.second > TargetNumRegisters) 5808 Selected = false; 5809 } 5810 if (Selected) { 5811 MaxVF = VFs[i]; 5812 break; 5813 } 5814 } 5815 if (ElementCount MinVF = 5816 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5817 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5818 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5819 << ") with target's minimum: " << MinVF << '\n'); 5820 MaxVF = MinVF; 5821 } 5822 } 5823 } 5824 return MaxVF; 5825 } 5826 5827 VectorizationFactor 5828 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5829 // FIXME: This can be fixed for scalable vectors later, because at this stage 5830 // the LoopVectorizer will only consider vectorizing a loop with scalable 5831 // vectors when the loop has a hint to enable vectorization for a given VF. 5832 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5833 5834 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5835 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5836 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5837 5838 auto Width = ElementCount::getFixed(1); 5839 const float ScalarCost = *ExpectedCost.getValue(); 5840 float Cost = ScalarCost; 5841 5842 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5843 if (ForceVectorization && MaxVF.isVector()) { 5844 // Ignore scalar width, because the user explicitly wants vectorization. 5845 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5846 // evaluation. 5847 Cost = std::numeric_limits<float>::max(); 5848 } 5849 5850 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5851 i *= 2) { 5852 // Notice that the vector loop needs to be executed less times, so 5853 // we need to divide the cost of the vector loops by the width of 5854 // the vector elements. 5855 VectorizationCostTy C = expectedCost(i); 5856 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5857 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5858 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5859 << " costs: " << (int)VectorCost << ".\n"); 5860 if (!C.second && !ForceVectorization) { 5861 LLVM_DEBUG( 5862 dbgs() << "LV: Not considering vector loop of width " << i 5863 << " because it will not generate any vector instructions.\n"); 5864 continue; 5865 } 5866 5867 // If profitable add it to ProfitableVF list. 5868 if (VectorCost < ScalarCost) { 5869 ProfitableVFs.push_back(VectorizationFactor( 5870 {i, (unsigned)VectorCost})); 5871 } 5872 5873 if (VectorCost < Cost) { 5874 Cost = VectorCost; 5875 Width = i; 5876 } 5877 } 5878 5879 if (!EnableCondStoresVectorization && NumPredStores) { 5880 reportVectorizationFailure("There are conditional stores.", 5881 "store that is conditionally executed prevents vectorization", 5882 "ConditionalStore", ORE, TheLoop); 5883 Width = ElementCount::getFixed(1); 5884 Cost = ScalarCost; 5885 } 5886 5887 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5888 << "LV: Vectorization seems to be not beneficial, " 5889 << "but was forced by a user.\n"); 5890 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5891 VectorizationFactor Factor = {Width, 5892 (unsigned)(Width.getKnownMinValue() * Cost)}; 5893 return Factor; 5894 } 5895 5896 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5897 const Loop &L, ElementCount VF) const { 5898 // Cross iteration phis such as reductions need special handling and are 5899 // currently unsupported. 5900 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5901 return Legal->isFirstOrderRecurrence(&Phi) || 5902 Legal->isReductionVariable(&Phi); 5903 })) 5904 return false; 5905 5906 // Phis with uses outside of the loop require special handling and are 5907 // currently unsupported. 5908 for (auto &Entry : Legal->getInductionVars()) { 5909 // Look for uses of the value of the induction at the last iteration. 5910 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5911 for (User *U : PostInc->users()) 5912 if (!L.contains(cast<Instruction>(U))) 5913 return false; 5914 // Look for uses of penultimate value of the induction. 5915 for (User *U : Entry.first->users()) 5916 if (!L.contains(cast<Instruction>(U))) 5917 return false; 5918 } 5919 5920 // Induction variables that are widened require special handling that is 5921 // currently not supported. 5922 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5923 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5924 this->isProfitableToScalarize(Entry.first, VF)); 5925 })) 5926 return false; 5927 5928 return true; 5929 } 5930 5931 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5932 const ElementCount VF) const { 5933 // FIXME: We need a much better cost-model to take different parameters such 5934 // as register pressure, code size increase and cost of extra branches into 5935 // account. For now we apply a very crude heuristic and only consider loops 5936 // with vectorization factors larger than a certain value. 5937 // We also consider epilogue vectorization unprofitable for targets that don't 5938 // consider interleaving beneficial (eg. MVE). 5939 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5940 return false; 5941 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5942 return true; 5943 return false; 5944 } 5945 5946 VectorizationFactor 5947 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5948 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5949 VectorizationFactor Result = VectorizationFactor::Disabled(); 5950 if (!EnableEpilogueVectorization) { 5951 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5952 return Result; 5953 } 5954 5955 if (!isScalarEpilogueAllowed()) { 5956 LLVM_DEBUG( 5957 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5958 "allowed.\n";); 5959 return Result; 5960 } 5961 5962 // FIXME: This can be fixed for scalable vectors later, because at this stage 5963 // the LoopVectorizer will only consider vectorizing a loop with scalable 5964 // vectors when the loop has a hint to enable vectorization for a given VF. 5965 if (MainLoopVF.isScalable()) { 5966 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5967 "yet supported.\n"); 5968 return Result; 5969 } 5970 5971 // Not really a cost consideration, but check for unsupported cases here to 5972 // simplify the logic. 5973 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5974 LLVM_DEBUG( 5975 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5976 "not a supported candidate.\n";); 5977 return Result; 5978 } 5979 5980 if (EpilogueVectorizationForceVF > 1) { 5981 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5982 if (LVP.hasPlanWithVFs( 5983 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5984 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5985 else { 5986 LLVM_DEBUG( 5987 dbgs() 5988 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5989 return Result; 5990 } 5991 } 5992 5993 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5994 TheLoop->getHeader()->getParent()->hasMinSize()) { 5995 LLVM_DEBUG( 5996 dbgs() 5997 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5998 return Result; 5999 } 6000 6001 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6002 return Result; 6003 6004 for (auto &NextVF : ProfitableVFs) 6005 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6006 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6007 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6008 Result = NextVF; 6009 6010 if (Result != VectorizationFactor::Disabled()) 6011 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6012 << Result.Width.getFixedValue() << "\n";); 6013 return Result; 6014 } 6015 6016 std::pair<unsigned, unsigned> 6017 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6018 unsigned MinWidth = -1U; 6019 unsigned MaxWidth = 8; 6020 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6021 6022 // For each block. 6023 for (BasicBlock *BB : TheLoop->blocks()) { 6024 // For each instruction in the loop. 6025 for (Instruction &I : BB->instructionsWithoutDebug()) { 6026 Type *T = I.getType(); 6027 6028 // Skip ignored values. 6029 if (ValuesToIgnore.count(&I)) 6030 continue; 6031 6032 // Only examine Loads, Stores and PHINodes. 6033 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6034 continue; 6035 6036 // Examine PHI nodes that are reduction variables. Update the type to 6037 // account for the recurrence type. 6038 if (auto *PN = dyn_cast<PHINode>(&I)) { 6039 if (!Legal->isReductionVariable(PN)) 6040 continue; 6041 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6042 if (PreferInLoopReductions || 6043 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6044 RdxDesc.getRecurrenceType(), 6045 TargetTransformInfo::ReductionFlags())) 6046 continue; 6047 T = RdxDesc.getRecurrenceType(); 6048 } 6049 6050 // Examine the stored values. 6051 if (auto *ST = dyn_cast<StoreInst>(&I)) 6052 T = ST->getValueOperand()->getType(); 6053 6054 // Ignore loaded pointer types and stored pointer types that are not 6055 // vectorizable. 6056 // 6057 // FIXME: The check here attempts to predict whether a load or store will 6058 // be vectorized. We only know this for certain after a VF has 6059 // been selected. Here, we assume that if an access can be 6060 // vectorized, it will be. We should also look at extending this 6061 // optimization to non-pointer types. 6062 // 6063 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6064 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6065 continue; 6066 6067 MinWidth = std::min(MinWidth, 6068 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6069 MaxWidth = std::max(MaxWidth, 6070 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6071 } 6072 } 6073 6074 return {MinWidth, MaxWidth}; 6075 } 6076 6077 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6078 unsigned LoopCost) { 6079 // -- The interleave heuristics -- 6080 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6081 // There are many micro-architectural considerations that we can't predict 6082 // at this level. For example, frontend pressure (on decode or fetch) due to 6083 // code size, or the number and capabilities of the execution ports. 6084 // 6085 // We use the following heuristics to select the interleave count: 6086 // 1. If the code has reductions, then we interleave to break the cross 6087 // iteration dependency. 6088 // 2. If the loop is really small, then we interleave to reduce the loop 6089 // overhead. 6090 // 3. We don't interleave if we think that we will spill registers to memory 6091 // due to the increased register pressure. 6092 6093 if (!isScalarEpilogueAllowed()) 6094 return 1; 6095 6096 // We used the distance for the interleave count. 6097 if (Legal->getMaxSafeDepDistBytes() != -1U) 6098 return 1; 6099 6100 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6101 const bool HasReductions = !Legal->getReductionVars().empty(); 6102 // Do not interleave loops with a relatively small known or estimated trip 6103 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6104 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6105 // because with the above conditions interleaving can expose ILP and break 6106 // cross iteration dependences for reductions. 6107 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6108 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6109 return 1; 6110 6111 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6112 // We divide by these constants so assume that we have at least one 6113 // instruction that uses at least one register. 6114 for (auto& pair : R.MaxLocalUsers) { 6115 pair.second = std::max(pair.second, 1U); 6116 } 6117 6118 // We calculate the interleave count using the following formula. 6119 // Subtract the number of loop invariants from the number of available 6120 // registers. These registers are used by all of the interleaved instances. 6121 // Next, divide the remaining registers by the number of registers that is 6122 // required by the loop, in order to estimate how many parallel instances 6123 // fit without causing spills. All of this is rounded down if necessary to be 6124 // a power of two. We want power of two interleave count to simplify any 6125 // addressing operations or alignment considerations. 6126 // We also want power of two interleave counts to ensure that the induction 6127 // variable of the vector loop wraps to zero, when tail is folded by masking; 6128 // this currently happens when OptForSize, in which case IC is set to 1 above. 6129 unsigned IC = UINT_MAX; 6130 6131 for (auto& pair : R.MaxLocalUsers) { 6132 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6133 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6134 << " registers of " 6135 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6136 if (VF.isScalar()) { 6137 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6138 TargetNumRegisters = ForceTargetNumScalarRegs; 6139 } else { 6140 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6141 TargetNumRegisters = ForceTargetNumVectorRegs; 6142 } 6143 unsigned MaxLocalUsers = pair.second; 6144 unsigned LoopInvariantRegs = 0; 6145 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6146 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6147 6148 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6149 // Don't count the induction variable as interleaved. 6150 if (EnableIndVarRegisterHeur) { 6151 TmpIC = 6152 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6153 std::max(1U, (MaxLocalUsers - 1))); 6154 } 6155 6156 IC = std::min(IC, TmpIC); 6157 } 6158 6159 // Clamp the interleave ranges to reasonable counts. 6160 unsigned MaxInterleaveCount = 6161 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6162 6163 // Check if the user has overridden the max. 6164 if (VF.isScalar()) { 6165 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6166 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6167 } else { 6168 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6169 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6170 } 6171 6172 // If trip count is known or estimated compile time constant, limit the 6173 // interleave count to be less than the trip count divided by VF, provided it 6174 // is at least 1. 6175 // 6176 // For scalable vectors we can't know if interleaving is beneficial. It may 6177 // not be beneficial for small loops if none of the lanes in the second vector 6178 // iterations is enabled. However, for larger loops, there is likely to be a 6179 // similar benefit as for fixed-width vectors. For now, we choose to leave 6180 // the InterleaveCount as if vscale is '1', although if some information about 6181 // the vector is known (e.g. min vector size), we can make a better decision. 6182 if (BestKnownTC) { 6183 MaxInterleaveCount = 6184 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6185 // Make sure MaxInterleaveCount is greater than 0. 6186 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6187 } 6188 6189 assert(MaxInterleaveCount > 0 && 6190 "Maximum interleave count must be greater than 0"); 6191 6192 // Clamp the calculated IC to be between the 1 and the max interleave count 6193 // that the target and trip count allows. 6194 if (IC > MaxInterleaveCount) 6195 IC = MaxInterleaveCount; 6196 else 6197 // Make sure IC is greater than 0. 6198 IC = std::max(1u, IC); 6199 6200 assert(IC > 0 && "Interleave count must be greater than 0."); 6201 6202 // If we did not calculate the cost for VF (because the user selected the VF) 6203 // then we calculate the cost of VF here. 6204 if (LoopCost == 0) { 6205 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6206 LoopCost = *expectedCost(VF).first.getValue(); 6207 } 6208 6209 assert(LoopCost && "Non-zero loop cost expected"); 6210 6211 // Interleave if we vectorized this loop and there is a reduction that could 6212 // benefit from interleaving. 6213 if (VF.isVector() && HasReductions) { 6214 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6215 return IC; 6216 } 6217 6218 // Note that if we've already vectorized the loop we will have done the 6219 // runtime check and so interleaving won't require further checks. 6220 bool InterleavingRequiresRuntimePointerCheck = 6221 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6222 6223 // We want to interleave small loops in order to reduce the loop overhead and 6224 // potentially expose ILP opportunities. 6225 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6226 << "LV: IC is " << IC << '\n' 6227 << "LV: VF is " << VF << '\n'); 6228 const bool AggressivelyInterleaveReductions = 6229 TTI.enableAggressiveInterleaving(HasReductions); 6230 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6231 // We assume that the cost overhead is 1 and we use the cost model 6232 // to estimate the cost of the loop and interleave until the cost of the 6233 // loop overhead is about 5% of the cost of the loop. 6234 unsigned SmallIC = 6235 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6236 6237 // Interleave until store/load ports (estimated by max interleave count) are 6238 // saturated. 6239 unsigned NumStores = Legal->getNumStores(); 6240 unsigned NumLoads = Legal->getNumLoads(); 6241 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6242 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6243 6244 // If we have a scalar reduction (vector reductions are already dealt with 6245 // by this point), we can increase the critical path length if the loop 6246 // we're interleaving is inside another loop. Limit, by default to 2, so the 6247 // critical path only gets increased by one reduction operation. 6248 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6249 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6250 SmallIC = std::min(SmallIC, F); 6251 StoresIC = std::min(StoresIC, F); 6252 LoadsIC = std::min(LoadsIC, F); 6253 } 6254 6255 if (EnableLoadStoreRuntimeInterleave && 6256 std::max(StoresIC, LoadsIC) > SmallIC) { 6257 LLVM_DEBUG( 6258 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6259 return std::max(StoresIC, LoadsIC); 6260 } 6261 6262 // If there are scalar reductions and TTI has enabled aggressive 6263 // interleaving for reductions, we will interleave to expose ILP. 6264 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6265 AggressivelyInterleaveReductions) { 6266 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6267 // Interleave no less than SmallIC but not as aggressive as the normal IC 6268 // to satisfy the rare situation when resources are too limited. 6269 return std::max(IC / 2, SmallIC); 6270 } else { 6271 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6272 return SmallIC; 6273 } 6274 } 6275 6276 // Interleave if this is a large loop (small loops are already dealt with by 6277 // this point) that could benefit from interleaving. 6278 if (AggressivelyInterleaveReductions) { 6279 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6280 return IC; 6281 } 6282 6283 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6284 return 1; 6285 } 6286 6287 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6288 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6289 // This function calculates the register usage by measuring the highest number 6290 // of values that are alive at a single location. Obviously, this is a very 6291 // rough estimation. We scan the loop in a topological order in order and 6292 // assign a number to each instruction. We use RPO to ensure that defs are 6293 // met before their users. We assume that each instruction that has in-loop 6294 // users starts an interval. We record every time that an in-loop value is 6295 // used, so we have a list of the first and last occurrences of each 6296 // instruction. Next, we transpose this data structure into a multi map that 6297 // holds the list of intervals that *end* at a specific location. This multi 6298 // map allows us to perform a linear search. We scan the instructions linearly 6299 // and record each time that a new interval starts, by placing it in a set. 6300 // If we find this value in the multi-map then we remove it from the set. 6301 // The max register usage is the maximum size of the set. 6302 // We also search for instructions that are defined outside the loop, but are 6303 // used inside the loop. We need this number separately from the max-interval 6304 // usage number because when we unroll, loop-invariant values do not take 6305 // more register. 6306 LoopBlocksDFS DFS(TheLoop); 6307 DFS.perform(LI); 6308 6309 RegisterUsage RU; 6310 6311 // Each 'key' in the map opens a new interval. The values 6312 // of the map are the index of the 'last seen' usage of the 6313 // instruction that is the key. 6314 using IntervalMap = DenseMap<Instruction *, unsigned>; 6315 6316 // Maps instruction to its index. 6317 SmallVector<Instruction *, 64> IdxToInstr; 6318 // Marks the end of each interval. 6319 IntervalMap EndPoint; 6320 // Saves the list of instruction indices that are used in the loop. 6321 SmallPtrSet<Instruction *, 8> Ends; 6322 // Saves the list of values that are used in the loop but are 6323 // defined outside the loop, such as arguments and constants. 6324 SmallPtrSet<Value *, 8> LoopInvariants; 6325 6326 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6327 for (Instruction &I : BB->instructionsWithoutDebug()) { 6328 IdxToInstr.push_back(&I); 6329 6330 // Save the end location of each USE. 6331 for (Value *U : I.operands()) { 6332 auto *Instr = dyn_cast<Instruction>(U); 6333 6334 // Ignore non-instruction values such as arguments, constants, etc. 6335 if (!Instr) 6336 continue; 6337 6338 // If this instruction is outside the loop then record it and continue. 6339 if (!TheLoop->contains(Instr)) { 6340 LoopInvariants.insert(Instr); 6341 continue; 6342 } 6343 6344 // Overwrite previous end points. 6345 EndPoint[Instr] = IdxToInstr.size(); 6346 Ends.insert(Instr); 6347 } 6348 } 6349 } 6350 6351 // Saves the list of intervals that end with the index in 'key'. 6352 using InstrList = SmallVector<Instruction *, 2>; 6353 DenseMap<unsigned, InstrList> TransposeEnds; 6354 6355 // Transpose the EndPoints to a list of values that end at each index. 6356 for (auto &Interval : EndPoint) 6357 TransposeEnds[Interval.second].push_back(Interval.first); 6358 6359 SmallPtrSet<Instruction *, 8> OpenIntervals; 6360 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6361 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6362 6363 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6364 6365 // A lambda that gets the register usage for the given type and VF. 6366 const auto &TTICapture = TTI; 6367 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6368 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6369 return 0U; 6370 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6371 }; 6372 6373 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6374 Instruction *I = IdxToInstr[i]; 6375 6376 // Remove all of the instructions that end at this location. 6377 InstrList &List = TransposeEnds[i]; 6378 for (Instruction *ToRemove : List) 6379 OpenIntervals.erase(ToRemove); 6380 6381 // Ignore instructions that are never used within the loop. 6382 if (!Ends.count(I)) 6383 continue; 6384 6385 // Skip ignored values. 6386 if (ValuesToIgnore.count(I)) 6387 continue; 6388 6389 // For each VF find the maximum usage of registers. 6390 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6391 // Count the number of live intervals. 6392 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6393 6394 if (VFs[j].isScalar()) { 6395 for (auto Inst : OpenIntervals) { 6396 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6397 if (RegUsage.find(ClassID) == RegUsage.end()) 6398 RegUsage[ClassID] = 1; 6399 else 6400 RegUsage[ClassID] += 1; 6401 } 6402 } else { 6403 collectUniformsAndScalars(VFs[j]); 6404 for (auto Inst : OpenIntervals) { 6405 // Skip ignored values for VF > 1. 6406 if (VecValuesToIgnore.count(Inst)) 6407 continue; 6408 if (isScalarAfterVectorization(Inst, VFs[j])) { 6409 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6410 if (RegUsage.find(ClassID) == RegUsage.end()) 6411 RegUsage[ClassID] = 1; 6412 else 6413 RegUsage[ClassID] += 1; 6414 } else { 6415 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6416 if (RegUsage.find(ClassID) == RegUsage.end()) 6417 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6418 else 6419 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6420 } 6421 } 6422 } 6423 6424 for (auto& pair : RegUsage) { 6425 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6426 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6427 else 6428 MaxUsages[j][pair.first] = pair.second; 6429 } 6430 } 6431 6432 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6433 << OpenIntervals.size() << '\n'); 6434 6435 // Add the current instruction to the list of open intervals. 6436 OpenIntervals.insert(I); 6437 } 6438 6439 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6440 SmallMapVector<unsigned, unsigned, 4> Invariant; 6441 6442 for (auto Inst : LoopInvariants) { 6443 unsigned Usage = 6444 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6445 unsigned ClassID = 6446 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6447 if (Invariant.find(ClassID) == Invariant.end()) 6448 Invariant[ClassID] = Usage; 6449 else 6450 Invariant[ClassID] += Usage; 6451 } 6452 6453 LLVM_DEBUG({ 6454 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6455 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6456 << " item\n"; 6457 for (const auto &pair : MaxUsages[i]) { 6458 dbgs() << "LV(REG): RegisterClass: " 6459 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6460 << " registers\n"; 6461 } 6462 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6463 << " item\n"; 6464 for (const auto &pair : Invariant) { 6465 dbgs() << "LV(REG): RegisterClass: " 6466 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6467 << " registers\n"; 6468 } 6469 }); 6470 6471 RU.LoopInvariantRegs = Invariant; 6472 RU.MaxLocalUsers = MaxUsages[i]; 6473 RUs[i] = RU; 6474 } 6475 6476 return RUs; 6477 } 6478 6479 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6480 // TODO: Cost model for emulated masked load/store is completely 6481 // broken. This hack guides the cost model to use an artificially 6482 // high enough value to practically disable vectorization with such 6483 // operations, except where previously deployed legality hack allowed 6484 // using very low cost values. This is to avoid regressions coming simply 6485 // from moving "masked load/store" check from legality to cost model. 6486 // Masked Load/Gather emulation was previously never allowed. 6487 // Limited number of Masked Store/Scatter emulation was allowed. 6488 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6489 return isa<LoadInst>(I) || 6490 (isa<StoreInst>(I) && 6491 NumPredStores > NumberOfStoresToPredicate); 6492 } 6493 6494 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6495 // If we aren't vectorizing the loop, or if we've already collected the 6496 // instructions to scalarize, there's nothing to do. Collection may already 6497 // have occurred if we have a user-selected VF and are now computing the 6498 // expected cost for interleaving. 6499 if (VF.isScalar() || VF.isZero() || 6500 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6501 return; 6502 6503 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6504 // not profitable to scalarize any instructions, the presence of VF in the 6505 // map will indicate that we've analyzed it already. 6506 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6507 6508 // Find all the instructions that are scalar with predication in the loop and 6509 // determine if it would be better to not if-convert the blocks they are in. 6510 // If so, we also record the instructions to scalarize. 6511 for (BasicBlock *BB : TheLoop->blocks()) { 6512 if (!blockNeedsPredication(BB)) 6513 continue; 6514 for (Instruction &I : *BB) 6515 if (isScalarWithPredication(&I)) { 6516 ScalarCostsTy ScalarCosts; 6517 // Do not apply discount logic if hacked cost is needed 6518 // for emulated masked memrefs. 6519 if (!useEmulatedMaskMemRefHack(&I) && 6520 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6521 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6522 // Remember that BB will remain after vectorization. 6523 PredicatedBBsAfterVectorization.insert(BB); 6524 } 6525 } 6526 } 6527 6528 int LoopVectorizationCostModel::computePredInstDiscount( 6529 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6530 assert(!isUniformAfterVectorization(PredInst, VF) && 6531 "Instruction marked uniform-after-vectorization will be predicated"); 6532 6533 // Initialize the discount to zero, meaning that the scalar version and the 6534 // vector version cost the same. 6535 InstructionCost Discount = 0; 6536 6537 // Holds instructions to analyze. The instructions we visit are mapped in 6538 // ScalarCosts. Those instructions are the ones that would be scalarized if 6539 // we find that the scalar version costs less. 6540 SmallVector<Instruction *, 8> Worklist; 6541 6542 // Returns true if the given instruction can be scalarized. 6543 auto canBeScalarized = [&](Instruction *I) -> bool { 6544 // We only attempt to scalarize instructions forming a single-use chain 6545 // from the original predicated block that would otherwise be vectorized. 6546 // Although not strictly necessary, we give up on instructions we know will 6547 // already be scalar to avoid traversing chains that are unlikely to be 6548 // beneficial. 6549 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6550 isScalarAfterVectorization(I, VF)) 6551 return false; 6552 6553 // If the instruction is scalar with predication, it will be analyzed 6554 // separately. We ignore it within the context of PredInst. 6555 if (isScalarWithPredication(I)) 6556 return false; 6557 6558 // If any of the instruction's operands are uniform after vectorization, 6559 // the instruction cannot be scalarized. This prevents, for example, a 6560 // masked load from being scalarized. 6561 // 6562 // We assume we will only emit a value for lane zero of an instruction 6563 // marked uniform after vectorization, rather than VF identical values. 6564 // Thus, if we scalarize an instruction that uses a uniform, we would 6565 // create uses of values corresponding to the lanes we aren't emitting code 6566 // for. This behavior can be changed by allowing getScalarValue to clone 6567 // the lane zero values for uniforms rather than asserting. 6568 for (Use &U : I->operands()) 6569 if (auto *J = dyn_cast<Instruction>(U.get())) 6570 if (isUniformAfterVectorization(J, VF)) 6571 return false; 6572 6573 // Otherwise, we can scalarize the instruction. 6574 return true; 6575 }; 6576 6577 // Compute the expected cost discount from scalarizing the entire expression 6578 // feeding the predicated instruction. We currently only consider expressions 6579 // that are single-use instruction chains. 6580 Worklist.push_back(PredInst); 6581 while (!Worklist.empty()) { 6582 Instruction *I = Worklist.pop_back_val(); 6583 6584 // If we've already analyzed the instruction, there's nothing to do. 6585 if (ScalarCosts.find(I) != ScalarCosts.end()) 6586 continue; 6587 6588 // Compute the cost of the vector instruction. Note that this cost already 6589 // includes the scalarization overhead of the predicated instruction. 6590 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6591 6592 // Compute the cost of the scalarized instruction. This cost is the cost of 6593 // the instruction as if it wasn't if-converted and instead remained in the 6594 // predicated block. We will scale this cost by block probability after 6595 // computing the scalarization overhead. 6596 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6597 InstructionCost ScalarCost = 6598 VF.getKnownMinValue() * 6599 getInstructionCost(I, ElementCount::getFixed(1)).first; 6600 6601 // Compute the scalarization overhead of needed insertelement instructions 6602 // and phi nodes. 6603 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6604 ScalarCost += TTI.getScalarizationOverhead( 6605 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6606 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6607 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6608 ScalarCost += 6609 VF.getKnownMinValue() * 6610 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6611 } 6612 6613 // Compute the scalarization overhead of needed extractelement 6614 // instructions. For each of the instruction's operands, if the operand can 6615 // be scalarized, add it to the worklist; otherwise, account for the 6616 // overhead. 6617 for (Use &U : I->operands()) 6618 if (auto *J = dyn_cast<Instruction>(U.get())) { 6619 assert(VectorType::isValidElementType(J->getType()) && 6620 "Instruction has non-scalar type"); 6621 if (canBeScalarized(J)) 6622 Worklist.push_back(J); 6623 else if (needsExtract(J, VF)) { 6624 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6625 ScalarCost += TTI.getScalarizationOverhead( 6626 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6627 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6628 } 6629 } 6630 6631 // Scale the total scalar cost by block probability. 6632 ScalarCost /= getReciprocalPredBlockProb(); 6633 6634 // Compute the discount. A non-negative discount means the vector version 6635 // of the instruction costs more, and scalarizing would be beneficial. 6636 Discount += VectorCost - ScalarCost; 6637 ScalarCosts[I] = ScalarCost; 6638 } 6639 6640 return *Discount.getValue(); 6641 } 6642 6643 LoopVectorizationCostModel::VectorizationCostTy 6644 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6645 VectorizationCostTy Cost; 6646 6647 // For each block. 6648 for (BasicBlock *BB : TheLoop->blocks()) { 6649 VectorizationCostTy BlockCost; 6650 6651 // For each instruction in the old loop. 6652 for (Instruction &I : BB->instructionsWithoutDebug()) { 6653 // Skip ignored values. 6654 if (ValuesToIgnore.count(&I) || 6655 (VF.isVector() && VecValuesToIgnore.count(&I))) 6656 continue; 6657 6658 VectorizationCostTy C = getInstructionCost(&I, VF); 6659 6660 // Check if we should override the cost. 6661 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6662 C.first = InstructionCost(ForceTargetInstructionCost); 6663 6664 BlockCost.first += C.first; 6665 BlockCost.second |= C.second; 6666 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6667 << " for VF " << VF << " For instruction: " << I 6668 << '\n'); 6669 } 6670 6671 // If we are vectorizing a predicated block, it will have been 6672 // if-converted. This means that the block's instructions (aside from 6673 // stores and instructions that may divide by zero) will now be 6674 // unconditionally executed. For the scalar case, we may not always execute 6675 // the predicated block, if it is an if-else block. Thus, scale the block's 6676 // cost by the probability of executing it. blockNeedsPredication from 6677 // Legal is used so as to not include all blocks in tail folded loops. 6678 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6679 BlockCost.first /= getReciprocalPredBlockProb(); 6680 6681 Cost.first += BlockCost.first; 6682 Cost.second |= BlockCost.second; 6683 } 6684 6685 return Cost; 6686 } 6687 6688 /// Gets Address Access SCEV after verifying that the access pattern 6689 /// is loop invariant except the induction variable dependence. 6690 /// 6691 /// This SCEV can be sent to the Target in order to estimate the address 6692 /// calculation cost. 6693 static const SCEV *getAddressAccessSCEV( 6694 Value *Ptr, 6695 LoopVectorizationLegality *Legal, 6696 PredicatedScalarEvolution &PSE, 6697 const Loop *TheLoop) { 6698 6699 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6700 if (!Gep) 6701 return nullptr; 6702 6703 // We are looking for a gep with all loop invariant indices except for one 6704 // which should be an induction variable. 6705 auto SE = PSE.getSE(); 6706 unsigned NumOperands = Gep->getNumOperands(); 6707 for (unsigned i = 1; i < NumOperands; ++i) { 6708 Value *Opd = Gep->getOperand(i); 6709 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6710 !Legal->isInductionVariable(Opd)) 6711 return nullptr; 6712 } 6713 6714 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6715 return PSE.getSCEV(Ptr); 6716 } 6717 6718 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6719 return Legal->hasStride(I->getOperand(0)) || 6720 Legal->hasStride(I->getOperand(1)); 6721 } 6722 6723 InstructionCost 6724 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6725 ElementCount VF) { 6726 assert(VF.isVector() && 6727 "Scalarization cost of instruction implies vectorization."); 6728 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6729 Type *ValTy = getMemInstValueType(I); 6730 auto SE = PSE.getSE(); 6731 6732 unsigned AS = getLoadStoreAddressSpace(I); 6733 Value *Ptr = getLoadStorePointerOperand(I); 6734 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6735 6736 // Figure out whether the access is strided and get the stride value 6737 // if it's known in compile time 6738 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6739 6740 // Get the cost of the scalar memory instruction and address computation. 6741 InstructionCost Cost = 6742 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6743 6744 // Don't pass *I here, since it is scalar but will actually be part of a 6745 // vectorized loop where the user of it is a vectorized instruction. 6746 const Align Alignment = getLoadStoreAlignment(I); 6747 Cost += VF.getKnownMinValue() * 6748 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6749 AS, TTI::TCK_RecipThroughput); 6750 6751 // Get the overhead of the extractelement and insertelement instructions 6752 // we might create due to scalarization. 6753 Cost += getScalarizationOverhead(I, VF); 6754 6755 // If we have a predicated load/store, it will need extra i1 extracts and 6756 // conditional branches, but may not be executed for each vector lane. Scale 6757 // the cost by the probability of executing the predicated block. 6758 if (isPredicatedInst(I)) { 6759 Cost /= getReciprocalPredBlockProb(); 6760 6761 // Add the cost of an i1 extract and a branch 6762 auto *Vec_i1Ty = 6763 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6764 Cost += TTI.getScalarizationOverhead( 6765 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6766 /*Insert=*/false, /*Extract=*/true); 6767 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6768 6769 if (useEmulatedMaskMemRefHack(I)) 6770 // Artificially setting to a high enough value to practically disable 6771 // vectorization with such operations. 6772 Cost = 3000000; 6773 } 6774 6775 return Cost; 6776 } 6777 6778 InstructionCost 6779 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6780 ElementCount VF) { 6781 Type *ValTy = getMemInstValueType(I); 6782 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6783 Value *Ptr = getLoadStorePointerOperand(I); 6784 unsigned AS = getLoadStoreAddressSpace(I); 6785 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6786 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6787 6788 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6789 "Stride should be 1 or -1 for consecutive memory access"); 6790 const Align Alignment = getLoadStoreAlignment(I); 6791 InstructionCost Cost = 0; 6792 if (Legal->isMaskRequired(I)) 6793 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6794 CostKind); 6795 else 6796 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6797 CostKind, I); 6798 6799 bool Reverse = ConsecutiveStride < 0; 6800 if (Reverse) 6801 Cost += 6802 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6803 return Cost; 6804 } 6805 6806 InstructionCost 6807 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6808 ElementCount VF) { 6809 assert(Legal->isUniformMemOp(*I)); 6810 6811 Type *ValTy = getMemInstValueType(I); 6812 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6813 const Align Alignment = getLoadStoreAlignment(I); 6814 unsigned AS = getLoadStoreAddressSpace(I); 6815 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6816 if (isa<LoadInst>(I)) { 6817 return TTI.getAddressComputationCost(ValTy) + 6818 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6819 CostKind) + 6820 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6821 } 6822 StoreInst *SI = cast<StoreInst>(I); 6823 6824 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6825 return TTI.getAddressComputationCost(ValTy) + 6826 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6827 CostKind) + 6828 (isLoopInvariantStoreValue 6829 ? 0 6830 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6831 VF.getKnownMinValue() - 1)); 6832 } 6833 6834 InstructionCost 6835 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6836 ElementCount VF) { 6837 Type *ValTy = getMemInstValueType(I); 6838 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6839 const Align Alignment = getLoadStoreAlignment(I); 6840 const Value *Ptr = getLoadStorePointerOperand(I); 6841 6842 return TTI.getAddressComputationCost(VectorTy) + 6843 TTI.getGatherScatterOpCost( 6844 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6845 TargetTransformInfo::TCK_RecipThroughput, I); 6846 } 6847 6848 InstructionCost 6849 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6850 ElementCount VF) { 6851 // TODO: Once we have support for interleaving with scalable vectors 6852 // we can calculate the cost properly here. 6853 if (VF.isScalable()) 6854 return InstructionCost::getInvalid(); 6855 6856 Type *ValTy = getMemInstValueType(I); 6857 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6858 unsigned AS = getLoadStoreAddressSpace(I); 6859 6860 auto Group = getInterleavedAccessGroup(I); 6861 assert(Group && "Fail to get an interleaved access group."); 6862 6863 unsigned InterleaveFactor = Group->getFactor(); 6864 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6865 6866 // Holds the indices of existing members in an interleaved load group. 6867 // An interleaved store group doesn't need this as it doesn't allow gaps. 6868 SmallVector<unsigned, 4> Indices; 6869 if (isa<LoadInst>(I)) { 6870 for (unsigned i = 0; i < InterleaveFactor; i++) 6871 if (Group->getMember(i)) 6872 Indices.push_back(i); 6873 } 6874 6875 // Calculate the cost of the whole interleaved group. 6876 bool UseMaskForGaps = 6877 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6878 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6879 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6880 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6881 6882 if (Group->isReverse()) { 6883 // TODO: Add support for reversed masked interleaved access. 6884 assert(!Legal->isMaskRequired(I) && 6885 "Reverse masked interleaved access not supported."); 6886 Cost += 6887 Group->getNumMembers() * 6888 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6889 } 6890 return Cost; 6891 } 6892 6893 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6894 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6895 // Early exit for no inloop reductions 6896 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6897 return InstructionCost::getInvalid(); 6898 auto *VectorTy = cast<VectorType>(Ty); 6899 6900 // We are looking for a pattern of, and finding the minimal acceptable cost: 6901 // reduce(mul(ext(A), ext(B))) or 6902 // reduce(mul(A, B)) or 6903 // reduce(ext(A)) or 6904 // reduce(A). 6905 // The basic idea is that we walk down the tree to do that, finding the root 6906 // reduction instruction in InLoopReductionImmediateChains. From there we find 6907 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6908 // of the components. If the reduction cost is lower then we return it for the 6909 // reduction instruction and 0 for the other instructions in the pattern. If 6910 // it is not we return an invalid cost specifying the orignal cost method 6911 // should be used. 6912 Instruction *RetI = I; 6913 if ((RetI->getOpcode() == Instruction::SExt || 6914 RetI->getOpcode() == Instruction::ZExt)) { 6915 if (!RetI->hasOneUser()) 6916 return InstructionCost::getInvalid(); 6917 RetI = RetI->user_back(); 6918 } 6919 if (RetI->getOpcode() == Instruction::Mul && 6920 RetI->user_back()->getOpcode() == Instruction::Add) { 6921 if (!RetI->hasOneUser()) 6922 return InstructionCost::getInvalid(); 6923 RetI = RetI->user_back(); 6924 } 6925 6926 // Test if the found instruction is a reduction, and if not return an invalid 6927 // cost specifying the parent to use the original cost modelling. 6928 if (!InLoopReductionImmediateChains.count(RetI)) 6929 return InstructionCost::getInvalid(); 6930 6931 // Find the reduction this chain is a part of and calculate the basic cost of 6932 // the reduction on its own. 6933 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6934 Instruction *ReductionPhi = LastChain; 6935 while (!isa<PHINode>(ReductionPhi)) 6936 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6937 6938 RecurrenceDescriptor RdxDesc = 6939 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6940 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6941 VectorTy, false, CostKind); 6942 6943 // Get the operand that was not the reduction chain and match it to one of the 6944 // patterns, returning the better cost if it is found. 6945 Instruction *RedOp = RetI->getOperand(1) == LastChain 6946 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6947 : dyn_cast<Instruction>(RetI->getOperand(1)); 6948 6949 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6950 6951 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6952 !TheLoop->isLoopInvariant(RedOp)) { 6953 bool IsUnsigned = isa<ZExtInst>(RedOp); 6954 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6955 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6956 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6957 CostKind); 6958 6959 unsigned ExtCost = 6960 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6961 TTI::CastContextHint::None, CostKind, RedOp); 6962 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6963 return I == RetI ? *RedCost.getValue() : 0; 6964 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6965 Instruction *Mul = RedOp; 6966 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6967 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6968 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6969 Op0->getOpcode() == Op1->getOpcode() && 6970 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6971 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6972 bool IsUnsigned = isa<ZExtInst>(Op0); 6973 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6974 // reduce(mul(ext, ext)) 6975 unsigned ExtCost = 6976 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6977 TTI::CastContextHint::None, CostKind, Op0); 6978 InstructionCost MulCost = 6979 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6980 6981 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6982 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6983 CostKind); 6984 6985 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6986 return I == RetI ? *RedCost.getValue() : 0; 6987 } else { 6988 InstructionCost MulCost = 6989 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6990 6991 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6992 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6993 CostKind); 6994 6995 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6996 return I == RetI ? *RedCost.getValue() : 0; 6997 } 6998 } 6999 7000 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7001 } 7002 7003 InstructionCost 7004 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7005 ElementCount VF) { 7006 // Calculate scalar cost only. Vectorization cost should be ready at this 7007 // moment. 7008 if (VF.isScalar()) { 7009 Type *ValTy = getMemInstValueType(I); 7010 const Align Alignment = getLoadStoreAlignment(I); 7011 unsigned AS = getLoadStoreAddressSpace(I); 7012 7013 return TTI.getAddressComputationCost(ValTy) + 7014 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7015 TTI::TCK_RecipThroughput, I); 7016 } 7017 return getWideningCost(I, VF); 7018 } 7019 7020 LoopVectorizationCostModel::VectorizationCostTy 7021 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7022 ElementCount VF) { 7023 // If we know that this instruction will remain uniform, check the cost of 7024 // the scalar version. 7025 if (isUniformAfterVectorization(I, VF)) 7026 VF = ElementCount::getFixed(1); 7027 7028 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7029 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7030 7031 // Forced scalars do not have any scalarization overhead. 7032 auto ForcedScalar = ForcedScalars.find(VF); 7033 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7034 auto InstSet = ForcedScalar->second; 7035 if (InstSet.count(I)) 7036 return VectorizationCostTy( 7037 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7038 VF.getKnownMinValue()), 7039 false); 7040 } 7041 7042 Type *VectorTy; 7043 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7044 7045 bool TypeNotScalarized = 7046 VF.isVector() && VectorTy->isVectorTy() && 7047 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7048 return VectorizationCostTy(C, TypeNotScalarized); 7049 } 7050 7051 InstructionCost 7052 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7053 ElementCount VF) { 7054 7055 if (VF.isScalable()) 7056 return InstructionCost::getInvalid(); 7057 7058 if (VF.isScalar()) 7059 return 0; 7060 7061 InstructionCost Cost = 0; 7062 Type *RetTy = ToVectorTy(I->getType(), VF); 7063 if (!RetTy->isVoidTy() && 7064 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7065 Cost += TTI.getScalarizationOverhead( 7066 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7067 true, false); 7068 7069 // Some targets keep addresses scalar. 7070 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7071 return Cost; 7072 7073 // Some targets support efficient element stores. 7074 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7075 return Cost; 7076 7077 // Collect operands to consider. 7078 CallInst *CI = dyn_cast<CallInst>(I); 7079 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7080 7081 // Skip operands that do not require extraction/scalarization and do not incur 7082 // any overhead. 7083 SmallVector<Type *> Tys; 7084 for (auto *V : filterExtractingOperands(Ops, VF)) 7085 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7086 return Cost + TTI.getOperandsScalarizationOverhead( 7087 filterExtractingOperands(Ops, VF), Tys); 7088 } 7089 7090 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7091 if (VF.isScalar()) 7092 return; 7093 NumPredStores = 0; 7094 for (BasicBlock *BB : TheLoop->blocks()) { 7095 // For each instruction in the old loop. 7096 for (Instruction &I : *BB) { 7097 Value *Ptr = getLoadStorePointerOperand(&I); 7098 if (!Ptr) 7099 continue; 7100 7101 // TODO: We should generate better code and update the cost model for 7102 // predicated uniform stores. Today they are treated as any other 7103 // predicated store (see added test cases in 7104 // invariant-store-vectorization.ll). 7105 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7106 NumPredStores++; 7107 7108 if (Legal->isUniformMemOp(I)) { 7109 // TODO: Avoid replicating loads and stores instead of 7110 // relying on instcombine to remove them. 7111 // Load: Scalar load + broadcast 7112 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7113 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7114 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7115 continue; 7116 } 7117 7118 // We assume that widening is the best solution when possible. 7119 if (memoryInstructionCanBeWidened(&I, VF)) { 7120 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7121 int ConsecutiveStride = 7122 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7123 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7124 "Expected consecutive stride."); 7125 InstWidening Decision = 7126 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7127 setWideningDecision(&I, VF, Decision, Cost); 7128 continue; 7129 } 7130 7131 // Choose between Interleaving, Gather/Scatter or Scalarization. 7132 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7133 unsigned NumAccesses = 1; 7134 if (isAccessInterleaved(&I)) { 7135 auto Group = getInterleavedAccessGroup(&I); 7136 assert(Group && "Fail to get an interleaved access group."); 7137 7138 // Make one decision for the whole group. 7139 if (getWideningDecision(&I, VF) != CM_Unknown) 7140 continue; 7141 7142 NumAccesses = Group->getNumMembers(); 7143 if (interleavedAccessCanBeWidened(&I, VF)) 7144 InterleaveCost = getInterleaveGroupCost(&I, VF); 7145 } 7146 7147 InstructionCost GatherScatterCost = 7148 isLegalGatherOrScatter(&I) 7149 ? getGatherScatterCost(&I, VF) * NumAccesses 7150 : InstructionCost::getInvalid(); 7151 7152 InstructionCost ScalarizationCost = 7153 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7154 : InstructionCost::getInvalid(); 7155 7156 // Choose better solution for the current VF, 7157 // write down this decision and use it during vectorization. 7158 InstructionCost Cost; 7159 InstWidening Decision; 7160 if (InterleaveCost <= GatherScatterCost && 7161 InterleaveCost < ScalarizationCost) { 7162 Decision = CM_Interleave; 7163 Cost = InterleaveCost; 7164 } else if (GatherScatterCost < ScalarizationCost) { 7165 Decision = CM_GatherScatter; 7166 Cost = GatherScatterCost; 7167 } else { 7168 assert(!VF.isScalable() && 7169 "We cannot yet scalarise for scalable vectors"); 7170 Decision = CM_Scalarize; 7171 Cost = ScalarizationCost; 7172 } 7173 // If the instructions belongs to an interleave group, the whole group 7174 // receives the same decision. The whole group receives the cost, but 7175 // the cost will actually be assigned to one instruction. 7176 if (auto Group = getInterleavedAccessGroup(&I)) 7177 setWideningDecision(Group, VF, Decision, Cost); 7178 else 7179 setWideningDecision(&I, VF, Decision, Cost); 7180 } 7181 } 7182 7183 // Make sure that any load of address and any other address computation 7184 // remains scalar unless there is gather/scatter support. This avoids 7185 // inevitable extracts into address registers, and also has the benefit of 7186 // activating LSR more, since that pass can't optimize vectorized 7187 // addresses. 7188 if (TTI.prefersVectorizedAddressing()) 7189 return; 7190 7191 // Start with all scalar pointer uses. 7192 SmallPtrSet<Instruction *, 8> AddrDefs; 7193 for (BasicBlock *BB : TheLoop->blocks()) 7194 for (Instruction &I : *BB) { 7195 Instruction *PtrDef = 7196 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7197 if (PtrDef && TheLoop->contains(PtrDef) && 7198 getWideningDecision(&I, VF) != CM_GatherScatter) 7199 AddrDefs.insert(PtrDef); 7200 } 7201 7202 // Add all instructions used to generate the addresses. 7203 SmallVector<Instruction *, 4> Worklist; 7204 append_range(Worklist, AddrDefs); 7205 while (!Worklist.empty()) { 7206 Instruction *I = Worklist.pop_back_val(); 7207 for (auto &Op : I->operands()) 7208 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7209 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7210 AddrDefs.insert(InstOp).second) 7211 Worklist.push_back(InstOp); 7212 } 7213 7214 for (auto *I : AddrDefs) { 7215 if (isa<LoadInst>(I)) { 7216 // Setting the desired widening decision should ideally be handled in 7217 // by cost functions, but since this involves the task of finding out 7218 // if the loaded register is involved in an address computation, it is 7219 // instead changed here when we know this is the case. 7220 InstWidening Decision = getWideningDecision(I, VF); 7221 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7222 // Scalarize a widened load of address. 7223 setWideningDecision( 7224 I, VF, CM_Scalarize, 7225 (VF.getKnownMinValue() * 7226 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7227 else if (auto Group = getInterleavedAccessGroup(I)) { 7228 // Scalarize an interleave group of address loads. 7229 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7230 if (Instruction *Member = Group->getMember(I)) 7231 setWideningDecision( 7232 Member, VF, CM_Scalarize, 7233 (VF.getKnownMinValue() * 7234 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7235 } 7236 } 7237 } else 7238 // Make sure I gets scalarized and a cost estimate without 7239 // scalarization overhead. 7240 ForcedScalars[VF].insert(I); 7241 } 7242 } 7243 7244 InstructionCost 7245 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7246 Type *&VectorTy) { 7247 Type *RetTy = I->getType(); 7248 if (canTruncateToMinimalBitwidth(I, VF)) 7249 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7250 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7251 auto SE = PSE.getSE(); 7252 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7253 7254 // TODO: We need to estimate the cost of intrinsic calls. 7255 switch (I->getOpcode()) { 7256 case Instruction::GetElementPtr: 7257 // We mark this instruction as zero-cost because the cost of GEPs in 7258 // vectorized code depends on whether the corresponding memory instruction 7259 // is scalarized or not. Therefore, we handle GEPs with the memory 7260 // instruction cost. 7261 return 0; 7262 case Instruction::Br: { 7263 // In cases of scalarized and predicated instructions, there will be VF 7264 // predicated blocks in the vectorized loop. Each branch around these 7265 // blocks requires also an extract of its vector compare i1 element. 7266 bool ScalarPredicatedBB = false; 7267 BranchInst *BI = cast<BranchInst>(I); 7268 if (VF.isVector() && BI->isConditional() && 7269 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7270 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7271 ScalarPredicatedBB = true; 7272 7273 if (ScalarPredicatedBB) { 7274 // Return cost for branches around scalarized and predicated blocks. 7275 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7276 auto *Vec_i1Ty = 7277 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7278 return (TTI.getScalarizationOverhead( 7279 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7280 false, true) + 7281 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7282 VF.getKnownMinValue())); 7283 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7284 // The back-edge branch will remain, as will all scalar branches. 7285 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7286 else 7287 // This branch will be eliminated by if-conversion. 7288 return 0; 7289 // Note: We currently assume zero cost for an unconditional branch inside 7290 // a predicated block since it will become a fall-through, although we 7291 // may decide in the future to call TTI for all branches. 7292 } 7293 case Instruction::PHI: { 7294 auto *Phi = cast<PHINode>(I); 7295 7296 // First-order recurrences are replaced by vector shuffles inside the loop. 7297 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7298 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7299 return TTI.getShuffleCost( 7300 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7301 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7302 7303 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7304 // converted into select instructions. We require N - 1 selects per phi 7305 // node, where N is the number of incoming values. 7306 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7307 return (Phi->getNumIncomingValues() - 1) * 7308 TTI.getCmpSelInstrCost( 7309 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7310 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7311 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7312 7313 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7314 } 7315 case Instruction::UDiv: 7316 case Instruction::SDiv: 7317 case Instruction::URem: 7318 case Instruction::SRem: 7319 // If we have a predicated instruction, it may not be executed for each 7320 // vector lane. Get the scalarization cost and scale this amount by the 7321 // probability of executing the predicated block. If the instruction is not 7322 // predicated, we fall through to the next case. 7323 if (VF.isVector() && isScalarWithPredication(I)) { 7324 InstructionCost Cost = 0; 7325 7326 // These instructions have a non-void type, so account for the phi nodes 7327 // that we will create. This cost is likely to be zero. The phi node 7328 // cost, if any, should be scaled by the block probability because it 7329 // models a copy at the end of each predicated block. 7330 Cost += VF.getKnownMinValue() * 7331 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7332 7333 // The cost of the non-predicated instruction. 7334 Cost += VF.getKnownMinValue() * 7335 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7336 7337 // The cost of insertelement and extractelement instructions needed for 7338 // scalarization. 7339 Cost += getScalarizationOverhead(I, VF); 7340 7341 // Scale the cost by the probability of executing the predicated blocks. 7342 // This assumes the predicated block for each vector lane is equally 7343 // likely. 7344 return Cost / getReciprocalPredBlockProb(); 7345 } 7346 LLVM_FALLTHROUGH; 7347 case Instruction::Add: 7348 case Instruction::FAdd: 7349 case Instruction::Sub: 7350 case Instruction::FSub: 7351 case Instruction::Mul: 7352 case Instruction::FMul: 7353 case Instruction::FDiv: 7354 case Instruction::FRem: 7355 case Instruction::Shl: 7356 case Instruction::LShr: 7357 case Instruction::AShr: 7358 case Instruction::And: 7359 case Instruction::Or: 7360 case Instruction::Xor: { 7361 // Since we will replace the stride by 1 the multiplication should go away. 7362 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7363 return 0; 7364 7365 // Detect reduction patterns 7366 InstructionCost RedCost; 7367 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7368 .isValid()) 7369 return RedCost; 7370 7371 // Certain instructions can be cheaper to vectorize if they have a constant 7372 // second vector operand. One example of this are shifts on x86. 7373 Value *Op2 = I->getOperand(1); 7374 TargetTransformInfo::OperandValueProperties Op2VP; 7375 TargetTransformInfo::OperandValueKind Op2VK = 7376 TTI.getOperandInfo(Op2, Op2VP); 7377 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7378 Op2VK = TargetTransformInfo::OK_UniformValue; 7379 7380 SmallVector<const Value *, 4> Operands(I->operand_values()); 7381 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7382 return N * TTI.getArithmeticInstrCost( 7383 I->getOpcode(), VectorTy, CostKind, 7384 TargetTransformInfo::OK_AnyValue, 7385 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7386 } 7387 case Instruction::FNeg: { 7388 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7389 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7390 return N * TTI.getArithmeticInstrCost( 7391 I->getOpcode(), VectorTy, CostKind, 7392 TargetTransformInfo::OK_AnyValue, 7393 TargetTransformInfo::OK_AnyValue, 7394 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7395 I->getOperand(0), I); 7396 } 7397 case Instruction::Select: { 7398 SelectInst *SI = cast<SelectInst>(I); 7399 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7400 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7401 Type *CondTy = SI->getCondition()->getType(); 7402 if (!ScalarCond) 7403 CondTy = VectorType::get(CondTy, VF); 7404 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7405 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7406 } 7407 case Instruction::ICmp: 7408 case Instruction::FCmp: { 7409 Type *ValTy = I->getOperand(0)->getType(); 7410 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7411 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7412 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7413 VectorTy = ToVectorTy(ValTy, VF); 7414 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7415 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7416 } 7417 case Instruction::Store: 7418 case Instruction::Load: { 7419 ElementCount Width = VF; 7420 if (Width.isVector()) { 7421 InstWidening Decision = getWideningDecision(I, Width); 7422 assert(Decision != CM_Unknown && 7423 "CM decision should be taken at this point"); 7424 if (Decision == CM_Scalarize) 7425 Width = ElementCount::getFixed(1); 7426 } 7427 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7428 return getMemoryInstructionCost(I, VF); 7429 } 7430 case Instruction::ZExt: 7431 case Instruction::SExt: 7432 case Instruction::FPToUI: 7433 case Instruction::FPToSI: 7434 case Instruction::FPExt: 7435 case Instruction::PtrToInt: 7436 case Instruction::IntToPtr: 7437 case Instruction::SIToFP: 7438 case Instruction::UIToFP: 7439 case Instruction::Trunc: 7440 case Instruction::FPTrunc: 7441 case Instruction::BitCast: { 7442 // Computes the CastContextHint from a Load/Store instruction. 7443 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7444 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7445 "Expected a load or a store!"); 7446 7447 if (VF.isScalar() || !TheLoop->contains(I)) 7448 return TTI::CastContextHint::Normal; 7449 7450 switch (getWideningDecision(I, VF)) { 7451 case LoopVectorizationCostModel::CM_GatherScatter: 7452 return TTI::CastContextHint::GatherScatter; 7453 case LoopVectorizationCostModel::CM_Interleave: 7454 return TTI::CastContextHint::Interleave; 7455 case LoopVectorizationCostModel::CM_Scalarize: 7456 case LoopVectorizationCostModel::CM_Widen: 7457 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7458 : TTI::CastContextHint::Normal; 7459 case LoopVectorizationCostModel::CM_Widen_Reverse: 7460 return TTI::CastContextHint::Reversed; 7461 case LoopVectorizationCostModel::CM_Unknown: 7462 llvm_unreachable("Instr did not go through cost modelling?"); 7463 } 7464 7465 llvm_unreachable("Unhandled case!"); 7466 }; 7467 7468 unsigned Opcode = I->getOpcode(); 7469 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7470 // For Trunc, the context is the only user, which must be a StoreInst. 7471 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7472 if (I->hasOneUse()) 7473 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7474 CCH = ComputeCCH(Store); 7475 } 7476 // For Z/Sext, the context is the operand, which must be a LoadInst. 7477 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7478 Opcode == Instruction::FPExt) { 7479 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7480 CCH = ComputeCCH(Load); 7481 } 7482 7483 // We optimize the truncation of induction variables having constant 7484 // integer steps. The cost of these truncations is the same as the scalar 7485 // operation. 7486 if (isOptimizableIVTruncate(I, VF)) { 7487 auto *Trunc = cast<TruncInst>(I); 7488 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7489 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7490 } 7491 7492 // Detect reduction patterns 7493 InstructionCost RedCost; 7494 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7495 .isValid()) 7496 return RedCost; 7497 7498 Type *SrcScalarTy = I->getOperand(0)->getType(); 7499 Type *SrcVecTy = 7500 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7501 if (canTruncateToMinimalBitwidth(I, VF)) { 7502 // This cast is going to be shrunk. This may remove the cast or it might 7503 // turn it into slightly different cast. For example, if MinBW == 16, 7504 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7505 // 7506 // Calculate the modified src and dest types. 7507 Type *MinVecTy = VectorTy; 7508 if (Opcode == Instruction::Trunc) { 7509 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7510 VectorTy = 7511 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7512 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7513 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7514 VectorTy = 7515 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7516 } 7517 } 7518 7519 unsigned N; 7520 if (isScalarAfterVectorization(I, VF)) { 7521 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7522 N = VF.getKnownMinValue(); 7523 } else 7524 N = 1; 7525 return N * 7526 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7527 } 7528 case Instruction::Call: { 7529 bool NeedToScalarize; 7530 CallInst *CI = cast<CallInst>(I); 7531 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7532 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7533 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7534 return std::min(CallCost, IntrinsicCost); 7535 } 7536 return CallCost; 7537 } 7538 case Instruction::ExtractValue: 7539 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7540 default: 7541 // The cost of executing VF copies of the scalar instruction. This opcode 7542 // is unknown. Assume that it is the same as 'mul'. 7543 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7544 Instruction::Mul, VectorTy, CostKind) + 7545 getScalarizationOverhead(I, VF); 7546 } // end of switch. 7547 } 7548 7549 char LoopVectorize::ID = 0; 7550 7551 static const char lv_name[] = "Loop Vectorization"; 7552 7553 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7554 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7555 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7556 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7557 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7558 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7559 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7560 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7561 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7562 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7563 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7564 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7565 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7566 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7567 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7568 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7569 7570 namespace llvm { 7571 7572 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7573 7574 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7575 bool VectorizeOnlyWhenForced) { 7576 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7577 } 7578 7579 } // end namespace llvm 7580 7581 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7582 // Check if the pointer operand of a load or store instruction is 7583 // consecutive. 7584 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7585 return Legal->isConsecutivePtr(Ptr); 7586 return false; 7587 } 7588 7589 void LoopVectorizationCostModel::collectValuesToIgnore() { 7590 // Ignore ephemeral values. 7591 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7592 7593 // Ignore type-promoting instructions we identified during reduction 7594 // detection. 7595 for (auto &Reduction : Legal->getReductionVars()) { 7596 RecurrenceDescriptor &RedDes = Reduction.second; 7597 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7598 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7599 } 7600 // Ignore type-casting instructions we identified during induction 7601 // detection. 7602 for (auto &Induction : Legal->getInductionVars()) { 7603 InductionDescriptor &IndDes = Induction.second; 7604 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7605 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7606 } 7607 } 7608 7609 void LoopVectorizationCostModel::collectInLoopReductions() { 7610 for (auto &Reduction : Legal->getReductionVars()) { 7611 PHINode *Phi = Reduction.first; 7612 RecurrenceDescriptor &RdxDesc = Reduction.second; 7613 7614 // We don't collect reductions that are type promoted (yet). 7615 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7616 continue; 7617 7618 // If the target would prefer this reduction to happen "in-loop", then we 7619 // want to record it as such. 7620 unsigned Opcode = RdxDesc.getOpcode(); 7621 if (!PreferInLoopReductions && 7622 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7623 TargetTransformInfo::ReductionFlags())) 7624 continue; 7625 7626 // Check that we can correctly put the reductions into the loop, by 7627 // finding the chain of operations that leads from the phi to the loop 7628 // exit value. 7629 SmallVector<Instruction *, 4> ReductionOperations = 7630 RdxDesc.getReductionOpChain(Phi, TheLoop); 7631 bool InLoop = !ReductionOperations.empty(); 7632 if (InLoop) { 7633 InLoopReductionChains[Phi] = ReductionOperations; 7634 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7635 Instruction *LastChain = Phi; 7636 for (auto *I : ReductionOperations) { 7637 InLoopReductionImmediateChains[I] = LastChain; 7638 LastChain = I; 7639 } 7640 } 7641 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7642 << " reduction for phi: " << *Phi << "\n"); 7643 } 7644 } 7645 7646 // TODO: we could return a pair of values that specify the max VF and 7647 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7648 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7649 // doesn't have a cost model that can choose which plan to execute if 7650 // more than one is generated. 7651 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7652 LoopVectorizationCostModel &CM) { 7653 unsigned WidestType; 7654 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7655 return WidestVectorRegBits / WidestType; 7656 } 7657 7658 VectorizationFactor 7659 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7660 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7661 ElementCount VF = UserVF; 7662 // Outer loop handling: They may require CFG and instruction level 7663 // transformations before even evaluating whether vectorization is profitable. 7664 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7665 // the vectorization pipeline. 7666 if (!OrigLoop->isInnermost()) { 7667 // If the user doesn't provide a vectorization factor, determine a 7668 // reasonable one. 7669 if (UserVF.isZero()) { 7670 VF = ElementCount::getFixed( 7671 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7672 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7673 7674 // Make sure we have a VF > 1 for stress testing. 7675 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7676 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7677 << "overriding computed VF.\n"); 7678 VF = ElementCount::getFixed(4); 7679 } 7680 } 7681 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7682 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7683 "VF needs to be a power of two"); 7684 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7685 << "VF " << VF << " to build VPlans.\n"); 7686 buildVPlans(VF, VF); 7687 7688 // For VPlan build stress testing, we bail out after VPlan construction. 7689 if (VPlanBuildStressTest) 7690 return VectorizationFactor::Disabled(); 7691 7692 return {VF, 0 /*Cost*/}; 7693 } 7694 7695 LLVM_DEBUG( 7696 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7697 "VPlan-native path.\n"); 7698 return VectorizationFactor::Disabled(); 7699 } 7700 7701 Optional<VectorizationFactor> 7702 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7703 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7704 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7705 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7706 return None; 7707 7708 // Invalidate interleave groups if all blocks of loop will be predicated. 7709 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7710 !useMaskedInterleavedAccesses(*TTI)) { 7711 LLVM_DEBUG( 7712 dbgs() 7713 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7714 "which requires masked-interleaved support.\n"); 7715 if (CM.InterleaveInfo.invalidateGroups()) 7716 // Invalidating interleave groups also requires invalidating all decisions 7717 // based on them, which includes widening decisions and uniform and scalar 7718 // values. 7719 CM.invalidateCostModelingDecisions(); 7720 } 7721 7722 ElementCount MaxVF = MaybeMaxVF.getValue(); 7723 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7724 7725 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7726 if (!UserVF.isZero() && 7727 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7728 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7729 // VFs here, this should be reverted to only use legal UserVFs once the 7730 // loop below supports scalable VFs. 7731 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7732 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7733 << " VF " << VF << ".\n"); 7734 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7735 "VF needs to be a power of two"); 7736 // Collect the instructions (and their associated costs) that will be more 7737 // profitable to scalarize. 7738 CM.selectUserVectorizationFactor(VF); 7739 CM.collectInLoopReductions(); 7740 buildVPlansWithVPRecipes(VF, VF); 7741 LLVM_DEBUG(printPlans(dbgs())); 7742 return {{VF, 0}}; 7743 } 7744 7745 assert(!MaxVF.isScalable() && 7746 "Scalable vectors not yet supported beyond this point"); 7747 7748 for (ElementCount VF = ElementCount::getFixed(1); 7749 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7750 // Collect Uniform and Scalar instructions after vectorization with VF. 7751 CM.collectUniformsAndScalars(VF); 7752 7753 // Collect the instructions (and their associated costs) that will be more 7754 // profitable to scalarize. 7755 if (VF.isVector()) 7756 CM.collectInstsToScalarize(VF); 7757 } 7758 7759 CM.collectInLoopReductions(); 7760 7761 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7762 LLVM_DEBUG(printPlans(dbgs())); 7763 if (MaxVF.isScalar()) 7764 return VectorizationFactor::Disabled(); 7765 7766 // Select the optimal vectorization factor. 7767 return CM.selectVectorizationFactor(MaxVF); 7768 } 7769 7770 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7771 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7772 << '\n'); 7773 BestVF = VF; 7774 BestUF = UF; 7775 7776 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7777 return !Plan->hasVF(VF); 7778 }); 7779 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7780 } 7781 7782 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7783 DominatorTree *DT) { 7784 // Perform the actual loop transformation. 7785 7786 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7787 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7788 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7789 7790 VPTransformState State{ 7791 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7792 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7793 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7794 State.CanonicalIV = ILV.Induction; 7795 7796 ILV.printDebugTracesAtStart(); 7797 7798 //===------------------------------------------------===// 7799 // 7800 // Notice: any optimization or new instruction that go 7801 // into the code below should also be implemented in 7802 // the cost-model. 7803 // 7804 //===------------------------------------------------===// 7805 7806 // 2. Copy and widen instructions from the old loop into the new loop. 7807 VPlans.front()->execute(&State); 7808 7809 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7810 // predication, updating analyses. 7811 ILV.fixVectorizedLoop(State); 7812 7813 ILV.printDebugTracesAtEnd(); 7814 } 7815 7816 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7817 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7818 for (const auto &Plan : VPlans) 7819 if (PrintVPlansInDotFormat) 7820 Plan->printDOT(O); 7821 else 7822 Plan->print(O); 7823 } 7824 #endif 7825 7826 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7827 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7828 7829 // We create new control-flow for the vectorized loop, so the original exit 7830 // conditions will be dead after vectorization if it's only used by the 7831 // terminator 7832 SmallVector<BasicBlock*> ExitingBlocks; 7833 OrigLoop->getExitingBlocks(ExitingBlocks); 7834 for (auto *BB : ExitingBlocks) { 7835 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7836 if (!Cmp || !Cmp->hasOneUse()) 7837 continue; 7838 7839 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7840 if (!DeadInstructions.insert(Cmp).second) 7841 continue; 7842 7843 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7844 // TODO: can recurse through operands in general 7845 for (Value *Op : Cmp->operands()) { 7846 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7847 DeadInstructions.insert(cast<Instruction>(Op)); 7848 } 7849 } 7850 7851 // We create new "steps" for induction variable updates to which the original 7852 // induction variables map. An original update instruction will be dead if 7853 // all its users except the induction variable are dead. 7854 auto *Latch = OrigLoop->getLoopLatch(); 7855 for (auto &Induction : Legal->getInductionVars()) { 7856 PHINode *Ind = Induction.first; 7857 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7858 7859 // If the tail is to be folded by masking, the primary induction variable, 7860 // if exists, isn't dead: it will be used for masking. Don't kill it. 7861 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7862 continue; 7863 7864 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7865 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7866 })) 7867 DeadInstructions.insert(IndUpdate); 7868 7869 // We record as "Dead" also the type-casting instructions we had identified 7870 // during induction analysis. We don't need any handling for them in the 7871 // vectorized loop because we have proven that, under a proper runtime 7872 // test guarding the vectorized loop, the value of the phi, and the casted 7873 // value of the phi, are the same. The last instruction in this casting chain 7874 // will get its scalar/vector/widened def from the scalar/vector/widened def 7875 // of the respective phi node. Any other casts in the induction def-use chain 7876 // have no other uses outside the phi update chain, and will be ignored. 7877 InductionDescriptor &IndDes = Induction.second; 7878 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7879 DeadInstructions.insert(Casts.begin(), Casts.end()); 7880 } 7881 } 7882 7883 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7884 7885 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7886 7887 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7888 Instruction::BinaryOps BinOp) { 7889 // When unrolling and the VF is 1, we only need to add a simple scalar. 7890 Type *Ty = Val->getType(); 7891 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7892 7893 if (Ty->isFloatingPointTy()) { 7894 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7895 7896 // Floating-point operations inherit FMF via the builder's flags. 7897 Value *MulOp = Builder.CreateFMul(C, Step); 7898 return Builder.CreateBinOp(BinOp, Val, MulOp); 7899 } 7900 Constant *C = ConstantInt::get(Ty, StartIdx); 7901 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7902 } 7903 7904 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7905 SmallVector<Metadata *, 4> MDs; 7906 // Reserve first location for self reference to the LoopID metadata node. 7907 MDs.push_back(nullptr); 7908 bool IsUnrollMetadata = false; 7909 MDNode *LoopID = L->getLoopID(); 7910 if (LoopID) { 7911 // First find existing loop unrolling disable metadata. 7912 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7913 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7914 if (MD) { 7915 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7916 IsUnrollMetadata = 7917 S && S->getString().startswith("llvm.loop.unroll.disable"); 7918 } 7919 MDs.push_back(LoopID->getOperand(i)); 7920 } 7921 } 7922 7923 if (!IsUnrollMetadata) { 7924 // Add runtime unroll disable metadata. 7925 LLVMContext &Context = L->getHeader()->getContext(); 7926 SmallVector<Metadata *, 1> DisableOperands; 7927 DisableOperands.push_back( 7928 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7929 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7930 MDs.push_back(DisableNode); 7931 MDNode *NewLoopID = MDNode::get(Context, MDs); 7932 // Set operand 0 to refer to the loop id itself. 7933 NewLoopID->replaceOperandWith(0, NewLoopID); 7934 L->setLoopID(NewLoopID); 7935 } 7936 } 7937 7938 //===--------------------------------------------------------------------===// 7939 // EpilogueVectorizerMainLoop 7940 //===--------------------------------------------------------------------===// 7941 7942 /// This function is partially responsible for generating the control flow 7943 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7944 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7945 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7946 Loop *Lp = createVectorLoopSkeleton(""); 7947 7948 // Generate the code to check the minimum iteration count of the vector 7949 // epilogue (see below). 7950 EPI.EpilogueIterationCountCheck = 7951 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7952 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7953 7954 // Generate the code to check any assumptions that we've made for SCEV 7955 // expressions. 7956 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7957 7958 // Generate the code that checks at runtime if arrays overlap. We put the 7959 // checks into a separate block to make the more common case of few elements 7960 // faster. 7961 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7962 7963 // Generate the iteration count check for the main loop, *after* the check 7964 // for the epilogue loop, so that the path-length is shorter for the case 7965 // that goes directly through the vector epilogue. The longer-path length for 7966 // the main loop is compensated for, by the gain from vectorizing the larger 7967 // trip count. Note: the branch will get updated later on when we vectorize 7968 // the epilogue. 7969 EPI.MainLoopIterationCountCheck = 7970 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7971 7972 // Generate the induction variable. 7973 OldInduction = Legal->getPrimaryInduction(); 7974 Type *IdxTy = Legal->getWidestInductionType(); 7975 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7976 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7977 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7978 EPI.VectorTripCount = CountRoundDown; 7979 Induction = 7980 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7981 getDebugLocFromInstOrOperands(OldInduction)); 7982 7983 // Skip induction resume value creation here because they will be created in 7984 // the second pass. If we created them here, they wouldn't be used anyway, 7985 // because the vplan in the second pass still contains the inductions from the 7986 // original loop. 7987 7988 return completeLoopSkeleton(Lp, OrigLoopID); 7989 } 7990 7991 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7992 LLVM_DEBUG({ 7993 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7994 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7995 << ", Main Loop UF:" << EPI.MainLoopUF 7996 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7997 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7998 }); 7999 } 8000 8001 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8002 DEBUG_WITH_TYPE(VerboseDebug, { 8003 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8004 }); 8005 } 8006 8007 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8008 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8009 assert(L && "Expected valid Loop."); 8010 assert(Bypass && "Expected valid bypass basic block."); 8011 unsigned VFactor = 8012 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8013 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8014 Value *Count = getOrCreateTripCount(L); 8015 // Reuse existing vector loop preheader for TC checks. 8016 // Note that new preheader block is generated for vector loop. 8017 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8018 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8019 8020 // Generate code to check if the loop's trip count is less than VF * UF of the 8021 // main vector loop. 8022 auto P = 8023 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8024 8025 Value *CheckMinIters = Builder.CreateICmp( 8026 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8027 "min.iters.check"); 8028 8029 if (!ForEpilogue) 8030 TCCheckBlock->setName("vector.main.loop.iter.check"); 8031 8032 // Create new preheader for vector loop. 8033 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8034 DT, LI, nullptr, "vector.ph"); 8035 8036 if (ForEpilogue) { 8037 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8038 DT->getNode(Bypass)->getIDom()) && 8039 "TC check is expected to dominate Bypass"); 8040 8041 // Update dominator for Bypass & LoopExit. 8042 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8043 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8044 8045 LoopBypassBlocks.push_back(TCCheckBlock); 8046 8047 // Save the trip count so we don't have to regenerate it in the 8048 // vec.epilog.iter.check. This is safe to do because the trip count 8049 // generated here dominates the vector epilog iter check. 8050 EPI.TripCount = Count; 8051 } 8052 8053 ReplaceInstWithInst( 8054 TCCheckBlock->getTerminator(), 8055 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8056 8057 return TCCheckBlock; 8058 } 8059 8060 //===--------------------------------------------------------------------===// 8061 // EpilogueVectorizerEpilogueLoop 8062 //===--------------------------------------------------------------------===// 8063 8064 /// This function is partially responsible for generating the control flow 8065 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8066 BasicBlock * 8067 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8068 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8069 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8070 8071 // Now, compare the remaining count and if there aren't enough iterations to 8072 // execute the vectorized epilogue skip to the scalar part. 8073 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8074 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8075 LoopVectorPreHeader = 8076 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8077 LI, nullptr, "vec.epilog.ph"); 8078 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8079 VecEpilogueIterationCountCheck); 8080 8081 // Adjust the control flow taking the state info from the main loop 8082 // vectorization into account. 8083 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8084 "expected this to be saved from the previous pass."); 8085 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8086 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8087 8088 DT->changeImmediateDominator(LoopVectorPreHeader, 8089 EPI.MainLoopIterationCountCheck); 8090 8091 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8092 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8093 8094 if (EPI.SCEVSafetyCheck) 8095 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8096 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8097 if (EPI.MemSafetyCheck) 8098 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8099 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8100 8101 DT->changeImmediateDominator( 8102 VecEpilogueIterationCountCheck, 8103 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8104 8105 DT->changeImmediateDominator(LoopScalarPreHeader, 8106 EPI.EpilogueIterationCountCheck); 8107 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8108 8109 // Keep track of bypass blocks, as they feed start values to the induction 8110 // phis in the scalar loop preheader. 8111 if (EPI.SCEVSafetyCheck) 8112 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8113 if (EPI.MemSafetyCheck) 8114 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8115 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8116 8117 // Generate a resume induction for the vector epilogue and put it in the 8118 // vector epilogue preheader 8119 Type *IdxTy = Legal->getWidestInductionType(); 8120 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8121 LoopVectorPreHeader->getFirstNonPHI()); 8122 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8123 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8124 EPI.MainLoopIterationCountCheck); 8125 8126 // Generate the induction variable. 8127 OldInduction = Legal->getPrimaryInduction(); 8128 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8129 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8130 Value *StartIdx = EPResumeVal; 8131 Induction = 8132 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8133 getDebugLocFromInstOrOperands(OldInduction)); 8134 8135 // Generate induction resume values. These variables save the new starting 8136 // indexes for the scalar loop. They are used to test if there are any tail 8137 // iterations left once the vector loop has completed. 8138 // Note that when the vectorized epilogue is skipped due to iteration count 8139 // check, then the resume value for the induction variable comes from 8140 // the trip count of the main vector loop, hence passing the AdditionalBypass 8141 // argument. 8142 createInductionResumeValues(Lp, CountRoundDown, 8143 {VecEpilogueIterationCountCheck, 8144 EPI.VectorTripCount} /* AdditionalBypass */); 8145 8146 AddRuntimeUnrollDisableMetaData(Lp); 8147 return completeLoopSkeleton(Lp, OrigLoopID); 8148 } 8149 8150 BasicBlock * 8151 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8152 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8153 8154 assert(EPI.TripCount && 8155 "Expected trip count to have been safed in the first pass."); 8156 assert( 8157 (!isa<Instruction>(EPI.TripCount) || 8158 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8159 "saved trip count does not dominate insertion point."); 8160 Value *TC = EPI.TripCount; 8161 IRBuilder<> Builder(Insert->getTerminator()); 8162 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8163 8164 // Generate code to check if the loop's trip count is less than VF * UF of the 8165 // vector epilogue loop. 8166 auto P = 8167 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8168 8169 Value *CheckMinIters = Builder.CreateICmp( 8170 P, Count, 8171 ConstantInt::get(Count->getType(), 8172 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8173 "min.epilog.iters.check"); 8174 8175 ReplaceInstWithInst( 8176 Insert->getTerminator(), 8177 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8178 8179 LoopBypassBlocks.push_back(Insert); 8180 return Insert; 8181 } 8182 8183 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8184 LLVM_DEBUG({ 8185 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8186 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8187 << ", Main Loop UF:" << EPI.MainLoopUF 8188 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8189 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8190 }); 8191 } 8192 8193 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8194 DEBUG_WITH_TYPE(VerboseDebug, { 8195 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8196 }); 8197 } 8198 8199 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8200 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8201 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8202 bool PredicateAtRangeStart = Predicate(Range.Start); 8203 8204 for (ElementCount TmpVF = Range.Start * 2; 8205 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8206 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8207 Range.End = TmpVF; 8208 break; 8209 } 8210 8211 return PredicateAtRangeStart; 8212 } 8213 8214 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8215 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8216 /// of VF's starting at a given VF and extending it as much as possible. Each 8217 /// vectorization decision can potentially shorten this sub-range during 8218 /// buildVPlan(). 8219 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8220 ElementCount MaxVF) { 8221 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8222 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8223 VFRange SubRange = {VF, MaxVFPlusOne}; 8224 VPlans.push_back(buildVPlan(SubRange)); 8225 VF = SubRange.End; 8226 } 8227 } 8228 8229 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8230 VPlanPtr &Plan) { 8231 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8232 8233 // Look for cached value. 8234 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8235 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8236 if (ECEntryIt != EdgeMaskCache.end()) 8237 return ECEntryIt->second; 8238 8239 VPValue *SrcMask = createBlockInMask(Src, Plan); 8240 8241 // The terminator has to be a branch inst! 8242 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8243 assert(BI && "Unexpected terminator found"); 8244 8245 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8246 return EdgeMaskCache[Edge] = SrcMask; 8247 8248 // If source is an exiting block, we know the exit edge is dynamically dead 8249 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8250 // adding uses of an otherwise potentially dead instruction. 8251 if (OrigLoop->isLoopExiting(Src)) 8252 return EdgeMaskCache[Edge] = SrcMask; 8253 8254 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8255 assert(EdgeMask && "No Edge Mask found for condition"); 8256 8257 if (BI->getSuccessor(0) != Dst) 8258 EdgeMask = Builder.createNot(EdgeMask); 8259 8260 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8261 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8262 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8263 // The select version does not introduce new UB if SrcMask is false and 8264 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8265 VPValue *False = Plan->getOrAddVPValue( 8266 ConstantInt::getFalse(BI->getCondition()->getType())); 8267 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8268 } 8269 8270 return EdgeMaskCache[Edge] = EdgeMask; 8271 } 8272 8273 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8274 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8275 8276 // Look for cached value. 8277 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8278 if (BCEntryIt != BlockMaskCache.end()) 8279 return BCEntryIt->second; 8280 8281 // All-one mask is modelled as no-mask following the convention for masked 8282 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8283 VPValue *BlockMask = nullptr; 8284 8285 if (OrigLoop->getHeader() == BB) { 8286 if (!CM.blockNeedsPredication(BB)) 8287 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8288 8289 // Create the block in mask as the first non-phi instruction in the block. 8290 VPBuilder::InsertPointGuard Guard(Builder); 8291 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8292 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8293 8294 // Introduce the early-exit compare IV <= BTC to form header block mask. 8295 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8296 // Start by constructing the desired canonical IV. 8297 VPValue *IV = nullptr; 8298 if (Legal->getPrimaryInduction()) 8299 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8300 else { 8301 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8302 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8303 IV = IVRecipe->getVPValue(); 8304 } 8305 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8306 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8307 8308 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8309 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8310 // as a second argument, we only pass the IV here and extract the 8311 // tripcount from the transform state where codegen of the VP instructions 8312 // happen. 8313 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8314 } else { 8315 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8316 } 8317 return BlockMaskCache[BB] = BlockMask; 8318 } 8319 8320 // This is the block mask. We OR all incoming edges. 8321 for (auto *Predecessor : predecessors(BB)) { 8322 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8323 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8324 return BlockMaskCache[BB] = EdgeMask; 8325 8326 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8327 BlockMask = EdgeMask; 8328 continue; 8329 } 8330 8331 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8332 } 8333 8334 return BlockMaskCache[BB] = BlockMask; 8335 } 8336 8337 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8338 VPlanPtr &Plan) { 8339 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8340 "Must be called with either a load or store"); 8341 8342 auto willWiden = [&](ElementCount VF) -> bool { 8343 if (VF.isScalar()) 8344 return false; 8345 LoopVectorizationCostModel::InstWidening Decision = 8346 CM.getWideningDecision(I, VF); 8347 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8348 "CM decision should be taken at this point."); 8349 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8350 return true; 8351 if (CM.isScalarAfterVectorization(I, VF) || 8352 CM.isProfitableToScalarize(I, VF)) 8353 return false; 8354 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8355 }; 8356 8357 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8358 return nullptr; 8359 8360 VPValue *Mask = nullptr; 8361 if (Legal->isMaskRequired(I)) 8362 Mask = createBlockInMask(I->getParent(), Plan); 8363 8364 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8365 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8366 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8367 8368 StoreInst *Store = cast<StoreInst>(I); 8369 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8370 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8371 } 8372 8373 VPWidenIntOrFpInductionRecipe * 8374 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8375 // Check if this is an integer or fp induction. If so, build the recipe that 8376 // produces its scalar and vector values. 8377 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8378 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8379 II.getKind() == InductionDescriptor::IK_FpInduction) { 8380 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8381 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8382 return new VPWidenIntOrFpInductionRecipe( 8383 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8384 } 8385 8386 return nullptr; 8387 } 8388 8389 VPWidenIntOrFpInductionRecipe * 8390 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8391 VPlan &Plan) const { 8392 // Optimize the special case where the source is a constant integer 8393 // induction variable. Notice that we can only optimize the 'trunc' case 8394 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8395 // (c) other casts depend on pointer size. 8396 8397 // Determine whether \p K is a truncation based on an induction variable that 8398 // can be optimized. 8399 auto isOptimizableIVTruncate = 8400 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8401 return [=](ElementCount VF) -> bool { 8402 return CM.isOptimizableIVTruncate(K, VF); 8403 }; 8404 }; 8405 8406 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8407 isOptimizableIVTruncate(I), Range)) { 8408 8409 InductionDescriptor II = 8410 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8411 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8412 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8413 Start, nullptr, I); 8414 } 8415 return nullptr; 8416 } 8417 8418 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8419 // If all incoming values are equal, the incoming VPValue can be used directly 8420 // instead of creating a new VPBlendRecipe. 8421 Value *FirstIncoming = Phi->getIncomingValue(0); 8422 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8423 return FirstIncoming == Inc; 8424 })) { 8425 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8426 } 8427 8428 // We know that all PHIs in non-header blocks are converted into selects, so 8429 // we don't have to worry about the insertion order and we can just use the 8430 // builder. At this point we generate the predication tree. There may be 8431 // duplications since this is a simple recursive scan, but future 8432 // optimizations will clean it up. 8433 SmallVector<VPValue *, 2> Operands; 8434 unsigned NumIncoming = Phi->getNumIncomingValues(); 8435 8436 for (unsigned In = 0; In < NumIncoming; In++) { 8437 VPValue *EdgeMask = 8438 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8439 assert((EdgeMask || NumIncoming == 1) && 8440 "Multiple predecessors with one having a full mask"); 8441 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8442 if (EdgeMask) 8443 Operands.push_back(EdgeMask); 8444 } 8445 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8446 } 8447 8448 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8449 VPlan &Plan) const { 8450 8451 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8452 [this, CI](ElementCount VF) { 8453 return CM.isScalarWithPredication(CI, VF); 8454 }, 8455 Range); 8456 8457 if (IsPredicated) 8458 return nullptr; 8459 8460 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8461 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8462 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8463 ID == Intrinsic::pseudoprobe || 8464 ID == Intrinsic::experimental_noalias_scope_decl)) 8465 return nullptr; 8466 8467 auto willWiden = [&](ElementCount VF) -> bool { 8468 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8469 // The following case may be scalarized depending on the VF. 8470 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8471 // version of the instruction. 8472 // Is it beneficial to perform intrinsic call compared to lib call? 8473 bool NeedToScalarize = false; 8474 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8475 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8476 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8477 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8478 "Cannot have invalid costs while widening"); 8479 return UseVectorIntrinsic || !NeedToScalarize; 8480 }; 8481 8482 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8483 return nullptr; 8484 8485 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8486 } 8487 8488 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8489 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8490 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8491 // Instruction should be widened, unless it is scalar after vectorization, 8492 // scalarization is profitable or it is predicated. 8493 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8494 return CM.isScalarAfterVectorization(I, VF) || 8495 CM.isProfitableToScalarize(I, VF) || 8496 CM.isScalarWithPredication(I, VF); 8497 }; 8498 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8499 Range); 8500 } 8501 8502 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8503 auto IsVectorizableOpcode = [](unsigned Opcode) { 8504 switch (Opcode) { 8505 case Instruction::Add: 8506 case Instruction::And: 8507 case Instruction::AShr: 8508 case Instruction::BitCast: 8509 case Instruction::FAdd: 8510 case Instruction::FCmp: 8511 case Instruction::FDiv: 8512 case Instruction::FMul: 8513 case Instruction::FNeg: 8514 case Instruction::FPExt: 8515 case Instruction::FPToSI: 8516 case Instruction::FPToUI: 8517 case Instruction::FPTrunc: 8518 case Instruction::FRem: 8519 case Instruction::FSub: 8520 case Instruction::ICmp: 8521 case Instruction::IntToPtr: 8522 case Instruction::LShr: 8523 case Instruction::Mul: 8524 case Instruction::Or: 8525 case Instruction::PtrToInt: 8526 case Instruction::SDiv: 8527 case Instruction::Select: 8528 case Instruction::SExt: 8529 case Instruction::Shl: 8530 case Instruction::SIToFP: 8531 case Instruction::SRem: 8532 case Instruction::Sub: 8533 case Instruction::Trunc: 8534 case Instruction::UDiv: 8535 case Instruction::UIToFP: 8536 case Instruction::URem: 8537 case Instruction::Xor: 8538 case Instruction::ZExt: 8539 return true; 8540 } 8541 return false; 8542 }; 8543 8544 if (!IsVectorizableOpcode(I->getOpcode())) 8545 return nullptr; 8546 8547 // Success: widen this instruction. 8548 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8549 } 8550 8551 VPBasicBlock *VPRecipeBuilder::handleReplication( 8552 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8553 VPlanPtr &Plan) { 8554 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8555 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8556 Range); 8557 8558 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8559 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8560 Range); 8561 8562 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8563 IsUniform, IsPredicated); 8564 setRecipe(I, Recipe); 8565 Plan->addVPValue(I, Recipe); 8566 8567 // Find if I uses a predicated instruction. If so, it will use its scalar 8568 // value. Avoid hoisting the insert-element which packs the scalar value into 8569 // a vector value, as that happens iff all users use the vector value. 8570 for (VPValue *Op : Recipe->operands()) { 8571 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8572 if (!PredR) 8573 continue; 8574 auto *RepR = 8575 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8576 assert(RepR->isPredicated() && 8577 "expected Replicate recipe to be predicated"); 8578 RepR->setAlsoPack(false); 8579 } 8580 8581 // Finalize the recipe for Instr, first if it is not predicated. 8582 if (!IsPredicated) { 8583 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8584 VPBB->appendRecipe(Recipe); 8585 return VPBB; 8586 } 8587 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8588 assert(VPBB->getSuccessors().empty() && 8589 "VPBB has successors when handling predicated replication."); 8590 // Record predicated instructions for above packing optimizations. 8591 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8592 VPBlockUtils::insertBlockAfter(Region, VPBB); 8593 auto *RegSucc = new VPBasicBlock(); 8594 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8595 return RegSucc; 8596 } 8597 8598 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8599 VPRecipeBase *PredRecipe, 8600 VPlanPtr &Plan) { 8601 // Instructions marked for predication are replicated and placed under an 8602 // if-then construct to prevent side-effects. 8603 8604 // Generate recipes to compute the block mask for this region. 8605 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8606 8607 // Build the triangular if-then region. 8608 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8609 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8610 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8611 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8612 auto *PHIRecipe = Instr->getType()->isVoidTy() 8613 ? nullptr 8614 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8615 if (PHIRecipe) { 8616 Plan->removeVPValueFor(Instr); 8617 Plan->addVPValue(Instr, PHIRecipe); 8618 } 8619 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8620 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8621 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8622 8623 // Note: first set Entry as region entry and then connect successors starting 8624 // from it in order, to propagate the "parent" of each VPBasicBlock. 8625 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8626 VPBlockUtils::connectBlocks(Pred, Exit); 8627 8628 return Region; 8629 } 8630 8631 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8632 VFRange &Range, 8633 VPlanPtr &Plan) { 8634 // First, check for specific widening recipes that deal with calls, memory 8635 // operations, inductions and Phi nodes. 8636 if (auto *CI = dyn_cast<CallInst>(Instr)) 8637 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8638 8639 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8640 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8641 8642 VPRecipeBase *Recipe; 8643 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8644 if (Phi->getParent() != OrigLoop->getHeader()) 8645 return tryToBlend(Phi, Plan); 8646 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8647 return toVPRecipeResult(Recipe); 8648 8649 if (Legal->isReductionVariable(Phi)) { 8650 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8651 VPValue *StartV = 8652 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8653 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8654 } 8655 8656 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8657 } 8658 8659 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8660 cast<TruncInst>(Instr), Range, *Plan))) 8661 return toVPRecipeResult(Recipe); 8662 8663 if (!shouldWiden(Instr, Range)) 8664 return nullptr; 8665 8666 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8667 return toVPRecipeResult(new VPWidenGEPRecipe( 8668 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8669 8670 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8671 bool InvariantCond = 8672 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8673 return toVPRecipeResult(new VPWidenSelectRecipe( 8674 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8675 } 8676 8677 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8678 } 8679 8680 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8681 ElementCount MaxVF) { 8682 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8683 8684 // Collect instructions from the original loop that will become trivially dead 8685 // in the vectorized loop. We don't need to vectorize these instructions. For 8686 // example, original induction update instructions can become dead because we 8687 // separately emit induction "steps" when generating code for the new loop. 8688 // Similarly, we create a new latch condition when setting up the structure 8689 // of the new loop, so the old one can become dead. 8690 SmallPtrSet<Instruction *, 4> DeadInstructions; 8691 collectTriviallyDeadInstructions(DeadInstructions); 8692 8693 // Add assume instructions we need to drop to DeadInstructions, to prevent 8694 // them from being added to the VPlan. 8695 // TODO: We only need to drop assumes in blocks that get flattend. If the 8696 // control flow is preserved, we should keep them. 8697 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8698 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8699 8700 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8701 // Dead instructions do not need sinking. Remove them from SinkAfter. 8702 for (Instruction *I : DeadInstructions) 8703 SinkAfter.erase(I); 8704 8705 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8706 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8707 VFRange SubRange = {VF, MaxVFPlusOne}; 8708 VPlans.push_back( 8709 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8710 VF = SubRange.End; 8711 } 8712 } 8713 8714 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8715 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8716 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8717 8718 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8719 8720 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8721 8722 // --------------------------------------------------------------------------- 8723 // Pre-construction: record ingredients whose recipes we'll need to further 8724 // process after constructing the initial VPlan. 8725 // --------------------------------------------------------------------------- 8726 8727 // Mark instructions we'll need to sink later and their targets as 8728 // ingredients whose recipe we'll need to record. 8729 for (auto &Entry : SinkAfter) { 8730 RecipeBuilder.recordRecipeOf(Entry.first); 8731 RecipeBuilder.recordRecipeOf(Entry.second); 8732 } 8733 for (auto &Reduction : CM.getInLoopReductionChains()) { 8734 PHINode *Phi = Reduction.first; 8735 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8736 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8737 8738 RecipeBuilder.recordRecipeOf(Phi); 8739 for (auto &R : ReductionOperations) { 8740 RecipeBuilder.recordRecipeOf(R); 8741 // For min/max reducitons, where we have a pair of icmp/select, we also 8742 // need to record the ICmp recipe, so it can be removed later. 8743 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8744 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8745 } 8746 } 8747 8748 // For each interleave group which is relevant for this (possibly trimmed) 8749 // Range, add it to the set of groups to be later applied to the VPlan and add 8750 // placeholders for its members' Recipes which we'll be replacing with a 8751 // single VPInterleaveRecipe. 8752 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8753 auto applyIG = [IG, this](ElementCount VF) -> bool { 8754 return (VF.isVector() && // Query is illegal for VF == 1 8755 CM.getWideningDecision(IG->getInsertPos(), VF) == 8756 LoopVectorizationCostModel::CM_Interleave); 8757 }; 8758 if (!getDecisionAndClampRange(applyIG, Range)) 8759 continue; 8760 InterleaveGroups.insert(IG); 8761 for (unsigned i = 0; i < IG->getFactor(); i++) 8762 if (Instruction *Member = IG->getMember(i)) 8763 RecipeBuilder.recordRecipeOf(Member); 8764 }; 8765 8766 // --------------------------------------------------------------------------- 8767 // Build initial VPlan: Scan the body of the loop in a topological order to 8768 // visit each basic block after having visited its predecessor basic blocks. 8769 // --------------------------------------------------------------------------- 8770 8771 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8772 auto Plan = std::make_unique<VPlan>(); 8773 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8774 Plan->setEntry(VPBB); 8775 8776 // Scan the body of the loop in a topological order to visit each basic block 8777 // after having visited its predecessor basic blocks. 8778 LoopBlocksDFS DFS(OrigLoop); 8779 DFS.perform(LI); 8780 8781 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8782 // Relevant instructions from basic block BB will be grouped into VPRecipe 8783 // ingredients and fill a new VPBasicBlock. 8784 unsigned VPBBsForBB = 0; 8785 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8786 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8787 VPBB = FirstVPBBForBB; 8788 Builder.setInsertPoint(VPBB); 8789 8790 // Introduce each ingredient into VPlan. 8791 // TODO: Model and preserve debug instrinsics in VPlan. 8792 for (Instruction &I : BB->instructionsWithoutDebug()) { 8793 Instruction *Instr = &I; 8794 8795 // First filter out irrelevant instructions, to ensure no recipes are 8796 // built for them. 8797 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8798 continue; 8799 8800 if (auto RecipeOrValue = 8801 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8802 // If Instr can be simplified to an existing VPValue, use it. 8803 if (RecipeOrValue.is<VPValue *>()) { 8804 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8805 continue; 8806 } 8807 // Otherwise, add the new recipe. 8808 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8809 for (auto *Def : Recipe->definedValues()) { 8810 auto *UV = Def->getUnderlyingValue(); 8811 Plan->addVPValue(UV, Def); 8812 } 8813 8814 RecipeBuilder.setRecipe(Instr, Recipe); 8815 VPBB->appendRecipe(Recipe); 8816 continue; 8817 } 8818 8819 // Otherwise, if all widening options failed, Instruction is to be 8820 // replicated. This may create a successor for VPBB. 8821 VPBasicBlock *NextVPBB = 8822 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8823 if (NextVPBB != VPBB) { 8824 VPBB = NextVPBB; 8825 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8826 : ""); 8827 } 8828 } 8829 } 8830 8831 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8832 // may also be empty, such as the last one VPBB, reflecting original 8833 // basic-blocks with no recipes. 8834 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8835 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8836 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8837 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8838 delete PreEntry; 8839 8840 // --------------------------------------------------------------------------- 8841 // Transform initial VPlan: Apply previously taken decisions, in order, to 8842 // bring the VPlan to its final state. 8843 // --------------------------------------------------------------------------- 8844 8845 // Apply Sink-After legal constraints. 8846 for (auto &Entry : SinkAfter) { 8847 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8848 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8849 // If the target is in a replication region, make sure to move Sink to the 8850 // block after it, not into the replication region itself. 8851 if (auto *Region = 8852 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8853 if (Region->isReplicator()) { 8854 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8855 VPBasicBlock *NextBlock = 8856 cast<VPBasicBlock>(Region->getSuccessors().front()); 8857 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8858 continue; 8859 } 8860 } 8861 Sink->moveAfter(Target); 8862 } 8863 8864 // Interleave memory: for each Interleave Group we marked earlier as relevant 8865 // for this VPlan, replace the Recipes widening its memory instructions with a 8866 // single VPInterleaveRecipe at its insertion point. 8867 for (auto IG : InterleaveGroups) { 8868 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8869 RecipeBuilder.getRecipe(IG->getInsertPos())); 8870 SmallVector<VPValue *, 4> StoredValues; 8871 for (unsigned i = 0; i < IG->getFactor(); ++i) 8872 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8873 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8874 8875 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8876 Recipe->getMask()); 8877 VPIG->insertBefore(Recipe); 8878 unsigned J = 0; 8879 for (unsigned i = 0; i < IG->getFactor(); ++i) 8880 if (Instruction *Member = IG->getMember(i)) { 8881 if (!Member->getType()->isVoidTy()) { 8882 VPValue *OriginalV = Plan->getVPValue(Member); 8883 Plan->removeVPValueFor(Member); 8884 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8885 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8886 J++; 8887 } 8888 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8889 } 8890 } 8891 8892 // Adjust the recipes for any inloop reductions. 8893 if (Range.Start.isVector()) 8894 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8895 8896 // Finally, if tail is folded by masking, introduce selects between the phi 8897 // and the live-out instruction of each reduction, at the end of the latch. 8898 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8899 Builder.setInsertPoint(VPBB); 8900 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8901 for (auto &Reduction : Legal->getReductionVars()) { 8902 if (CM.isInLoopReduction(Reduction.first)) 8903 continue; 8904 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8905 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8906 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8907 } 8908 } 8909 8910 std::string PlanName; 8911 raw_string_ostream RSO(PlanName); 8912 ElementCount VF = Range.Start; 8913 Plan->addVF(VF); 8914 RSO << "Initial VPlan for VF={" << VF; 8915 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8916 Plan->addVF(VF); 8917 RSO << "," << VF; 8918 } 8919 RSO << "},UF>=1"; 8920 RSO.flush(); 8921 Plan->setName(PlanName); 8922 8923 return Plan; 8924 } 8925 8926 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8927 // Outer loop handling: They may require CFG and instruction level 8928 // transformations before even evaluating whether vectorization is profitable. 8929 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8930 // the vectorization pipeline. 8931 assert(!OrigLoop->isInnermost()); 8932 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8933 8934 // Create new empty VPlan 8935 auto Plan = std::make_unique<VPlan>(); 8936 8937 // Build hierarchical CFG 8938 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8939 HCFGBuilder.buildHierarchicalCFG(); 8940 8941 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8942 VF *= 2) 8943 Plan->addVF(VF); 8944 8945 if (EnableVPlanPredication) { 8946 VPlanPredicator VPP(*Plan); 8947 VPP.predicate(); 8948 8949 // Avoid running transformation to recipes until masked code generation in 8950 // VPlan-native path is in place. 8951 return Plan; 8952 } 8953 8954 SmallPtrSet<Instruction *, 1> DeadInstructions; 8955 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 8956 Legal->getInductionVars(), 8957 DeadInstructions, *PSE.getSE()); 8958 return Plan; 8959 } 8960 8961 // Adjust the recipes for any inloop reductions. The chain of instructions 8962 // leading from the loop exit instr to the phi need to be converted to 8963 // reductions, with one operand being vector and the other being the scalar 8964 // reduction chain. 8965 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8966 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8967 for (auto &Reduction : CM.getInLoopReductionChains()) { 8968 PHINode *Phi = Reduction.first; 8969 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8970 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8971 8972 // ReductionOperations are orders top-down from the phi's use to the 8973 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8974 // which of the two operands will remain scalar and which will be reduced. 8975 // For minmax the chain will be the select instructions. 8976 Instruction *Chain = Phi; 8977 for (Instruction *R : ReductionOperations) { 8978 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8979 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8980 8981 VPValue *ChainOp = Plan->getVPValue(Chain); 8982 unsigned FirstOpId; 8983 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8984 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8985 "Expected to replace a VPWidenSelectSC"); 8986 FirstOpId = 1; 8987 } else { 8988 assert(isa<VPWidenRecipe>(WidenRecipe) && 8989 "Expected to replace a VPWidenSC"); 8990 FirstOpId = 0; 8991 } 8992 unsigned VecOpId = 8993 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8994 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8995 8996 auto *CondOp = CM.foldTailByMasking() 8997 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8998 : nullptr; 8999 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9000 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9001 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9002 Plan->removeVPValueFor(R); 9003 Plan->addVPValue(R, RedRecipe); 9004 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9005 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9006 WidenRecipe->eraseFromParent(); 9007 9008 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9009 VPRecipeBase *CompareRecipe = 9010 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9011 assert(isa<VPWidenRecipe>(CompareRecipe) && 9012 "Expected to replace a VPWidenSC"); 9013 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9014 "Expected no remaining users"); 9015 CompareRecipe->eraseFromParent(); 9016 } 9017 Chain = R; 9018 } 9019 } 9020 } 9021 9022 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9023 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9024 VPSlotTracker &SlotTracker) const { 9025 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9026 IG->getInsertPos()->printAsOperand(O, false); 9027 O << ", "; 9028 getAddr()->printAsOperand(O, SlotTracker); 9029 VPValue *Mask = getMask(); 9030 if (Mask) { 9031 O << ", "; 9032 Mask->printAsOperand(O, SlotTracker); 9033 } 9034 for (unsigned i = 0; i < IG->getFactor(); ++i) 9035 if (Instruction *I = IG->getMember(i)) 9036 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9037 } 9038 #endif 9039 9040 void VPWidenCallRecipe::execute(VPTransformState &State) { 9041 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9042 *this, State); 9043 } 9044 9045 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9046 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9047 this, *this, InvariantCond, State); 9048 } 9049 9050 void VPWidenRecipe::execute(VPTransformState &State) { 9051 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9052 } 9053 9054 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9055 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9056 *this, State.UF, State.VF, IsPtrLoopInvariant, 9057 IsIndexLoopInvariant, State); 9058 } 9059 9060 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9061 assert(!State.Instance && "Int or FP induction being replicated."); 9062 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9063 getTruncInst(), getVPValue(0), 9064 getCastValue(), State); 9065 } 9066 9067 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9068 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9069 getStartValue(), this, State); 9070 } 9071 9072 void VPBlendRecipe::execute(VPTransformState &State) { 9073 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9074 // We know that all PHIs in non-header blocks are converted into 9075 // selects, so we don't have to worry about the insertion order and we 9076 // can just use the builder. 9077 // At this point we generate the predication tree. There may be 9078 // duplications since this is a simple recursive scan, but future 9079 // optimizations will clean it up. 9080 9081 unsigned NumIncoming = getNumIncomingValues(); 9082 9083 // Generate a sequence of selects of the form: 9084 // SELECT(Mask3, In3, 9085 // SELECT(Mask2, In2, 9086 // SELECT(Mask1, In1, 9087 // In0))) 9088 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9089 // are essentially undef are taken from In0. 9090 InnerLoopVectorizer::VectorParts Entry(State.UF); 9091 for (unsigned In = 0; In < NumIncoming; ++In) { 9092 for (unsigned Part = 0; Part < State.UF; ++Part) { 9093 // We might have single edge PHIs (blocks) - use an identity 9094 // 'select' for the first PHI operand. 9095 Value *In0 = State.get(getIncomingValue(In), Part); 9096 if (In == 0) 9097 Entry[Part] = In0; // Initialize with the first incoming value. 9098 else { 9099 // Select between the current value and the previous incoming edge 9100 // based on the incoming mask. 9101 Value *Cond = State.get(getMask(In), Part); 9102 Entry[Part] = 9103 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9104 } 9105 } 9106 } 9107 for (unsigned Part = 0; Part < State.UF; ++Part) 9108 State.set(this, Entry[Part], Part); 9109 } 9110 9111 void VPInterleaveRecipe::execute(VPTransformState &State) { 9112 assert(!State.Instance && "Interleave group being replicated."); 9113 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9114 getStoredValues(), getMask()); 9115 } 9116 9117 void VPReductionRecipe::execute(VPTransformState &State) { 9118 assert(!State.Instance && "Reduction being replicated."); 9119 for (unsigned Part = 0; Part < State.UF; ++Part) { 9120 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9121 Value *NewVecOp = State.get(getVecOp(), Part); 9122 if (VPValue *Cond = getCondOp()) { 9123 Value *NewCond = State.get(Cond, Part); 9124 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9125 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9126 Kind, VecTy->getElementType()); 9127 Constant *IdenVec = 9128 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9129 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9130 NewVecOp = Select; 9131 } 9132 Value *NewRed = 9133 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9134 Value *PrevInChain = State.get(getChainOp(), Part); 9135 Value *NextInChain; 9136 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9137 NextInChain = 9138 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9139 NewRed, PrevInChain); 9140 } else { 9141 NextInChain = State.Builder.CreateBinOp( 9142 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9143 PrevInChain); 9144 } 9145 State.set(this, NextInChain, Part); 9146 } 9147 } 9148 9149 void VPReplicateRecipe::execute(VPTransformState &State) { 9150 if (State.Instance) { // Generate a single instance. 9151 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9152 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9153 *State.Instance, IsPredicated, State); 9154 // Insert scalar instance packing it into a vector. 9155 if (AlsoPack && State.VF.isVector()) { 9156 // If we're constructing lane 0, initialize to start from poison. 9157 if (State.Instance->Lane.isFirstLane()) { 9158 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9159 Value *Poison = PoisonValue::get( 9160 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9161 State.set(this, Poison, State.Instance->Part); 9162 } 9163 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9164 } 9165 return; 9166 } 9167 9168 // Generate scalar instances for all VF lanes of all UF parts, unless the 9169 // instruction is uniform inwhich case generate only the first lane for each 9170 // of the UF parts. 9171 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9172 assert((!State.VF.isScalable() || IsUniform) && 9173 "Can't scalarize a scalable vector"); 9174 for (unsigned Part = 0; Part < State.UF; ++Part) 9175 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9176 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9177 VPIteration(Part, Lane), IsPredicated, 9178 State); 9179 } 9180 9181 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9182 assert(State.Instance && "Branch on Mask works only on single instance."); 9183 9184 unsigned Part = State.Instance->Part; 9185 unsigned Lane = State.Instance->Lane.getKnownLane(); 9186 9187 Value *ConditionBit = nullptr; 9188 VPValue *BlockInMask = getMask(); 9189 if (BlockInMask) { 9190 ConditionBit = State.get(BlockInMask, Part); 9191 if (ConditionBit->getType()->isVectorTy()) 9192 ConditionBit = State.Builder.CreateExtractElement( 9193 ConditionBit, State.Builder.getInt32(Lane)); 9194 } else // Block in mask is all-one. 9195 ConditionBit = State.Builder.getTrue(); 9196 9197 // Replace the temporary unreachable terminator with a new conditional branch, 9198 // whose two destinations will be set later when they are created. 9199 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9200 assert(isa<UnreachableInst>(CurrentTerminator) && 9201 "Expected to replace unreachable terminator with conditional branch."); 9202 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9203 CondBr->setSuccessor(0, nullptr); 9204 ReplaceInstWithInst(CurrentTerminator, CondBr); 9205 } 9206 9207 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9208 assert(State.Instance && "Predicated instruction PHI works per instance."); 9209 Instruction *ScalarPredInst = 9210 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9211 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9212 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9213 assert(PredicatingBB && "Predicated block has no single predecessor."); 9214 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9215 "operand must be VPReplicateRecipe"); 9216 9217 // By current pack/unpack logic we need to generate only a single phi node: if 9218 // a vector value for the predicated instruction exists at this point it means 9219 // the instruction has vector users only, and a phi for the vector value is 9220 // needed. In this case the recipe of the predicated instruction is marked to 9221 // also do that packing, thereby "hoisting" the insert-element sequence. 9222 // Otherwise, a phi node for the scalar value is needed. 9223 unsigned Part = State.Instance->Part; 9224 if (State.hasVectorValue(getOperand(0), Part)) { 9225 Value *VectorValue = State.get(getOperand(0), Part); 9226 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9227 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9228 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9229 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9230 if (State.hasVectorValue(this, Part)) 9231 State.reset(this, VPhi, Part); 9232 else 9233 State.set(this, VPhi, Part); 9234 // NOTE: Currently we need to update the value of the operand, so the next 9235 // predicated iteration inserts its generated value in the correct vector. 9236 State.reset(getOperand(0), VPhi, Part); 9237 } else { 9238 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9239 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9240 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9241 PredicatingBB); 9242 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9243 if (State.hasScalarValue(this, *State.Instance)) 9244 State.reset(this, Phi, *State.Instance); 9245 else 9246 State.set(this, Phi, *State.Instance); 9247 // NOTE: Currently we need to update the value of the operand, so the next 9248 // predicated iteration inserts its generated value in the correct vector. 9249 State.reset(getOperand(0), Phi, *State.Instance); 9250 } 9251 } 9252 9253 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9254 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9255 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9256 StoredValue ? nullptr : getVPValue(), 9257 getAddr(), StoredValue, getMask()); 9258 } 9259 9260 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9261 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9262 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9263 // for predication. 9264 static ScalarEpilogueLowering getScalarEpilogueLowering( 9265 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9266 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9267 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9268 LoopVectorizationLegality &LVL) { 9269 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9270 // don't look at hints or options, and don't request a scalar epilogue. 9271 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9272 // LoopAccessInfo (due to code dependency and not being able to reliably get 9273 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9274 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9275 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9276 // back to the old way and vectorize with versioning when forced. See D81345.) 9277 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9278 PGSOQueryType::IRPass) && 9279 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9280 return CM_ScalarEpilogueNotAllowedOptSize; 9281 9282 // 2) If set, obey the directives 9283 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9284 switch (PreferPredicateOverEpilogue) { 9285 case PreferPredicateTy::ScalarEpilogue: 9286 return CM_ScalarEpilogueAllowed; 9287 case PreferPredicateTy::PredicateElseScalarEpilogue: 9288 return CM_ScalarEpilogueNotNeededUsePredicate; 9289 case PreferPredicateTy::PredicateOrDontVectorize: 9290 return CM_ScalarEpilogueNotAllowedUsePredicate; 9291 }; 9292 } 9293 9294 // 3) If set, obey the hints 9295 switch (Hints.getPredicate()) { 9296 case LoopVectorizeHints::FK_Enabled: 9297 return CM_ScalarEpilogueNotNeededUsePredicate; 9298 case LoopVectorizeHints::FK_Disabled: 9299 return CM_ScalarEpilogueAllowed; 9300 }; 9301 9302 // 4) if the TTI hook indicates this is profitable, request predication. 9303 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9304 LVL.getLAI())) 9305 return CM_ScalarEpilogueNotNeededUsePredicate; 9306 9307 return CM_ScalarEpilogueAllowed; 9308 } 9309 9310 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9311 // If Values have been set for this Def return the one relevant for \p Part. 9312 if (hasVectorValue(Def, Part)) 9313 return Data.PerPartOutput[Def][Part]; 9314 9315 if (!hasScalarValue(Def, {Part, 0})) { 9316 Value *IRV = Def->getLiveInIRValue(); 9317 Value *B = ILV->getBroadcastInstrs(IRV); 9318 set(Def, B, Part); 9319 return B; 9320 } 9321 9322 Value *ScalarValue = get(Def, {Part, 0}); 9323 // If we aren't vectorizing, we can just copy the scalar map values over 9324 // to the vector map. 9325 if (VF.isScalar()) { 9326 set(Def, ScalarValue, Part); 9327 return ScalarValue; 9328 } 9329 9330 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9331 bool IsUniform = RepR && RepR->isUniform(); 9332 9333 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9334 // Check if there is a scalar value for the selected lane. 9335 if (!hasScalarValue(Def, {Part, LastLane})) { 9336 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9337 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9338 "unexpected recipe found to be invariant"); 9339 IsUniform = true; 9340 LastLane = 0; 9341 } 9342 9343 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9344 9345 // Set the insert point after the last scalarized instruction. This 9346 // ensures the insertelement sequence will directly follow the scalar 9347 // definitions. 9348 auto OldIP = Builder.saveIP(); 9349 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9350 Builder.SetInsertPoint(&*NewIP); 9351 9352 // However, if we are vectorizing, we need to construct the vector values. 9353 // If the value is known to be uniform after vectorization, we can just 9354 // broadcast the scalar value corresponding to lane zero for each unroll 9355 // iteration. Otherwise, we construct the vector values using 9356 // insertelement instructions. Since the resulting vectors are stored in 9357 // State, we will only generate the insertelements once. 9358 Value *VectorValue = nullptr; 9359 if (IsUniform) { 9360 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9361 set(Def, VectorValue, Part); 9362 } else { 9363 // Initialize packing with insertelements to start from undef. 9364 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9365 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9366 set(Def, Undef, Part); 9367 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9368 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9369 VectorValue = get(Def, Part); 9370 } 9371 Builder.restoreIP(OldIP); 9372 return VectorValue; 9373 } 9374 9375 // Process the loop in the VPlan-native vectorization path. This path builds 9376 // VPlan upfront in the vectorization pipeline, which allows to apply 9377 // VPlan-to-VPlan transformations from the very beginning without modifying the 9378 // input LLVM IR. 9379 static bool processLoopInVPlanNativePath( 9380 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9381 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9382 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9383 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9384 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9385 9386 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9387 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9388 return false; 9389 } 9390 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9391 Function *F = L->getHeader()->getParent(); 9392 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9393 9394 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9395 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9396 9397 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9398 &Hints, IAI); 9399 // Use the planner for outer loop vectorization. 9400 // TODO: CM is not used at this point inside the planner. Turn CM into an 9401 // optional argument if we don't need it in the future. 9402 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9403 9404 // Get user vectorization factor. 9405 ElementCount UserVF = Hints.getWidth(); 9406 9407 // Plan how to best vectorize, return the best VF and its cost. 9408 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9409 9410 // If we are stress testing VPlan builds, do not attempt to generate vector 9411 // code. Masked vector code generation support will follow soon. 9412 // Also, do not attempt to vectorize if no vector code will be produced. 9413 if (VPlanBuildStressTest || EnableVPlanPredication || 9414 VectorizationFactor::Disabled() == VF) 9415 return false; 9416 9417 LVP.setBestPlan(VF.Width, 1); 9418 9419 { 9420 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9421 F->getParent()->getDataLayout()); 9422 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9423 &CM, BFI, PSI, Checks); 9424 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9425 << L->getHeader()->getParent()->getName() << "\"\n"); 9426 LVP.executePlan(LB, DT); 9427 } 9428 9429 // Mark the loop as already vectorized to avoid vectorizing again. 9430 Hints.setAlreadyVectorized(); 9431 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9432 return true; 9433 } 9434 9435 // Emit a remark if there are stores to floats that required a floating point 9436 // extension. If the vectorized loop was generated with floating point there 9437 // will be a performance penalty from the conversion overhead and the change in 9438 // the vector width. 9439 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9440 SmallVector<Instruction *, 4> Worklist; 9441 for (BasicBlock *BB : L->getBlocks()) { 9442 for (Instruction &Inst : *BB) { 9443 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9444 if (S->getValueOperand()->getType()->isFloatTy()) 9445 Worklist.push_back(S); 9446 } 9447 } 9448 } 9449 9450 // Traverse the floating point stores upwards searching, for floating point 9451 // conversions. 9452 SmallPtrSet<const Instruction *, 4> Visited; 9453 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9454 while (!Worklist.empty()) { 9455 auto *I = Worklist.pop_back_val(); 9456 if (!L->contains(I)) 9457 continue; 9458 if (!Visited.insert(I).second) 9459 continue; 9460 9461 // Emit a remark if the floating point store required a floating 9462 // point conversion. 9463 // TODO: More work could be done to identify the root cause such as a 9464 // constant or a function return type and point the user to it. 9465 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9466 ORE->emit([&]() { 9467 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9468 I->getDebugLoc(), L->getHeader()) 9469 << "floating point conversion changes vector width. " 9470 << "Mixed floating point precision requires an up/down " 9471 << "cast that will negatively impact performance."; 9472 }); 9473 9474 for (Use &Op : I->operands()) 9475 if (auto *OpI = dyn_cast<Instruction>(Op)) 9476 Worklist.push_back(OpI); 9477 } 9478 } 9479 9480 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9481 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9482 !EnableLoopInterleaving), 9483 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9484 !EnableLoopVectorization) {} 9485 9486 bool LoopVectorizePass::processLoop(Loop *L) { 9487 assert((EnableVPlanNativePath || L->isInnermost()) && 9488 "VPlan-native path is not enabled. Only process inner loops."); 9489 9490 #ifndef NDEBUG 9491 const std::string DebugLocStr = getDebugLocString(L); 9492 #endif /* NDEBUG */ 9493 9494 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9495 << L->getHeader()->getParent()->getName() << "\" from " 9496 << DebugLocStr << "\n"); 9497 9498 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9499 9500 LLVM_DEBUG( 9501 dbgs() << "LV: Loop hints:" 9502 << " force=" 9503 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9504 ? "disabled" 9505 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9506 ? "enabled" 9507 : "?")) 9508 << " width=" << Hints.getWidth() 9509 << " unroll=" << Hints.getInterleave() << "\n"); 9510 9511 // Function containing loop 9512 Function *F = L->getHeader()->getParent(); 9513 9514 // Looking at the diagnostic output is the only way to determine if a loop 9515 // was vectorized (other than looking at the IR or machine code), so it 9516 // is important to generate an optimization remark for each loop. Most of 9517 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9518 // generated as OptimizationRemark and OptimizationRemarkMissed are 9519 // less verbose reporting vectorized loops and unvectorized loops that may 9520 // benefit from vectorization, respectively. 9521 9522 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9523 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9524 return false; 9525 } 9526 9527 PredicatedScalarEvolution PSE(*SE, *L); 9528 9529 // Check if it is legal to vectorize the loop. 9530 LoopVectorizationRequirements Requirements(*ORE); 9531 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9532 &Requirements, &Hints, DB, AC, BFI, PSI); 9533 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9534 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9535 Hints.emitRemarkWithHints(); 9536 return false; 9537 } 9538 9539 // Check the function attributes and profiles to find out if this function 9540 // should be optimized for size. 9541 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9542 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9543 9544 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9545 // here. They may require CFG and instruction level transformations before 9546 // even evaluating whether vectorization is profitable. Since we cannot modify 9547 // the incoming IR, we need to build VPlan upfront in the vectorization 9548 // pipeline. 9549 if (!L->isInnermost()) 9550 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9551 ORE, BFI, PSI, Hints); 9552 9553 assert(L->isInnermost() && "Inner loop expected."); 9554 9555 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9556 // count by optimizing for size, to minimize overheads. 9557 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9558 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9559 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9560 << "This loop is worth vectorizing only if no scalar " 9561 << "iteration overheads are incurred."); 9562 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9563 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9564 else { 9565 LLVM_DEBUG(dbgs() << "\n"); 9566 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9567 } 9568 } 9569 9570 // Check the function attributes to see if implicit floats are allowed. 9571 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9572 // an integer loop and the vector instructions selected are purely integer 9573 // vector instructions? 9574 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9575 reportVectorizationFailure( 9576 "Can't vectorize when the NoImplicitFloat attribute is used", 9577 "loop not vectorized due to NoImplicitFloat attribute", 9578 "NoImplicitFloat", ORE, L); 9579 Hints.emitRemarkWithHints(); 9580 return false; 9581 } 9582 9583 // Check if the target supports potentially unsafe FP vectorization. 9584 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9585 // for the target we're vectorizing for, to make sure none of the 9586 // additional fp-math flags can help. 9587 if (Hints.isPotentiallyUnsafe() && 9588 TTI->isFPVectorizationPotentiallyUnsafe()) { 9589 reportVectorizationFailure( 9590 "Potentially unsafe FP op prevents vectorization", 9591 "loop not vectorized due to unsafe FP support.", 9592 "UnsafeFP", ORE, L); 9593 Hints.emitRemarkWithHints(); 9594 return false; 9595 } 9596 9597 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9598 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9599 9600 // If an override option has been passed in for interleaved accesses, use it. 9601 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9602 UseInterleaved = EnableInterleavedMemAccesses; 9603 9604 // Analyze interleaved memory accesses. 9605 if (UseInterleaved) { 9606 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9607 } 9608 9609 // Use the cost model. 9610 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9611 F, &Hints, IAI); 9612 CM.collectValuesToIgnore(); 9613 9614 // Use the planner for vectorization. 9615 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9616 9617 // Get user vectorization factor and interleave count. 9618 ElementCount UserVF = Hints.getWidth(); 9619 unsigned UserIC = Hints.getInterleave(); 9620 9621 // Plan how to best vectorize, return the best VF and its cost. 9622 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9623 9624 VectorizationFactor VF = VectorizationFactor::Disabled(); 9625 unsigned IC = 1; 9626 9627 if (MaybeVF) { 9628 VF = *MaybeVF; 9629 // Select the interleave count. 9630 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9631 } 9632 9633 // Identify the diagnostic messages that should be produced. 9634 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9635 bool VectorizeLoop = true, InterleaveLoop = true; 9636 if (Requirements.doesNotMeet(F, L, Hints)) { 9637 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9638 "requirements.\n"); 9639 Hints.emitRemarkWithHints(); 9640 return false; 9641 } 9642 9643 if (VF.Width.isScalar()) { 9644 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9645 VecDiagMsg = std::make_pair( 9646 "VectorizationNotBeneficial", 9647 "the cost-model indicates that vectorization is not beneficial"); 9648 VectorizeLoop = false; 9649 } 9650 9651 if (!MaybeVF && UserIC > 1) { 9652 // Tell the user interleaving was avoided up-front, despite being explicitly 9653 // requested. 9654 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9655 "interleaving should be avoided up front\n"); 9656 IntDiagMsg = std::make_pair( 9657 "InterleavingAvoided", 9658 "Ignoring UserIC, because interleaving was avoided up front"); 9659 InterleaveLoop = false; 9660 } else if (IC == 1 && UserIC <= 1) { 9661 // Tell the user interleaving is not beneficial. 9662 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9663 IntDiagMsg = std::make_pair( 9664 "InterleavingNotBeneficial", 9665 "the cost-model indicates that interleaving is not beneficial"); 9666 InterleaveLoop = false; 9667 if (UserIC == 1) { 9668 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9669 IntDiagMsg.second += 9670 " and is explicitly disabled or interleave count is set to 1"; 9671 } 9672 } else if (IC > 1 && UserIC == 1) { 9673 // Tell the user interleaving is beneficial, but it explicitly disabled. 9674 LLVM_DEBUG( 9675 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9676 IntDiagMsg = std::make_pair( 9677 "InterleavingBeneficialButDisabled", 9678 "the cost-model indicates that interleaving is beneficial " 9679 "but is explicitly disabled or interleave count is set to 1"); 9680 InterleaveLoop = false; 9681 } 9682 9683 // Override IC if user provided an interleave count. 9684 IC = UserIC > 0 ? UserIC : IC; 9685 9686 // Emit diagnostic messages, if any. 9687 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9688 if (!VectorizeLoop && !InterleaveLoop) { 9689 // Do not vectorize or interleaving the loop. 9690 ORE->emit([&]() { 9691 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9692 L->getStartLoc(), L->getHeader()) 9693 << VecDiagMsg.second; 9694 }); 9695 ORE->emit([&]() { 9696 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9697 L->getStartLoc(), L->getHeader()) 9698 << IntDiagMsg.second; 9699 }); 9700 return false; 9701 } else if (!VectorizeLoop && InterleaveLoop) { 9702 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9703 ORE->emit([&]() { 9704 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9705 L->getStartLoc(), L->getHeader()) 9706 << VecDiagMsg.second; 9707 }); 9708 } else if (VectorizeLoop && !InterleaveLoop) { 9709 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9710 << ") in " << DebugLocStr << '\n'); 9711 ORE->emit([&]() { 9712 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9713 L->getStartLoc(), L->getHeader()) 9714 << IntDiagMsg.second; 9715 }); 9716 } else if (VectorizeLoop && InterleaveLoop) { 9717 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9718 << ") in " << DebugLocStr << '\n'); 9719 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9720 } 9721 9722 bool DisableRuntimeUnroll = false; 9723 MDNode *OrigLoopID = L->getLoopID(); 9724 { 9725 // Optimistically generate runtime checks. Drop them if they turn out to not 9726 // be profitable. Limit the scope of Checks, so the cleanup happens 9727 // immediately after vector codegeneration is done. 9728 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9729 F->getParent()->getDataLayout()); 9730 if (!VF.Width.isScalar() || IC > 1) 9731 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9732 LVP.setBestPlan(VF.Width, IC); 9733 9734 using namespace ore; 9735 if (!VectorizeLoop) { 9736 assert(IC > 1 && "interleave count should not be 1 or 0"); 9737 // If we decided that it is not legal to vectorize the loop, then 9738 // interleave it. 9739 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9740 &CM, BFI, PSI, Checks); 9741 LVP.executePlan(Unroller, DT); 9742 9743 ORE->emit([&]() { 9744 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9745 L->getHeader()) 9746 << "interleaved loop (interleaved count: " 9747 << NV("InterleaveCount", IC) << ")"; 9748 }); 9749 } else { 9750 // If we decided that it is *legal* to vectorize the loop, then do it. 9751 9752 // Consider vectorizing the epilogue too if it's profitable. 9753 VectorizationFactor EpilogueVF = 9754 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9755 if (EpilogueVF.Width.isVector()) { 9756 9757 // The first pass vectorizes the main loop and creates a scalar epilogue 9758 // to be vectorized by executing the plan (potentially with a different 9759 // factor) again shortly afterwards. 9760 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9761 EpilogueVF.Width.getKnownMinValue(), 9762 1); 9763 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9764 EPI, &LVL, &CM, BFI, PSI, Checks); 9765 9766 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9767 LVP.executePlan(MainILV, DT); 9768 ++LoopsVectorized; 9769 9770 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9771 formLCSSARecursively(*L, *DT, LI, SE); 9772 9773 // Second pass vectorizes the epilogue and adjusts the control flow 9774 // edges from the first pass. 9775 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9776 EPI.MainLoopVF = EPI.EpilogueVF; 9777 EPI.MainLoopUF = EPI.EpilogueUF; 9778 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9779 ORE, EPI, &LVL, &CM, BFI, PSI, 9780 Checks); 9781 LVP.executePlan(EpilogILV, DT); 9782 ++LoopsEpilogueVectorized; 9783 9784 if (!MainILV.areSafetyChecksAdded()) 9785 DisableRuntimeUnroll = true; 9786 } else { 9787 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9788 &LVL, &CM, BFI, PSI, Checks); 9789 LVP.executePlan(LB, DT); 9790 ++LoopsVectorized; 9791 9792 // Add metadata to disable runtime unrolling a scalar loop when there 9793 // are no runtime checks about strides and memory. A scalar loop that is 9794 // rarely used is not worth unrolling. 9795 if (!LB.areSafetyChecksAdded()) 9796 DisableRuntimeUnroll = true; 9797 } 9798 // Report the vectorization decision. 9799 ORE->emit([&]() { 9800 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9801 L->getHeader()) 9802 << "vectorized loop (vectorization width: " 9803 << NV("VectorizationFactor", VF.Width) 9804 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9805 }); 9806 } 9807 9808 if (ORE->allowExtraAnalysis(LV_NAME)) 9809 checkMixedPrecision(L, ORE); 9810 } 9811 9812 Optional<MDNode *> RemainderLoopID = 9813 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9814 LLVMLoopVectorizeFollowupEpilogue}); 9815 if (RemainderLoopID.hasValue()) { 9816 L->setLoopID(RemainderLoopID.getValue()); 9817 } else { 9818 if (DisableRuntimeUnroll) 9819 AddRuntimeUnrollDisableMetaData(L); 9820 9821 // Mark the loop as already vectorized to avoid vectorizing again. 9822 Hints.setAlreadyVectorized(); 9823 } 9824 9825 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9826 return true; 9827 } 9828 9829 LoopVectorizeResult LoopVectorizePass::runImpl( 9830 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9831 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9832 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9833 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9834 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9835 SE = &SE_; 9836 LI = &LI_; 9837 TTI = &TTI_; 9838 DT = &DT_; 9839 BFI = &BFI_; 9840 TLI = TLI_; 9841 AA = &AA_; 9842 AC = &AC_; 9843 GetLAA = &GetLAA_; 9844 DB = &DB_; 9845 ORE = &ORE_; 9846 PSI = PSI_; 9847 9848 // Don't attempt if 9849 // 1. the target claims to have no vector registers, and 9850 // 2. interleaving won't help ILP. 9851 // 9852 // The second condition is necessary because, even if the target has no 9853 // vector registers, loop vectorization may still enable scalar 9854 // interleaving. 9855 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9856 TTI->getMaxInterleaveFactor(1) < 2) 9857 return LoopVectorizeResult(false, false); 9858 9859 bool Changed = false, CFGChanged = false; 9860 9861 // The vectorizer requires loops to be in simplified form. 9862 // Since simplification may add new inner loops, it has to run before the 9863 // legality and profitability checks. This means running the loop vectorizer 9864 // will simplify all loops, regardless of whether anything end up being 9865 // vectorized. 9866 for (auto &L : *LI) 9867 Changed |= CFGChanged |= 9868 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9869 9870 // Build up a worklist of inner-loops to vectorize. This is necessary as 9871 // the act of vectorizing or partially unrolling a loop creates new loops 9872 // and can invalidate iterators across the loops. 9873 SmallVector<Loop *, 8> Worklist; 9874 9875 for (Loop *L : *LI) 9876 collectSupportedLoops(*L, LI, ORE, Worklist); 9877 9878 LoopsAnalyzed += Worklist.size(); 9879 9880 // Now walk the identified inner loops. 9881 while (!Worklist.empty()) { 9882 Loop *L = Worklist.pop_back_val(); 9883 9884 // For the inner loops we actually process, form LCSSA to simplify the 9885 // transform. 9886 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9887 9888 Changed |= CFGChanged |= processLoop(L); 9889 } 9890 9891 // Process each loop nest in the function. 9892 return LoopVectorizeResult(Changed, CFGChanged); 9893 } 9894 9895 PreservedAnalyses LoopVectorizePass::run(Function &F, 9896 FunctionAnalysisManager &AM) { 9897 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9898 auto &LI = AM.getResult<LoopAnalysis>(F); 9899 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9900 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9901 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9902 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9903 auto &AA = AM.getResult<AAManager>(F); 9904 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9905 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9906 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9907 MemorySSA *MSSA = EnableMSSALoopDependency 9908 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9909 : nullptr; 9910 9911 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9912 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9913 [&](Loop &L) -> const LoopAccessInfo & { 9914 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9915 TLI, TTI, nullptr, MSSA}; 9916 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9917 }; 9918 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9919 ProfileSummaryInfo *PSI = 9920 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9921 LoopVectorizeResult Result = 9922 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9923 if (!Result.MadeAnyChange) 9924 return PreservedAnalyses::all(); 9925 PreservedAnalyses PA; 9926 9927 // We currently do not preserve loopinfo/dominator analyses with outer loop 9928 // vectorization. Until this is addressed, mark these analyses as preserved 9929 // only for non-VPlan-native path. 9930 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9931 if (!EnableVPlanNativePath) { 9932 PA.preserve<LoopAnalysis>(); 9933 PA.preserve<DominatorTreeAnalysis>(); 9934 } 9935 PA.preserve<BasicAA>(); 9936 PA.preserve<GlobalsAA>(); 9937 if (!Result.MadeCFGChange) 9938 PA.preserveSet<CFGAnalyses>(); 9939 return PA; 9940 } 9941