1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 201 // that predication is preferred, and this lists all options. I.e., the 202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 203 // and predicate the instructions accordingly. If tail-folding fails, there are 204 // different fallback strategies depending on these values: 205 namespace PreferPredicateTy { 206 enum Option { 207 ScalarEpilogue = 0, 208 PredicateElseScalarEpilogue, 209 PredicateOrDontVectorize 210 }; 211 } // namespace PreferPredicateTy 212 213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 214 "prefer-predicate-over-epilogue", 215 cl::init(PreferPredicateTy::ScalarEpilogue), 216 cl::Hidden, 217 cl::desc("Tail-folding and predication preferences over creating a scalar " 218 "epilogue loop."), 219 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 220 "scalar-epilogue", 221 "Don't tail-predicate loops, create scalar epilogue"), 222 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 223 "predicate-else-scalar-epilogue", 224 "prefer tail-folding, create scalar epilogue if tail " 225 "folding fails."), 226 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 227 "predicate-dont-vectorize", 228 "prefers tail-folding, don't attempt vectorization if " 229 "tail-folding fails."))); 230 231 static cl::opt<bool> MaximizeBandwidth( 232 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 233 cl::desc("Maximize bandwidth when selecting vectorization factor which " 234 "will be determined by the smallest type in loop.")); 235 236 static cl::opt<bool> EnableInterleavedMemAccesses( 237 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 238 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 239 240 /// An interleave-group may need masking if it resides in a block that needs 241 /// predication, or in order to mask away gaps. 242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 243 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 245 246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 247 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 248 cl::desc("We don't interleave loops with a estimated constant trip count " 249 "below this number")); 250 251 static cl::opt<unsigned> ForceTargetNumScalarRegs( 252 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 253 cl::desc("A flag that overrides the target's number of scalar registers.")); 254 255 static cl::opt<unsigned> ForceTargetNumVectorRegs( 256 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 257 cl::desc("A flag that overrides the target's number of vector registers.")); 258 259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 260 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 261 cl::desc("A flag that overrides the target's max interleave factor for " 262 "scalar loops.")); 263 264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 265 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "vectorized loops.")); 268 269 static cl::opt<unsigned> ForceTargetInstructionCost( 270 "force-target-instruction-cost", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's expected cost for " 272 "an instruction to a single constant value. Mostly " 273 "useful for getting consistent testing.")); 274 275 static cl::opt<bool> ForceTargetSupportsScalableVectors( 276 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 277 cl::desc( 278 "Pretend that scalable vectors are supported, even if the target does " 279 "not support them. This flag should only be used for testing.")); 280 281 static cl::opt<unsigned> SmallLoopCost( 282 "small-loop-cost", cl::init(20), cl::Hidden, 283 cl::desc( 284 "The cost of a loop that is considered 'small' by the interleaver.")); 285 286 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 287 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 288 cl::desc("Enable the use of the block frequency analysis to access PGO " 289 "heuristics minimizing code growth in cold regions and being more " 290 "aggressive in hot regions.")); 291 292 // Runtime interleave loops for load/store throughput. 293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 294 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 295 cl::desc( 296 "Enable runtime interleaving until load/store ports are saturated")); 297 298 /// Interleave small loops with scalar reductions. 299 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 300 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 301 cl::desc("Enable interleaving for loops with small iteration counts that " 302 "contain scalar reductions to expose ILP.")); 303 304 /// The number of stores in a loop that are allowed to need predication. 305 static cl::opt<unsigned> NumberOfStoresToPredicate( 306 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 307 cl::desc("Max number of stores to be predicated behind an if.")); 308 309 static cl::opt<bool> EnableIndVarRegisterHeur( 310 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 311 cl::desc("Count the induction variable only once when interleaving")); 312 313 static cl::opt<bool> EnableCondStoresVectorization( 314 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 315 cl::desc("Enable if predication of stores during vectorization.")); 316 317 static cl::opt<unsigned> MaxNestedScalarReductionIC( 318 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 319 cl::desc("The maximum interleave count to use when interleaving a scalar " 320 "reduction in a nested loop.")); 321 322 static cl::opt<bool> 323 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 324 cl::Hidden, 325 cl::desc("Prefer in-loop vector reductions, " 326 "overriding the targets preference.")); 327 328 static cl::opt<bool> PreferPredicatedReductionSelect( 329 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 330 cl::desc( 331 "Prefer predicating a reduction operation over an after loop select.")); 332 333 cl::opt<bool> EnableVPlanNativePath( 334 "enable-vplan-native-path", cl::init(false), cl::Hidden, 335 cl::desc("Enable VPlan-native vectorization path with " 336 "support for outer loop vectorization.")); 337 338 // FIXME: Remove this switch once we have divergence analysis. Currently we 339 // assume divergent non-backedge branches when this switch is true. 340 cl::opt<bool> EnableVPlanPredication( 341 "enable-vplan-predication", cl::init(false), cl::Hidden, 342 cl::desc("Enable VPlan-native vectorization path predicator with " 343 "support for outer loop vectorization.")); 344 345 // This flag enables the stress testing of the VPlan H-CFG construction in the 346 // VPlan-native vectorization path. It must be used in conjuction with 347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 348 // verification of the H-CFGs built. 349 static cl::opt<bool> VPlanBuildStressTest( 350 "vplan-build-stress-test", cl::init(false), cl::Hidden, 351 cl::desc( 352 "Build VPlan for every supported loop nest in the function and bail " 353 "out right after the build (stress test the VPlan H-CFG construction " 354 "in the VPlan-native vectorization path).")); 355 356 cl::opt<bool> llvm::EnableLoopInterleaving( 357 "interleave-loops", cl::init(true), cl::Hidden, 358 cl::desc("Enable loop interleaving in Loop vectorization passes")); 359 cl::opt<bool> llvm::EnableLoopVectorization( 360 "vectorize-loops", cl::init(true), cl::Hidden, 361 cl::desc("Run the Loop vectorization passes")); 362 363 cl::opt<bool> PrintVPlansInDotFormat( 364 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 365 cl::desc("Use dot format instead of plain text when dumping VPlans")); 366 367 /// A helper function that returns the type of loaded or stored value. 368 static Type *getMemInstValueType(Value *I) { 369 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 370 "Expected Load or Store instruction"); 371 if (auto *LI = dyn_cast<LoadInst>(I)) 372 return LI->getType(); 373 return cast<StoreInst>(I)->getValueOperand()->getType(); 374 } 375 376 /// A helper function that returns true if the given type is irregular. The 377 /// type is irregular if its allocated size doesn't equal the store size of an 378 /// element of the corresponding vector type. 379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 380 // Determine if an array of N elements of type Ty is "bitcast compatible" 381 // with a <N x Ty> vector. 382 // This is only true if there is no padding between the array elements. 383 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 384 } 385 386 /// A helper function that returns the reciprocal of the block probability of 387 /// predicated blocks. If we return X, we are assuming the predicated block 388 /// will execute once for every X iterations of the loop header. 389 /// 390 /// TODO: We should use actual block probability here, if available. Currently, 391 /// we always assume predicated blocks have a 50% chance of executing. 392 static unsigned getReciprocalPredBlockProb() { return 2; } 393 394 /// A helper function that returns an integer or floating-point constant with 395 /// value C. 396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 397 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 398 : ConstantFP::get(Ty, C); 399 } 400 401 /// Returns "best known" trip count for the specified loop \p L as defined by 402 /// the following procedure: 403 /// 1) Returns exact trip count if it is known. 404 /// 2) Returns expected trip count according to profile data if any. 405 /// 3) Returns upper bound estimate if it is known. 406 /// 4) Returns None if all of the above failed. 407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 408 // Check if exact trip count is known. 409 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 410 return ExpectedTC; 411 412 // Check if there is an expected trip count available from profile data. 413 if (LoopVectorizeWithBlockFrequency) 414 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 415 return EstimatedTC; 416 417 // Check if upper bound estimate is known. 418 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 419 return ExpectedTC; 420 421 return None; 422 } 423 424 // Forward declare GeneratedRTChecks. 425 class GeneratedRTChecks; 426 427 namespace llvm { 428 429 /// InnerLoopVectorizer vectorizes loops which contain only one basic 430 /// block to a specified vectorization factor (VF). 431 /// This class performs the widening of scalars into vectors, or multiple 432 /// scalars. This class also implements the following features: 433 /// * It inserts an epilogue loop for handling loops that don't have iteration 434 /// counts that are known to be a multiple of the vectorization factor. 435 /// * It handles the code generation for reduction variables. 436 /// * Scalarization (implementation using scalars) of un-vectorizable 437 /// instructions. 438 /// InnerLoopVectorizer does not perform any vectorization-legality 439 /// checks, and relies on the caller to check for the different legality 440 /// aspects. The InnerLoopVectorizer relies on the 441 /// LoopVectorizationLegality class to provide information about the induction 442 /// and reduction variables that were found to a given vectorization factor. 443 class InnerLoopVectorizer { 444 public: 445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 446 LoopInfo *LI, DominatorTree *DT, 447 const TargetLibraryInfo *TLI, 448 const TargetTransformInfo *TTI, AssumptionCache *AC, 449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 450 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 451 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 452 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 453 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 454 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 455 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 456 PSI(PSI), RTChecks(RTChecks) { 457 // Query this against the original loop and save it here because the profile 458 // of the original loop header may change as the transformation happens. 459 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 460 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 461 } 462 463 virtual ~InnerLoopVectorizer() = default; 464 465 /// Create a new empty loop that will contain vectorized instructions later 466 /// on, while the old loop will be used as the scalar remainder. Control flow 467 /// is generated around the vectorized (and scalar epilogue) loops consisting 468 /// of various checks and bypasses. Return the pre-header block of the new 469 /// loop. 470 /// In the case of epilogue vectorization, this function is overriden to 471 /// handle the more complex control flow around the loops. 472 virtual BasicBlock *createVectorizedLoopSkeleton(); 473 474 /// Widen a single instruction within the innermost loop. 475 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 476 VPTransformState &State); 477 478 /// Widen a single call instruction within the innermost loop. 479 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 480 VPTransformState &State); 481 482 /// Widen a single select instruction within the innermost loop. 483 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 484 bool InvariantCond, VPTransformState &State); 485 486 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 487 void fixVectorizedLoop(VPTransformState &State); 488 489 // Return true if any runtime check is added. 490 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 491 492 /// A type for vectorized values in the new loop. Each value from the 493 /// original loop, when vectorized, is represented by UF vector values in the 494 /// new unrolled loop, where UF is the unroll factor. 495 using VectorParts = SmallVector<Value *, 2>; 496 497 /// Vectorize a single GetElementPtrInst based on information gathered and 498 /// decisions taken during planning. 499 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 500 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 501 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 502 503 /// Vectorize a single PHINode in a block. This method handles the induction 504 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 505 /// arbitrary length vectors. 506 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 507 VPValue *StartV, VPValue *Def, 508 VPTransformState &State); 509 510 /// A helper function to scalarize a single Instruction in the innermost loop. 511 /// Generates a sequence of scalar instances for each lane between \p MinLane 512 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 513 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 514 /// Instr's operands. 515 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 516 const VPIteration &Instance, bool IfPredicateInstr, 517 VPTransformState &State); 518 519 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 520 /// is provided, the integer induction variable will first be truncated to 521 /// the corresponding type. 522 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 523 VPValue *Def, VPValue *CastDef, 524 VPTransformState &State); 525 526 /// Construct the vector value of a scalarized value \p V one lane at a time. 527 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 528 VPTransformState &State); 529 530 /// Try to vectorize interleaved access group \p Group with the base address 531 /// given in \p Addr, optionally masking the vector operations if \p 532 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 533 /// values in the vectorized loop. 534 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 535 ArrayRef<VPValue *> VPDefs, 536 VPTransformState &State, VPValue *Addr, 537 ArrayRef<VPValue *> StoredValues, 538 VPValue *BlockInMask = nullptr); 539 540 /// Vectorize Load and Store instructions with the base address given in \p 541 /// Addr, optionally masking the vector operations if \p BlockInMask is 542 /// non-null. Use \p State to translate given VPValues to IR values in the 543 /// vectorized loop. 544 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 545 VPValue *Def, VPValue *Addr, 546 VPValue *StoredValue, VPValue *BlockInMask); 547 548 /// Set the debug location in the builder using the debug location in 549 /// the instruction. 550 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 551 552 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 553 void fixNonInductionPHIs(VPTransformState &State); 554 555 /// Create a broadcast instruction. This method generates a broadcast 556 /// instruction (shuffle) for loop invariant values and for the induction 557 /// value. If this is the induction variable then we extend it to N, N+1, ... 558 /// this is needed because each iteration in the loop corresponds to a SIMD 559 /// element. 560 virtual Value *getBroadcastInstrs(Value *V); 561 562 protected: 563 friend class LoopVectorizationPlanner; 564 565 /// A small list of PHINodes. 566 using PhiVector = SmallVector<PHINode *, 4>; 567 568 /// A type for scalarized values in the new loop. Each value from the 569 /// original loop, when scalarized, is represented by UF x VF scalar values 570 /// in the new unrolled loop, where UF is the unroll factor and VF is the 571 /// vectorization factor. 572 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 573 574 /// Set up the values of the IVs correctly when exiting the vector loop. 575 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 576 Value *CountRoundDown, Value *EndValue, 577 BasicBlock *MiddleBlock); 578 579 /// Create a new induction variable inside L. 580 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 581 Value *Step, Instruction *DL); 582 583 /// Handle all cross-iteration phis in the header. 584 void fixCrossIterationPHIs(VPTransformState &State); 585 586 /// Fix a first-order recurrence. This is the second phase of vectorizing 587 /// this phi node. 588 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 589 590 /// Fix a reduction cross-iteration phi. This is the second phase of 591 /// vectorizing this phi node. 592 void fixReduction(PHINode *Phi, VPTransformState &State); 593 594 /// Clear NSW/NUW flags from reduction instructions if necessary. 595 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 596 VPTransformState &State); 597 598 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 599 /// means we need to add the appropriate incoming value from the middle 600 /// block as exiting edges from the scalar epilogue loop (if present) are 601 /// already in place, and we exit the vector loop exclusively to the middle 602 /// block. 603 void fixLCSSAPHIs(VPTransformState &State); 604 605 /// Iteratively sink the scalarized operands of a predicated instruction into 606 /// the block that was created for it. 607 void sinkScalarOperands(Instruction *PredInst); 608 609 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 610 /// represented as. 611 void truncateToMinimalBitwidths(VPTransformState &State); 612 613 /// This function adds 614 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 615 /// to each vector element of Val. The sequence starts at StartIndex. 616 /// \p Opcode is relevant for FP induction variable. 617 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 618 Instruction::BinaryOps Opcode = 619 Instruction::BinaryOpsEnd); 620 621 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 622 /// variable on which to base the steps, \p Step is the size of the step, and 623 /// \p EntryVal is the value from the original loop that maps to the steps. 624 /// Note that \p EntryVal doesn't have to be an induction variable - it 625 /// can also be a truncate instruction. 626 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 627 const InductionDescriptor &ID, VPValue *Def, 628 VPValue *CastDef, VPTransformState &State); 629 630 /// Create a vector induction phi node based on an existing scalar one. \p 631 /// EntryVal is the value from the original loop that maps to the vector phi 632 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 633 /// truncate instruction, instead of widening the original IV, we widen a 634 /// version of the IV truncated to \p EntryVal's type. 635 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 636 Value *Step, Value *Start, 637 Instruction *EntryVal, VPValue *Def, 638 VPValue *CastDef, 639 VPTransformState &State); 640 641 /// Returns true if an instruction \p I should be scalarized instead of 642 /// vectorized for the chosen vectorization factor. 643 bool shouldScalarizeInstruction(Instruction *I) const; 644 645 /// Returns true if we should generate a scalar version of \p IV. 646 bool needsScalarInduction(Instruction *IV) const; 647 648 /// If there is a cast involved in the induction variable \p ID, which should 649 /// be ignored in the vectorized loop body, this function records the 650 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 651 /// cast. We had already proved that the casted Phi is equal to the uncasted 652 /// Phi in the vectorized loop (under a runtime guard), and therefore 653 /// there is no need to vectorize the cast - the same value can be used in the 654 /// vector loop for both the Phi and the cast. 655 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 656 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 657 /// 658 /// \p EntryVal is the value from the original loop that maps to the vector 659 /// phi node and is used to distinguish what is the IV currently being 660 /// processed - original one (if \p EntryVal is a phi corresponding to the 661 /// original IV) or the "newly-created" one based on the proof mentioned above 662 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 663 /// latter case \p EntryVal is a TruncInst and we must not record anything for 664 /// that IV, but it's error-prone to expect callers of this routine to care 665 /// about that, hence this explicit parameter. 666 void recordVectorLoopValueForInductionCast( 667 const InductionDescriptor &ID, const Instruction *EntryVal, 668 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 669 unsigned Part, unsigned Lane = UINT_MAX); 670 671 /// Generate a shuffle sequence that will reverse the vector Vec. 672 virtual Value *reverseVector(Value *Vec); 673 674 /// Returns (and creates if needed) the original loop trip count. 675 Value *getOrCreateTripCount(Loop *NewLoop); 676 677 /// Returns (and creates if needed) the trip count of the widened loop. 678 Value *getOrCreateVectorTripCount(Loop *NewLoop); 679 680 /// Returns a bitcasted value to the requested vector type. 681 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 682 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 683 const DataLayout &DL); 684 685 /// Emit a bypass check to see if the vector trip count is zero, including if 686 /// it overflows. 687 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 688 689 /// Emit a bypass check to see if all of the SCEV assumptions we've 690 /// had to make are correct. Returns the block containing the checks or 691 /// nullptr if no checks have been added. 692 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 693 694 /// Emit bypass checks to check any memory assumptions we may have made. 695 /// Returns the block containing the checks or nullptr if no checks have been 696 /// added. 697 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Compute the transformed value of Index at offset StartValue using step 700 /// StepValue. 701 /// For integer induction, returns StartValue + Index * StepValue. 702 /// For pointer induction, returns StartValue[Index * StepValue]. 703 /// FIXME: The newly created binary instructions should contain nsw/nuw 704 /// flags, which can be found from the original scalar operations. 705 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 706 const DataLayout &DL, 707 const InductionDescriptor &ID) const; 708 709 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 710 /// vector loop preheader, middle block and scalar preheader. Also 711 /// allocate a loop object for the new vector loop and return it. 712 Loop *createVectorLoopSkeleton(StringRef Prefix); 713 714 /// Create new phi nodes for the induction variables to resume iteration count 715 /// in the scalar epilogue, from where the vectorized loop left off (given by 716 /// \p VectorTripCount). 717 /// In cases where the loop skeleton is more complicated (eg. epilogue 718 /// vectorization) and the resume values can come from an additional bypass 719 /// block, the \p AdditionalBypass pair provides information about the bypass 720 /// block and the end value on the edge from bypass to this loop. 721 void createInductionResumeValues( 722 Loop *L, Value *VectorTripCount, 723 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 724 725 /// Complete the loop skeleton by adding debug MDs, creating appropriate 726 /// conditional branches in the middle block, preparing the builder and 727 /// running the verifier. Take in the vector loop \p L as argument, and return 728 /// the preheader of the completed vector loop. 729 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 730 731 /// Add additional metadata to \p To that was not present on \p Orig. 732 /// 733 /// Currently this is used to add the noalias annotations based on the 734 /// inserted memchecks. Use this for instructions that are *cloned* into the 735 /// vector loop. 736 void addNewMetadata(Instruction *To, const Instruction *Orig); 737 738 /// Add metadata from one instruction to another. 739 /// 740 /// This includes both the original MDs from \p From and additional ones (\see 741 /// addNewMetadata). Use this for *newly created* instructions in the vector 742 /// loop. 743 void addMetadata(Instruction *To, Instruction *From); 744 745 /// Similar to the previous function but it adds the metadata to a 746 /// vector of instructions. 747 void addMetadata(ArrayRef<Value *> To, Instruction *From); 748 749 /// Allow subclasses to override and print debug traces before/after vplan 750 /// execution, when trace information is requested. 751 virtual void printDebugTracesAtStart(){}; 752 virtual void printDebugTracesAtEnd(){}; 753 754 /// The original loop. 755 Loop *OrigLoop; 756 757 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 758 /// dynamic knowledge to simplify SCEV expressions and converts them to a 759 /// more usable form. 760 PredicatedScalarEvolution &PSE; 761 762 /// Loop Info. 763 LoopInfo *LI; 764 765 /// Dominator Tree. 766 DominatorTree *DT; 767 768 /// Alias Analysis. 769 AAResults *AA; 770 771 /// Target Library Info. 772 const TargetLibraryInfo *TLI; 773 774 /// Target Transform Info. 775 const TargetTransformInfo *TTI; 776 777 /// Assumption Cache. 778 AssumptionCache *AC; 779 780 /// Interface to emit optimization remarks. 781 OptimizationRemarkEmitter *ORE; 782 783 /// LoopVersioning. It's only set up (non-null) if memchecks were 784 /// used. 785 /// 786 /// This is currently only used to add no-alias metadata based on the 787 /// memchecks. The actually versioning is performed manually. 788 std::unique_ptr<LoopVersioning> LVer; 789 790 /// The vectorization SIMD factor to use. Each vector will have this many 791 /// vector elements. 792 ElementCount VF; 793 794 /// The vectorization unroll factor to use. Each scalar is vectorized to this 795 /// many different vector instructions. 796 unsigned UF; 797 798 /// The builder that we use 799 IRBuilder<> Builder; 800 801 // --- Vectorization state --- 802 803 /// The vector-loop preheader. 804 BasicBlock *LoopVectorPreHeader; 805 806 /// The scalar-loop preheader. 807 BasicBlock *LoopScalarPreHeader; 808 809 /// Middle Block between the vector and the scalar. 810 BasicBlock *LoopMiddleBlock; 811 812 /// The (unique) ExitBlock of the scalar loop. Note that 813 /// there can be multiple exiting edges reaching this block. 814 BasicBlock *LoopExitBlock; 815 816 /// The vector loop body. 817 BasicBlock *LoopVectorBody; 818 819 /// The scalar loop body. 820 BasicBlock *LoopScalarBody; 821 822 /// A list of all bypass blocks. The first block is the entry of the loop. 823 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 824 825 /// The new Induction variable which was added to the new block. 826 PHINode *Induction = nullptr; 827 828 /// The induction variable of the old basic block. 829 PHINode *OldInduction = nullptr; 830 831 /// Store instructions that were predicated. 832 SmallVector<Instruction *, 4> PredicatedInstructions; 833 834 /// Trip count of the original loop. 835 Value *TripCount = nullptr; 836 837 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 838 Value *VectorTripCount = nullptr; 839 840 /// The legality analysis. 841 LoopVectorizationLegality *Legal; 842 843 /// The profitablity analysis. 844 LoopVectorizationCostModel *Cost; 845 846 // Record whether runtime checks are added. 847 bool AddedSafetyChecks = false; 848 849 // Holds the end values for each induction variable. We save the end values 850 // so we can later fix-up the external users of the induction variables. 851 DenseMap<PHINode *, Value *> IVEndValues; 852 853 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 854 // fixed up at the end of vector code generation. 855 SmallVector<PHINode *, 8> OrigPHIsToFix; 856 857 /// BFI and PSI are used to check for profile guided size optimizations. 858 BlockFrequencyInfo *BFI; 859 ProfileSummaryInfo *PSI; 860 861 // Whether this loop should be optimized for size based on profile guided size 862 // optimizatios. 863 bool OptForSizeBasedOnProfile; 864 865 /// Structure to hold information about generated runtime checks, responsible 866 /// for cleaning the checks, if vectorization turns out unprofitable. 867 GeneratedRTChecks &RTChecks; 868 }; 869 870 class InnerLoopUnroller : public InnerLoopVectorizer { 871 public: 872 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 873 LoopInfo *LI, DominatorTree *DT, 874 const TargetLibraryInfo *TLI, 875 const TargetTransformInfo *TTI, AssumptionCache *AC, 876 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 877 LoopVectorizationLegality *LVL, 878 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 879 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 880 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 881 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 882 BFI, PSI, Check) {} 883 884 private: 885 Value *getBroadcastInstrs(Value *V) override; 886 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 887 Instruction::BinaryOps Opcode = 888 Instruction::BinaryOpsEnd) override; 889 Value *reverseVector(Value *Vec) override; 890 }; 891 892 /// Encapsulate information regarding vectorization of a loop and its epilogue. 893 /// This information is meant to be updated and used across two stages of 894 /// epilogue vectorization. 895 struct EpilogueLoopVectorizationInfo { 896 ElementCount MainLoopVF = ElementCount::getFixed(0); 897 unsigned MainLoopUF = 0; 898 ElementCount EpilogueVF = ElementCount::getFixed(0); 899 unsigned EpilogueUF = 0; 900 BasicBlock *MainLoopIterationCountCheck = nullptr; 901 BasicBlock *EpilogueIterationCountCheck = nullptr; 902 BasicBlock *SCEVSafetyCheck = nullptr; 903 BasicBlock *MemSafetyCheck = nullptr; 904 Value *TripCount = nullptr; 905 Value *VectorTripCount = nullptr; 906 907 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 908 unsigned EUF) 909 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 910 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 911 assert(EUF == 1 && 912 "A high UF for the epilogue loop is likely not beneficial."); 913 } 914 }; 915 916 /// An extension of the inner loop vectorizer that creates a skeleton for a 917 /// vectorized loop that has its epilogue (residual) also vectorized. 918 /// The idea is to run the vplan on a given loop twice, firstly to setup the 919 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 920 /// from the first step and vectorize the epilogue. This is achieved by 921 /// deriving two concrete strategy classes from this base class and invoking 922 /// them in succession from the loop vectorizer planner. 923 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 924 public: 925 InnerLoopAndEpilogueVectorizer( 926 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 927 DominatorTree *DT, const TargetLibraryInfo *TLI, 928 const TargetTransformInfo *TTI, AssumptionCache *AC, 929 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 930 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 931 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 932 GeneratedRTChecks &Checks) 933 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 934 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 935 Checks), 936 EPI(EPI) {} 937 938 // Override this function to handle the more complex control flow around the 939 // three loops. 940 BasicBlock *createVectorizedLoopSkeleton() final override { 941 return createEpilogueVectorizedLoopSkeleton(); 942 } 943 944 /// The interface for creating a vectorized skeleton using one of two 945 /// different strategies, each corresponding to one execution of the vplan 946 /// as described above. 947 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 948 949 /// Holds and updates state information required to vectorize the main loop 950 /// and its epilogue in two separate passes. This setup helps us avoid 951 /// regenerating and recomputing runtime safety checks. It also helps us to 952 /// shorten the iteration-count-check path length for the cases where the 953 /// iteration count of the loop is so small that the main vector loop is 954 /// completely skipped. 955 EpilogueLoopVectorizationInfo &EPI; 956 }; 957 958 /// A specialized derived class of inner loop vectorizer that performs 959 /// vectorization of *main* loops in the process of vectorizing loops and their 960 /// epilogues. 961 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 962 public: 963 EpilogueVectorizerMainLoop( 964 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 965 DominatorTree *DT, const TargetLibraryInfo *TLI, 966 const TargetTransformInfo *TTI, AssumptionCache *AC, 967 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 968 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 969 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 970 GeneratedRTChecks &Check) 971 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 972 EPI, LVL, CM, BFI, PSI, Check) {} 973 /// Implements the interface for creating a vectorized skeleton using the 974 /// *main loop* strategy (ie the first pass of vplan execution). 975 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 976 977 protected: 978 /// Emits an iteration count bypass check once for the main loop (when \p 979 /// ForEpilogue is false) and once for the epilogue loop (when \p 980 /// ForEpilogue is true). 981 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 982 bool ForEpilogue); 983 void printDebugTracesAtStart() override; 984 void printDebugTracesAtEnd() override; 985 }; 986 987 // A specialized derived class of inner loop vectorizer that performs 988 // vectorization of *epilogue* loops in the process of vectorizing loops and 989 // their epilogues. 990 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 991 public: 992 EpilogueVectorizerEpilogueLoop( 993 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 994 DominatorTree *DT, const TargetLibraryInfo *TLI, 995 const TargetTransformInfo *TTI, AssumptionCache *AC, 996 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 997 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 998 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 999 GeneratedRTChecks &Checks) 1000 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1001 EPI, LVL, CM, BFI, PSI, Checks) {} 1002 /// Implements the interface for creating a vectorized skeleton using the 1003 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1004 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1005 1006 protected: 1007 /// Emits an iteration count bypass check after the main vector loop has 1008 /// finished to see if there are any iterations left to execute by either 1009 /// the vector epilogue or the scalar epilogue. 1010 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1011 BasicBlock *Bypass, 1012 BasicBlock *Insert); 1013 void printDebugTracesAtStart() override; 1014 void printDebugTracesAtEnd() override; 1015 }; 1016 } // end namespace llvm 1017 1018 /// Look for a meaningful debug location on the instruction or it's 1019 /// operands. 1020 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1021 if (!I) 1022 return I; 1023 1024 DebugLoc Empty; 1025 if (I->getDebugLoc() != Empty) 1026 return I; 1027 1028 for (Use &Op : I->operands()) { 1029 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1030 if (OpInst->getDebugLoc() != Empty) 1031 return OpInst; 1032 } 1033 1034 return I; 1035 } 1036 1037 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1038 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1039 const DILocation *DIL = Inst->getDebugLoc(); 1040 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1041 !isa<DbgInfoIntrinsic>(Inst)) { 1042 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1043 auto NewDIL = 1044 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1045 if (NewDIL) 1046 B.SetCurrentDebugLocation(NewDIL.getValue()); 1047 else 1048 LLVM_DEBUG(dbgs() 1049 << "Failed to create new discriminator: " 1050 << DIL->getFilename() << " Line: " << DIL->getLine()); 1051 } 1052 else 1053 B.SetCurrentDebugLocation(DIL); 1054 } else 1055 B.SetCurrentDebugLocation(DebugLoc()); 1056 } 1057 1058 /// Write a record \p DebugMsg about vectorization failure to the debug 1059 /// output stream. If \p I is passed, it is an instruction that prevents 1060 /// vectorization. 1061 #ifndef NDEBUG 1062 static void debugVectorizationFailure(const StringRef DebugMsg, 1063 Instruction *I) { 1064 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1065 if (I != nullptr) 1066 dbgs() << " " << *I; 1067 else 1068 dbgs() << '.'; 1069 dbgs() << '\n'; 1070 } 1071 #endif 1072 1073 /// Create an analysis remark that explains why vectorization failed 1074 /// 1075 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1076 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1077 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1078 /// the location of the remark. \return the remark object that can be 1079 /// streamed to. 1080 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1081 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1082 Value *CodeRegion = TheLoop->getHeader(); 1083 DebugLoc DL = TheLoop->getStartLoc(); 1084 1085 if (I) { 1086 CodeRegion = I->getParent(); 1087 // If there is no debug location attached to the instruction, revert back to 1088 // using the loop's. 1089 if (I->getDebugLoc()) 1090 DL = I->getDebugLoc(); 1091 } 1092 1093 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1094 R << "loop not vectorized: "; 1095 return R; 1096 } 1097 1098 /// Return a value for Step multiplied by VF. 1099 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1100 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1101 Constant *StepVal = ConstantInt::get( 1102 Step->getType(), 1103 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1104 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1105 } 1106 1107 namespace llvm { 1108 1109 /// Return the runtime value for VF. 1110 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1111 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1113 } 1114 1115 void reportVectorizationFailure(const StringRef DebugMsg, 1116 const StringRef OREMsg, const StringRef ORETag, 1117 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1118 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1119 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1120 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1121 ORETag, TheLoop, I) << OREMsg); 1122 } 1123 1124 } // end namespace llvm 1125 1126 #ifndef NDEBUG 1127 /// \return string containing a file name and a line # for the given loop. 1128 static std::string getDebugLocString(const Loop *L) { 1129 std::string Result; 1130 if (L) { 1131 raw_string_ostream OS(Result); 1132 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1133 LoopDbgLoc.print(OS); 1134 else 1135 // Just print the module name. 1136 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1137 OS.flush(); 1138 } 1139 return Result; 1140 } 1141 #endif 1142 1143 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1144 const Instruction *Orig) { 1145 // If the loop was versioned with memchecks, add the corresponding no-alias 1146 // metadata. 1147 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1148 LVer->annotateInstWithNoAlias(To, Orig); 1149 } 1150 1151 void InnerLoopVectorizer::addMetadata(Instruction *To, 1152 Instruction *From) { 1153 propagateMetadata(To, From); 1154 addNewMetadata(To, From); 1155 } 1156 1157 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1158 Instruction *From) { 1159 for (Value *V : To) { 1160 if (Instruction *I = dyn_cast<Instruction>(V)) 1161 addMetadata(I, From); 1162 } 1163 } 1164 1165 namespace llvm { 1166 1167 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1168 // lowered. 1169 enum ScalarEpilogueLowering { 1170 1171 // The default: allowing scalar epilogues. 1172 CM_ScalarEpilogueAllowed, 1173 1174 // Vectorization with OptForSize: don't allow epilogues. 1175 CM_ScalarEpilogueNotAllowedOptSize, 1176 1177 // A special case of vectorisation with OptForSize: loops with a very small 1178 // trip count are considered for vectorization under OptForSize, thereby 1179 // making sure the cost of their loop body is dominant, free of runtime 1180 // guards and scalar iteration overheads. 1181 CM_ScalarEpilogueNotAllowedLowTripLoop, 1182 1183 // Loop hint predicate indicating an epilogue is undesired. 1184 CM_ScalarEpilogueNotNeededUsePredicate, 1185 1186 // Directive indicating we must either tail fold or not vectorize 1187 CM_ScalarEpilogueNotAllowedUsePredicate 1188 }; 1189 1190 /// LoopVectorizationCostModel - estimates the expected speedups due to 1191 /// vectorization. 1192 /// In many cases vectorization is not profitable. This can happen because of 1193 /// a number of reasons. In this class we mainly attempt to predict the 1194 /// expected speedup/slowdowns due to the supported instruction set. We use the 1195 /// TargetTransformInfo to query the different backends for the cost of 1196 /// different operations. 1197 class LoopVectorizationCostModel { 1198 public: 1199 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1200 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1201 LoopVectorizationLegality *Legal, 1202 const TargetTransformInfo &TTI, 1203 const TargetLibraryInfo *TLI, DemandedBits *DB, 1204 AssumptionCache *AC, 1205 OptimizationRemarkEmitter *ORE, const Function *F, 1206 const LoopVectorizeHints *Hints, 1207 InterleavedAccessInfo &IAI) 1208 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1209 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1210 Hints(Hints), InterleaveInfo(IAI) {} 1211 1212 /// \return An upper bound for the vectorization factor, or None if 1213 /// vectorization and interleaving should be avoided up front. 1214 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1215 1216 /// \return True if runtime checks are required for vectorization, and false 1217 /// otherwise. 1218 bool runtimeChecksRequired(); 1219 1220 /// \return The most profitable vectorization factor and the cost of that VF. 1221 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1222 /// then this vectorization factor will be selected if vectorization is 1223 /// possible. 1224 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1225 VectorizationFactor 1226 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1227 const LoopVectorizationPlanner &LVP); 1228 1229 /// Setup cost-based decisions for user vectorization factor. 1230 void selectUserVectorizationFactor(ElementCount UserVF) { 1231 collectUniformsAndScalars(UserVF); 1232 collectInstsToScalarize(UserVF); 1233 } 1234 1235 /// \return The size (in bits) of the smallest and widest types in the code 1236 /// that needs to be vectorized. We ignore values that remain scalar such as 1237 /// 64 bit loop indices. 1238 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1239 1240 /// \return The desired interleave count. 1241 /// If interleave count has been specified by metadata it will be returned. 1242 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1243 /// are the selected vectorization factor and the cost of the selected VF. 1244 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1245 1246 /// Memory access instruction may be vectorized in more than one way. 1247 /// Form of instruction after vectorization depends on cost. 1248 /// This function takes cost-based decisions for Load/Store instructions 1249 /// and collects them in a map. This decisions map is used for building 1250 /// the lists of loop-uniform and loop-scalar instructions. 1251 /// The calculated cost is saved with widening decision in order to 1252 /// avoid redundant calculations. 1253 void setCostBasedWideningDecision(ElementCount VF); 1254 1255 /// A struct that represents some properties of the register usage 1256 /// of a loop. 1257 struct RegisterUsage { 1258 /// Holds the number of loop invariant values that are used in the loop. 1259 /// The key is ClassID of target-provided register class. 1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1261 /// Holds the maximum number of concurrent live intervals in the loop. 1262 /// The key is ClassID of target-provided register class. 1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1264 }; 1265 1266 /// \return Returns information about the register usages of the loop for the 1267 /// given vectorization factors. 1268 SmallVector<RegisterUsage, 8> 1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1270 1271 /// Collect values we want to ignore in the cost model. 1272 void collectValuesToIgnore(); 1273 1274 /// Split reductions into those that happen in the loop, and those that happen 1275 /// outside. In loop reductions are collected into InLoopReductionChains. 1276 void collectInLoopReductions(); 1277 1278 /// \returns The smallest bitwidth each instruction can be represented with. 1279 /// The vector equivalents of these instructions should be truncated to this 1280 /// type. 1281 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1282 return MinBWs; 1283 } 1284 1285 /// \returns True if it is more profitable to scalarize instruction \p I for 1286 /// vectorization factor \p VF. 1287 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1288 assert(VF.isVector() && 1289 "Profitable to scalarize relevant only for VF > 1."); 1290 1291 // Cost model is not run in the VPlan-native path - return conservative 1292 // result until this changes. 1293 if (EnableVPlanNativePath) 1294 return false; 1295 1296 auto Scalars = InstsToScalarize.find(VF); 1297 assert(Scalars != InstsToScalarize.end() && 1298 "VF not yet analyzed for scalarization profitability"); 1299 return Scalars->second.find(I) != Scalars->second.end(); 1300 } 1301 1302 /// Returns true if \p I is known to be uniform after vectorization. 1303 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1304 if (VF.isScalar()) 1305 return true; 1306 1307 // Cost model is not run in the VPlan-native path - return conservative 1308 // result until this changes. 1309 if (EnableVPlanNativePath) 1310 return false; 1311 1312 auto UniformsPerVF = Uniforms.find(VF); 1313 assert(UniformsPerVF != Uniforms.end() && 1314 "VF not yet analyzed for uniformity"); 1315 return UniformsPerVF->second.count(I); 1316 } 1317 1318 /// Returns true if \p I is known to be scalar after vectorization. 1319 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1320 if (VF.isScalar()) 1321 return true; 1322 1323 // Cost model is not run in the VPlan-native path - return conservative 1324 // result until this changes. 1325 if (EnableVPlanNativePath) 1326 return false; 1327 1328 auto ScalarsPerVF = Scalars.find(VF); 1329 assert(ScalarsPerVF != Scalars.end() && 1330 "Scalar values are not calculated for VF"); 1331 return ScalarsPerVF->second.count(I); 1332 } 1333 1334 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1335 /// for vectorization factor \p VF. 1336 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1337 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1338 !isProfitableToScalarize(I, VF) && 1339 !isScalarAfterVectorization(I, VF); 1340 } 1341 1342 /// Decision that was taken during cost calculation for memory instruction. 1343 enum InstWidening { 1344 CM_Unknown, 1345 CM_Widen, // For consecutive accesses with stride +1. 1346 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1347 CM_Interleave, 1348 CM_GatherScatter, 1349 CM_Scalarize 1350 }; 1351 1352 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1353 /// instruction \p I and vector width \p VF. 1354 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1355 InstructionCost Cost) { 1356 assert(VF.isVector() && "Expected VF >=2"); 1357 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1358 } 1359 1360 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1361 /// interleaving group \p Grp and vector width \p VF. 1362 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1363 ElementCount VF, InstWidening W, 1364 InstructionCost Cost) { 1365 assert(VF.isVector() && "Expected VF >=2"); 1366 /// Broadcast this decicion to all instructions inside the group. 1367 /// But the cost will be assigned to one instruction only. 1368 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1369 if (auto *I = Grp->getMember(i)) { 1370 if (Grp->getInsertPos() == I) 1371 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1372 else 1373 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1374 } 1375 } 1376 } 1377 1378 /// Return the cost model decision for the given instruction \p I and vector 1379 /// width \p VF. Return CM_Unknown if this instruction did not pass 1380 /// through the cost modeling. 1381 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1382 assert(VF.isVector() && "Expected VF to be a vector VF"); 1383 // Cost model is not run in the VPlan-native path - return conservative 1384 // result until this changes. 1385 if (EnableVPlanNativePath) 1386 return CM_GatherScatter; 1387 1388 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1389 auto Itr = WideningDecisions.find(InstOnVF); 1390 if (Itr == WideningDecisions.end()) 1391 return CM_Unknown; 1392 return Itr->second.first; 1393 } 1394 1395 /// Return the vectorization cost for the given instruction \p I and vector 1396 /// width \p VF. 1397 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1398 assert(VF.isVector() && "Expected VF >=2"); 1399 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1400 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1401 "The cost is not calculated"); 1402 return WideningDecisions[InstOnVF].second; 1403 } 1404 1405 /// Return True if instruction \p I is an optimizable truncate whose operand 1406 /// is an induction variable. Such a truncate will be removed by adding a new 1407 /// induction variable with the destination type. 1408 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1409 // If the instruction is not a truncate, return false. 1410 auto *Trunc = dyn_cast<TruncInst>(I); 1411 if (!Trunc) 1412 return false; 1413 1414 // Get the source and destination types of the truncate. 1415 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1416 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1417 1418 // If the truncate is free for the given types, return false. Replacing a 1419 // free truncate with an induction variable would add an induction variable 1420 // update instruction to each iteration of the loop. We exclude from this 1421 // check the primary induction variable since it will need an update 1422 // instruction regardless. 1423 Value *Op = Trunc->getOperand(0); 1424 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1425 return false; 1426 1427 // If the truncated value is not an induction variable, return false. 1428 return Legal->isInductionPhi(Op); 1429 } 1430 1431 /// Collects the instructions to scalarize for each predicated instruction in 1432 /// the loop. 1433 void collectInstsToScalarize(ElementCount VF); 1434 1435 /// Collect Uniform and Scalar values for the given \p VF. 1436 /// The sets depend on CM decision for Load/Store instructions 1437 /// that may be vectorized as interleave, gather-scatter or scalarized. 1438 void collectUniformsAndScalars(ElementCount VF) { 1439 // Do the analysis once. 1440 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1441 return; 1442 setCostBasedWideningDecision(VF); 1443 collectLoopUniforms(VF); 1444 collectLoopScalars(VF); 1445 } 1446 1447 /// Returns true if the target machine supports masked store operation 1448 /// for the given \p DataType and kind of access to \p Ptr. 1449 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1450 return Legal->isConsecutivePtr(Ptr) && 1451 TTI.isLegalMaskedStore(DataType, Alignment); 1452 } 1453 1454 /// Returns true if the target machine supports masked load operation 1455 /// for the given \p DataType and kind of access to \p Ptr. 1456 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1457 return Legal->isConsecutivePtr(Ptr) && 1458 TTI.isLegalMaskedLoad(DataType, Alignment); 1459 } 1460 1461 /// Returns true if the target machine supports masked scatter operation 1462 /// for the given \p DataType. 1463 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1464 return TTI.isLegalMaskedScatter(DataType, Alignment); 1465 } 1466 1467 /// Returns true if the target machine supports masked gather operation 1468 /// for the given \p DataType. 1469 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1470 return TTI.isLegalMaskedGather(DataType, Alignment); 1471 } 1472 1473 /// Returns true if the target machine can represent \p V as a masked gather 1474 /// or scatter operation. 1475 bool isLegalGatherOrScatter(Value *V) { 1476 bool LI = isa<LoadInst>(V); 1477 bool SI = isa<StoreInst>(V); 1478 if (!LI && !SI) 1479 return false; 1480 auto *Ty = getMemInstValueType(V); 1481 Align Align = getLoadStoreAlignment(V); 1482 return (LI && isLegalMaskedGather(Ty, Align)) || 1483 (SI && isLegalMaskedScatter(Ty, Align)); 1484 } 1485 1486 /// Returns true if the target machine supports all of the reduction 1487 /// variables found for the given VF. 1488 bool canVectorizeReductions(ElementCount VF) { 1489 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1490 RecurrenceDescriptor RdxDesc = Reduction.second; 1491 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1492 })); 1493 } 1494 1495 /// Returns true if \p I is an instruction that will be scalarized with 1496 /// predication. Such instructions include conditional stores and 1497 /// instructions that may divide by zero. 1498 /// If a non-zero VF has been calculated, we check if I will be scalarized 1499 /// predication for that VF. 1500 bool isScalarWithPredication(Instruction *I, 1501 ElementCount VF = ElementCount::getFixed(1)); 1502 1503 // Returns true if \p I is an instruction that will be predicated either 1504 // through scalar predication or masked load/store or masked gather/scatter. 1505 // Superset of instructions that return true for isScalarWithPredication. 1506 bool isPredicatedInst(Instruction *I) { 1507 if (!blockNeedsPredication(I->getParent())) 1508 return false; 1509 // Loads and stores that need some form of masked operation are predicated 1510 // instructions. 1511 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1512 return Legal->isMaskRequired(I); 1513 return isScalarWithPredication(I); 1514 } 1515 1516 /// Returns true if \p I is a memory instruction with consecutive memory 1517 /// access that can be widened. 1518 bool 1519 memoryInstructionCanBeWidened(Instruction *I, 1520 ElementCount VF = ElementCount::getFixed(1)); 1521 1522 /// Returns true if \p I is a memory instruction in an interleaved-group 1523 /// of memory accesses that can be vectorized with wide vector loads/stores 1524 /// and shuffles. 1525 bool 1526 interleavedAccessCanBeWidened(Instruction *I, 1527 ElementCount VF = ElementCount::getFixed(1)); 1528 1529 /// Check if \p Instr belongs to any interleaved access group. 1530 bool isAccessInterleaved(Instruction *Instr) { 1531 return InterleaveInfo.isInterleaved(Instr); 1532 } 1533 1534 /// Get the interleaved access group that \p Instr belongs to. 1535 const InterleaveGroup<Instruction> * 1536 getInterleavedAccessGroup(Instruction *Instr) { 1537 return InterleaveInfo.getInterleaveGroup(Instr); 1538 } 1539 1540 /// Returns true if we're required to use a scalar epilogue for at least 1541 /// the final iteration of the original loop. 1542 bool requiresScalarEpilogue() const { 1543 if (!isScalarEpilogueAllowed()) 1544 return false; 1545 // If we might exit from anywhere but the latch, must run the exiting 1546 // iteration in scalar form. 1547 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1548 return true; 1549 return InterleaveInfo.requiresScalarEpilogue(); 1550 } 1551 1552 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1553 /// loop hint annotation. 1554 bool isScalarEpilogueAllowed() const { 1555 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1556 } 1557 1558 /// Returns true if all loop blocks should be masked to fold tail loop. 1559 bool foldTailByMasking() const { return FoldTailByMasking; } 1560 1561 bool blockNeedsPredication(BasicBlock *BB) { 1562 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1563 } 1564 1565 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1566 /// nodes to the chain of instructions representing the reductions. Uses a 1567 /// MapVector to ensure deterministic iteration order. 1568 using ReductionChainMap = 1569 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1570 1571 /// Return the chain of instructions representing an inloop reduction. 1572 const ReductionChainMap &getInLoopReductionChains() const { 1573 return InLoopReductionChains; 1574 } 1575 1576 /// Returns true if the Phi is part of an inloop reduction. 1577 bool isInLoopReduction(PHINode *Phi) const { 1578 return InLoopReductionChains.count(Phi); 1579 } 1580 1581 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1582 /// with factor VF. Return the cost of the instruction, including 1583 /// scalarization overhead if it's needed. 1584 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1585 1586 /// Estimate cost of a call instruction CI if it were vectorized with factor 1587 /// VF. Return the cost of the instruction, including scalarization overhead 1588 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1589 /// scalarized - 1590 /// i.e. either vector version isn't available, or is too expensive. 1591 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1592 bool &NeedToScalarize); 1593 1594 /// Invalidates decisions already taken by the cost model. 1595 void invalidateCostModelingDecisions() { 1596 WideningDecisions.clear(); 1597 Uniforms.clear(); 1598 Scalars.clear(); 1599 } 1600 1601 private: 1602 unsigned NumPredStores = 0; 1603 1604 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1605 /// than zero. One is returned if vectorization should best be avoided due 1606 /// to cost. 1607 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1608 ElementCount UserVF); 1609 1610 /// The vectorization cost is a combination of the cost itself and a boolean 1611 /// indicating whether any of the contributing operations will actually 1612 /// operate on 1613 /// vector values after type legalization in the backend. If this latter value 1614 /// is 1615 /// false, then all operations will be scalarized (i.e. no vectorization has 1616 /// actually taken place). 1617 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1618 1619 /// Returns the expected execution cost. The unit of the cost does 1620 /// not matter because we use the 'cost' units to compare different 1621 /// vector widths. The cost that is returned is *not* normalized by 1622 /// the factor width. 1623 VectorizationCostTy expectedCost(ElementCount VF); 1624 1625 /// Returns the execution time cost of an instruction for a given vector 1626 /// width. Vector width of one means scalar. 1627 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1628 1629 /// The cost-computation logic from getInstructionCost which provides 1630 /// the vector type as an output parameter. 1631 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1632 Type *&VectorTy); 1633 1634 /// Return the cost of instructions in an inloop reduction pattern, if I is 1635 /// part of that pattern. 1636 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1637 Type *VectorTy, 1638 TTI::TargetCostKind CostKind); 1639 1640 /// Calculate vectorization cost of memory instruction \p I. 1641 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1642 1643 /// The cost computation for scalarized memory instruction. 1644 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1645 1646 /// The cost computation for interleaving group of memory instructions. 1647 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1648 1649 /// The cost computation for Gather/Scatter instruction. 1650 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1651 1652 /// The cost computation for widening instruction \p I with consecutive 1653 /// memory access. 1654 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1655 1656 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1657 /// Load: scalar load + broadcast. 1658 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1659 /// element) 1660 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1661 1662 /// Estimate the overhead of scalarizing an instruction. This is a 1663 /// convenience wrapper for the type-based getScalarizationOverhead API. 1664 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1665 1666 /// Returns whether the instruction is a load or store and will be a emitted 1667 /// as a vector operation. 1668 bool isConsecutiveLoadOrStore(Instruction *I); 1669 1670 /// Returns true if an artificially high cost for emulated masked memrefs 1671 /// should be used. 1672 bool useEmulatedMaskMemRefHack(Instruction *I); 1673 1674 /// Map of scalar integer values to the smallest bitwidth they can be legally 1675 /// represented as. The vector equivalents of these values should be truncated 1676 /// to this type. 1677 MapVector<Instruction *, uint64_t> MinBWs; 1678 1679 /// A type representing the costs for instructions if they were to be 1680 /// scalarized rather than vectorized. The entries are Instruction-Cost 1681 /// pairs. 1682 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1683 1684 /// A set containing all BasicBlocks that are known to present after 1685 /// vectorization as a predicated block. 1686 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1687 1688 /// Records whether it is allowed to have the original scalar loop execute at 1689 /// least once. This may be needed as a fallback loop in case runtime 1690 /// aliasing/dependence checks fail, or to handle the tail/remainder 1691 /// iterations when the trip count is unknown or doesn't divide by the VF, 1692 /// or as a peel-loop to handle gaps in interleave-groups. 1693 /// Under optsize and when the trip count is very small we don't allow any 1694 /// iterations to execute in the scalar loop. 1695 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1696 1697 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1698 bool FoldTailByMasking = false; 1699 1700 /// A map holding scalar costs for different vectorization factors. The 1701 /// presence of a cost for an instruction in the mapping indicates that the 1702 /// instruction will be scalarized when vectorizing with the associated 1703 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1704 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1705 1706 /// Holds the instructions known to be uniform after vectorization. 1707 /// The data is collected per VF. 1708 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1709 1710 /// Holds the instructions known to be scalar after vectorization. 1711 /// The data is collected per VF. 1712 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1713 1714 /// Holds the instructions (address computations) that are forced to be 1715 /// scalarized. 1716 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1717 1718 /// PHINodes of the reductions that should be expanded in-loop along with 1719 /// their associated chains of reduction operations, in program order from top 1720 /// (PHI) to bottom 1721 ReductionChainMap InLoopReductionChains; 1722 1723 /// A Map of inloop reduction operations and their immediate chain operand. 1724 /// FIXME: This can be removed once reductions can be costed correctly in 1725 /// vplan. This was added to allow quick lookup to the inloop operations, 1726 /// without having to loop through InLoopReductionChains. 1727 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1728 1729 /// Returns the expected difference in cost from scalarizing the expression 1730 /// feeding a predicated instruction \p PredInst. The instructions to 1731 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1732 /// non-negative return value implies the expression will be scalarized. 1733 /// Currently, only single-use chains are considered for scalarization. 1734 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1735 ElementCount VF); 1736 1737 /// Collect the instructions that are uniform after vectorization. An 1738 /// instruction is uniform if we represent it with a single scalar value in 1739 /// the vectorized loop corresponding to each vector iteration. Examples of 1740 /// uniform instructions include pointer operands of consecutive or 1741 /// interleaved memory accesses. Note that although uniformity implies an 1742 /// instruction will be scalar, the reverse is not true. In general, a 1743 /// scalarized instruction will be represented by VF scalar values in the 1744 /// vectorized loop, each corresponding to an iteration of the original 1745 /// scalar loop. 1746 void collectLoopUniforms(ElementCount VF); 1747 1748 /// Collect the instructions that are scalar after vectorization. An 1749 /// instruction is scalar if it is known to be uniform or will be scalarized 1750 /// during vectorization. Non-uniform scalarized instructions will be 1751 /// represented by VF values in the vectorized loop, each corresponding to an 1752 /// iteration of the original scalar loop. 1753 void collectLoopScalars(ElementCount VF); 1754 1755 /// Keeps cost model vectorization decision and cost for instructions. 1756 /// Right now it is used for memory instructions only. 1757 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1758 std::pair<InstWidening, InstructionCost>>; 1759 1760 DecisionList WideningDecisions; 1761 1762 /// Returns true if \p V is expected to be vectorized and it needs to be 1763 /// extracted. 1764 bool needsExtract(Value *V, ElementCount VF) const { 1765 Instruction *I = dyn_cast<Instruction>(V); 1766 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1767 TheLoop->isLoopInvariant(I)) 1768 return false; 1769 1770 // Assume we can vectorize V (and hence we need extraction) if the 1771 // scalars are not computed yet. This can happen, because it is called 1772 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1773 // the scalars are collected. That should be a safe assumption in most 1774 // cases, because we check if the operands have vectorizable types 1775 // beforehand in LoopVectorizationLegality. 1776 return Scalars.find(VF) == Scalars.end() || 1777 !isScalarAfterVectorization(I, VF); 1778 }; 1779 1780 /// Returns a range containing only operands needing to be extracted. 1781 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1782 ElementCount VF) { 1783 return SmallVector<Value *, 4>(make_filter_range( 1784 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1785 } 1786 1787 /// Determines if we have the infrastructure to vectorize loop \p L and its 1788 /// epilogue, assuming the main loop is vectorized by \p VF. 1789 bool isCandidateForEpilogueVectorization(const Loop &L, 1790 const ElementCount VF) const; 1791 1792 /// Returns true if epilogue vectorization is considered profitable, and 1793 /// false otherwise. 1794 /// \p VF is the vectorization factor chosen for the original loop. 1795 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1796 1797 public: 1798 /// The loop that we evaluate. 1799 Loop *TheLoop; 1800 1801 /// Predicated scalar evolution analysis. 1802 PredicatedScalarEvolution &PSE; 1803 1804 /// Loop Info analysis. 1805 LoopInfo *LI; 1806 1807 /// Vectorization legality. 1808 LoopVectorizationLegality *Legal; 1809 1810 /// Vector target information. 1811 const TargetTransformInfo &TTI; 1812 1813 /// Target Library Info. 1814 const TargetLibraryInfo *TLI; 1815 1816 /// Demanded bits analysis. 1817 DemandedBits *DB; 1818 1819 /// Assumption cache. 1820 AssumptionCache *AC; 1821 1822 /// Interface to emit optimization remarks. 1823 OptimizationRemarkEmitter *ORE; 1824 1825 const Function *TheFunction; 1826 1827 /// Loop Vectorize Hint. 1828 const LoopVectorizeHints *Hints; 1829 1830 /// The interleave access information contains groups of interleaved accesses 1831 /// with the same stride and close to each other. 1832 InterleavedAccessInfo &InterleaveInfo; 1833 1834 /// Values to ignore in the cost model. 1835 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1836 1837 /// Values to ignore in the cost model when VF > 1. 1838 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1839 1840 /// Profitable vector factors. 1841 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1842 }; 1843 } // end namespace llvm 1844 1845 /// Helper struct to manage generating runtime checks for vectorization. 1846 /// 1847 /// The runtime checks are created up-front in temporary blocks to allow better 1848 /// estimating the cost and un-linked from the existing IR. After deciding to 1849 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1850 /// temporary blocks are completely removed. 1851 class GeneratedRTChecks { 1852 /// Basic block which contains the generated SCEV checks, if any. 1853 BasicBlock *SCEVCheckBlock = nullptr; 1854 1855 /// The value representing the result of the generated SCEV checks. If it is 1856 /// nullptr, either no SCEV checks have been generated or they have been used. 1857 Value *SCEVCheckCond = nullptr; 1858 1859 /// Basic block which contains the generated memory runtime checks, if any. 1860 BasicBlock *MemCheckBlock = nullptr; 1861 1862 /// The value representing the result of the generated memory runtime checks. 1863 /// If it is nullptr, either no memory runtime checks have been generated or 1864 /// they have been used. 1865 Instruction *MemRuntimeCheckCond = nullptr; 1866 1867 DominatorTree *DT; 1868 LoopInfo *LI; 1869 1870 SCEVExpander SCEVExp; 1871 SCEVExpander MemCheckExp; 1872 1873 public: 1874 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1875 const DataLayout &DL) 1876 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1877 MemCheckExp(SE, DL, "scev.check") {} 1878 1879 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1880 /// accurately estimate the cost of the runtime checks. The blocks are 1881 /// un-linked from the IR and is added back during vector code generation. If 1882 /// there is no vector code generation, the check blocks are removed 1883 /// completely. 1884 void Create(Loop *L, const LoopAccessInfo &LAI, 1885 const SCEVUnionPredicate &UnionPred) { 1886 1887 BasicBlock *LoopHeader = L->getHeader(); 1888 BasicBlock *Preheader = L->getLoopPreheader(); 1889 1890 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1891 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1892 // may be used by SCEVExpander. The blocks will be un-linked from their 1893 // predecessors and removed from LI & DT at the end of the function. 1894 if (!UnionPred.isAlwaysTrue()) { 1895 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1896 nullptr, "vector.scevcheck"); 1897 1898 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1899 &UnionPred, SCEVCheckBlock->getTerminator()); 1900 } 1901 1902 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1903 if (RtPtrChecking.Need) { 1904 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1905 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1906 "vector.memcheck"); 1907 1908 std::tie(std::ignore, MemRuntimeCheckCond) = 1909 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1910 RtPtrChecking.getChecks(), MemCheckExp); 1911 assert(MemRuntimeCheckCond && 1912 "no RT checks generated although RtPtrChecking " 1913 "claimed checks are required"); 1914 } 1915 1916 if (!MemCheckBlock && !SCEVCheckBlock) 1917 return; 1918 1919 // Unhook the temporary block with the checks, update various places 1920 // accordingly. 1921 if (SCEVCheckBlock) 1922 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1923 if (MemCheckBlock) 1924 MemCheckBlock->replaceAllUsesWith(Preheader); 1925 1926 if (SCEVCheckBlock) { 1927 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1928 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1929 Preheader->getTerminator()->eraseFromParent(); 1930 } 1931 if (MemCheckBlock) { 1932 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1933 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1934 Preheader->getTerminator()->eraseFromParent(); 1935 } 1936 1937 DT->changeImmediateDominator(LoopHeader, Preheader); 1938 if (MemCheckBlock) { 1939 DT->eraseNode(MemCheckBlock); 1940 LI->removeBlock(MemCheckBlock); 1941 } 1942 if (SCEVCheckBlock) { 1943 DT->eraseNode(SCEVCheckBlock); 1944 LI->removeBlock(SCEVCheckBlock); 1945 } 1946 } 1947 1948 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1949 /// unused. 1950 ~GeneratedRTChecks() { 1951 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1952 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1953 if (!SCEVCheckCond) 1954 SCEVCleaner.markResultUsed(); 1955 1956 if (!MemRuntimeCheckCond) 1957 MemCheckCleaner.markResultUsed(); 1958 1959 if (MemRuntimeCheckCond) { 1960 auto &SE = *MemCheckExp.getSE(); 1961 // Memory runtime check generation creates compares that use expanded 1962 // values. Remove them before running the SCEVExpanderCleaners. 1963 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1964 if (MemCheckExp.isInsertedInstruction(&I)) 1965 continue; 1966 SE.forgetValue(&I); 1967 SE.eraseValueFromMap(&I); 1968 I.eraseFromParent(); 1969 } 1970 } 1971 MemCheckCleaner.cleanup(); 1972 SCEVCleaner.cleanup(); 1973 1974 if (SCEVCheckCond) 1975 SCEVCheckBlock->eraseFromParent(); 1976 if (MemRuntimeCheckCond) 1977 MemCheckBlock->eraseFromParent(); 1978 } 1979 1980 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1981 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1982 /// depending on the generated condition. 1983 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1984 BasicBlock *LoopVectorPreHeader, 1985 BasicBlock *LoopExitBlock) { 1986 if (!SCEVCheckCond) 1987 return nullptr; 1988 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1989 if (C->isZero()) 1990 return nullptr; 1991 1992 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 1993 1994 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 1995 // Create new preheader for vector loop. 1996 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 1997 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 1998 1999 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2000 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2001 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2002 SCEVCheckBlock); 2003 2004 DT->addNewBlock(SCEVCheckBlock, Pred); 2005 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2006 2007 ReplaceInstWithInst( 2008 SCEVCheckBlock->getTerminator(), 2009 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2010 // Mark the check as used, to prevent it from being removed during cleanup. 2011 SCEVCheckCond = nullptr; 2012 return SCEVCheckBlock; 2013 } 2014 2015 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2016 /// the branches to branch to the vector preheader or \p Bypass, depending on 2017 /// the generated condition. 2018 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2019 BasicBlock *LoopVectorPreHeader) { 2020 // Check if we generated code that checks in runtime if arrays overlap. 2021 if (!MemRuntimeCheckCond) 2022 return nullptr; 2023 2024 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2025 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2026 MemCheckBlock); 2027 2028 DT->addNewBlock(MemCheckBlock, Pred); 2029 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2030 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2031 2032 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2033 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2034 2035 ReplaceInstWithInst( 2036 MemCheckBlock->getTerminator(), 2037 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2038 MemCheckBlock->getTerminator()->setDebugLoc( 2039 Pred->getTerminator()->getDebugLoc()); 2040 2041 // Mark the check as used, to prevent it from being removed during cleanup. 2042 MemRuntimeCheckCond = nullptr; 2043 return MemCheckBlock; 2044 } 2045 }; 2046 2047 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2048 // vectorization. The loop needs to be annotated with #pragma omp simd 2049 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2050 // vector length information is not provided, vectorization is not considered 2051 // explicit. Interleave hints are not allowed either. These limitations will be 2052 // relaxed in the future. 2053 // Please, note that we are currently forced to abuse the pragma 'clang 2054 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2055 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2056 // provides *explicit vectorization hints* (LV can bypass legal checks and 2057 // assume that vectorization is legal). However, both hints are implemented 2058 // using the same metadata (llvm.loop.vectorize, processed by 2059 // LoopVectorizeHints). This will be fixed in the future when the native IR 2060 // representation for pragma 'omp simd' is introduced. 2061 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2062 OptimizationRemarkEmitter *ORE) { 2063 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2064 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2065 2066 // Only outer loops with an explicit vectorization hint are supported. 2067 // Unannotated outer loops are ignored. 2068 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2069 return false; 2070 2071 Function *Fn = OuterLp->getHeader()->getParent(); 2072 if (!Hints.allowVectorization(Fn, OuterLp, 2073 true /*VectorizeOnlyWhenForced*/)) { 2074 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2075 return false; 2076 } 2077 2078 if (Hints.getInterleave() > 1) { 2079 // TODO: Interleave support is future work. 2080 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2081 "outer loops.\n"); 2082 Hints.emitRemarkWithHints(); 2083 return false; 2084 } 2085 2086 return true; 2087 } 2088 2089 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2090 OptimizationRemarkEmitter *ORE, 2091 SmallVectorImpl<Loop *> &V) { 2092 // Collect inner loops and outer loops without irreducible control flow. For 2093 // now, only collect outer loops that have explicit vectorization hints. If we 2094 // are stress testing the VPlan H-CFG construction, we collect the outermost 2095 // loop of every loop nest. 2096 if (L.isInnermost() || VPlanBuildStressTest || 2097 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2098 LoopBlocksRPO RPOT(&L); 2099 RPOT.perform(LI); 2100 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2101 V.push_back(&L); 2102 // TODO: Collect inner loops inside marked outer loops in case 2103 // vectorization fails for the outer loop. Do not invoke 2104 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2105 // already known to be reducible. We can use an inherited attribute for 2106 // that. 2107 return; 2108 } 2109 } 2110 for (Loop *InnerL : L) 2111 collectSupportedLoops(*InnerL, LI, ORE, V); 2112 } 2113 2114 namespace { 2115 2116 /// The LoopVectorize Pass. 2117 struct LoopVectorize : public FunctionPass { 2118 /// Pass identification, replacement for typeid 2119 static char ID; 2120 2121 LoopVectorizePass Impl; 2122 2123 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2124 bool VectorizeOnlyWhenForced = false) 2125 : FunctionPass(ID), 2126 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2127 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2128 } 2129 2130 bool runOnFunction(Function &F) override { 2131 if (skipFunction(F)) 2132 return false; 2133 2134 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2135 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2136 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2137 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2138 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2139 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2140 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2141 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2142 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2143 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2144 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2145 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2146 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2147 2148 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2149 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2150 2151 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2152 GetLAA, *ORE, PSI).MadeAnyChange; 2153 } 2154 2155 void getAnalysisUsage(AnalysisUsage &AU) const override { 2156 AU.addRequired<AssumptionCacheTracker>(); 2157 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2158 AU.addRequired<DominatorTreeWrapperPass>(); 2159 AU.addRequired<LoopInfoWrapperPass>(); 2160 AU.addRequired<ScalarEvolutionWrapperPass>(); 2161 AU.addRequired<TargetTransformInfoWrapperPass>(); 2162 AU.addRequired<AAResultsWrapperPass>(); 2163 AU.addRequired<LoopAccessLegacyAnalysis>(); 2164 AU.addRequired<DemandedBitsWrapperPass>(); 2165 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2166 AU.addRequired<InjectTLIMappingsLegacy>(); 2167 2168 // We currently do not preserve loopinfo/dominator analyses with outer loop 2169 // vectorization. Until this is addressed, mark these analyses as preserved 2170 // only for non-VPlan-native path. 2171 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2172 if (!EnableVPlanNativePath) { 2173 AU.addPreserved<LoopInfoWrapperPass>(); 2174 AU.addPreserved<DominatorTreeWrapperPass>(); 2175 } 2176 2177 AU.addPreserved<BasicAAWrapperPass>(); 2178 AU.addPreserved<GlobalsAAWrapperPass>(); 2179 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2180 } 2181 }; 2182 2183 } // end anonymous namespace 2184 2185 //===----------------------------------------------------------------------===// 2186 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2187 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2188 //===----------------------------------------------------------------------===// 2189 2190 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2191 // We need to place the broadcast of invariant variables outside the loop, 2192 // but only if it's proven safe to do so. Else, broadcast will be inside 2193 // vector loop body. 2194 Instruction *Instr = dyn_cast<Instruction>(V); 2195 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2196 (!Instr || 2197 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2198 // Place the code for broadcasting invariant variables in the new preheader. 2199 IRBuilder<>::InsertPointGuard Guard(Builder); 2200 if (SafeToHoist) 2201 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2202 2203 // Broadcast the scalar into all locations in the vector. 2204 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2205 2206 return Shuf; 2207 } 2208 2209 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2210 const InductionDescriptor &II, Value *Step, Value *Start, 2211 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2212 VPTransformState &State) { 2213 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2214 "Expected either an induction phi-node or a truncate of it!"); 2215 2216 // Construct the initial value of the vector IV in the vector loop preheader 2217 auto CurrIP = Builder.saveIP(); 2218 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2219 if (isa<TruncInst>(EntryVal)) { 2220 assert(Start->getType()->isIntegerTy() && 2221 "Truncation requires an integer type"); 2222 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2223 Step = Builder.CreateTrunc(Step, TruncType); 2224 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2225 } 2226 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2227 Value *SteppedStart = 2228 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2229 2230 // We create vector phi nodes for both integer and floating-point induction 2231 // variables. Here, we determine the kind of arithmetic we will perform. 2232 Instruction::BinaryOps AddOp; 2233 Instruction::BinaryOps MulOp; 2234 if (Step->getType()->isIntegerTy()) { 2235 AddOp = Instruction::Add; 2236 MulOp = Instruction::Mul; 2237 } else { 2238 AddOp = II.getInductionOpcode(); 2239 MulOp = Instruction::FMul; 2240 } 2241 2242 // Multiply the vectorization factor by the step using integer or 2243 // floating-point arithmetic as appropriate. 2244 Value *ConstVF = 2245 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2246 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); 2247 2248 // Create a vector splat to use in the induction update. 2249 // 2250 // FIXME: If the step is non-constant, we create the vector splat with 2251 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2252 // handle a constant vector splat. 2253 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2254 Value *SplatVF = isa<Constant>(Mul) 2255 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2256 : Builder.CreateVectorSplat(VF, Mul); 2257 Builder.restoreIP(CurrIP); 2258 2259 // We may need to add the step a number of times, depending on the unroll 2260 // factor. The last of those goes into the PHI. 2261 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2262 &*LoopVectorBody->getFirstInsertionPt()); 2263 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2264 Instruction *LastInduction = VecInd; 2265 for (unsigned Part = 0; Part < UF; ++Part) { 2266 State.set(Def, LastInduction, Part); 2267 2268 if (isa<TruncInst>(EntryVal)) 2269 addMetadata(LastInduction, EntryVal); 2270 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2271 State, Part); 2272 2273 LastInduction = cast<Instruction>( 2274 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2275 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2276 } 2277 2278 // Move the last step to the end of the latch block. This ensures consistent 2279 // placement of all induction updates. 2280 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2281 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2282 auto *ICmp = cast<Instruction>(Br->getCondition()); 2283 LastInduction->moveBefore(ICmp); 2284 LastInduction->setName("vec.ind.next"); 2285 2286 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2287 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2288 } 2289 2290 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2291 return Cost->isScalarAfterVectorization(I, VF) || 2292 Cost->isProfitableToScalarize(I, VF); 2293 } 2294 2295 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2296 if (shouldScalarizeInstruction(IV)) 2297 return true; 2298 auto isScalarInst = [&](User *U) -> bool { 2299 auto *I = cast<Instruction>(U); 2300 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2301 }; 2302 return llvm::any_of(IV->users(), isScalarInst); 2303 } 2304 2305 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2306 const InductionDescriptor &ID, const Instruction *EntryVal, 2307 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2308 unsigned Part, unsigned Lane) { 2309 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2310 "Expected either an induction phi-node or a truncate of it!"); 2311 2312 // This induction variable is not the phi from the original loop but the 2313 // newly-created IV based on the proof that casted Phi is equal to the 2314 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2315 // re-uses the same InductionDescriptor that original IV uses but we don't 2316 // have to do any recording in this case - that is done when original IV is 2317 // processed. 2318 if (isa<TruncInst>(EntryVal)) 2319 return; 2320 2321 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2322 if (Casts.empty()) 2323 return; 2324 // Only the first Cast instruction in the Casts vector is of interest. 2325 // The rest of the Casts (if exist) have no uses outside the 2326 // induction update chain itself. 2327 if (Lane < UINT_MAX) 2328 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2329 else 2330 State.set(CastDef, VectorLoopVal, Part); 2331 } 2332 2333 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2334 TruncInst *Trunc, VPValue *Def, 2335 VPValue *CastDef, 2336 VPTransformState &State) { 2337 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2338 "Primary induction variable must have an integer type"); 2339 2340 auto II = Legal->getInductionVars().find(IV); 2341 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2342 2343 auto ID = II->second; 2344 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2345 2346 // The value from the original loop to which we are mapping the new induction 2347 // variable. 2348 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2349 2350 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2351 2352 // Generate code for the induction step. Note that induction steps are 2353 // required to be loop-invariant 2354 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2355 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2356 "Induction step should be loop invariant"); 2357 if (PSE.getSE()->isSCEVable(IV->getType())) { 2358 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2359 return Exp.expandCodeFor(Step, Step->getType(), 2360 LoopVectorPreHeader->getTerminator()); 2361 } 2362 return cast<SCEVUnknown>(Step)->getValue(); 2363 }; 2364 2365 // The scalar value to broadcast. This is derived from the canonical 2366 // induction variable. If a truncation type is given, truncate the canonical 2367 // induction variable and step. Otherwise, derive these values from the 2368 // induction descriptor. 2369 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2370 Value *ScalarIV = Induction; 2371 if (IV != OldInduction) { 2372 ScalarIV = IV->getType()->isIntegerTy() 2373 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2374 : Builder.CreateCast(Instruction::SIToFP, Induction, 2375 IV->getType()); 2376 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2377 ScalarIV->setName("offset.idx"); 2378 } 2379 if (Trunc) { 2380 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2381 assert(Step->getType()->isIntegerTy() && 2382 "Truncation requires an integer step"); 2383 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2384 Step = Builder.CreateTrunc(Step, TruncType); 2385 } 2386 return ScalarIV; 2387 }; 2388 2389 // Create the vector values from the scalar IV, in the absence of creating a 2390 // vector IV. 2391 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2392 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2393 for (unsigned Part = 0; Part < UF; ++Part) { 2394 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2395 Value *EntryPart = 2396 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2397 ID.getInductionOpcode()); 2398 State.set(Def, EntryPart, Part); 2399 if (Trunc) 2400 addMetadata(EntryPart, Trunc); 2401 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2402 State, Part); 2403 } 2404 }; 2405 2406 // Fast-math-flags propagate from the original induction instruction. 2407 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2408 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2409 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2410 2411 // Now do the actual transformations, and start with creating the step value. 2412 Value *Step = CreateStepValue(ID.getStep()); 2413 if (VF.isZero() || VF.isScalar()) { 2414 Value *ScalarIV = CreateScalarIV(Step); 2415 CreateSplatIV(ScalarIV, Step); 2416 return; 2417 } 2418 2419 // Determine if we want a scalar version of the induction variable. This is 2420 // true if the induction variable itself is not widened, or if it has at 2421 // least one user in the loop that is not widened. 2422 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2423 if (!NeedsScalarIV) { 2424 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2425 State); 2426 return; 2427 } 2428 2429 // Try to create a new independent vector induction variable. If we can't 2430 // create the phi node, we will splat the scalar induction variable in each 2431 // loop iteration. 2432 if (!shouldScalarizeInstruction(EntryVal)) { 2433 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2434 State); 2435 Value *ScalarIV = CreateScalarIV(Step); 2436 // Create scalar steps that can be used by instructions we will later 2437 // scalarize. Note that the addition of the scalar steps will not increase 2438 // the number of instructions in the loop in the common case prior to 2439 // InstCombine. We will be trading one vector extract for each scalar step. 2440 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2441 return; 2442 } 2443 2444 // All IV users are scalar instructions, so only emit a scalar IV, not a 2445 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2446 // predicate used by the masked loads/stores. 2447 Value *ScalarIV = CreateScalarIV(Step); 2448 if (!Cost->isScalarEpilogueAllowed()) 2449 CreateSplatIV(ScalarIV, Step); 2450 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2451 } 2452 2453 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2454 Instruction::BinaryOps BinOp) { 2455 // Create and check the types. 2456 assert(isa<FixedVectorType>(Val->getType()) && 2457 "Creation of scalable step vector not yet supported"); 2458 auto *ValVTy = cast<VectorType>(Val->getType()); 2459 ElementCount VLen = ValVTy->getElementCount(); 2460 2461 Type *STy = Val->getType()->getScalarType(); 2462 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2463 "Induction Step must be an integer or FP"); 2464 assert(Step->getType() == STy && "Step has wrong type"); 2465 2466 SmallVector<Constant *, 8> Indices; 2467 2468 // Create a vector of consecutive numbers from zero to VF. 2469 VectorType *InitVecValVTy = ValVTy; 2470 Type *InitVecValSTy = STy; 2471 if (STy->isFloatingPointTy()) { 2472 InitVecValSTy = 2473 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2474 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2475 } 2476 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2477 2478 // Add on StartIdx 2479 Value *StartIdxSplat = Builder.CreateVectorSplat( 2480 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2481 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2482 2483 if (STy->isIntegerTy()) { 2484 Step = Builder.CreateVectorSplat(VLen, Step); 2485 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2486 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2487 // which can be found from the original scalar operations. 2488 Step = Builder.CreateMul(InitVec, Step); 2489 return Builder.CreateAdd(Val, Step, "induction"); 2490 } 2491 2492 // Floating point induction. 2493 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2494 "Binary Opcode should be specified for FP induction"); 2495 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2496 Step = Builder.CreateVectorSplat(VLen, Step); 2497 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2498 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2499 } 2500 2501 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2502 Instruction *EntryVal, 2503 const InductionDescriptor &ID, 2504 VPValue *Def, VPValue *CastDef, 2505 VPTransformState &State) { 2506 // We shouldn't have to build scalar steps if we aren't vectorizing. 2507 assert(VF.isVector() && "VF should be greater than one"); 2508 // Get the value type and ensure it and the step have the same integer type. 2509 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2510 assert(ScalarIVTy == Step->getType() && 2511 "Val and Step should have the same type"); 2512 2513 // We build scalar steps for both integer and floating-point induction 2514 // variables. Here, we determine the kind of arithmetic we will perform. 2515 Instruction::BinaryOps AddOp; 2516 Instruction::BinaryOps MulOp; 2517 if (ScalarIVTy->isIntegerTy()) { 2518 AddOp = Instruction::Add; 2519 MulOp = Instruction::Mul; 2520 } else { 2521 AddOp = ID.getInductionOpcode(); 2522 MulOp = Instruction::FMul; 2523 } 2524 2525 // Determine the number of scalars we need to generate for each unroll 2526 // iteration. If EntryVal is uniform, we only need to generate the first 2527 // lane. Otherwise, we generate all VF values. 2528 unsigned Lanes = 2529 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2530 ? 1 2531 : VF.getKnownMinValue(); 2532 assert((!VF.isScalable() || Lanes == 1) && 2533 "Should never scalarize a scalable vector"); 2534 // Compute the scalar steps and save the results in State. 2535 for (unsigned Part = 0; Part < UF; ++Part) { 2536 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2537 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2538 ScalarIVTy->getScalarSizeInBits()); 2539 Value *StartIdx = 2540 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2541 if (ScalarIVTy->isFloatingPointTy()) 2542 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2543 StartIdx = Builder.CreateBinOp( 2544 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2545 // The step returned by `createStepForVF` is a runtime-evaluated value 2546 // when VF is scalable. Otherwise, it should be folded into a Constant. 2547 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2548 "Expected StartIdx to be folded to a constant when VF is not " 2549 "scalable"); 2550 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2551 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2552 State.set(Def, Add, VPIteration(Part, Lane)); 2553 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2554 Part, Lane); 2555 } 2556 } 2557 } 2558 2559 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2560 const VPIteration &Instance, 2561 VPTransformState &State) { 2562 Value *ScalarInst = State.get(Def, Instance); 2563 Value *VectorValue = State.get(Def, Instance.Part); 2564 VectorValue = Builder.CreateInsertElement( 2565 VectorValue, ScalarInst, 2566 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2567 State.set(Def, VectorValue, Instance.Part); 2568 } 2569 2570 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2571 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2572 return Builder.CreateVectorReverse(Vec, "reverse"); 2573 } 2574 2575 // Return whether we allow using masked interleave-groups (for dealing with 2576 // strided loads/stores that reside in predicated blocks, or for dealing 2577 // with gaps). 2578 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2579 // If an override option has been passed in for interleaved accesses, use it. 2580 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2581 return EnableMaskedInterleavedMemAccesses; 2582 2583 return TTI.enableMaskedInterleavedAccessVectorization(); 2584 } 2585 2586 // Try to vectorize the interleave group that \p Instr belongs to. 2587 // 2588 // E.g. Translate following interleaved load group (factor = 3): 2589 // for (i = 0; i < N; i+=3) { 2590 // R = Pic[i]; // Member of index 0 2591 // G = Pic[i+1]; // Member of index 1 2592 // B = Pic[i+2]; // Member of index 2 2593 // ... // do something to R, G, B 2594 // } 2595 // To: 2596 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2597 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2598 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2599 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2600 // 2601 // Or translate following interleaved store group (factor = 3): 2602 // for (i = 0; i < N; i+=3) { 2603 // ... do something to R, G, B 2604 // Pic[i] = R; // Member of index 0 2605 // Pic[i+1] = G; // Member of index 1 2606 // Pic[i+2] = B; // Member of index 2 2607 // } 2608 // To: 2609 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2610 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2611 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2612 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2613 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2614 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2615 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2616 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2617 VPValue *BlockInMask) { 2618 Instruction *Instr = Group->getInsertPos(); 2619 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2620 2621 // Prepare for the vector type of the interleaved load/store. 2622 Type *ScalarTy = getMemInstValueType(Instr); 2623 unsigned InterleaveFactor = Group->getFactor(); 2624 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2625 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2626 2627 // Prepare for the new pointers. 2628 SmallVector<Value *, 2> AddrParts; 2629 unsigned Index = Group->getIndex(Instr); 2630 2631 // TODO: extend the masked interleaved-group support to reversed access. 2632 assert((!BlockInMask || !Group->isReverse()) && 2633 "Reversed masked interleave-group not supported."); 2634 2635 // If the group is reverse, adjust the index to refer to the last vector lane 2636 // instead of the first. We adjust the index from the first vector lane, 2637 // rather than directly getting the pointer for lane VF - 1, because the 2638 // pointer operand of the interleaved access is supposed to be uniform. For 2639 // uniform instructions, we're only required to generate a value for the 2640 // first vector lane in each unroll iteration. 2641 assert(!VF.isScalable() && 2642 "scalable vector reverse operation is not implemented"); 2643 if (Group->isReverse()) 2644 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2645 2646 for (unsigned Part = 0; Part < UF; Part++) { 2647 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2648 setDebugLocFromInst(Builder, AddrPart); 2649 2650 // Notice current instruction could be any index. Need to adjust the address 2651 // to the member of index 0. 2652 // 2653 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2654 // b = A[i]; // Member of index 0 2655 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2656 // 2657 // E.g. A[i+1] = a; // Member of index 1 2658 // A[i] = b; // Member of index 0 2659 // A[i+2] = c; // Member of index 2 (Current instruction) 2660 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2661 2662 bool InBounds = false; 2663 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2664 InBounds = gep->isInBounds(); 2665 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2666 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2667 2668 // Cast to the vector pointer type. 2669 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2670 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2671 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2672 } 2673 2674 setDebugLocFromInst(Builder, Instr); 2675 Value *PoisonVec = PoisonValue::get(VecTy); 2676 2677 Value *MaskForGaps = nullptr; 2678 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2679 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2680 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2681 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2682 } 2683 2684 // Vectorize the interleaved load group. 2685 if (isa<LoadInst>(Instr)) { 2686 // For each unroll part, create a wide load for the group. 2687 SmallVector<Value *, 2> NewLoads; 2688 for (unsigned Part = 0; Part < UF; Part++) { 2689 Instruction *NewLoad; 2690 if (BlockInMask || MaskForGaps) { 2691 assert(useMaskedInterleavedAccesses(*TTI) && 2692 "masked interleaved groups are not allowed."); 2693 Value *GroupMask = MaskForGaps; 2694 if (BlockInMask) { 2695 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2696 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2697 Value *ShuffledMask = Builder.CreateShuffleVector( 2698 BlockInMaskPart, 2699 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2700 "interleaved.mask"); 2701 GroupMask = MaskForGaps 2702 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2703 MaskForGaps) 2704 : ShuffledMask; 2705 } 2706 NewLoad = 2707 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2708 GroupMask, PoisonVec, "wide.masked.vec"); 2709 } 2710 else 2711 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2712 Group->getAlign(), "wide.vec"); 2713 Group->addMetadata(NewLoad); 2714 NewLoads.push_back(NewLoad); 2715 } 2716 2717 // For each member in the group, shuffle out the appropriate data from the 2718 // wide loads. 2719 unsigned J = 0; 2720 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2721 Instruction *Member = Group->getMember(I); 2722 2723 // Skip the gaps in the group. 2724 if (!Member) 2725 continue; 2726 2727 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2728 auto StrideMask = 2729 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2730 for (unsigned Part = 0; Part < UF; Part++) { 2731 Value *StridedVec = Builder.CreateShuffleVector( 2732 NewLoads[Part], StrideMask, "strided.vec"); 2733 2734 // If this member has different type, cast the result type. 2735 if (Member->getType() != ScalarTy) { 2736 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2737 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2738 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2739 } 2740 2741 if (Group->isReverse()) 2742 StridedVec = reverseVector(StridedVec); 2743 2744 State.set(VPDefs[J], StridedVec, Part); 2745 } 2746 ++J; 2747 } 2748 return; 2749 } 2750 2751 // The sub vector type for current instruction. 2752 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2753 auto *SubVT = VectorType::get(ScalarTy, VF); 2754 2755 // Vectorize the interleaved store group. 2756 for (unsigned Part = 0; Part < UF; Part++) { 2757 // Collect the stored vector from each member. 2758 SmallVector<Value *, 4> StoredVecs; 2759 for (unsigned i = 0; i < InterleaveFactor; i++) { 2760 // Interleaved store group doesn't allow a gap, so each index has a member 2761 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2762 2763 Value *StoredVec = State.get(StoredValues[i], Part); 2764 2765 if (Group->isReverse()) 2766 StoredVec = reverseVector(StoredVec); 2767 2768 // If this member has different type, cast it to a unified type. 2769 2770 if (StoredVec->getType() != SubVT) 2771 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2772 2773 StoredVecs.push_back(StoredVec); 2774 } 2775 2776 // Concatenate all vectors into a wide vector. 2777 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2778 2779 // Interleave the elements in the wide vector. 2780 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2781 Value *IVec = Builder.CreateShuffleVector( 2782 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2783 "interleaved.vec"); 2784 2785 Instruction *NewStoreInstr; 2786 if (BlockInMask) { 2787 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2788 Value *ShuffledMask = Builder.CreateShuffleVector( 2789 BlockInMaskPart, 2790 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2791 "interleaved.mask"); 2792 NewStoreInstr = Builder.CreateMaskedStore( 2793 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2794 } 2795 else 2796 NewStoreInstr = 2797 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2798 2799 Group->addMetadata(NewStoreInstr); 2800 } 2801 } 2802 2803 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2804 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2805 VPValue *StoredValue, VPValue *BlockInMask) { 2806 // Attempt to issue a wide load. 2807 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2808 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2809 2810 assert((LI || SI) && "Invalid Load/Store instruction"); 2811 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2812 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2813 2814 LoopVectorizationCostModel::InstWidening Decision = 2815 Cost->getWideningDecision(Instr, VF); 2816 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2817 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2818 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2819 "CM decision is not to widen the memory instruction"); 2820 2821 Type *ScalarDataTy = getMemInstValueType(Instr); 2822 2823 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2824 const Align Alignment = getLoadStoreAlignment(Instr); 2825 2826 // Determine if the pointer operand of the access is either consecutive or 2827 // reverse consecutive. 2828 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2829 bool ConsecutiveStride = 2830 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2831 bool CreateGatherScatter = 2832 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2833 2834 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2835 // gather/scatter. Otherwise Decision should have been to Scalarize. 2836 assert((ConsecutiveStride || CreateGatherScatter) && 2837 "The instruction should be scalarized"); 2838 (void)ConsecutiveStride; 2839 2840 VectorParts BlockInMaskParts(UF); 2841 bool isMaskRequired = BlockInMask; 2842 if (isMaskRequired) 2843 for (unsigned Part = 0; Part < UF; ++Part) 2844 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2845 2846 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2847 // Calculate the pointer for the specific unroll-part. 2848 GetElementPtrInst *PartPtr = nullptr; 2849 2850 bool InBounds = false; 2851 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2852 InBounds = gep->isInBounds(); 2853 if (Reverse) { 2854 // If the address is consecutive but reversed, then the 2855 // wide store needs to start at the last vector element. 2856 // RunTimeVF = VScale * VF.getKnownMinValue() 2857 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2858 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2859 // NumElt = -Part * RunTimeVF 2860 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2861 // LastLane = 1 - RunTimeVF 2862 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2863 PartPtr = 2864 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2865 PartPtr->setIsInBounds(InBounds); 2866 PartPtr = cast<GetElementPtrInst>( 2867 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2868 PartPtr->setIsInBounds(InBounds); 2869 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2870 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2871 } else { 2872 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2873 PartPtr = cast<GetElementPtrInst>( 2874 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2875 PartPtr->setIsInBounds(InBounds); 2876 } 2877 2878 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2879 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2880 }; 2881 2882 // Handle Stores: 2883 if (SI) { 2884 setDebugLocFromInst(Builder, SI); 2885 2886 for (unsigned Part = 0; Part < UF; ++Part) { 2887 Instruction *NewSI = nullptr; 2888 Value *StoredVal = State.get(StoredValue, Part); 2889 if (CreateGatherScatter) { 2890 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2891 Value *VectorGep = State.get(Addr, Part); 2892 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2893 MaskPart); 2894 } else { 2895 if (Reverse) { 2896 // If we store to reverse consecutive memory locations, then we need 2897 // to reverse the order of elements in the stored value. 2898 StoredVal = reverseVector(StoredVal); 2899 // We don't want to update the value in the map as it might be used in 2900 // another expression. So don't call resetVectorValue(StoredVal). 2901 } 2902 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2903 if (isMaskRequired) 2904 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2905 BlockInMaskParts[Part]); 2906 else 2907 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2908 } 2909 addMetadata(NewSI, SI); 2910 } 2911 return; 2912 } 2913 2914 // Handle loads. 2915 assert(LI && "Must have a load instruction"); 2916 setDebugLocFromInst(Builder, LI); 2917 for (unsigned Part = 0; Part < UF; ++Part) { 2918 Value *NewLI; 2919 if (CreateGatherScatter) { 2920 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2921 Value *VectorGep = State.get(Addr, Part); 2922 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2923 nullptr, "wide.masked.gather"); 2924 addMetadata(NewLI, LI); 2925 } else { 2926 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2927 if (isMaskRequired) 2928 NewLI = Builder.CreateMaskedLoad( 2929 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2930 "wide.masked.load"); 2931 else 2932 NewLI = 2933 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2934 2935 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2936 addMetadata(NewLI, LI); 2937 if (Reverse) 2938 NewLI = reverseVector(NewLI); 2939 } 2940 2941 State.set(Def, NewLI, Part); 2942 } 2943 } 2944 2945 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2946 VPUser &User, 2947 const VPIteration &Instance, 2948 bool IfPredicateInstr, 2949 VPTransformState &State) { 2950 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2951 2952 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2953 // the first lane and part. 2954 if (isa<NoAliasScopeDeclInst>(Instr)) 2955 if (!Instance.isFirstIteration()) 2956 return; 2957 2958 setDebugLocFromInst(Builder, Instr); 2959 2960 // Does this instruction return a value ? 2961 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2962 2963 Instruction *Cloned = Instr->clone(); 2964 if (!IsVoidRetTy) 2965 Cloned->setName(Instr->getName() + ".cloned"); 2966 2967 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2968 Builder.GetInsertPoint()); 2969 // Replace the operands of the cloned instructions with their scalar 2970 // equivalents in the new loop. 2971 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2972 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2973 auto InputInstance = Instance; 2974 if (!Operand || !OrigLoop->contains(Operand) || 2975 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2976 InputInstance.Lane = VPLane::getFirstLane(); 2977 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2978 Cloned->setOperand(op, NewOp); 2979 } 2980 addNewMetadata(Cloned, Instr); 2981 2982 // Place the cloned scalar in the new loop. 2983 Builder.Insert(Cloned); 2984 2985 State.set(Def, Cloned, Instance); 2986 2987 // If we just cloned a new assumption, add it the assumption cache. 2988 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2989 if (II->getIntrinsicID() == Intrinsic::assume) 2990 AC->registerAssumption(II); 2991 2992 // End if-block. 2993 if (IfPredicateInstr) 2994 PredicatedInstructions.push_back(Cloned); 2995 } 2996 2997 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2998 Value *End, Value *Step, 2999 Instruction *DL) { 3000 BasicBlock *Header = L->getHeader(); 3001 BasicBlock *Latch = L->getLoopLatch(); 3002 // As we're just creating this loop, it's possible no latch exists 3003 // yet. If so, use the header as this will be a single block loop. 3004 if (!Latch) 3005 Latch = Header; 3006 3007 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3008 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3009 setDebugLocFromInst(Builder, OldInst); 3010 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3011 3012 Builder.SetInsertPoint(Latch->getTerminator()); 3013 setDebugLocFromInst(Builder, OldInst); 3014 3015 // Create i+1 and fill the PHINode. 3016 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3017 Induction->addIncoming(Start, L->getLoopPreheader()); 3018 Induction->addIncoming(Next, Latch); 3019 // Create the compare. 3020 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3021 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3022 3023 // Now we have two terminators. Remove the old one from the block. 3024 Latch->getTerminator()->eraseFromParent(); 3025 3026 return Induction; 3027 } 3028 3029 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3030 if (TripCount) 3031 return TripCount; 3032 3033 assert(L && "Create Trip Count for null loop."); 3034 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3035 // Find the loop boundaries. 3036 ScalarEvolution *SE = PSE.getSE(); 3037 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3038 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3039 "Invalid loop count"); 3040 3041 Type *IdxTy = Legal->getWidestInductionType(); 3042 assert(IdxTy && "No type for induction"); 3043 3044 // The exit count might have the type of i64 while the phi is i32. This can 3045 // happen if we have an induction variable that is sign extended before the 3046 // compare. The only way that we get a backedge taken count is that the 3047 // induction variable was signed and as such will not overflow. In such a case 3048 // truncation is legal. 3049 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3050 IdxTy->getPrimitiveSizeInBits()) 3051 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3052 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3053 3054 // Get the total trip count from the count by adding 1. 3055 const SCEV *ExitCount = SE->getAddExpr( 3056 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3057 3058 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3059 3060 // Expand the trip count and place the new instructions in the preheader. 3061 // Notice that the pre-header does not change, only the loop body. 3062 SCEVExpander Exp(*SE, DL, "induction"); 3063 3064 // Count holds the overall loop count (N). 3065 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3066 L->getLoopPreheader()->getTerminator()); 3067 3068 if (TripCount->getType()->isPointerTy()) 3069 TripCount = 3070 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3071 L->getLoopPreheader()->getTerminator()); 3072 3073 return TripCount; 3074 } 3075 3076 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3077 if (VectorTripCount) 3078 return VectorTripCount; 3079 3080 Value *TC = getOrCreateTripCount(L); 3081 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3082 3083 Type *Ty = TC->getType(); 3084 // This is where we can make the step a runtime constant. 3085 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3086 3087 // If the tail is to be folded by masking, round the number of iterations N 3088 // up to a multiple of Step instead of rounding down. This is done by first 3089 // adding Step-1 and then rounding down. Note that it's ok if this addition 3090 // overflows: the vector induction variable will eventually wrap to zero given 3091 // that it starts at zero and its Step is a power of two; the loop will then 3092 // exit, with the last early-exit vector comparison also producing all-true. 3093 if (Cost->foldTailByMasking()) { 3094 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3095 "VF*UF must be a power of 2 when folding tail by masking"); 3096 assert(!VF.isScalable() && 3097 "Tail folding not yet supported for scalable vectors"); 3098 TC = Builder.CreateAdd( 3099 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3100 } 3101 3102 // Now we need to generate the expression for the part of the loop that the 3103 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3104 // iterations are not required for correctness, or N - Step, otherwise. Step 3105 // is equal to the vectorization factor (number of SIMD elements) times the 3106 // unroll factor (number of SIMD instructions). 3107 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3108 3109 // There are two cases where we need to ensure (at least) the last iteration 3110 // runs in the scalar remainder loop. Thus, if the step evenly divides 3111 // the trip count, we set the remainder to be equal to the step. If the step 3112 // does not evenly divide the trip count, no adjustment is necessary since 3113 // there will already be scalar iterations. Note that the minimum iterations 3114 // check ensures that N >= Step. The cases are: 3115 // 1) If there is a non-reversed interleaved group that may speculatively 3116 // access memory out-of-bounds. 3117 // 2) If any instruction may follow a conditionally taken exit. That is, if 3118 // the loop contains multiple exiting blocks, or a single exiting block 3119 // which is not the latch. 3120 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3121 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3122 R = Builder.CreateSelect(IsZero, Step, R); 3123 } 3124 3125 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3126 3127 return VectorTripCount; 3128 } 3129 3130 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3131 const DataLayout &DL) { 3132 // Verify that V is a vector type with same number of elements as DstVTy. 3133 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3134 unsigned VF = DstFVTy->getNumElements(); 3135 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3136 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3137 Type *SrcElemTy = SrcVecTy->getElementType(); 3138 Type *DstElemTy = DstFVTy->getElementType(); 3139 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3140 "Vector elements must have same size"); 3141 3142 // Do a direct cast if element types are castable. 3143 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3144 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3145 } 3146 // V cannot be directly casted to desired vector type. 3147 // May happen when V is a floating point vector but DstVTy is a vector of 3148 // pointers or vice-versa. Handle this using a two-step bitcast using an 3149 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3150 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3151 "Only one type should be a pointer type"); 3152 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3153 "Only one type should be a floating point type"); 3154 Type *IntTy = 3155 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3156 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3157 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3158 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3159 } 3160 3161 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3162 BasicBlock *Bypass) { 3163 Value *Count = getOrCreateTripCount(L); 3164 // Reuse existing vector loop preheader for TC checks. 3165 // Note that new preheader block is generated for vector loop. 3166 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3167 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3168 3169 // Generate code to check if the loop's trip count is less than VF * UF, or 3170 // equal to it in case a scalar epilogue is required; this implies that the 3171 // vector trip count is zero. This check also covers the case where adding one 3172 // to the backedge-taken count overflowed leading to an incorrect trip count 3173 // of zero. In this case we will also jump to the scalar loop. 3174 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3175 : ICmpInst::ICMP_ULT; 3176 3177 // If tail is to be folded, vector loop takes care of all iterations. 3178 Value *CheckMinIters = Builder.getFalse(); 3179 if (!Cost->foldTailByMasking()) { 3180 Value *Step = 3181 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3182 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3183 } 3184 // Create new preheader for vector loop. 3185 LoopVectorPreHeader = 3186 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3187 "vector.ph"); 3188 3189 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3190 DT->getNode(Bypass)->getIDom()) && 3191 "TC check is expected to dominate Bypass"); 3192 3193 // Update dominator for Bypass & LoopExit. 3194 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3195 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3196 3197 ReplaceInstWithInst( 3198 TCCheckBlock->getTerminator(), 3199 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3200 LoopBypassBlocks.push_back(TCCheckBlock); 3201 } 3202 3203 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3204 3205 BasicBlock *const SCEVCheckBlock = 3206 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3207 if (!SCEVCheckBlock) 3208 return nullptr; 3209 3210 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3211 (OptForSizeBasedOnProfile && 3212 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3213 "Cannot SCEV check stride or overflow when optimizing for size"); 3214 3215 3216 // Update dominator only if this is first RT check. 3217 if (LoopBypassBlocks.empty()) { 3218 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3219 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3220 } 3221 3222 LoopBypassBlocks.push_back(SCEVCheckBlock); 3223 AddedSafetyChecks = true; 3224 return SCEVCheckBlock; 3225 } 3226 3227 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3228 BasicBlock *Bypass) { 3229 // VPlan-native path does not do any analysis for runtime checks currently. 3230 if (EnableVPlanNativePath) 3231 return nullptr; 3232 3233 BasicBlock *const MemCheckBlock = 3234 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3235 3236 // Check if we generated code that checks in runtime if arrays overlap. We put 3237 // the checks into a separate block to make the more common case of few 3238 // elements faster. 3239 if (!MemCheckBlock) 3240 return nullptr; 3241 3242 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3243 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3244 "Cannot emit memory checks when optimizing for size, unless forced " 3245 "to vectorize."); 3246 ORE->emit([&]() { 3247 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3248 L->getStartLoc(), L->getHeader()) 3249 << "Code-size may be reduced by not forcing " 3250 "vectorization, or by source-code modifications " 3251 "eliminating the need for runtime checks " 3252 "(e.g., adding 'restrict')."; 3253 }); 3254 } 3255 3256 LoopBypassBlocks.push_back(MemCheckBlock); 3257 3258 AddedSafetyChecks = true; 3259 3260 // We currently don't use LoopVersioning for the actual loop cloning but we 3261 // still use it to add the noalias metadata. 3262 LVer = std::make_unique<LoopVersioning>( 3263 *Legal->getLAI(), 3264 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3265 DT, PSE.getSE()); 3266 LVer->prepareNoAliasMetadata(); 3267 return MemCheckBlock; 3268 } 3269 3270 Value *InnerLoopVectorizer::emitTransformedIndex( 3271 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3272 const InductionDescriptor &ID) const { 3273 3274 SCEVExpander Exp(*SE, DL, "induction"); 3275 auto Step = ID.getStep(); 3276 auto StartValue = ID.getStartValue(); 3277 assert(Index->getType() == Step->getType() && 3278 "Index type does not match StepValue type"); 3279 3280 // Note: the IR at this point is broken. We cannot use SE to create any new 3281 // SCEV and then expand it, hoping that SCEV's simplification will give us 3282 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3283 // lead to various SCEV crashes. So all we can do is to use builder and rely 3284 // on InstCombine for future simplifications. Here we handle some trivial 3285 // cases only. 3286 auto CreateAdd = [&B](Value *X, Value *Y) { 3287 assert(X->getType() == Y->getType() && "Types don't match!"); 3288 if (auto *CX = dyn_cast<ConstantInt>(X)) 3289 if (CX->isZero()) 3290 return Y; 3291 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3292 if (CY->isZero()) 3293 return X; 3294 return B.CreateAdd(X, Y); 3295 }; 3296 3297 auto CreateMul = [&B](Value *X, Value *Y) { 3298 assert(X->getType() == Y->getType() && "Types don't match!"); 3299 if (auto *CX = dyn_cast<ConstantInt>(X)) 3300 if (CX->isOne()) 3301 return Y; 3302 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3303 if (CY->isOne()) 3304 return X; 3305 return B.CreateMul(X, Y); 3306 }; 3307 3308 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3309 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3310 // the DomTree is not kept up-to-date for additional blocks generated in the 3311 // vector loop. By using the header as insertion point, we guarantee that the 3312 // expanded instructions dominate all their uses. 3313 auto GetInsertPoint = [this, &B]() { 3314 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3315 if (InsertBB != LoopVectorBody && 3316 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3317 return LoopVectorBody->getTerminator(); 3318 return &*B.GetInsertPoint(); 3319 }; 3320 3321 switch (ID.getKind()) { 3322 case InductionDescriptor::IK_IntInduction: { 3323 assert(Index->getType() == StartValue->getType() && 3324 "Index type does not match StartValue type"); 3325 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3326 return B.CreateSub(StartValue, Index); 3327 auto *Offset = CreateMul( 3328 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3329 return CreateAdd(StartValue, Offset); 3330 } 3331 case InductionDescriptor::IK_PtrInduction: { 3332 assert(isa<SCEVConstant>(Step) && 3333 "Expected constant step for pointer induction"); 3334 return B.CreateGEP( 3335 StartValue->getType()->getPointerElementType(), StartValue, 3336 CreateMul(Index, 3337 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3338 } 3339 case InductionDescriptor::IK_FpInduction: { 3340 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3341 auto InductionBinOp = ID.getInductionBinOp(); 3342 assert(InductionBinOp && 3343 (InductionBinOp->getOpcode() == Instruction::FAdd || 3344 InductionBinOp->getOpcode() == Instruction::FSub) && 3345 "Original bin op should be defined for FP induction"); 3346 3347 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3348 Value *MulExp = B.CreateFMul(StepValue, Index); 3349 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3350 "induction"); 3351 } 3352 case InductionDescriptor::IK_NoInduction: 3353 return nullptr; 3354 } 3355 llvm_unreachable("invalid enum"); 3356 } 3357 3358 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3359 LoopScalarBody = OrigLoop->getHeader(); 3360 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3361 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3362 assert(LoopExitBlock && "Must have an exit block"); 3363 assert(LoopVectorPreHeader && "Invalid loop structure"); 3364 3365 LoopMiddleBlock = 3366 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3367 LI, nullptr, Twine(Prefix) + "middle.block"); 3368 LoopScalarPreHeader = 3369 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3370 nullptr, Twine(Prefix) + "scalar.ph"); 3371 3372 // Set up branch from middle block to the exit and scalar preheader blocks. 3373 // completeLoopSkeleton will update the condition to use an iteration check, 3374 // if required to decide whether to execute the remainder. 3375 BranchInst *BrInst = 3376 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3377 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3378 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3379 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3380 3381 // We intentionally don't let SplitBlock to update LoopInfo since 3382 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3383 // LoopVectorBody is explicitly added to the correct place few lines later. 3384 LoopVectorBody = 3385 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3386 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3387 3388 // Update dominator for loop exit. 3389 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3390 3391 // Create and register the new vector loop. 3392 Loop *Lp = LI->AllocateLoop(); 3393 Loop *ParentLoop = OrigLoop->getParentLoop(); 3394 3395 // Insert the new loop into the loop nest and register the new basic blocks 3396 // before calling any utilities such as SCEV that require valid LoopInfo. 3397 if (ParentLoop) { 3398 ParentLoop->addChildLoop(Lp); 3399 } else { 3400 LI->addTopLevelLoop(Lp); 3401 } 3402 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3403 return Lp; 3404 } 3405 3406 void InnerLoopVectorizer::createInductionResumeValues( 3407 Loop *L, Value *VectorTripCount, 3408 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3409 assert(VectorTripCount && L && "Expected valid arguments"); 3410 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3411 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3412 "Inconsistent information about additional bypass."); 3413 // We are going to resume the execution of the scalar loop. 3414 // Go over all of the induction variables that we found and fix the 3415 // PHIs that are left in the scalar version of the loop. 3416 // The starting values of PHI nodes depend on the counter of the last 3417 // iteration in the vectorized loop. 3418 // If we come from a bypass edge then we need to start from the original 3419 // start value. 3420 for (auto &InductionEntry : Legal->getInductionVars()) { 3421 PHINode *OrigPhi = InductionEntry.first; 3422 InductionDescriptor II = InductionEntry.second; 3423 3424 // Create phi nodes to merge from the backedge-taken check block. 3425 PHINode *BCResumeVal = 3426 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3427 LoopScalarPreHeader->getTerminator()); 3428 // Copy original phi DL over to the new one. 3429 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3430 Value *&EndValue = IVEndValues[OrigPhi]; 3431 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3432 if (OrigPhi == OldInduction) { 3433 // We know what the end value is. 3434 EndValue = VectorTripCount; 3435 } else { 3436 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3437 3438 // Fast-math-flags propagate from the original induction instruction. 3439 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3440 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3441 3442 Type *StepType = II.getStep()->getType(); 3443 Instruction::CastOps CastOp = 3444 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3445 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3446 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3447 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3448 EndValue->setName("ind.end"); 3449 3450 // Compute the end value for the additional bypass (if applicable). 3451 if (AdditionalBypass.first) { 3452 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3453 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3454 StepType, true); 3455 CRD = 3456 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3457 EndValueFromAdditionalBypass = 3458 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3459 EndValueFromAdditionalBypass->setName("ind.end"); 3460 } 3461 } 3462 // The new PHI merges the original incoming value, in case of a bypass, 3463 // or the value at the end of the vectorized loop. 3464 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3465 3466 // Fix the scalar body counter (PHI node). 3467 // The old induction's phi node in the scalar body needs the truncated 3468 // value. 3469 for (BasicBlock *BB : LoopBypassBlocks) 3470 BCResumeVal->addIncoming(II.getStartValue(), BB); 3471 3472 if (AdditionalBypass.first) 3473 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3474 EndValueFromAdditionalBypass); 3475 3476 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3477 } 3478 } 3479 3480 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3481 MDNode *OrigLoopID) { 3482 assert(L && "Expected valid loop."); 3483 3484 // The trip counts should be cached by now. 3485 Value *Count = getOrCreateTripCount(L); 3486 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3487 3488 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3489 3490 // Add a check in the middle block to see if we have completed 3491 // all of the iterations in the first vector loop. 3492 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3493 // If tail is to be folded, we know we don't need to run the remainder. 3494 if (!Cost->foldTailByMasking()) { 3495 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3496 Count, VectorTripCount, "cmp.n", 3497 LoopMiddleBlock->getTerminator()); 3498 3499 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3500 // of the corresponding compare because they may have ended up with 3501 // different line numbers and we want to avoid awkward line stepping while 3502 // debugging. Eg. if the compare has got a line number inside the loop. 3503 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3504 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3505 } 3506 3507 // Get ready to start creating new instructions into the vectorized body. 3508 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3509 "Inconsistent vector loop preheader"); 3510 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3511 3512 Optional<MDNode *> VectorizedLoopID = 3513 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3514 LLVMLoopVectorizeFollowupVectorized}); 3515 if (VectorizedLoopID.hasValue()) { 3516 L->setLoopID(VectorizedLoopID.getValue()); 3517 3518 // Do not setAlreadyVectorized if loop attributes have been defined 3519 // explicitly. 3520 return LoopVectorPreHeader; 3521 } 3522 3523 // Keep all loop hints from the original loop on the vector loop (we'll 3524 // replace the vectorizer-specific hints below). 3525 if (MDNode *LID = OrigLoop->getLoopID()) 3526 L->setLoopID(LID); 3527 3528 LoopVectorizeHints Hints(L, true, *ORE); 3529 Hints.setAlreadyVectorized(); 3530 3531 #ifdef EXPENSIVE_CHECKS 3532 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3533 LI->verify(*DT); 3534 #endif 3535 3536 return LoopVectorPreHeader; 3537 } 3538 3539 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3540 /* 3541 In this function we generate a new loop. The new loop will contain 3542 the vectorized instructions while the old loop will continue to run the 3543 scalar remainder. 3544 3545 [ ] <-- loop iteration number check. 3546 / | 3547 / v 3548 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3549 | / | 3550 | / v 3551 || [ ] <-- vector pre header. 3552 |/ | 3553 | v 3554 | [ ] \ 3555 | [ ]_| <-- vector loop. 3556 | | 3557 | v 3558 | -[ ] <--- middle-block. 3559 | / | 3560 | / v 3561 -|- >[ ] <--- new preheader. 3562 | | 3563 | v 3564 | [ ] \ 3565 | [ ]_| <-- old scalar loop to handle remainder. 3566 \ | 3567 \ v 3568 >[ ] <-- exit block. 3569 ... 3570 */ 3571 3572 // Get the metadata of the original loop before it gets modified. 3573 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3574 3575 // Create an empty vector loop, and prepare basic blocks for the runtime 3576 // checks. 3577 Loop *Lp = createVectorLoopSkeleton(""); 3578 3579 // Now, compare the new count to zero. If it is zero skip the vector loop and 3580 // jump to the scalar loop. This check also covers the case where the 3581 // backedge-taken count is uint##_max: adding one to it will overflow leading 3582 // to an incorrect trip count of zero. In this (rare) case we will also jump 3583 // to the scalar loop. 3584 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3585 3586 // Generate the code to check any assumptions that we've made for SCEV 3587 // expressions. 3588 emitSCEVChecks(Lp, LoopScalarPreHeader); 3589 3590 // Generate the code that checks in runtime if arrays overlap. We put the 3591 // checks into a separate block to make the more common case of few elements 3592 // faster. 3593 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3594 3595 // Some loops have a single integer induction variable, while other loops 3596 // don't. One example is c++ iterators that often have multiple pointer 3597 // induction variables. In the code below we also support a case where we 3598 // don't have a single induction variable. 3599 // 3600 // We try to obtain an induction variable from the original loop as hard 3601 // as possible. However if we don't find one that: 3602 // - is an integer 3603 // - counts from zero, stepping by one 3604 // - is the size of the widest induction variable type 3605 // then we create a new one. 3606 OldInduction = Legal->getPrimaryInduction(); 3607 Type *IdxTy = Legal->getWidestInductionType(); 3608 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3609 // The loop step is equal to the vectorization factor (num of SIMD elements) 3610 // times the unroll factor (num of SIMD instructions). 3611 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3612 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3613 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3614 Induction = 3615 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3616 getDebugLocFromInstOrOperands(OldInduction)); 3617 3618 // Emit phis for the new starting index of the scalar loop. 3619 createInductionResumeValues(Lp, CountRoundDown); 3620 3621 return completeLoopSkeleton(Lp, OrigLoopID); 3622 } 3623 3624 // Fix up external users of the induction variable. At this point, we are 3625 // in LCSSA form, with all external PHIs that use the IV having one input value, 3626 // coming from the remainder loop. We need those PHIs to also have a correct 3627 // value for the IV when arriving directly from the middle block. 3628 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3629 const InductionDescriptor &II, 3630 Value *CountRoundDown, Value *EndValue, 3631 BasicBlock *MiddleBlock) { 3632 // There are two kinds of external IV usages - those that use the value 3633 // computed in the last iteration (the PHI) and those that use the penultimate 3634 // value (the value that feeds into the phi from the loop latch). 3635 // We allow both, but they, obviously, have different values. 3636 3637 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3638 3639 DenseMap<Value *, Value *> MissingVals; 3640 3641 // An external user of the last iteration's value should see the value that 3642 // the remainder loop uses to initialize its own IV. 3643 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3644 for (User *U : PostInc->users()) { 3645 Instruction *UI = cast<Instruction>(U); 3646 if (!OrigLoop->contains(UI)) { 3647 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3648 MissingVals[UI] = EndValue; 3649 } 3650 } 3651 3652 // An external user of the penultimate value need to see EndValue - Step. 3653 // The simplest way to get this is to recompute it from the constituent SCEVs, 3654 // that is Start + (Step * (CRD - 1)). 3655 for (User *U : OrigPhi->users()) { 3656 auto *UI = cast<Instruction>(U); 3657 if (!OrigLoop->contains(UI)) { 3658 const DataLayout &DL = 3659 OrigLoop->getHeader()->getModule()->getDataLayout(); 3660 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3661 3662 IRBuilder<> B(MiddleBlock->getTerminator()); 3663 3664 // Fast-math-flags propagate from the original induction instruction. 3665 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3666 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3667 3668 Value *CountMinusOne = B.CreateSub( 3669 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3670 Value *CMO = 3671 !II.getStep()->getType()->isIntegerTy() 3672 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3673 II.getStep()->getType()) 3674 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3675 CMO->setName("cast.cmo"); 3676 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3677 Escape->setName("ind.escape"); 3678 MissingVals[UI] = Escape; 3679 } 3680 } 3681 3682 for (auto &I : MissingVals) { 3683 PHINode *PHI = cast<PHINode>(I.first); 3684 // One corner case we have to handle is two IVs "chasing" each-other, 3685 // that is %IV2 = phi [...], [ %IV1, %latch ] 3686 // In this case, if IV1 has an external use, we need to avoid adding both 3687 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3688 // don't already have an incoming value for the middle block. 3689 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3690 PHI->addIncoming(I.second, MiddleBlock); 3691 } 3692 } 3693 3694 namespace { 3695 3696 struct CSEDenseMapInfo { 3697 static bool canHandle(const Instruction *I) { 3698 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3699 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3700 } 3701 3702 static inline Instruction *getEmptyKey() { 3703 return DenseMapInfo<Instruction *>::getEmptyKey(); 3704 } 3705 3706 static inline Instruction *getTombstoneKey() { 3707 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3708 } 3709 3710 static unsigned getHashValue(const Instruction *I) { 3711 assert(canHandle(I) && "Unknown instruction!"); 3712 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3713 I->value_op_end())); 3714 } 3715 3716 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3717 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3718 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3719 return LHS == RHS; 3720 return LHS->isIdenticalTo(RHS); 3721 } 3722 }; 3723 3724 } // end anonymous namespace 3725 3726 ///Perform cse of induction variable instructions. 3727 static void cse(BasicBlock *BB) { 3728 // Perform simple cse. 3729 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3730 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3731 Instruction *In = &*I++; 3732 3733 if (!CSEDenseMapInfo::canHandle(In)) 3734 continue; 3735 3736 // Check if we can replace this instruction with any of the 3737 // visited instructions. 3738 if (Instruction *V = CSEMap.lookup(In)) { 3739 In->replaceAllUsesWith(V); 3740 In->eraseFromParent(); 3741 continue; 3742 } 3743 3744 CSEMap[In] = In; 3745 } 3746 } 3747 3748 InstructionCost 3749 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3750 bool &NeedToScalarize) { 3751 Function *F = CI->getCalledFunction(); 3752 Type *ScalarRetTy = CI->getType(); 3753 SmallVector<Type *, 4> Tys, ScalarTys; 3754 for (auto &ArgOp : CI->arg_operands()) 3755 ScalarTys.push_back(ArgOp->getType()); 3756 3757 // Estimate cost of scalarized vector call. The source operands are assumed 3758 // to be vectors, so we need to extract individual elements from there, 3759 // execute VF scalar calls, and then gather the result into the vector return 3760 // value. 3761 InstructionCost ScalarCallCost = 3762 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3763 if (VF.isScalar()) 3764 return ScalarCallCost; 3765 3766 // Compute corresponding vector type for return value and arguments. 3767 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3768 for (Type *ScalarTy : ScalarTys) 3769 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3770 3771 // Compute costs of unpacking argument values for the scalar calls and 3772 // packing the return values to a vector. 3773 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3774 3775 InstructionCost Cost = 3776 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3777 3778 // If we can't emit a vector call for this function, then the currently found 3779 // cost is the cost we need to return. 3780 NeedToScalarize = true; 3781 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3782 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3783 3784 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3785 return Cost; 3786 3787 // If the corresponding vector cost is cheaper, return its cost. 3788 InstructionCost VectorCallCost = 3789 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3790 if (VectorCallCost < Cost) { 3791 NeedToScalarize = false; 3792 Cost = VectorCallCost; 3793 } 3794 return Cost; 3795 } 3796 3797 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3798 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3799 return Elt; 3800 return VectorType::get(Elt, VF); 3801 } 3802 3803 InstructionCost 3804 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3805 ElementCount VF) { 3806 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3807 assert(ID && "Expected intrinsic call!"); 3808 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3809 FastMathFlags FMF; 3810 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3811 FMF = FPMO->getFastMathFlags(); 3812 3813 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3814 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3815 SmallVector<Type *> ParamTys; 3816 std::transform(FTy->param_begin(), FTy->param_end(), 3817 std::back_inserter(ParamTys), 3818 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3819 3820 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3821 dyn_cast<IntrinsicInst>(CI)); 3822 return TTI.getIntrinsicInstrCost(CostAttrs, 3823 TargetTransformInfo::TCK_RecipThroughput); 3824 } 3825 3826 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3827 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3828 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3829 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3830 } 3831 3832 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3833 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3834 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3835 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3836 } 3837 3838 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3839 // For every instruction `I` in MinBWs, truncate the operands, create a 3840 // truncated version of `I` and reextend its result. InstCombine runs 3841 // later and will remove any ext/trunc pairs. 3842 SmallPtrSet<Value *, 4> Erased; 3843 for (const auto &KV : Cost->getMinimalBitwidths()) { 3844 // If the value wasn't vectorized, we must maintain the original scalar 3845 // type. The absence of the value from State indicates that it 3846 // wasn't vectorized. 3847 VPValue *Def = State.Plan->getVPValue(KV.first); 3848 if (!State.hasAnyVectorValue(Def)) 3849 continue; 3850 for (unsigned Part = 0; Part < UF; ++Part) { 3851 Value *I = State.get(Def, Part); 3852 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3853 continue; 3854 Type *OriginalTy = I->getType(); 3855 Type *ScalarTruncatedTy = 3856 IntegerType::get(OriginalTy->getContext(), KV.second); 3857 auto *TruncatedTy = FixedVectorType::get( 3858 ScalarTruncatedTy, 3859 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3860 if (TruncatedTy == OriginalTy) 3861 continue; 3862 3863 IRBuilder<> B(cast<Instruction>(I)); 3864 auto ShrinkOperand = [&](Value *V) -> Value * { 3865 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3866 if (ZI->getSrcTy() == TruncatedTy) 3867 return ZI->getOperand(0); 3868 return B.CreateZExtOrTrunc(V, TruncatedTy); 3869 }; 3870 3871 // The actual instruction modification depends on the instruction type, 3872 // unfortunately. 3873 Value *NewI = nullptr; 3874 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3875 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3876 ShrinkOperand(BO->getOperand(1))); 3877 3878 // Any wrapping introduced by shrinking this operation shouldn't be 3879 // considered undefined behavior. So, we can't unconditionally copy 3880 // arithmetic wrapping flags to NewI. 3881 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3882 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3883 NewI = 3884 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3885 ShrinkOperand(CI->getOperand(1))); 3886 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3887 NewI = B.CreateSelect(SI->getCondition(), 3888 ShrinkOperand(SI->getTrueValue()), 3889 ShrinkOperand(SI->getFalseValue())); 3890 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3891 switch (CI->getOpcode()) { 3892 default: 3893 llvm_unreachable("Unhandled cast!"); 3894 case Instruction::Trunc: 3895 NewI = ShrinkOperand(CI->getOperand(0)); 3896 break; 3897 case Instruction::SExt: 3898 NewI = B.CreateSExtOrTrunc( 3899 CI->getOperand(0), 3900 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3901 break; 3902 case Instruction::ZExt: 3903 NewI = B.CreateZExtOrTrunc( 3904 CI->getOperand(0), 3905 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3906 break; 3907 } 3908 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3909 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3910 ->getNumElements(); 3911 auto *O0 = B.CreateZExtOrTrunc( 3912 SI->getOperand(0), 3913 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3914 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3915 ->getNumElements(); 3916 auto *O1 = B.CreateZExtOrTrunc( 3917 SI->getOperand(1), 3918 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3919 3920 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3921 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3922 // Don't do anything with the operands, just extend the result. 3923 continue; 3924 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3925 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3926 ->getNumElements(); 3927 auto *O0 = B.CreateZExtOrTrunc( 3928 IE->getOperand(0), 3929 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3930 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3931 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3932 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3933 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3934 ->getNumElements(); 3935 auto *O0 = B.CreateZExtOrTrunc( 3936 EE->getOperand(0), 3937 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3938 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3939 } else { 3940 // If we don't know what to do, be conservative and don't do anything. 3941 continue; 3942 } 3943 3944 // Lastly, extend the result. 3945 NewI->takeName(cast<Instruction>(I)); 3946 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3947 I->replaceAllUsesWith(Res); 3948 cast<Instruction>(I)->eraseFromParent(); 3949 Erased.insert(I); 3950 State.reset(Def, Res, Part); 3951 } 3952 } 3953 3954 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3955 for (const auto &KV : Cost->getMinimalBitwidths()) { 3956 // If the value wasn't vectorized, we must maintain the original scalar 3957 // type. The absence of the value from State indicates that it 3958 // wasn't vectorized. 3959 VPValue *Def = State.Plan->getVPValue(KV.first); 3960 if (!State.hasAnyVectorValue(Def)) 3961 continue; 3962 for (unsigned Part = 0; Part < UF; ++Part) { 3963 Value *I = State.get(Def, Part); 3964 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3965 if (Inst && Inst->use_empty()) { 3966 Value *NewI = Inst->getOperand(0); 3967 Inst->eraseFromParent(); 3968 State.reset(Def, NewI, Part); 3969 } 3970 } 3971 } 3972 } 3973 3974 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3975 // Insert truncates and extends for any truncated instructions as hints to 3976 // InstCombine. 3977 if (VF.isVector()) 3978 truncateToMinimalBitwidths(State); 3979 3980 // Fix widened non-induction PHIs by setting up the PHI operands. 3981 if (OrigPHIsToFix.size()) { 3982 assert(EnableVPlanNativePath && 3983 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3984 fixNonInductionPHIs(State); 3985 } 3986 3987 // At this point every instruction in the original loop is widened to a 3988 // vector form. Now we need to fix the recurrences in the loop. These PHI 3989 // nodes are currently empty because we did not want to introduce cycles. 3990 // This is the second stage of vectorizing recurrences. 3991 fixCrossIterationPHIs(State); 3992 3993 // Forget the original basic block. 3994 PSE.getSE()->forgetLoop(OrigLoop); 3995 3996 // Fix-up external users of the induction variables. 3997 for (auto &Entry : Legal->getInductionVars()) 3998 fixupIVUsers(Entry.first, Entry.second, 3999 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4000 IVEndValues[Entry.first], LoopMiddleBlock); 4001 4002 fixLCSSAPHIs(State); 4003 for (Instruction *PI : PredicatedInstructions) 4004 sinkScalarOperands(&*PI); 4005 4006 // Remove redundant induction instructions. 4007 cse(LoopVectorBody); 4008 4009 // Set/update profile weights for the vector and remainder loops as original 4010 // loop iterations are now distributed among them. Note that original loop 4011 // represented by LoopScalarBody becomes remainder loop after vectorization. 4012 // 4013 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4014 // end up getting slightly roughened result but that should be OK since 4015 // profile is not inherently precise anyway. Note also possible bypass of 4016 // vector code caused by legality checks is ignored, assigning all the weight 4017 // to the vector loop, optimistically. 4018 // 4019 // For scalable vectorization we can't know at compile time how many iterations 4020 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4021 // vscale of '1'. 4022 setProfileInfoAfterUnrolling( 4023 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4024 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4025 } 4026 4027 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4028 // In order to support recurrences we need to be able to vectorize Phi nodes. 4029 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4030 // stage #2: We now need to fix the recurrences by adding incoming edges to 4031 // the currently empty PHI nodes. At this point every instruction in the 4032 // original loop is widened to a vector form so we can use them to construct 4033 // the incoming edges. 4034 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4035 // Handle first-order recurrences and reductions that need to be fixed. 4036 if (Legal->isFirstOrderRecurrence(&Phi)) 4037 fixFirstOrderRecurrence(&Phi, State); 4038 else if (Legal->isReductionVariable(&Phi)) 4039 fixReduction(&Phi, State); 4040 } 4041 } 4042 4043 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4044 VPTransformState &State) { 4045 // This is the second phase of vectorizing first-order recurrences. An 4046 // overview of the transformation is described below. Suppose we have the 4047 // following loop. 4048 // 4049 // for (int i = 0; i < n; ++i) 4050 // b[i] = a[i] - a[i - 1]; 4051 // 4052 // There is a first-order recurrence on "a". For this loop, the shorthand 4053 // scalar IR looks like: 4054 // 4055 // scalar.ph: 4056 // s_init = a[-1] 4057 // br scalar.body 4058 // 4059 // scalar.body: 4060 // i = phi [0, scalar.ph], [i+1, scalar.body] 4061 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4062 // s2 = a[i] 4063 // b[i] = s2 - s1 4064 // br cond, scalar.body, ... 4065 // 4066 // In this example, s1 is a recurrence because it's value depends on the 4067 // previous iteration. In the first phase of vectorization, we created a 4068 // temporary value for s1. We now complete the vectorization and produce the 4069 // shorthand vector IR shown below (for VF = 4, UF = 1). 4070 // 4071 // vector.ph: 4072 // v_init = vector(..., ..., ..., a[-1]) 4073 // br vector.body 4074 // 4075 // vector.body 4076 // i = phi [0, vector.ph], [i+4, vector.body] 4077 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4078 // v2 = a[i, i+1, i+2, i+3]; 4079 // v3 = vector(v1(3), v2(0, 1, 2)) 4080 // b[i, i+1, i+2, i+3] = v2 - v3 4081 // br cond, vector.body, middle.block 4082 // 4083 // middle.block: 4084 // x = v2(3) 4085 // br scalar.ph 4086 // 4087 // scalar.ph: 4088 // s_init = phi [x, middle.block], [a[-1], otherwise] 4089 // br scalar.body 4090 // 4091 // After execution completes the vector loop, we extract the next value of 4092 // the recurrence (x) to use as the initial value in the scalar loop. 4093 4094 // Get the original loop preheader and single loop latch. 4095 auto *Preheader = OrigLoop->getLoopPreheader(); 4096 auto *Latch = OrigLoop->getLoopLatch(); 4097 4098 // Get the initial and previous values of the scalar recurrence. 4099 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4100 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4101 4102 // Create a vector from the initial value. 4103 auto *VectorInit = ScalarInit; 4104 if (VF.isVector()) { 4105 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4106 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4107 VectorInit = Builder.CreateInsertElement( 4108 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4109 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4110 } 4111 4112 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4113 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4114 // We constructed a temporary phi node in the first phase of vectorization. 4115 // This phi node will eventually be deleted. 4116 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4117 4118 // Create a phi node for the new recurrence. The current value will either be 4119 // the initial value inserted into a vector or loop-varying vector value. 4120 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4121 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4122 4123 // Get the vectorized previous value of the last part UF - 1. It appears last 4124 // among all unrolled iterations, due to the order of their construction. 4125 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4126 4127 // Find and set the insertion point after the previous value if it is an 4128 // instruction. 4129 BasicBlock::iterator InsertPt; 4130 // Note that the previous value may have been constant-folded so it is not 4131 // guaranteed to be an instruction in the vector loop. 4132 // FIXME: Loop invariant values do not form recurrences. We should deal with 4133 // them earlier. 4134 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4135 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4136 else { 4137 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4138 if (isa<PHINode>(PreviousLastPart)) 4139 // If the previous value is a phi node, we should insert after all the phi 4140 // nodes in the block containing the PHI to avoid breaking basic block 4141 // verification. Note that the basic block may be different to 4142 // LoopVectorBody, in case we predicate the loop. 4143 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4144 else 4145 InsertPt = ++PreviousInst->getIterator(); 4146 } 4147 Builder.SetInsertPoint(&*InsertPt); 4148 4149 // We will construct a vector for the recurrence by combining the values for 4150 // the current and previous iterations. This is the required shuffle mask. 4151 assert(!VF.isScalable()); 4152 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4153 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4154 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4155 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4156 4157 // The vector from which to take the initial value for the current iteration 4158 // (actual or unrolled). Initially, this is the vector phi node. 4159 Value *Incoming = VecPhi; 4160 4161 // Shuffle the current and previous vector and update the vector parts. 4162 for (unsigned Part = 0; Part < UF; ++Part) { 4163 Value *PreviousPart = State.get(PreviousDef, Part); 4164 Value *PhiPart = State.get(PhiDef, Part); 4165 auto *Shuffle = 4166 VF.isVector() 4167 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4168 : Incoming; 4169 PhiPart->replaceAllUsesWith(Shuffle); 4170 cast<Instruction>(PhiPart)->eraseFromParent(); 4171 State.reset(PhiDef, Shuffle, Part); 4172 Incoming = PreviousPart; 4173 } 4174 4175 // Fix the latch value of the new recurrence in the vector loop. 4176 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4177 4178 // Extract the last vector element in the middle block. This will be the 4179 // initial value for the recurrence when jumping to the scalar loop. 4180 auto *ExtractForScalar = Incoming; 4181 if (VF.isVector()) { 4182 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4183 ExtractForScalar = Builder.CreateExtractElement( 4184 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4185 "vector.recur.extract"); 4186 } 4187 // Extract the second last element in the middle block if the 4188 // Phi is used outside the loop. We need to extract the phi itself 4189 // and not the last element (the phi update in the current iteration). This 4190 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4191 // when the scalar loop is not run at all. 4192 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4193 if (VF.isVector()) 4194 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4195 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4196 "vector.recur.extract.for.phi"); 4197 // When loop is unrolled without vectorizing, initialize 4198 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4199 // `Incoming`. This is analogous to the vectorized case above: extracting the 4200 // second last element when VF > 1. 4201 else if (UF > 1) 4202 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4203 4204 // Fix the initial value of the original recurrence in the scalar loop. 4205 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4206 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4207 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4208 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4209 Start->addIncoming(Incoming, BB); 4210 } 4211 4212 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4213 Phi->setName("scalar.recur"); 4214 4215 // Finally, fix users of the recurrence outside the loop. The users will need 4216 // either the last value of the scalar recurrence or the last value of the 4217 // vector recurrence we extracted in the middle block. Since the loop is in 4218 // LCSSA form, we just need to find all the phi nodes for the original scalar 4219 // recurrence in the exit block, and then add an edge for the middle block. 4220 // Note that LCSSA does not imply single entry when the original scalar loop 4221 // had multiple exiting edges (as we always run the last iteration in the 4222 // scalar epilogue); in that case, the exiting path through middle will be 4223 // dynamically dead and the value picked for the phi doesn't matter. 4224 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4225 if (any_of(LCSSAPhi.incoming_values(), 4226 [Phi](Value *V) { return V == Phi; })) 4227 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4228 } 4229 4230 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4231 // Get it's reduction variable descriptor. 4232 assert(Legal->isReductionVariable(Phi) && 4233 "Unable to find the reduction variable"); 4234 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4235 4236 RecurKind RK = RdxDesc.getRecurrenceKind(); 4237 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4238 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4239 setDebugLocFromInst(Builder, ReductionStartValue); 4240 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4241 4242 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4243 // This is the vector-clone of the value that leaves the loop. 4244 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4245 4246 // Wrap flags are in general invalid after vectorization, clear them. 4247 clearReductionWrapFlags(RdxDesc, State); 4248 4249 // Fix the vector-loop phi. 4250 4251 // Reductions do not have to start at zero. They can start with 4252 // any loop invariant values. 4253 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4254 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4255 4256 for (unsigned Part = 0; Part < UF; ++Part) { 4257 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4258 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4259 cast<PHINode>(VecRdxPhi) 4260 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4261 } 4262 4263 // Before each round, move the insertion point right between 4264 // the PHIs and the values we are going to write. 4265 // This allows us to write both PHINodes and the extractelement 4266 // instructions. 4267 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4268 4269 setDebugLocFromInst(Builder, LoopExitInst); 4270 4271 // If tail is folded by masking, the vector value to leave the loop should be 4272 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4273 // instead of the former. For an inloop reduction the reduction will already 4274 // be predicated, and does not need to be handled here. 4275 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4276 for (unsigned Part = 0; Part < UF; ++Part) { 4277 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4278 Value *Sel = nullptr; 4279 for (User *U : VecLoopExitInst->users()) { 4280 if (isa<SelectInst>(U)) { 4281 assert(!Sel && "Reduction exit feeding two selects"); 4282 Sel = U; 4283 } else 4284 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4285 } 4286 assert(Sel && "Reduction exit feeds no select"); 4287 State.reset(LoopExitInstDef, Sel, Part); 4288 4289 // If the target can create a predicated operator for the reduction at no 4290 // extra cost in the loop (for example a predicated vadd), it can be 4291 // cheaper for the select to remain in the loop than be sunk out of it, 4292 // and so use the select value for the phi instead of the old 4293 // LoopExitValue. 4294 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4295 if (PreferPredicatedReductionSelect || 4296 TTI->preferPredicatedReductionSelect( 4297 RdxDesc.getOpcode(), Phi->getType(), 4298 TargetTransformInfo::ReductionFlags())) { 4299 auto *VecRdxPhi = 4300 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4301 VecRdxPhi->setIncomingValueForBlock( 4302 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4303 } 4304 } 4305 } 4306 4307 // If the vector reduction can be performed in a smaller type, we truncate 4308 // then extend the loop exit value to enable InstCombine to evaluate the 4309 // entire expression in the smaller type. 4310 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4311 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4312 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4313 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4314 Builder.SetInsertPoint( 4315 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4316 VectorParts RdxParts(UF); 4317 for (unsigned Part = 0; Part < UF; ++Part) { 4318 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4319 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4320 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4321 : Builder.CreateZExt(Trunc, VecTy); 4322 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4323 UI != RdxParts[Part]->user_end();) 4324 if (*UI != Trunc) { 4325 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4326 RdxParts[Part] = Extnd; 4327 } else { 4328 ++UI; 4329 } 4330 } 4331 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4332 for (unsigned Part = 0; Part < UF; ++Part) { 4333 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4334 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4335 } 4336 } 4337 4338 // Reduce all of the unrolled parts into a single vector. 4339 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4340 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4341 4342 // The middle block terminator has already been assigned a DebugLoc here (the 4343 // OrigLoop's single latch terminator). We want the whole middle block to 4344 // appear to execute on this line because: (a) it is all compiler generated, 4345 // (b) these instructions are always executed after evaluating the latch 4346 // conditional branch, and (c) other passes may add new predecessors which 4347 // terminate on this line. This is the easiest way to ensure we don't 4348 // accidentally cause an extra step back into the loop while debugging. 4349 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4350 { 4351 // Floating-point operations should have some FMF to enable the reduction. 4352 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4353 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4354 for (unsigned Part = 1; Part < UF; ++Part) { 4355 Value *RdxPart = State.get(LoopExitInstDef, Part); 4356 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4357 ReducedPartRdx = Builder.CreateBinOp( 4358 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4359 } else { 4360 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4361 } 4362 } 4363 } 4364 4365 // Create the reduction after the loop. Note that inloop reductions create the 4366 // target reduction in the loop using a Reduction recipe. 4367 if (VF.isVector() && !IsInLoopReductionPhi) { 4368 ReducedPartRdx = 4369 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4370 // If the reduction can be performed in a smaller type, we need to extend 4371 // the reduction to the wider type before we branch to the original loop. 4372 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4373 ReducedPartRdx = 4374 RdxDesc.isSigned() 4375 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4376 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4377 } 4378 4379 // Create a phi node that merges control-flow from the backedge-taken check 4380 // block and the middle block. 4381 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4382 LoopScalarPreHeader->getTerminator()); 4383 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4384 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4385 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4386 4387 // Now, we need to fix the users of the reduction variable 4388 // inside and outside of the scalar remainder loop. 4389 4390 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4391 // in the exit blocks. See comment on analogous loop in 4392 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4393 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4394 if (any_of(LCSSAPhi.incoming_values(), 4395 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4396 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4397 4398 // Fix the scalar loop reduction variable with the incoming reduction sum 4399 // from the vector body and from the backedge value. 4400 int IncomingEdgeBlockIdx = 4401 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4402 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4403 // Pick the other block. 4404 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4405 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4406 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4407 } 4408 4409 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4410 VPTransformState &State) { 4411 RecurKind RK = RdxDesc.getRecurrenceKind(); 4412 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4413 return; 4414 4415 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4416 assert(LoopExitInstr && "null loop exit instruction"); 4417 SmallVector<Instruction *, 8> Worklist; 4418 SmallPtrSet<Instruction *, 8> Visited; 4419 Worklist.push_back(LoopExitInstr); 4420 Visited.insert(LoopExitInstr); 4421 4422 while (!Worklist.empty()) { 4423 Instruction *Cur = Worklist.pop_back_val(); 4424 if (isa<OverflowingBinaryOperator>(Cur)) 4425 for (unsigned Part = 0; Part < UF; ++Part) { 4426 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4427 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4428 } 4429 4430 for (User *U : Cur->users()) { 4431 Instruction *UI = cast<Instruction>(U); 4432 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4433 Visited.insert(UI).second) 4434 Worklist.push_back(UI); 4435 } 4436 } 4437 } 4438 4439 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4440 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4441 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4442 // Some phis were already hand updated by the reduction and recurrence 4443 // code above, leave them alone. 4444 continue; 4445 4446 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4447 // Non-instruction incoming values will have only one value. 4448 4449 VPLane Lane = VPLane::getFirstLane(); 4450 if (isa<Instruction>(IncomingValue) && 4451 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4452 VF)) 4453 Lane = VPLane::getLastLaneForVF(VF); 4454 4455 // Can be a loop invariant incoming value or the last scalar value to be 4456 // extracted from the vectorized loop. 4457 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4458 Value *lastIncomingValue = 4459 OrigLoop->isLoopInvariant(IncomingValue) 4460 ? IncomingValue 4461 : State.get(State.Plan->getVPValue(IncomingValue), 4462 VPIteration(UF - 1, Lane)); 4463 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4464 } 4465 } 4466 4467 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4468 // The basic block and loop containing the predicated instruction. 4469 auto *PredBB = PredInst->getParent(); 4470 auto *VectorLoop = LI->getLoopFor(PredBB); 4471 4472 // Initialize a worklist with the operands of the predicated instruction. 4473 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4474 4475 // Holds instructions that we need to analyze again. An instruction may be 4476 // reanalyzed if we don't yet know if we can sink it or not. 4477 SmallVector<Instruction *, 8> InstsToReanalyze; 4478 4479 // Returns true if a given use occurs in the predicated block. Phi nodes use 4480 // their operands in their corresponding predecessor blocks. 4481 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4482 auto *I = cast<Instruction>(U.getUser()); 4483 BasicBlock *BB = I->getParent(); 4484 if (auto *Phi = dyn_cast<PHINode>(I)) 4485 BB = Phi->getIncomingBlock( 4486 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4487 return BB == PredBB; 4488 }; 4489 4490 // Iteratively sink the scalarized operands of the predicated instruction 4491 // into the block we created for it. When an instruction is sunk, it's 4492 // operands are then added to the worklist. The algorithm ends after one pass 4493 // through the worklist doesn't sink a single instruction. 4494 bool Changed; 4495 do { 4496 // Add the instructions that need to be reanalyzed to the worklist, and 4497 // reset the changed indicator. 4498 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4499 InstsToReanalyze.clear(); 4500 Changed = false; 4501 4502 while (!Worklist.empty()) { 4503 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4504 4505 // We can't sink an instruction if it is a phi node, is already in the 4506 // predicated block, is not in the loop, or may have side effects. 4507 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4508 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4509 continue; 4510 4511 // It's legal to sink the instruction if all its uses occur in the 4512 // predicated block. Otherwise, there's nothing to do yet, and we may 4513 // need to reanalyze the instruction. 4514 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4515 InstsToReanalyze.push_back(I); 4516 continue; 4517 } 4518 4519 // Move the instruction to the beginning of the predicated block, and add 4520 // it's operands to the worklist. 4521 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4522 Worklist.insert(I->op_begin(), I->op_end()); 4523 4524 // The sinking may have enabled other instructions to be sunk, so we will 4525 // need to iterate. 4526 Changed = true; 4527 } 4528 } while (Changed); 4529 } 4530 4531 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4532 for (PHINode *OrigPhi : OrigPHIsToFix) { 4533 VPWidenPHIRecipe *VPPhi = 4534 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4535 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4536 // Make sure the builder has a valid insert point. 4537 Builder.SetInsertPoint(NewPhi); 4538 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4539 VPValue *Inc = VPPhi->getIncomingValue(i); 4540 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4541 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4542 } 4543 } 4544 } 4545 4546 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4547 VPUser &Operands, unsigned UF, 4548 ElementCount VF, bool IsPtrLoopInvariant, 4549 SmallBitVector &IsIndexLoopInvariant, 4550 VPTransformState &State) { 4551 // Construct a vector GEP by widening the operands of the scalar GEP as 4552 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4553 // results in a vector of pointers when at least one operand of the GEP 4554 // is vector-typed. Thus, to keep the representation compact, we only use 4555 // vector-typed operands for loop-varying values. 4556 4557 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4558 // If we are vectorizing, but the GEP has only loop-invariant operands, 4559 // the GEP we build (by only using vector-typed operands for 4560 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4561 // produce a vector of pointers, we need to either arbitrarily pick an 4562 // operand to broadcast, or broadcast a clone of the original GEP. 4563 // Here, we broadcast a clone of the original. 4564 // 4565 // TODO: If at some point we decide to scalarize instructions having 4566 // loop-invariant operands, this special case will no longer be 4567 // required. We would add the scalarization decision to 4568 // collectLoopScalars() and teach getVectorValue() to broadcast 4569 // the lane-zero scalar value. 4570 auto *Clone = Builder.Insert(GEP->clone()); 4571 for (unsigned Part = 0; Part < UF; ++Part) { 4572 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4573 State.set(VPDef, EntryPart, Part); 4574 addMetadata(EntryPart, GEP); 4575 } 4576 } else { 4577 // If the GEP has at least one loop-varying operand, we are sure to 4578 // produce a vector of pointers. But if we are only unrolling, we want 4579 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4580 // produce with the code below will be scalar (if VF == 1) or vector 4581 // (otherwise). Note that for the unroll-only case, we still maintain 4582 // values in the vector mapping with initVector, as we do for other 4583 // instructions. 4584 for (unsigned Part = 0; Part < UF; ++Part) { 4585 // The pointer operand of the new GEP. If it's loop-invariant, we 4586 // won't broadcast it. 4587 auto *Ptr = IsPtrLoopInvariant 4588 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4589 : State.get(Operands.getOperand(0), Part); 4590 4591 // Collect all the indices for the new GEP. If any index is 4592 // loop-invariant, we won't broadcast it. 4593 SmallVector<Value *, 4> Indices; 4594 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4595 VPValue *Operand = Operands.getOperand(I); 4596 if (IsIndexLoopInvariant[I - 1]) 4597 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4598 else 4599 Indices.push_back(State.get(Operand, Part)); 4600 } 4601 4602 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4603 // but it should be a vector, otherwise. 4604 auto *NewGEP = 4605 GEP->isInBounds() 4606 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4607 Indices) 4608 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4609 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4610 "NewGEP is not a pointer vector"); 4611 State.set(VPDef, NewGEP, Part); 4612 addMetadata(NewGEP, GEP); 4613 } 4614 } 4615 } 4616 4617 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4618 RecurrenceDescriptor *RdxDesc, 4619 VPValue *StartVPV, VPValue *Def, 4620 VPTransformState &State) { 4621 PHINode *P = cast<PHINode>(PN); 4622 if (EnableVPlanNativePath) { 4623 // Currently we enter here in the VPlan-native path for non-induction 4624 // PHIs where all control flow is uniform. We simply widen these PHIs. 4625 // Create a vector phi with no operands - the vector phi operands will be 4626 // set at the end of vector code generation. 4627 Type *VecTy = (State.VF.isScalar()) 4628 ? PN->getType() 4629 : VectorType::get(PN->getType(), State.VF); 4630 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4631 State.set(Def, VecPhi, 0); 4632 OrigPHIsToFix.push_back(P); 4633 4634 return; 4635 } 4636 4637 assert(PN->getParent() == OrigLoop->getHeader() && 4638 "Non-header phis should have been handled elsewhere"); 4639 4640 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4641 // In order to support recurrences we need to be able to vectorize Phi nodes. 4642 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4643 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4644 // this value when we vectorize all of the instructions that use the PHI. 4645 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4646 Value *Iden = nullptr; 4647 bool ScalarPHI = 4648 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4649 Type *VecTy = 4650 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4651 4652 if (RdxDesc) { 4653 assert(Legal->isReductionVariable(P) && StartV && 4654 "RdxDesc should only be set for reduction variables; in that case " 4655 "a StartV is also required"); 4656 RecurKind RK = RdxDesc->getRecurrenceKind(); 4657 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4658 // MinMax reduction have the start value as their identify. 4659 if (ScalarPHI) { 4660 Iden = StartV; 4661 } else { 4662 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4663 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4664 StartV = Iden = 4665 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4666 } 4667 } else { 4668 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4669 RK, VecTy->getScalarType()); 4670 Iden = IdenC; 4671 4672 if (!ScalarPHI) { 4673 Iden = ConstantVector::getSplat(State.VF, IdenC); 4674 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4675 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4676 Constant *Zero = Builder.getInt32(0); 4677 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4678 } 4679 } 4680 } 4681 4682 for (unsigned Part = 0; Part < State.UF; ++Part) { 4683 // This is phase one of vectorizing PHIs. 4684 Value *EntryPart = PHINode::Create( 4685 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4686 State.set(Def, EntryPart, Part); 4687 if (StartV) { 4688 // Make sure to add the reduction start value only to the 4689 // first unroll part. 4690 Value *StartVal = (Part == 0) ? StartV : Iden; 4691 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4692 } 4693 } 4694 return; 4695 } 4696 4697 assert(!Legal->isReductionVariable(P) && 4698 "reductions should be handled above"); 4699 4700 setDebugLocFromInst(Builder, P); 4701 4702 // This PHINode must be an induction variable. 4703 // Make sure that we know about it. 4704 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4705 4706 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4707 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4708 4709 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4710 // which can be found from the original scalar operations. 4711 switch (II.getKind()) { 4712 case InductionDescriptor::IK_NoInduction: 4713 llvm_unreachable("Unknown induction"); 4714 case InductionDescriptor::IK_IntInduction: 4715 case InductionDescriptor::IK_FpInduction: 4716 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4717 case InductionDescriptor::IK_PtrInduction: { 4718 // Handle the pointer induction variable case. 4719 assert(P->getType()->isPointerTy() && "Unexpected type."); 4720 4721 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4722 // This is the normalized GEP that starts counting at zero. 4723 Value *PtrInd = 4724 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4725 // Determine the number of scalars we need to generate for each unroll 4726 // iteration. If the instruction is uniform, we only need to generate the 4727 // first lane. Otherwise, we generate all VF values. 4728 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4729 ? 1 4730 : State.VF.getKnownMinValue(); 4731 for (unsigned Part = 0; Part < UF; ++Part) { 4732 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4733 Constant *Idx = ConstantInt::get( 4734 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4735 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4736 Value *SclrGep = 4737 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4738 SclrGep->setName("next.gep"); 4739 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4740 } 4741 } 4742 return; 4743 } 4744 assert(isa<SCEVConstant>(II.getStep()) && 4745 "Induction step not a SCEV constant!"); 4746 Type *PhiType = II.getStep()->getType(); 4747 4748 // Build a pointer phi 4749 Value *ScalarStartValue = II.getStartValue(); 4750 Type *ScStValueType = ScalarStartValue->getType(); 4751 PHINode *NewPointerPhi = 4752 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4753 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4754 4755 // A pointer induction, performed by using a gep 4756 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4757 Instruction *InductionLoc = LoopLatch->getTerminator(); 4758 const SCEV *ScalarStep = II.getStep(); 4759 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4760 Value *ScalarStepValue = 4761 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4762 Value *InductionGEP = GetElementPtrInst::Create( 4763 ScStValueType->getPointerElementType(), NewPointerPhi, 4764 Builder.CreateMul( 4765 ScalarStepValue, 4766 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4767 "ptr.ind", InductionLoc); 4768 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4769 4770 // Create UF many actual address geps that use the pointer 4771 // phi as base and a vectorized version of the step value 4772 // (<step*0, ..., step*N>) as offset. 4773 for (unsigned Part = 0; Part < State.UF; ++Part) { 4774 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4775 Value *StartOffset = 4776 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue()); 4777 // Create a vector of consecutive numbers from zero to VF. 4778 StartOffset = 4779 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4780 4781 Value *GEP = Builder.CreateGEP( 4782 ScStValueType->getPointerElementType(), NewPointerPhi, 4783 Builder.CreateMul(StartOffset, 4784 Builder.CreateVectorSplat( 4785 State.VF.getKnownMinValue(), ScalarStepValue), 4786 "vector.gep")); 4787 State.set(Def, GEP, Part); 4788 } 4789 } 4790 } 4791 } 4792 4793 /// A helper function for checking whether an integer division-related 4794 /// instruction may divide by zero (in which case it must be predicated if 4795 /// executed conditionally in the scalar code). 4796 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4797 /// Non-zero divisors that are non compile-time constants will not be 4798 /// converted into multiplication, so we will still end up scalarizing 4799 /// the division, but can do so w/o predication. 4800 static bool mayDivideByZero(Instruction &I) { 4801 assert((I.getOpcode() == Instruction::UDiv || 4802 I.getOpcode() == Instruction::SDiv || 4803 I.getOpcode() == Instruction::URem || 4804 I.getOpcode() == Instruction::SRem) && 4805 "Unexpected instruction"); 4806 Value *Divisor = I.getOperand(1); 4807 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4808 return !CInt || CInt->isZero(); 4809 } 4810 4811 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4812 VPUser &User, 4813 VPTransformState &State) { 4814 switch (I.getOpcode()) { 4815 case Instruction::Call: 4816 case Instruction::Br: 4817 case Instruction::PHI: 4818 case Instruction::GetElementPtr: 4819 case Instruction::Select: 4820 llvm_unreachable("This instruction is handled by a different recipe."); 4821 case Instruction::UDiv: 4822 case Instruction::SDiv: 4823 case Instruction::SRem: 4824 case Instruction::URem: 4825 case Instruction::Add: 4826 case Instruction::FAdd: 4827 case Instruction::Sub: 4828 case Instruction::FSub: 4829 case Instruction::FNeg: 4830 case Instruction::Mul: 4831 case Instruction::FMul: 4832 case Instruction::FDiv: 4833 case Instruction::FRem: 4834 case Instruction::Shl: 4835 case Instruction::LShr: 4836 case Instruction::AShr: 4837 case Instruction::And: 4838 case Instruction::Or: 4839 case Instruction::Xor: { 4840 // Just widen unops and binops. 4841 setDebugLocFromInst(Builder, &I); 4842 4843 for (unsigned Part = 0; Part < UF; ++Part) { 4844 SmallVector<Value *, 2> Ops; 4845 for (VPValue *VPOp : User.operands()) 4846 Ops.push_back(State.get(VPOp, Part)); 4847 4848 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4849 4850 if (auto *VecOp = dyn_cast<Instruction>(V)) 4851 VecOp->copyIRFlags(&I); 4852 4853 // Use this vector value for all users of the original instruction. 4854 State.set(Def, V, Part); 4855 addMetadata(V, &I); 4856 } 4857 4858 break; 4859 } 4860 case Instruction::ICmp: 4861 case Instruction::FCmp: { 4862 // Widen compares. Generate vector compares. 4863 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4864 auto *Cmp = cast<CmpInst>(&I); 4865 setDebugLocFromInst(Builder, Cmp); 4866 for (unsigned Part = 0; Part < UF; ++Part) { 4867 Value *A = State.get(User.getOperand(0), Part); 4868 Value *B = State.get(User.getOperand(1), Part); 4869 Value *C = nullptr; 4870 if (FCmp) { 4871 // Propagate fast math flags. 4872 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4873 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4874 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4875 } else { 4876 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4877 } 4878 State.set(Def, C, Part); 4879 addMetadata(C, &I); 4880 } 4881 4882 break; 4883 } 4884 4885 case Instruction::ZExt: 4886 case Instruction::SExt: 4887 case Instruction::FPToUI: 4888 case Instruction::FPToSI: 4889 case Instruction::FPExt: 4890 case Instruction::PtrToInt: 4891 case Instruction::IntToPtr: 4892 case Instruction::SIToFP: 4893 case Instruction::UIToFP: 4894 case Instruction::Trunc: 4895 case Instruction::FPTrunc: 4896 case Instruction::BitCast: { 4897 auto *CI = cast<CastInst>(&I); 4898 setDebugLocFromInst(Builder, CI); 4899 4900 /// Vectorize casts. 4901 Type *DestTy = 4902 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4903 4904 for (unsigned Part = 0; Part < UF; ++Part) { 4905 Value *A = State.get(User.getOperand(0), Part); 4906 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4907 State.set(Def, Cast, Part); 4908 addMetadata(Cast, &I); 4909 } 4910 break; 4911 } 4912 default: 4913 // This instruction is not vectorized by simple widening. 4914 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4915 llvm_unreachable("Unhandled instruction!"); 4916 } // end of switch. 4917 } 4918 4919 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4920 VPUser &ArgOperands, 4921 VPTransformState &State) { 4922 assert(!isa<DbgInfoIntrinsic>(I) && 4923 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4924 setDebugLocFromInst(Builder, &I); 4925 4926 Module *M = I.getParent()->getParent()->getParent(); 4927 auto *CI = cast<CallInst>(&I); 4928 4929 SmallVector<Type *, 4> Tys; 4930 for (Value *ArgOperand : CI->arg_operands()) 4931 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4932 4933 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4934 4935 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4936 // version of the instruction. 4937 // Is it beneficial to perform intrinsic call compared to lib call? 4938 bool NeedToScalarize = false; 4939 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4940 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4941 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4942 assert((UseVectorIntrinsic || !NeedToScalarize) && 4943 "Instruction should be scalarized elsewhere."); 4944 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4945 "Cannot have invalid costs while widening"); 4946 4947 for (unsigned Part = 0; Part < UF; ++Part) { 4948 SmallVector<Value *, 4> Args; 4949 for (auto &I : enumerate(ArgOperands.operands())) { 4950 // Some intrinsics have a scalar argument - don't replace it with a 4951 // vector. 4952 Value *Arg; 4953 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4954 Arg = State.get(I.value(), Part); 4955 else 4956 Arg = State.get(I.value(), VPIteration(0, 0)); 4957 Args.push_back(Arg); 4958 } 4959 4960 Function *VectorF; 4961 if (UseVectorIntrinsic) { 4962 // Use vector version of the intrinsic. 4963 Type *TysForDecl[] = {CI->getType()}; 4964 if (VF.isVector()) 4965 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4966 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4967 assert(VectorF && "Can't retrieve vector intrinsic."); 4968 } else { 4969 // Use vector version of the function call. 4970 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4971 #ifndef NDEBUG 4972 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4973 "Can't create vector function."); 4974 #endif 4975 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4976 } 4977 SmallVector<OperandBundleDef, 1> OpBundles; 4978 CI->getOperandBundlesAsDefs(OpBundles); 4979 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4980 4981 if (isa<FPMathOperator>(V)) 4982 V->copyFastMathFlags(CI); 4983 4984 State.set(Def, V, Part); 4985 addMetadata(V, &I); 4986 } 4987 } 4988 4989 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4990 VPUser &Operands, 4991 bool InvariantCond, 4992 VPTransformState &State) { 4993 setDebugLocFromInst(Builder, &I); 4994 4995 // The condition can be loop invariant but still defined inside the 4996 // loop. This means that we can't just use the original 'cond' value. 4997 // We have to take the 'vectorized' value and pick the first lane. 4998 // Instcombine will make this a no-op. 4999 auto *InvarCond = InvariantCond 5000 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5001 : nullptr; 5002 5003 for (unsigned Part = 0; Part < UF; ++Part) { 5004 Value *Cond = 5005 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5006 Value *Op0 = State.get(Operands.getOperand(1), Part); 5007 Value *Op1 = State.get(Operands.getOperand(2), Part); 5008 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5009 State.set(VPDef, Sel, Part); 5010 addMetadata(Sel, &I); 5011 } 5012 } 5013 5014 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5015 // We should not collect Scalars more than once per VF. Right now, this 5016 // function is called from collectUniformsAndScalars(), which already does 5017 // this check. Collecting Scalars for VF=1 does not make any sense. 5018 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5019 "This function should not be visited twice for the same VF"); 5020 5021 SmallSetVector<Instruction *, 8> Worklist; 5022 5023 // These sets are used to seed the analysis with pointers used by memory 5024 // accesses that will remain scalar. 5025 SmallSetVector<Instruction *, 8> ScalarPtrs; 5026 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5027 auto *Latch = TheLoop->getLoopLatch(); 5028 5029 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5030 // The pointer operands of loads and stores will be scalar as long as the 5031 // memory access is not a gather or scatter operation. The value operand of a 5032 // store will remain scalar if the store is scalarized. 5033 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5034 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5035 assert(WideningDecision != CM_Unknown && 5036 "Widening decision should be ready at this moment"); 5037 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5038 if (Ptr == Store->getValueOperand()) 5039 return WideningDecision == CM_Scalarize; 5040 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5041 "Ptr is neither a value or pointer operand"); 5042 return WideningDecision != CM_GatherScatter; 5043 }; 5044 5045 // A helper that returns true if the given value is a bitcast or 5046 // getelementptr instruction contained in the loop. 5047 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5048 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5049 isa<GetElementPtrInst>(V)) && 5050 !TheLoop->isLoopInvariant(V); 5051 }; 5052 5053 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5054 if (!isa<PHINode>(Ptr) || 5055 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5056 return false; 5057 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5058 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5059 return false; 5060 return isScalarUse(MemAccess, Ptr); 5061 }; 5062 5063 // A helper that evaluates a memory access's use of a pointer. If the 5064 // pointer is actually the pointer induction of a loop, it is being 5065 // inserted into Worklist. If the use will be a scalar use, and the 5066 // pointer is only used by memory accesses, we place the pointer in 5067 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5068 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5069 if (isScalarPtrInduction(MemAccess, Ptr)) { 5070 Worklist.insert(cast<Instruction>(Ptr)); 5071 Instruction *Update = cast<Instruction>( 5072 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5073 Worklist.insert(Update); 5074 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5075 << "\n"); 5076 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5077 << "\n"); 5078 return; 5079 } 5080 // We only care about bitcast and getelementptr instructions contained in 5081 // the loop. 5082 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5083 return; 5084 5085 // If the pointer has already been identified as scalar (e.g., if it was 5086 // also identified as uniform), there's nothing to do. 5087 auto *I = cast<Instruction>(Ptr); 5088 if (Worklist.count(I)) 5089 return; 5090 5091 // If the use of the pointer will be a scalar use, and all users of the 5092 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5093 // place the pointer in PossibleNonScalarPtrs. 5094 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5095 return isa<LoadInst>(U) || isa<StoreInst>(U); 5096 })) 5097 ScalarPtrs.insert(I); 5098 else 5099 PossibleNonScalarPtrs.insert(I); 5100 }; 5101 5102 // We seed the scalars analysis with three classes of instructions: (1) 5103 // instructions marked uniform-after-vectorization and (2) bitcast, 5104 // getelementptr and (pointer) phi instructions used by memory accesses 5105 // requiring a scalar use. 5106 // 5107 // (1) Add to the worklist all instructions that have been identified as 5108 // uniform-after-vectorization. 5109 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5110 5111 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5112 // memory accesses requiring a scalar use. The pointer operands of loads and 5113 // stores will be scalar as long as the memory accesses is not a gather or 5114 // scatter operation. The value operand of a store will remain scalar if the 5115 // store is scalarized. 5116 for (auto *BB : TheLoop->blocks()) 5117 for (auto &I : *BB) { 5118 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5119 evaluatePtrUse(Load, Load->getPointerOperand()); 5120 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5121 evaluatePtrUse(Store, Store->getPointerOperand()); 5122 evaluatePtrUse(Store, Store->getValueOperand()); 5123 } 5124 } 5125 for (auto *I : ScalarPtrs) 5126 if (!PossibleNonScalarPtrs.count(I)) { 5127 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5128 Worklist.insert(I); 5129 } 5130 5131 // Insert the forced scalars. 5132 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5133 // induction variable when the PHI user is scalarized. 5134 auto ForcedScalar = ForcedScalars.find(VF); 5135 if (ForcedScalar != ForcedScalars.end()) 5136 for (auto *I : ForcedScalar->second) 5137 Worklist.insert(I); 5138 5139 // Expand the worklist by looking through any bitcasts and getelementptr 5140 // instructions we've already identified as scalar. This is similar to the 5141 // expansion step in collectLoopUniforms(); however, here we're only 5142 // expanding to include additional bitcasts and getelementptr instructions. 5143 unsigned Idx = 0; 5144 while (Idx != Worklist.size()) { 5145 Instruction *Dst = Worklist[Idx++]; 5146 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5147 continue; 5148 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5149 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5150 auto *J = cast<Instruction>(U); 5151 return !TheLoop->contains(J) || Worklist.count(J) || 5152 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5153 isScalarUse(J, Src)); 5154 })) { 5155 Worklist.insert(Src); 5156 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5157 } 5158 } 5159 5160 // An induction variable will remain scalar if all users of the induction 5161 // variable and induction variable update remain scalar. 5162 for (auto &Induction : Legal->getInductionVars()) { 5163 auto *Ind = Induction.first; 5164 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5165 5166 // If tail-folding is applied, the primary induction variable will be used 5167 // to feed a vector compare. 5168 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5169 continue; 5170 5171 // Determine if all users of the induction variable are scalar after 5172 // vectorization. 5173 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5174 auto *I = cast<Instruction>(U); 5175 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5176 }); 5177 if (!ScalarInd) 5178 continue; 5179 5180 // Determine if all users of the induction variable update instruction are 5181 // scalar after vectorization. 5182 auto ScalarIndUpdate = 5183 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5184 auto *I = cast<Instruction>(U); 5185 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5186 }); 5187 if (!ScalarIndUpdate) 5188 continue; 5189 5190 // The induction variable and its update instruction will remain scalar. 5191 Worklist.insert(Ind); 5192 Worklist.insert(IndUpdate); 5193 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5194 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5195 << "\n"); 5196 } 5197 5198 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5199 } 5200 5201 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5202 ElementCount VF) { 5203 if (!blockNeedsPredication(I->getParent())) 5204 return false; 5205 switch(I->getOpcode()) { 5206 default: 5207 break; 5208 case Instruction::Load: 5209 case Instruction::Store: { 5210 if (!Legal->isMaskRequired(I)) 5211 return false; 5212 auto *Ptr = getLoadStorePointerOperand(I); 5213 auto *Ty = getMemInstValueType(I); 5214 // We have already decided how to vectorize this instruction, get that 5215 // result. 5216 if (VF.isVector()) { 5217 InstWidening WideningDecision = getWideningDecision(I, VF); 5218 assert(WideningDecision != CM_Unknown && 5219 "Widening decision should be ready at this moment"); 5220 return WideningDecision == CM_Scalarize; 5221 } 5222 const Align Alignment = getLoadStoreAlignment(I); 5223 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5224 isLegalMaskedGather(Ty, Alignment)) 5225 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5226 isLegalMaskedScatter(Ty, Alignment)); 5227 } 5228 case Instruction::UDiv: 5229 case Instruction::SDiv: 5230 case Instruction::SRem: 5231 case Instruction::URem: 5232 return mayDivideByZero(*I); 5233 } 5234 return false; 5235 } 5236 5237 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5238 Instruction *I, ElementCount VF) { 5239 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5240 assert(getWideningDecision(I, VF) == CM_Unknown && 5241 "Decision should not be set yet."); 5242 auto *Group = getInterleavedAccessGroup(I); 5243 assert(Group && "Must have a group."); 5244 5245 // If the instruction's allocated size doesn't equal it's type size, it 5246 // requires padding and will be scalarized. 5247 auto &DL = I->getModule()->getDataLayout(); 5248 auto *ScalarTy = getMemInstValueType(I); 5249 if (hasIrregularType(ScalarTy, DL)) 5250 return false; 5251 5252 // Check if masking is required. 5253 // A Group may need masking for one of two reasons: it resides in a block that 5254 // needs predication, or it was decided to use masking to deal with gaps. 5255 bool PredicatedAccessRequiresMasking = 5256 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5257 bool AccessWithGapsRequiresMasking = 5258 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5259 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5260 return true; 5261 5262 // If masked interleaving is required, we expect that the user/target had 5263 // enabled it, because otherwise it either wouldn't have been created or 5264 // it should have been invalidated by the CostModel. 5265 assert(useMaskedInterleavedAccesses(TTI) && 5266 "Masked interleave-groups for predicated accesses are not enabled."); 5267 5268 auto *Ty = getMemInstValueType(I); 5269 const Align Alignment = getLoadStoreAlignment(I); 5270 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5271 : TTI.isLegalMaskedStore(Ty, Alignment); 5272 } 5273 5274 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5275 Instruction *I, ElementCount VF) { 5276 // Get and ensure we have a valid memory instruction. 5277 LoadInst *LI = dyn_cast<LoadInst>(I); 5278 StoreInst *SI = dyn_cast<StoreInst>(I); 5279 assert((LI || SI) && "Invalid memory instruction"); 5280 5281 auto *Ptr = getLoadStorePointerOperand(I); 5282 5283 // In order to be widened, the pointer should be consecutive, first of all. 5284 if (!Legal->isConsecutivePtr(Ptr)) 5285 return false; 5286 5287 // If the instruction is a store located in a predicated block, it will be 5288 // scalarized. 5289 if (isScalarWithPredication(I)) 5290 return false; 5291 5292 // If the instruction's allocated size doesn't equal it's type size, it 5293 // requires padding and will be scalarized. 5294 auto &DL = I->getModule()->getDataLayout(); 5295 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5296 if (hasIrregularType(ScalarTy, DL)) 5297 return false; 5298 5299 return true; 5300 } 5301 5302 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5303 // We should not collect Uniforms more than once per VF. Right now, 5304 // this function is called from collectUniformsAndScalars(), which 5305 // already does this check. Collecting Uniforms for VF=1 does not make any 5306 // sense. 5307 5308 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5309 "This function should not be visited twice for the same VF"); 5310 5311 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5312 // not analyze again. Uniforms.count(VF) will return 1. 5313 Uniforms[VF].clear(); 5314 5315 // We now know that the loop is vectorizable! 5316 // Collect instructions inside the loop that will remain uniform after 5317 // vectorization. 5318 5319 // Global values, params and instructions outside of current loop are out of 5320 // scope. 5321 auto isOutOfScope = [&](Value *V) -> bool { 5322 Instruction *I = dyn_cast<Instruction>(V); 5323 return (!I || !TheLoop->contains(I)); 5324 }; 5325 5326 SetVector<Instruction *> Worklist; 5327 BasicBlock *Latch = TheLoop->getLoopLatch(); 5328 5329 // Instructions that are scalar with predication must not be considered 5330 // uniform after vectorization, because that would create an erroneous 5331 // replicating region where only a single instance out of VF should be formed. 5332 // TODO: optimize such seldom cases if found important, see PR40816. 5333 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5334 if (isOutOfScope(I)) { 5335 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5336 << *I << "\n"); 5337 return; 5338 } 5339 if (isScalarWithPredication(I, VF)) { 5340 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5341 << *I << "\n"); 5342 return; 5343 } 5344 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5345 Worklist.insert(I); 5346 }; 5347 5348 // Start with the conditional branch. If the branch condition is an 5349 // instruction contained in the loop that is only used by the branch, it is 5350 // uniform. 5351 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5352 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5353 addToWorklistIfAllowed(Cmp); 5354 5355 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5356 InstWidening WideningDecision = getWideningDecision(I, VF); 5357 assert(WideningDecision != CM_Unknown && 5358 "Widening decision should be ready at this moment"); 5359 5360 // A uniform memory op is itself uniform. We exclude uniform stores 5361 // here as they demand the last lane, not the first one. 5362 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5363 assert(WideningDecision == CM_Scalarize); 5364 return true; 5365 } 5366 5367 return (WideningDecision == CM_Widen || 5368 WideningDecision == CM_Widen_Reverse || 5369 WideningDecision == CM_Interleave); 5370 }; 5371 5372 5373 // Returns true if Ptr is the pointer operand of a memory access instruction 5374 // I, and I is known to not require scalarization. 5375 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5376 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5377 }; 5378 5379 // Holds a list of values which are known to have at least one uniform use. 5380 // Note that there may be other uses which aren't uniform. A "uniform use" 5381 // here is something which only demands lane 0 of the unrolled iterations; 5382 // it does not imply that all lanes produce the same value (e.g. this is not 5383 // the usual meaning of uniform) 5384 SmallPtrSet<Value *, 8> HasUniformUse; 5385 5386 // Scan the loop for instructions which are either a) known to have only 5387 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5388 for (auto *BB : TheLoop->blocks()) 5389 for (auto &I : *BB) { 5390 // If there's no pointer operand, there's nothing to do. 5391 auto *Ptr = getLoadStorePointerOperand(&I); 5392 if (!Ptr) 5393 continue; 5394 5395 // A uniform memory op is itself uniform. We exclude uniform stores 5396 // here as they demand the last lane, not the first one. 5397 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5398 addToWorklistIfAllowed(&I); 5399 5400 if (isUniformDecision(&I, VF)) { 5401 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5402 HasUniformUse.insert(Ptr); 5403 } 5404 } 5405 5406 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5407 // demanding) users. Since loops are assumed to be in LCSSA form, this 5408 // disallows uses outside the loop as well. 5409 for (auto *V : HasUniformUse) { 5410 if (isOutOfScope(V)) 5411 continue; 5412 auto *I = cast<Instruction>(V); 5413 auto UsersAreMemAccesses = 5414 llvm::all_of(I->users(), [&](User *U) -> bool { 5415 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5416 }); 5417 if (UsersAreMemAccesses) 5418 addToWorklistIfAllowed(I); 5419 } 5420 5421 // Expand Worklist in topological order: whenever a new instruction 5422 // is added , its users should be already inside Worklist. It ensures 5423 // a uniform instruction will only be used by uniform instructions. 5424 unsigned idx = 0; 5425 while (idx != Worklist.size()) { 5426 Instruction *I = Worklist[idx++]; 5427 5428 for (auto OV : I->operand_values()) { 5429 // isOutOfScope operands cannot be uniform instructions. 5430 if (isOutOfScope(OV)) 5431 continue; 5432 // First order recurrence Phi's should typically be considered 5433 // non-uniform. 5434 auto *OP = dyn_cast<PHINode>(OV); 5435 if (OP && Legal->isFirstOrderRecurrence(OP)) 5436 continue; 5437 // If all the users of the operand are uniform, then add the 5438 // operand into the uniform worklist. 5439 auto *OI = cast<Instruction>(OV); 5440 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5441 auto *J = cast<Instruction>(U); 5442 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5443 })) 5444 addToWorklistIfAllowed(OI); 5445 } 5446 } 5447 5448 // For an instruction to be added into Worklist above, all its users inside 5449 // the loop should also be in Worklist. However, this condition cannot be 5450 // true for phi nodes that form a cyclic dependence. We must process phi 5451 // nodes separately. An induction variable will remain uniform if all users 5452 // of the induction variable and induction variable update remain uniform. 5453 // The code below handles both pointer and non-pointer induction variables. 5454 for (auto &Induction : Legal->getInductionVars()) { 5455 auto *Ind = Induction.first; 5456 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5457 5458 // Determine if all users of the induction variable are uniform after 5459 // vectorization. 5460 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5461 auto *I = cast<Instruction>(U); 5462 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5463 isVectorizedMemAccessUse(I, Ind); 5464 }); 5465 if (!UniformInd) 5466 continue; 5467 5468 // Determine if all users of the induction variable update instruction are 5469 // uniform after vectorization. 5470 auto UniformIndUpdate = 5471 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5472 auto *I = cast<Instruction>(U); 5473 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5474 isVectorizedMemAccessUse(I, IndUpdate); 5475 }); 5476 if (!UniformIndUpdate) 5477 continue; 5478 5479 // The induction variable and its update instruction will remain uniform. 5480 addToWorklistIfAllowed(Ind); 5481 addToWorklistIfAllowed(IndUpdate); 5482 } 5483 5484 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5485 } 5486 5487 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5488 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5489 5490 if (Legal->getRuntimePointerChecking()->Need) { 5491 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5492 "runtime pointer checks needed. Enable vectorization of this " 5493 "loop with '#pragma clang loop vectorize(enable)' when " 5494 "compiling with -Os/-Oz", 5495 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5496 return true; 5497 } 5498 5499 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5500 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5501 "runtime SCEV checks needed. Enable vectorization of this " 5502 "loop with '#pragma clang loop vectorize(enable)' when " 5503 "compiling with -Os/-Oz", 5504 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5505 return true; 5506 } 5507 5508 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5509 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5510 reportVectorizationFailure("Runtime stride check for small trip count", 5511 "runtime stride == 1 checks needed. Enable vectorization of " 5512 "this loop without such check by compiling with -Os/-Oz", 5513 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5514 return true; 5515 } 5516 5517 return false; 5518 } 5519 5520 Optional<ElementCount> 5521 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5522 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5523 // TODO: It may by useful to do since it's still likely to be dynamically 5524 // uniform if the target can skip. 5525 reportVectorizationFailure( 5526 "Not inserting runtime ptr check for divergent target", 5527 "runtime pointer checks needed. Not enabled for divergent target", 5528 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5529 return None; 5530 } 5531 5532 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5533 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5534 if (TC == 1) { 5535 reportVectorizationFailure("Single iteration (non) loop", 5536 "loop trip count is one, irrelevant for vectorization", 5537 "SingleIterationLoop", ORE, TheLoop); 5538 return None; 5539 } 5540 5541 switch (ScalarEpilogueStatus) { 5542 case CM_ScalarEpilogueAllowed: 5543 return computeFeasibleMaxVF(TC, UserVF); 5544 case CM_ScalarEpilogueNotAllowedUsePredicate: 5545 LLVM_FALLTHROUGH; 5546 case CM_ScalarEpilogueNotNeededUsePredicate: 5547 LLVM_DEBUG( 5548 dbgs() << "LV: vector predicate hint/switch found.\n" 5549 << "LV: Not allowing scalar epilogue, creating predicated " 5550 << "vector loop.\n"); 5551 break; 5552 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5553 // fallthrough as a special case of OptForSize 5554 case CM_ScalarEpilogueNotAllowedOptSize: 5555 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5556 LLVM_DEBUG( 5557 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5558 else 5559 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5560 << "count.\n"); 5561 5562 // Bail if runtime checks are required, which are not good when optimising 5563 // for size. 5564 if (runtimeChecksRequired()) 5565 return None; 5566 5567 break; 5568 } 5569 5570 // The only loops we can vectorize without a scalar epilogue, are loops with 5571 // a bottom-test and a single exiting block. We'd have to handle the fact 5572 // that not every instruction executes on the last iteration. This will 5573 // require a lane mask which varies through the vector loop body. (TODO) 5574 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5575 // If there was a tail-folding hint/switch, but we can't fold the tail by 5576 // masking, fallback to a vectorization with a scalar epilogue. 5577 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5578 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5579 "scalar epilogue instead.\n"); 5580 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5581 return computeFeasibleMaxVF(TC, UserVF); 5582 } 5583 return None; 5584 } 5585 5586 // Now try the tail folding 5587 5588 // Invalidate interleave groups that require an epilogue if we can't mask 5589 // the interleave-group. 5590 if (!useMaskedInterleavedAccesses(TTI)) { 5591 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5592 "No decisions should have been taken at this point"); 5593 // Note: There is no need to invalidate any cost modeling decisions here, as 5594 // non where taken so far. 5595 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5596 } 5597 5598 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5599 assert(!MaxVF.isScalable() && 5600 "Scalable vectors do not yet support tail folding"); 5601 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5602 "MaxVF must be a power of 2"); 5603 unsigned MaxVFtimesIC = 5604 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5605 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5606 // chose. 5607 ScalarEvolution *SE = PSE.getSE(); 5608 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5609 const SCEV *ExitCount = SE->getAddExpr( 5610 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5611 const SCEV *Rem = SE->getURemExpr( 5612 SE->applyLoopGuards(ExitCount, TheLoop), 5613 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5614 if (Rem->isZero()) { 5615 // Accept MaxVF if we do not have a tail. 5616 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5617 return MaxVF; 5618 } 5619 5620 // If we don't know the precise trip count, or if the trip count that we 5621 // found modulo the vectorization factor is not zero, try to fold the tail 5622 // by masking. 5623 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5624 if (Legal->prepareToFoldTailByMasking()) { 5625 FoldTailByMasking = true; 5626 return MaxVF; 5627 } 5628 5629 // If there was a tail-folding hint/switch, but we can't fold the tail by 5630 // masking, fallback to a vectorization with a scalar epilogue. 5631 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5632 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5633 "scalar epilogue instead.\n"); 5634 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5635 return MaxVF; 5636 } 5637 5638 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5639 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5640 return None; 5641 } 5642 5643 if (TC == 0) { 5644 reportVectorizationFailure( 5645 "Unable to calculate the loop count due to complex control flow", 5646 "unable to calculate the loop count due to complex control flow", 5647 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5648 return None; 5649 } 5650 5651 reportVectorizationFailure( 5652 "Cannot optimize for size and vectorize at the same time.", 5653 "cannot optimize for size and vectorize at the same time. " 5654 "Enable vectorization of this loop with '#pragma clang loop " 5655 "vectorize(enable)' when compiling with -Os/-Oz", 5656 "NoTailLoopWithOptForSize", ORE, TheLoop); 5657 return None; 5658 } 5659 5660 ElementCount 5661 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5662 ElementCount UserVF) { 5663 bool IgnoreScalableUserVF = UserVF.isScalable() && 5664 !TTI.supportsScalableVectors() && 5665 !ForceTargetSupportsScalableVectors; 5666 if (IgnoreScalableUserVF) { 5667 LLVM_DEBUG( 5668 dbgs() << "LV: Ignoring VF=" << UserVF 5669 << " because target does not support scalable vectors.\n"); 5670 ORE->emit([&]() { 5671 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5672 TheLoop->getStartLoc(), 5673 TheLoop->getHeader()) 5674 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5675 << " because target does not support scalable vectors."; 5676 }); 5677 } 5678 5679 // Beyond this point two scenarios are handled. If UserVF isn't specified 5680 // then a suitable VF is chosen. If UserVF is specified and there are 5681 // dependencies, check if it's legal. However, if a UserVF is specified and 5682 // there are no dependencies, then there's nothing to do. 5683 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5684 if (!canVectorizeReductions(UserVF)) { 5685 reportVectorizationFailure( 5686 "LV: Scalable vectorization not supported for the reduction " 5687 "operations found in this loop. Using fixed-width " 5688 "vectorization instead.", 5689 "Scalable vectorization not supported for the reduction operations " 5690 "found in this loop. Using fixed-width vectorization instead.", 5691 "ScalableVFUnfeasible", ORE, TheLoop); 5692 return computeFeasibleMaxVF( 5693 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5694 } 5695 5696 if (Legal->isSafeForAnyVectorWidth()) 5697 return UserVF; 5698 } 5699 5700 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5701 unsigned SmallestType, WidestType; 5702 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5703 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5704 5705 // Get the maximum safe dependence distance in bits computed by LAA. 5706 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5707 // the memory accesses that is most restrictive (involved in the smallest 5708 // dependence distance). 5709 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5710 5711 // If the user vectorization factor is legally unsafe, clamp it to a safe 5712 // value. Otherwise, return as is. 5713 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5714 unsigned MaxSafeElements = 5715 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5716 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5717 5718 if (UserVF.isScalable()) { 5719 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5720 5721 // Scale VF by vscale before checking if it's safe. 5722 MaxSafeVF = ElementCount::getScalable( 5723 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5724 5725 if (MaxSafeVF.isZero()) { 5726 // The dependence distance is too small to use scalable vectors, 5727 // fallback on fixed. 5728 LLVM_DEBUG( 5729 dbgs() 5730 << "LV: Max legal vector width too small, scalable vectorization " 5731 "unfeasible. Using fixed-width vectorization instead.\n"); 5732 ORE->emit([&]() { 5733 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5734 TheLoop->getStartLoc(), 5735 TheLoop->getHeader()) 5736 << "Max legal vector width too small, scalable vectorization " 5737 << "unfeasible. Using fixed-width vectorization instead."; 5738 }); 5739 return computeFeasibleMaxVF( 5740 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5741 } 5742 } 5743 5744 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5745 5746 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5747 return UserVF; 5748 5749 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5750 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5751 << ".\n"); 5752 ORE->emit([&]() { 5753 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5754 TheLoop->getStartLoc(), 5755 TheLoop->getHeader()) 5756 << "User-specified vectorization factor " 5757 << ore::NV("UserVectorizationFactor", UserVF) 5758 << " is unsafe, clamping to maximum safe vectorization factor " 5759 << ore::NV("VectorizationFactor", MaxSafeVF); 5760 }); 5761 return MaxSafeVF; 5762 } 5763 5764 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5765 5766 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5767 // Note that both WidestRegister and WidestType may not be a powers of 2. 5768 auto MaxVectorSize = 5769 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5770 5771 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5772 << " / " << WidestType << " bits.\n"); 5773 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5774 << WidestRegister << " bits.\n"); 5775 5776 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5777 "Did not expect to pack so many elements" 5778 " into one vector!"); 5779 if (MaxVectorSize.getFixedValue() == 0) { 5780 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5781 return ElementCount::getFixed(1); 5782 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5783 isPowerOf2_32(ConstTripCount)) { 5784 // We need to clamp the VF to be the ConstTripCount. There is no point in 5785 // choosing a higher viable VF as done in the loop below. 5786 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5787 << ConstTripCount << "\n"); 5788 return ElementCount::getFixed(ConstTripCount); 5789 } 5790 5791 ElementCount MaxVF = MaxVectorSize; 5792 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5793 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5794 // Collect all viable vectorization factors larger than the default MaxVF 5795 // (i.e. MaxVectorSize). 5796 SmallVector<ElementCount, 8> VFs; 5797 auto MaxVectorSizeMaxBW = 5798 ElementCount::getFixed(WidestRegister / SmallestType); 5799 for (ElementCount VS = MaxVectorSize * 2; 5800 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5801 VFs.push_back(VS); 5802 5803 // For each VF calculate its register usage. 5804 auto RUs = calculateRegisterUsage(VFs); 5805 5806 // Select the largest VF which doesn't require more registers than existing 5807 // ones. 5808 for (int i = RUs.size() - 1; i >= 0; --i) { 5809 bool Selected = true; 5810 for (auto &pair : RUs[i].MaxLocalUsers) { 5811 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5812 if (pair.second > TargetNumRegisters) 5813 Selected = false; 5814 } 5815 if (Selected) { 5816 MaxVF = VFs[i]; 5817 break; 5818 } 5819 } 5820 if (ElementCount MinVF = 5821 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5822 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5823 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5824 << ") with target's minimum: " << MinVF << '\n'); 5825 MaxVF = MinVF; 5826 } 5827 } 5828 } 5829 return MaxVF; 5830 } 5831 5832 VectorizationFactor 5833 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5834 // FIXME: This can be fixed for scalable vectors later, because at this stage 5835 // the LoopVectorizer will only consider vectorizing a loop with scalable 5836 // vectors when the loop has a hint to enable vectorization for a given VF. 5837 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5838 5839 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5840 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5841 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5842 5843 auto Width = ElementCount::getFixed(1); 5844 const float ScalarCost = *ExpectedCost.getValue(); 5845 float Cost = ScalarCost; 5846 5847 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5848 if (ForceVectorization && MaxVF.isVector()) { 5849 // Ignore scalar width, because the user explicitly wants vectorization. 5850 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5851 // evaluation. 5852 Cost = std::numeric_limits<float>::max(); 5853 } 5854 5855 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5856 i *= 2) { 5857 // Notice that the vector loop needs to be executed less times, so 5858 // we need to divide the cost of the vector loops by the width of 5859 // the vector elements. 5860 VectorizationCostTy C = expectedCost(i); 5861 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5862 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5863 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5864 << " costs: " << (int)VectorCost << ".\n"); 5865 if (!C.second && !ForceVectorization) { 5866 LLVM_DEBUG( 5867 dbgs() << "LV: Not considering vector loop of width " << i 5868 << " because it will not generate any vector instructions.\n"); 5869 continue; 5870 } 5871 5872 // If profitable add it to ProfitableVF list. 5873 if (VectorCost < ScalarCost) { 5874 ProfitableVFs.push_back(VectorizationFactor( 5875 {i, (unsigned)VectorCost})); 5876 } 5877 5878 if (VectorCost < Cost) { 5879 Cost = VectorCost; 5880 Width = i; 5881 } 5882 } 5883 5884 if (!EnableCondStoresVectorization && NumPredStores) { 5885 reportVectorizationFailure("There are conditional stores.", 5886 "store that is conditionally executed prevents vectorization", 5887 "ConditionalStore", ORE, TheLoop); 5888 Width = ElementCount::getFixed(1); 5889 Cost = ScalarCost; 5890 } 5891 5892 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5893 << "LV: Vectorization seems to be not beneficial, " 5894 << "but was forced by a user.\n"); 5895 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5896 VectorizationFactor Factor = {Width, 5897 (unsigned)(Width.getKnownMinValue() * Cost)}; 5898 return Factor; 5899 } 5900 5901 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5902 const Loop &L, ElementCount VF) const { 5903 // Cross iteration phis such as reductions need special handling and are 5904 // currently unsupported. 5905 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5906 return Legal->isFirstOrderRecurrence(&Phi) || 5907 Legal->isReductionVariable(&Phi); 5908 })) 5909 return false; 5910 5911 // Phis with uses outside of the loop require special handling and are 5912 // currently unsupported. 5913 for (auto &Entry : Legal->getInductionVars()) { 5914 // Look for uses of the value of the induction at the last iteration. 5915 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5916 for (User *U : PostInc->users()) 5917 if (!L.contains(cast<Instruction>(U))) 5918 return false; 5919 // Look for uses of penultimate value of the induction. 5920 for (User *U : Entry.first->users()) 5921 if (!L.contains(cast<Instruction>(U))) 5922 return false; 5923 } 5924 5925 // Induction variables that are widened require special handling that is 5926 // currently not supported. 5927 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5928 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5929 this->isProfitableToScalarize(Entry.first, VF)); 5930 })) 5931 return false; 5932 5933 return true; 5934 } 5935 5936 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5937 const ElementCount VF) const { 5938 // FIXME: We need a much better cost-model to take different parameters such 5939 // as register pressure, code size increase and cost of extra branches into 5940 // account. For now we apply a very crude heuristic and only consider loops 5941 // with vectorization factors larger than a certain value. 5942 // We also consider epilogue vectorization unprofitable for targets that don't 5943 // consider interleaving beneficial (eg. MVE). 5944 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5945 return false; 5946 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5947 return true; 5948 return false; 5949 } 5950 5951 VectorizationFactor 5952 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5953 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5954 VectorizationFactor Result = VectorizationFactor::Disabled(); 5955 if (!EnableEpilogueVectorization) { 5956 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5957 return Result; 5958 } 5959 5960 if (!isScalarEpilogueAllowed()) { 5961 LLVM_DEBUG( 5962 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5963 "allowed.\n";); 5964 return Result; 5965 } 5966 5967 // FIXME: This can be fixed for scalable vectors later, because at this stage 5968 // the LoopVectorizer will only consider vectorizing a loop with scalable 5969 // vectors when the loop has a hint to enable vectorization for a given VF. 5970 if (MainLoopVF.isScalable()) { 5971 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5972 "yet supported.\n"); 5973 return Result; 5974 } 5975 5976 // Not really a cost consideration, but check for unsupported cases here to 5977 // simplify the logic. 5978 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5979 LLVM_DEBUG( 5980 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5981 "not a supported candidate.\n";); 5982 return Result; 5983 } 5984 5985 if (EpilogueVectorizationForceVF > 1) { 5986 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5987 if (LVP.hasPlanWithVFs( 5988 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5989 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5990 else { 5991 LLVM_DEBUG( 5992 dbgs() 5993 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5994 return Result; 5995 } 5996 } 5997 5998 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5999 TheLoop->getHeader()->getParent()->hasMinSize()) { 6000 LLVM_DEBUG( 6001 dbgs() 6002 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6003 return Result; 6004 } 6005 6006 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6007 return Result; 6008 6009 for (auto &NextVF : ProfitableVFs) 6010 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6011 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6012 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6013 Result = NextVF; 6014 6015 if (Result != VectorizationFactor::Disabled()) 6016 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6017 << Result.Width.getFixedValue() << "\n";); 6018 return Result; 6019 } 6020 6021 std::pair<unsigned, unsigned> 6022 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6023 unsigned MinWidth = -1U; 6024 unsigned MaxWidth = 8; 6025 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6026 6027 // For each block. 6028 for (BasicBlock *BB : TheLoop->blocks()) { 6029 // For each instruction in the loop. 6030 for (Instruction &I : BB->instructionsWithoutDebug()) { 6031 Type *T = I.getType(); 6032 6033 // Skip ignored values. 6034 if (ValuesToIgnore.count(&I)) 6035 continue; 6036 6037 // Only examine Loads, Stores and PHINodes. 6038 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6039 continue; 6040 6041 // Examine PHI nodes that are reduction variables. Update the type to 6042 // account for the recurrence type. 6043 if (auto *PN = dyn_cast<PHINode>(&I)) { 6044 if (!Legal->isReductionVariable(PN)) 6045 continue; 6046 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6047 if (PreferInLoopReductions || 6048 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6049 RdxDesc.getRecurrenceType(), 6050 TargetTransformInfo::ReductionFlags())) 6051 continue; 6052 T = RdxDesc.getRecurrenceType(); 6053 } 6054 6055 // Examine the stored values. 6056 if (auto *ST = dyn_cast<StoreInst>(&I)) 6057 T = ST->getValueOperand()->getType(); 6058 6059 // Ignore loaded pointer types and stored pointer types that are not 6060 // vectorizable. 6061 // 6062 // FIXME: The check here attempts to predict whether a load or store will 6063 // be vectorized. We only know this for certain after a VF has 6064 // been selected. Here, we assume that if an access can be 6065 // vectorized, it will be. We should also look at extending this 6066 // optimization to non-pointer types. 6067 // 6068 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6069 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6070 continue; 6071 6072 MinWidth = std::min(MinWidth, 6073 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6074 MaxWidth = std::max(MaxWidth, 6075 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6076 } 6077 } 6078 6079 return {MinWidth, MaxWidth}; 6080 } 6081 6082 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6083 unsigned LoopCost) { 6084 // -- The interleave heuristics -- 6085 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6086 // There are many micro-architectural considerations that we can't predict 6087 // at this level. For example, frontend pressure (on decode or fetch) due to 6088 // code size, or the number and capabilities of the execution ports. 6089 // 6090 // We use the following heuristics to select the interleave count: 6091 // 1. If the code has reductions, then we interleave to break the cross 6092 // iteration dependency. 6093 // 2. If the loop is really small, then we interleave to reduce the loop 6094 // overhead. 6095 // 3. We don't interleave if we think that we will spill registers to memory 6096 // due to the increased register pressure. 6097 6098 if (!isScalarEpilogueAllowed()) 6099 return 1; 6100 6101 // We used the distance for the interleave count. 6102 if (Legal->getMaxSafeDepDistBytes() != -1U) 6103 return 1; 6104 6105 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6106 const bool HasReductions = !Legal->getReductionVars().empty(); 6107 // Do not interleave loops with a relatively small known or estimated trip 6108 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6109 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6110 // because with the above conditions interleaving can expose ILP and break 6111 // cross iteration dependences for reductions. 6112 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6113 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6114 return 1; 6115 6116 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6117 // We divide by these constants so assume that we have at least one 6118 // instruction that uses at least one register. 6119 for (auto& pair : R.MaxLocalUsers) { 6120 pair.second = std::max(pair.second, 1U); 6121 } 6122 6123 // We calculate the interleave count using the following formula. 6124 // Subtract the number of loop invariants from the number of available 6125 // registers. These registers are used by all of the interleaved instances. 6126 // Next, divide the remaining registers by the number of registers that is 6127 // required by the loop, in order to estimate how many parallel instances 6128 // fit without causing spills. All of this is rounded down if necessary to be 6129 // a power of two. We want power of two interleave count to simplify any 6130 // addressing operations or alignment considerations. 6131 // We also want power of two interleave counts to ensure that the induction 6132 // variable of the vector loop wraps to zero, when tail is folded by masking; 6133 // this currently happens when OptForSize, in which case IC is set to 1 above. 6134 unsigned IC = UINT_MAX; 6135 6136 for (auto& pair : R.MaxLocalUsers) { 6137 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6138 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6139 << " registers of " 6140 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6141 if (VF.isScalar()) { 6142 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6143 TargetNumRegisters = ForceTargetNumScalarRegs; 6144 } else { 6145 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6146 TargetNumRegisters = ForceTargetNumVectorRegs; 6147 } 6148 unsigned MaxLocalUsers = pair.second; 6149 unsigned LoopInvariantRegs = 0; 6150 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6151 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6152 6153 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6154 // Don't count the induction variable as interleaved. 6155 if (EnableIndVarRegisterHeur) { 6156 TmpIC = 6157 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6158 std::max(1U, (MaxLocalUsers - 1))); 6159 } 6160 6161 IC = std::min(IC, TmpIC); 6162 } 6163 6164 // Clamp the interleave ranges to reasonable counts. 6165 unsigned MaxInterleaveCount = 6166 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6167 6168 // Check if the user has overridden the max. 6169 if (VF.isScalar()) { 6170 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6171 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6172 } else { 6173 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6174 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6175 } 6176 6177 // If trip count is known or estimated compile time constant, limit the 6178 // interleave count to be less than the trip count divided by VF, provided it 6179 // is at least 1. 6180 // 6181 // For scalable vectors we can't know if interleaving is beneficial. It may 6182 // not be beneficial for small loops if none of the lanes in the second vector 6183 // iterations is enabled. However, for larger loops, there is likely to be a 6184 // similar benefit as for fixed-width vectors. For now, we choose to leave 6185 // the InterleaveCount as if vscale is '1', although if some information about 6186 // the vector is known (e.g. min vector size), we can make a better decision. 6187 if (BestKnownTC) { 6188 MaxInterleaveCount = 6189 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6190 // Make sure MaxInterleaveCount is greater than 0. 6191 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6192 } 6193 6194 assert(MaxInterleaveCount > 0 && 6195 "Maximum interleave count must be greater than 0"); 6196 6197 // Clamp the calculated IC to be between the 1 and the max interleave count 6198 // that the target and trip count allows. 6199 if (IC > MaxInterleaveCount) 6200 IC = MaxInterleaveCount; 6201 else 6202 // Make sure IC is greater than 0. 6203 IC = std::max(1u, IC); 6204 6205 assert(IC > 0 && "Interleave count must be greater than 0."); 6206 6207 // If we did not calculate the cost for VF (because the user selected the VF) 6208 // then we calculate the cost of VF here. 6209 if (LoopCost == 0) { 6210 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6211 LoopCost = *expectedCost(VF).first.getValue(); 6212 } 6213 6214 assert(LoopCost && "Non-zero loop cost expected"); 6215 6216 // Interleave if we vectorized this loop and there is a reduction that could 6217 // benefit from interleaving. 6218 if (VF.isVector() && HasReductions) { 6219 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6220 return IC; 6221 } 6222 6223 // Note that if we've already vectorized the loop we will have done the 6224 // runtime check and so interleaving won't require further checks. 6225 bool InterleavingRequiresRuntimePointerCheck = 6226 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6227 6228 // We want to interleave small loops in order to reduce the loop overhead and 6229 // potentially expose ILP opportunities. 6230 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6231 << "LV: IC is " << IC << '\n' 6232 << "LV: VF is " << VF << '\n'); 6233 const bool AggressivelyInterleaveReductions = 6234 TTI.enableAggressiveInterleaving(HasReductions); 6235 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6236 // We assume that the cost overhead is 1 and we use the cost model 6237 // to estimate the cost of the loop and interleave until the cost of the 6238 // loop overhead is about 5% of the cost of the loop. 6239 unsigned SmallIC = 6240 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6241 6242 // Interleave until store/load ports (estimated by max interleave count) are 6243 // saturated. 6244 unsigned NumStores = Legal->getNumStores(); 6245 unsigned NumLoads = Legal->getNumLoads(); 6246 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6247 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6248 6249 // If we have a scalar reduction (vector reductions are already dealt with 6250 // by this point), we can increase the critical path length if the loop 6251 // we're interleaving is inside another loop. Limit, by default to 2, so the 6252 // critical path only gets increased by one reduction operation. 6253 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6254 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6255 SmallIC = std::min(SmallIC, F); 6256 StoresIC = std::min(StoresIC, F); 6257 LoadsIC = std::min(LoadsIC, F); 6258 } 6259 6260 if (EnableLoadStoreRuntimeInterleave && 6261 std::max(StoresIC, LoadsIC) > SmallIC) { 6262 LLVM_DEBUG( 6263 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6264 return std::max(StoresIC, LoadsIC); 6265 } 6266 6267 // If there are scalar reductions and TTI has enabled aggressive 6268 // interleaving for reductions, we will interleave to expose ILP. 6269 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6270 AggressivelyInterleaveReductions) { 6271 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6272 // Interleave no less than SmallIC but not as aggressive as the normal IC 6273 // to satisfy the rare situation when resources are too limited. 6274 return std::max(IC / 2, SmallIC); 6275 } else { 6276 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6277 return SmallIC; 6278 } 6279 } 6280 6281 // Interleave if this is a large loop (small loops are already dealt with by 6282 // this point) that could benefit from interleaving. 6283 if (AggressivelyInterleaveReductions) { 6284 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6285 return IC; 6286 } 6287 6288 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6289 return 1; 6290 } 6291 6292 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6293 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6294 // This function calculates the register usage by measuring the highest number 6295 // of values that are alive at a single location. Obviously, this is a very 6296 // rough estimation. We scan the loop in a topological order in order and 6297 // assign a number to each instruction. We use RPO to ensure that defs are 6298 // met before their users. We assume that each instruction that has in-loop 6299 // users starts an interval. We record every time that an in-loop value is 6300 // used, so we have a list of the first and last occurrences of each 6301 // instruction. Next, we transpose this data structure into a multi map that 6302 // holds the list of intervals that *end* at a specific location. This multi 6303 // map allows us to perform a linear search. We scan the instructions linearly 6304 // and record each time that a new interval starts, by placing it in a set. 6305 // If we find this value in the multi-map then we remove it from the set. 6306 // The max register usage is the maximum size of the set. 6307 // We also search for instructions that are defined outside the loop, but are 6308 // used inside the loop. We need this number separately from the max-interval 6309 // usage number because when we unroll, loop-invariant values do not take 6310 // more register. 6311 LoopBlocksDFS DFS(TheLoop); 6312 DFS.perform(LI); 6313 6314 RegisterUsage RU; 6315 6316 // Each 'key' in the map opens a new interval. The values 6317 // of the map are the index of the 'last seen' usage of the 6318 // instruction that is the key. 6319 using IntervalMap = DenseMap<Instruction *, unsigned>; 6320 6321 // Maps instruction to its index. 6322 SmallVector<Instruction *, 64> IdxToInstr; 6323 // Marks the end of each interval. 6324 IntervalMap EndPoint; 6325 // Saves the list of instruction indices that are used in the loop. 6326 SmallPtrSet<Instruction *, 8> Ends; 6327 // Saves the list of values that are used in the loop but are 6328 // defined outside the loop, such as arguments and constants. 6329 SmallPtrSet<Value *, 8> LoopInvariants; 6330 6331 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6332 for (Instruction &I : BB->instructionsWithoutDebug()) { 6333 IdxToInstr.push_back(&I); 6334 6335 // Save the end location of each USE. 6336 for (Value *U : I.operands()) { 6337 auto *Instr = dyn_cast<Instruction>(U); 6338 6339 // Ignore non-instruction values such as arguments, constants, etc. 6340 if (!Instr) 6341 continue; 6342 6343 // If this instruction is outside the loop then record it and continue. 6344 if (!TheLoop->contains(Instr)) { 6345 LoopInvariants.insert(Instr); 6346 continue; 6347 } 6348 6349 // Overwrite previous end points. 6350 EndPoint[Instr] = IdxToInstr.size(); 6351 Ends.insert(Instr); 6352 } 6353 } 6354 } 6355 6356 // Saves the list of intervals that end with the index in 'key'. 6357 using InstrList = SmallVector<Instruction *, 2>; 6358 DenseMap<unsigned, InstrList> TransposeEnds; 6359 6360 // Transpose the EndPoints to a list of values that end at each index. 6361 for (auto &Interval : EndPoint) 6362 TransposeEnds[Interval.second].push_back(Interval.first); 6363 6364 SmallPtrSet<Instruction *, 8> OpenIntervals; 6365 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6366 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6367 6368 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6369 6370 // A lambda that gets the register usage for the given type and VF. 6371 const auto &TTICapture = TTI; 6372 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6373 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6374 return 0U; 6375 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6376 }; 6377 6378 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6379 Instruction *I = IdxToInstr[i]; 6380 6381 // Remove all of the instructions that end at this location. 6382 InstrList &List = TransposeEnds[i]; 6383 for (Instruction *ToRemove : List) 6384 OpenIntervals.erase(ToRemove); 6385 6386 // Ignore instructions that are never used within the loop. 6387 if (!Ends.count(I)) 6388 continue; 6389 6390 // Skip ignored values. 6391 if (ValuesToIgnore.count(I)) 6392 continue; 6393 6394 // For each VF find the maximum usage of registers. 6395 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6396 // Count the number of live intervals. 6397 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6398 6399 if (VFs[j].isScalar()) { 6400 for (auto Inst : OpenIntervals) { 6401 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6402 if (RegUsage.find(ClassID) == RegUsage.end()) 6403 RegUsage[ClassID] = 1; 6404 else 6405 RegUsage[ClassID] += 1; 6406 } 6407 } else { 6408 collectUniformsAndScalars(VFs[j]); 6409 for (auto Inst : OpenIntervals) { 6410 // Skip ignored values for VF > 1. 6411 if (VecValuesToIgnore.count(Inst)) 6412 continue; 6413 if (isScalarAfterVectorization(Inst, VFs[j])) { 6414 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6415 if (RegUsage.find(ClassID) == RegUsage.end()) 6416 RegUsage[ClassID] = 1; 6417 else 6418 RegUsage[ClassID] += 1; 6419 } else { 6420 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6421 if (RegUsage.find(ClassID) == RegUsage.end()) 6422 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6423 else 6424 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6425 } 6426 } 6427 } 6428 6429 for (auto& pair : RegUsage) { 6430 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6431 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6432 else 6433 MaxUsages[j][pair.first] = pair.second; 6434 } 6435 } 6436 6437 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6438 << OpenIntervals.size() << '\n'); 6439 6440 // Add the current instruction to the list of open intervals. 6441 OpenIntervals.insert(I); 6442 } 6443 6444 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6445 SmallMapVector<unsigned, unsigned, 4> Invariant; 6446 6447 for (auto Inst : LoopInvariants) { 6448 unsigned Usage = 6449 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6450 unsigned ClassID = 6451 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6452 if (Invariant.find(ClassID) == Invariant.end()) 6453 Invariant[ClassID] = Usage; 6454 else 6455 Invariant[ClassID] += Usage; 6456 } 6457 6458 LLVM_DEBUG({ 6459 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6460 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6461 << " item\n"; 6462 for (const auto &pair : MaxUsages[i]) { 6463 dbgs() << "LV(REG): RegisterClass: " 6464 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6465 << " registers\n"; 6466 } 6467 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6468 << " item\n"; 6469 for (const auto &pair : Invariant) { 6470 dbgs() << "LV(REG): RegisterClass: " 6471 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6472 << " registers\n"; 6473 } 6474 }); 6475 6476 RU.LoopInvariantRegs = Invariant; 6477 RU.MaxLocalUsers = MaxUsages[i]; 6478 RUs[i] = RU; 6479 } 6480 6481 return RUs; 6482 } 6483 6484 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6485 // TODO: Cost model for emulated masked load/store is completely 6486 // broken. This hack guides the cost model to use an artificially 6487 // high enough value to practically disable vectorization with such 6488 // operations, except where previously deployed legality hack allowed 6489 // using very low cost values. This is to avoid regressions coming simply 6490 // from moving "masked load/store" check from legality to cost model. 6491 // Masked Load/Gather emulation was previously never allowed. 6492 // Limited number of Masked Store/Scatter emulation was allowed. 6493 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6494 return isa<LoadInst>(I) || 6495 (isa<StoreInst>(I) && 6496 NumPredStores > NumberOfStoresToPredicate); 6497 } 6498 6499 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6500 // If we aren't vectorizing the loop, or if we've already collected the 6501 // instructions to scalarize, there's nothing to do. Collection may already 6502 // have occurred if we have a user-selected VF and are now computing the 6503 // expected cost for interleaving. 6504 if (VF.isScalar() || VF.isZero() || 6505 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6506 return; 6507 6508 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6509 // not profitable to scalarize any instructions, the presence of VF in the 6510 // map will indicate that we've analyzed it already. 6511 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6512 6513 // Find all the instructions that are scalar with predication in the loop and 6514 // determine if it would be better to not if-convert the blocks they are in. 6515 // If so, we also record the instructions to scalarize. 6516 for (BasicBlock *BB : TheLoop->blocks()) { 6517 if (!blockNeedsPredication(BB)) 6518 continue; 6519 for (Instruction &I : *BB) 6520 if (isScalarWithPredication(&I)) { 6521 ScalarCostsTy ScalarCosts; 6522 // Do not apply discount logic if hacked cost is needed 6523 // for emulated masked memrefs. 6524 if (!useEmulatedMaskMemRefHack(&I) && 6525 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6526 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6527 // Remember that BB will remain after vectorization. 6528 PredicatedBBsAfterVectorization.insert(BB); 6529 } 6530 } 6531 } 6532 6533 int LoopVectorizationCostModel::computePredInstDiscount( 6534 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6535 assert(!isUniformAfterVectorization(PredInst, VF) && 6536 "Instruction marked uniform-after-vectorization will be predicated"); 6537 6538 // Initialize the discount to zero, meaning that the scalar version and the 6539 // vector version cost the same. 6540 InstructionCost Discount = 0; 6541 6542 // Holds instructions to analyze. The instructions we visit are mapped in 6543 // ScalarCosts. Those instructions are the ones that would be scalarized if 6544 // we find that the scalar version costs less. 6545 SmallVector<Instruction *, 8> Worklist; 6546 6547 // Returns true if the given instruction can be scalarized. 6548 auto canBeScalarized = [&](Instruction *I) -> bool { 6549 // We only attempt to scalarize instructions forming a single-use chain 6550 // from the original predicated block that would otherwise be vectorized. 6551 // Although not strictly necessary, we give up on instructions we know will 6552 // already be scalar to avoid traversing chains that are unlikely to be 6553 // beneficial. 6554 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6555 isScalarAfterVectorization(I, VF)) 6556 return false; 6557 6558 // If the instruction is scalar with predication, it will be analyzed 6559 // separately. We ignore it within the context of PredInst. 6560 if (isScalarWithPredication(I)) 6561 return false; 6562 6563 // If any of the instruction's operands are uniform after vectorization, 6564 // the instruction cannot be scalarized. This prevents, for example, a 6565 // masked load from being scalarized. 6566 // 6567 // We assume we will only emit a value for lane zero of an instruction 6568 // marked uniform after vectorization, rather than VF identical values. 6569 // Thus, if we scalarize an instruction that uses a uniform, we would 6570 // create uses of values corresponding to the lanes we aren't emitting code 6571 // for. This behavior can be changed by allowing getScalarValue to clone 6572 // the lane zero values for uniforms rather than asserting. 6573 for (Use &U : I->operands()) 6574 if (auto *J = dyn_cast<Instruction>(U.get())) 6575 if (isUniformAfterVectorization(J, VF)) 6576 return false; 6577 6578 // Otherwise, we can scalarize the instruction. 6579 return true; 6580 }; 6581 6582 // Compute the expected cost discount from scalarizing the entire expression 6583 // feeding the predicated instruction. We currently only consider expressions 6584 // that are single-use instruction chains. 6585 Worklist.push_back(PredInst); 6586 while (!Worklist.empty()) { 6587 Instruction *I = Worklist.pop_back_val(); 6588 6589 // If we've already analyzed the instruction, there's nothing to do. 6590 if (ScalarCosts.find(I) != ScalarCosts.end()) 6591 continue; 6592 6593 // Compute the cost of the vector instruction. Note that this cost already 6594 // includes the scalarization overhead of the predicated instruction. 6595 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6596 6597 // Compute the cost of the scalarized instruction. This cost is the cost of 6598 // the instruction as if it wasn't if-converted and instead remained in the 6599 // predicated block. We will scale this cost by block probability after 6600 // computing the scalarization overhead. 6601 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6602 InstructionCost ScalarCost = 6603 VF.getKnownMinValue() * 6604 getInstructionCost(I, ElementCount::getFixed(1)).first; 6605 6606 // Compute the scalarization overhead of needed insertelement instructions 6607 // and phi nodes. 6608 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6609 ScalarCost += TTI.getScalarizationOverhead( 6610 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6611 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6612 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6613 ScalarCost += 6614 VF.getKnownMinValue() * 6615 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6616 } 6617 6618 // Compute the scalarization overhead of needed extractelement 6619 // instructions. For each of the instruction's operands, if the operand can 6620 // be scalarized, add it to the worklist; otherwise, account for the 6621 // overhead. 6622 for (Use &U : I->operands()) 6623 if (auto *J = dyn_cast<Instruction>(U.get())) { 6624 assert(VectorType::isValidElementType(J->getType()) && 6625 "Instruction has non-scalar type"); 6626 if (canBeScalarized(J)) 6627 Worklist.push_back(J); 6628 else if (needsExtract(J, VF)) { 6629 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6630 ScalarCost += TTI.getScalarizationOverhead( 6631 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6632 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6633 } 6634 } 6635 6636 // Scale the total scalar cost by block probability. 6637 ScalarCost /= getReciprocalPredBlockProb(); 6638 6639 // Compute the discount. A non-negative discount means the vector version 6640 // of the instruction costs more, and scalarizing would be beneficial. 6641 Discount += VectorCost - ScalarCost; 6642 ScalarCosts[I] = ScalarCost; 6643 } 6644 6645 return *Discount.getValue(); 6646 } 6647 6648 LoopVectorizationCostModel::VectorizationCostTy 6649 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6650 VectorizationCostTy Cost; 6651 6652 // For each block. 6653 for (BasicBlock *BB : TheLoop->blocks()) { 6654 VectorizationCostTy BlockCost; 6655 6656 // For each instruction in the old loop. 6657 for (Instruction &I : BB->instructionsWithoutDebug()) { 6658 // Skip ignored values. 6659 if (ValuesToIgnore.count(&I) || 6660 (VF.isVector() && VecValuesToIgnore.count(&I))) 6661 continue; 6662 6663 VectorizationCostTy C = getInstructionCost(&I, VF); 6664 6665 // Check if we should override the cost. 6666 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6667 C.first = InstructionCost(ForceTargetInstructionCost); 6668 6669 BlockCost.first += C.first; 6670 BlockCost.second |= C.second; 6671 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6672 << " for VF " << VF << " For instruction: " << I 6673 << '\n'); 6674 } 6675 6676 // If we are vectorizing a predicated block, it will have been 6677 // if-converted. This means that the block's instructions (aside from 6678 // stores and instructions that may divide by zero) will now be 6679 // unconditionally executed. For the scalar case, we may not always execute 6680 // the predicated block, if it is an if-else block. Thus, scale the block's 6681 // cost by the probability of executing it. blockNeedsPredication from 6682 // Legal is used so as to not include all blocks in tail folded loops. 6683 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6684 BlockCost.first /= getReciprocalPredBlockProb(); 6685 6686 Cost.first += BlockCost.first; 6687 Cost.second |= BlockCost.second; 6688 } 6689 6690 return Cost; 6691 } 6692 6693 /// Gets Address Access SCEV after verifying that the access pattern 6694 /// is loop invariant except the induction variable dependence. 6695 /// 6696 /// This SCEV can be sent to the Target in order to estimate the address 6697 /// calculation cost. 6698 static const SCEV *getAddressAccessSCEV( 6699 Value *Ptr, 6700 LoopVectorizationLegality *Legal, 6701 PredicatedScalarEvolution &PSE, 6702 const Loop *TheLoop) { 6703 6704 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6705 if (!Gep) 6706 return nullptr; 6707 6708 // We are looking for a gep with all loop invariant indices except for one 6709 // which should be an induction variable. 6710 auto SE = PSE.getSE(); 6711 unsigned NumOperands = Gep->getNumOperands(); 6712 for (unsigned i = 1; i < NumOperands; ++i) { 6713 Value *Opd = Gep->getOperand(i); 6714 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6715 !Legal->isInductionVariable(Opd)) 6716 return nullptr; 6717 } 6718 6719 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6720 return PSE.getSCEV(Ptr); 6721 } 6722 6723 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6724 return Legal->hasStride(I->getOperand(0)) || 6725 Legal->hasStride(I->getOperand(1)); 6726 } 6727 6728 InstructionCost 6729 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6730 ElementCount VF) { 6731 assert(VF.isVector() && 6732 "Scalarization cost of instruction implies vectorization."); 6733 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6734 Type *ValTy = getMemInstValueType(I); 6735 auto SE = PSE.getSE(); 6736 6737 unsigned AS = getLoadStoreAddressSpace(I); 6738 Value *Ptr = getLoadStorePointerOperand(I); 6739 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6740 6741 // Figure out whether the access is strided and get the stride value 6742 // if it's known in compile time 6743 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6744 6745 // Get the cost of the scalar memory instruction and address computation. 6746 InstructionCost Cost = 6747 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6748 6749 // Don't pass *I here, since it is scalar but will actually be part of a 6750 // vectorized loop where the user of it is a vectorized instruction. 6751 const Align Alignment = getLoadStoreAlignment(I); 6752 Cost += VF.getKnownMinValue() * 6753 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6754 AS, TTI::TCK_RecipThroughput); 6755 6756 // Get the overhead of the extractelement and insertelement instructions 6757 // we might create due to scalarization. 6758 Cost += getScalarizationOverhead(I, VF); 6759 6760 // If we have a predicated load/store, it will need extra i1 extracts and 6761 // conditional branches, but may not be executed for each vector lane. Scale 6762 // the cost by the probability of executing the predicated block. 6763 if (isPredicatedInst(I)) { 6764 Cost /= getReciprocalPredBlockProb(); 6765 6766 // Add the cost of an i1 extract and a branch 6767 auto *Vec_i1Ty = 6768 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6769 Cost += TTI.getScalarizationOverhead( 6770 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6771 /*Insert=*/false, /*Extract=*/true); 6772 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6773 6774 if (useEmulatedMaskMemRefHack(I)) 6775 // Artificially setting to a high enough value to practically disable 6776 // vectorization with such operations. 6777 Cost = 3000000; 6778 } 6779 6780 return Cost; 6781 } 6782 6783 InstructionCost 6784 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6785 ElementCount VF) { 6786 Type *ValTy = getMemInstValueType(I); 6787 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6788 Value *Ptr = getLoadStorePointerOperand(I); 6789 unsigned AS = getLoadStoreAddressSpace(I); 6790 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6791 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6792 6793 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6794 "Stride should be 1 or -1 for consecutive memory access"); 6795 const Align Alignment = getLoadStoreAlignment(I); 6796 InstructionCost Cost = 0; 6797 if (Legal->isMaskRequired(I)) 6798 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6799 CostKind); 6800 else 6801 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6802 CostKind, I); 6803 6804 bool Reverse = ConsecutiveStride < 0; 6805 if (Reverse) 6806 Cost += 6807 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6808 return Cost; 6809 } 6810 6811 InstructionCost 6812 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6813 ElementCount VF) { 6814 assert(Legal->isUniformMemOp(*I)); 6815 6816 Type *ValTy = getMemInstValueType(I); 6817 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6818 const Align Alignment = getLoadStoreAlignment(I); 6819 unsigned AS = getLoadStoreAddressSpace(I); 6820 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6821 if (isa<LoadInst>(I)) { 6822 return TTI.getAddressComputationCost(ValTy) + 6823 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6824 CostKind) + 6825 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6826 } 6827 StoreInst *SI = cast<StoreInst>(I); 6828 6829 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6830 return TTI.getAddressComputationCost(ValTy) + 6831 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6832 CostKind) + 6833 (isLoopInvariantStoreValue 6834 ? 0 6835 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6836 VF.getKnownMinValue() - 1)); 6837 } 6838 6839 InstructionCost 6840 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6841 ElementCount VF) { 6842 Type *ValTy = getMemInstValueType(I); 6843 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6844 const Align Alignment = getLoadStoreAlignment(I); 6845 const Value *Ptr = getLoadStorePointerOperand(I); 6846 6847 return TTI.getAddressComputationCost(VectorTy) + 6848 TTI.getGatherScatterOpCost( 6849 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6850 TargetTransformInfo::TCK_RecipThroughput, I); 6851 } 6852 6853 InstructionCost 6854 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6855 ElementCount VF) { 6856 // TODO: Once we have support for interleaving with scalable vectors 6857 // we can calculate the cost properly here. 6858 if (VF.isScalable()) 6859 return InstructionCost::getInvalid(); 6860 6861 Type *ValTy = getMemInstValueType(I); 6862 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6863 unsigned AS = getLoadStoreAddressSpace(I); 6864 6865 auto Group = getInterleavedAccessGroup(I); 6866 assert(Group && "Fail to get an interleaved access group."); 6867 6868 unsigned InterleaveFactor = Group->getFactor(); 6869 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6870 6871 // Holds the indices of existing members in an interleaved load group. 6872 // An interleaved store group doesn't need this as it doesn't allow gaps. 6873 SmallVector<unsigned, 4> Indices; 6874 if (isa<LoadInst>(I)) { 6875 for (unsigned i = 0; i < InterleaveFactor; i++) 6876 if (Group->getMember(i)) 6877 Indices.push_back(i); 6878 } 6879 6880 // Calculate the cost of the whole interleaved group. 6881 bool UseMaskForGaps = 6882 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6883 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6884 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6885 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6886 6887 if (Group->isReverse()) { 6888 // TODO: Add support for reversed masked interleaved access. 6889 assert(!Legal->isMaskRequired(I) && 6890 "Reverse masked interleaved access not supported."); 6891 Cost += 6892 Group->getNumMembers() * 6893 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6894 } 6895 return Cost; 6896 } 6897 6898 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6899 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6900 // Early exit for no inloop reductions 6901 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6902 return InstructionCost::getInvalid(); 6903 auto *VectorTy = cast<VectorType>(Ty); 6904 6905 // We are looking for a pattern of, and finding the minimal acceptable cost: 6906 // reduce(mul(ext(A), ext(B))) or 6907 // reduce(mul(A, B)) or 6908 // reduce(ext(A)) or 6909 // reduce(A). 6910 // The basic idea is that we walk down the tree to do that, finding the root 6911 // reduction instruction in InLoopReductionImmediateChains. From there we find 6912 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6913 // of the components. If the reduction cost is lower then we return it for the 6914 // reduction instruction and 0 for the other instructions in the pattern. If 6915 // it is not we return an invalid cost specifying the orignal cost method 6916 // should be used. 6917 Instruction *RetI = I; 6918 if ((RetI->getOpcode() == Instruction::SExt || 6919 RetI->getOpcode() == Instruction::ZExt)) { 6920 if (!RetI->hasOneUser()) 6921 return InstructionCost::getInvalid(); 6922 RetI = RetI->user_back(); 6923 } 6924 if (RetI->getOpcode() == Instruction::Mul && 6925 RetI->user_back()->getOpcode() == Instruction::Add) { 6926 if (!RetI->hasOneUser()) 6927 return InstructionCost::getInvalid(); 6928 RetI = RetI->user_back(); 6929 } 6930 6931 // Test if the found instruction is a reduction, and if not return an invalid 6932 // cost specifying the parent to use the original cost modelling. 6933 if (!InLoopReductionImmediateChains.count(RetI)) 6934 return InstructionCost::getInvalid(); 6935 6936 // Find the reduction this chain is a part of and calculate the basic cost of 6937 // the reduction on its own. 6938 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6939 Instruction *ReductionPhi = LastChain; 6940 while (!isa<PHINode>(ReductionPhi)) 6941 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6942 6943 RecurrenceDescriptor RdxDesc = 6944 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6945 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6946 VectorTy, false, CostKind); 6947 6948 // Get the operand that was not the reduction chain and match it to one of the 6949 // patterns, returning the better cost if it is found. 6950 Instruction *RedOp = RetI->getOperand(1) == LastChain 6951 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6952 : dyn_cast<Instruction>(RetI->getOperand(1)); 6953 6954 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6955 6956 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6957 !TheLoop->isLoopInvariant(RedOp)) { 6958 bool IsUnsigned = isa<ZExtInst>(RedOp); 6959 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6960 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6961 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6962 CostKind); 6963 6964 unsigned ExtCost = 6965 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6966 TTI::CastContextHint::None, CostKind, RedOp); 6967 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6968 return I == RetI ? *RedCost.getValue() : 0; 6969 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6970 Instruction *Mul = RedOp; 6971 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6972 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6973 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6974 Op0->getOpcode() == Op1->getOpcode() && 6975 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6976 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6977 bool IsUnsigned = isa<ZExtInst>(Op0); 6978 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6979 // reduce(mul(ext, ext)) 6980 unsigned ExtCost = 6981 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6982 TTI::CastContextHint::None, CostKind, Op0); 6983 InstructionCost MulCost = 6984 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6985 6986 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6987 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6988 CostKind); 6989 6990 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6991 return I == RetI ? *RedCost.getValue() : 0; 6992 } else { 6993 InstructionCost MulCost = 6994 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6995 6996 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6997 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6998 CostKind); 6999 7000 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7001 return I == RetI ? *RedCost.getValue() : 0; 7002 } 7003 } 7004 7005 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7006 } 7007 7008 InstructionCost 7009 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7010 ElementCount VF) { 7011 // Calculate scalar cost only. Vectorization cost should be ready at this 7012 // moment. 7013 if (VF.isScalar()) { 7014 Type *ValTy = getMemInstValueType(I); 7015 const Align Alignment = getLoadStoreAlignment(I); 7016 unsigned AS = getLoadStoreAddressSpace(I); 7017 7018 return TTI.getAddressComputationCost(ValTy) + 7019 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7020 TTI::TCK_RecipThroughput, I); 7021 } 7022 return getWideningCost(I, VF); 7023 } 7024 7025 LoopVectorizationCostModel::VectorizationCostTy 7026 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7027 ElementCount VF) { 7028 // If we know that this instruction will remain uniform, check the cost of 7029 // the scalar version. 7030 if (isUniformAfterVectorization(I, VF)) 7031 VF = ElementCount::getFixed(1); 7032 7033 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7034 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7035 7036 // Forced scalars do not have any scalarization overhead. 7037 auto ForcedScalar = ForcedScalars.find(VF); 7038 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7039 auto InstSet = ForcedScalar->second; 7040 if (InstSet.count(I)) 7041 return VectorizationCostTy( 7042 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7043 VF.getKnownMinValue()), 7044 false); 7045 } 7046 7047 Type *VectorTy; 7048 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7049 7050 bool TypeNotScalarized = 7051 VF.isVector() && VectorTy->isVectorTy() && 7052 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7053 return VectorizationCostTy(C, TypeNotScalarized); 7054 } 7055 7056 InstructionCost 7057 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7058 ElementCount VF) { 7059 7060 if (VF.isScalable()) 7061 return InstructionCost::getInvalid(); 7062 7063 if (VF.isScalar()) 7064 return 0; 7065 7066 InstructionCost Cost = 0; 7067 Type *RetTy = ToVectorTy(I->getType(), VF); 7068 if (!RetTy->isVoidTy() && 7069 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7070 Cost += TTI.getScalarizationOverhead( 7071 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7072 true, false); 7073 7074 // Some targets keep addresses scalar. 7075 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7076 return Cost; 7077 7078 // Some targets support efficient element stores. 7079 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7080 return Cost; 7081 7082 // Collect operands to consider. 7083 CallInst *CI = dyn_cast<CallInst>(I); 7084 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7085 7086 // Skip operands that do not require extraction/scalarization and do not incur 7087 // any overhead. 7088 SmallVector<Type *> Tys; 7089 for (auto *V : filterExtractingOperands(Ops, VF)) 7090 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7091 return Cost + TTI.getOperandsScalarizationOverhead( 7092 filterExtractingOperands(Ops, VF), Tys); 7093 } 7094 7095 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7096 if (VF.isScalar()) 7097 return; 7098 NumPredStores = 0; 7099 for (BasicBlock *BB : TheLoop->blocks()) { 7100 // For each instruction in the old loop. 7101 for (Instruction &I : *BB) { 7102 Value *Ptr = getLoadStorePointerOperand(&I); 7103 if (!Ptr) 7104 continue; 7105 7106 // TODO: We should generate better code and update the cost model for 7107 // predicated uniform stores. Today they are treated as any other 7108 // predicated store (see added test cases in 7109 // invariant-store-vectorization.ll). 7110 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7111 NumPredStores++; 7112 7113 if (Legal->isUniformMemOp(I)) { 7114 // TODO: Avoid replicating loads and stores instead of 7115 // relying on instcombine to remove them. 7116 // Load: Scalar load + broadcast 7117 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7118 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7119 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7120 continue; 7121 } 7122 7123 // We assume that widening is the best solution when possible. 7124 if (memoryInstructionCanBeWidened(&I, VF)) { 7125 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7126 int ConsecutiveStride = 7127 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7128 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7129 "Expected consecutive stride."); 7130 InstWidening Decision = 7131 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7132 setWideningDecision(&I, VF, Decision, Cost); 7133 continue; 7134 } 7135 7136 // Choose between Interleaving, Gather/Scatter or Scalarization. 7137 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7138 unsigned NumAccesses = 1; 7139 if (isAccessInterleaved(&I)) { 7140 auto Group = getInterleavedAccessGroup(&I); 7141 assert(Group && "Fail to get an interleaved access group."); 7142 7143 // Make one decision for the whole group. 7144 if (getWideningDecision(&I, VF) != CM_Unknown) 7145 continue; 7146 7147 NumAccesses = Group->getNumMembers(); 7148 if (interleavedAccessCanBeWidened(&I, VF)) 7149 InterleaveCost = getInterleaveGroupCost(&I, VF); 7150 } 7151 7152 InstructionCost GatherScatterCost = 7153 isLegalGatherOrScatter(&I) 7154 ? getGatherScatterCost(&I, VF) * NumAccesses 7155 : InstructionCost::getInvalid(); 7156 7157 InstructionCost ScalarizationCost = 7158 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7159 : InstructionCost::getInvalid(); 7160 7161 // Choose better solution for the current VF, 7162 // write down this decision and use it during vectorization. 7163 InstructionCost Cost; 7164 InstWidening Decision; 7165 if (InterleaveCost <= GatherScatterCost && 7166 InterleaveCost < ScalarizationCost) { 7167 Decision = CM_Interleave; 7168 Cost = InterleaveCost; 7169 } else if (GatherScatterCost < ScalarizationCost) { 7170 Decision = CM_GatherScatter; 7171 Cost = GatherScatterCost; 7172 } else { 7173 assert(!VF.isScalable() && 7174 "We cannot yet scalarise for scalable vectors"); 7175 Decision = CM_Scalarize; 7176 Cost = ScalarizationCost; 7177 } 7178 // If the instructions belongs to an interleave group, the whole group 7179 // receives the same decision. The whole group receives the cost, but 7180 // the cost will actually be assigned to one instruction. 7181 if (auto Group = getInterleavedAccessGroup(&I)) 7182 setWideningDecision(Group, VF, Decision, Cost); 7183 else 7184 setWideningDecision(&I, VF, Decision, Cost); 7185 } 7186 } 7187 7188 // Make sure that any load of address and any other address computation 7189 // remains scalar unless there is gather/scatter support. This avoids 7190 // inevitable extracts into address registers, and also has the benefit of 7191 // activating LSR more, since that pass can't optimize vectorized 7192 // addresses. 7193 if (TTI.prefersVectorizedAddressing()) 7194 return; 7195 7196 // Start with all scalar pointer uses. 7197 SmallPtrSet<Instruction *, 8> AddrDefs; 7198 for (BasicBlock *BB : TheLoop->blocks()) 7199 for (Instruction &I : *BB) { 7200 Instruction *PtrDef = 7201 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7202 if (PtrDef && TheLoop->contains(PtrDef) && 7203 getWideningDecision(&I, VF) != CM_GatherScatter) 7204 AddrDefs.insert(PtrDef); 7205 } 7206 7207 // Add all instructions used to generate the addresses. 7208 SmallVector<Instruction *, 4> Worklist; 7209 append_range(Worklist, AddrDefs); 7210 while (!Worklist.empty()) { 7211 Instruction *I = Worklist.pop_back_val(); 7212 for (auto &Op : I->operands()) 7213 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7214 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7215 AddrDefs.insert(InstOp).second) 7216 Worklist.push_back(InstOp); 7217 } 7218 7219 for (auto *I : AddrDefs) { 7220 if (isa<LoadInst>(I)) { 7221 // Setting the desired widening decision should ideally be handled in 7222 // by cost functions, but since this involves the task of finding out 7223 // if the loaded register is involved in an address computation, it is 7224 // instead changed here when we know this is the case. 7225 InstWidening Decision = getWideningDecision(I, VF); 7226 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7227 // Scalarize a widened load of address. 7228 setWideningDecision( 7229 I, VF, CM_Scalarize, 7230 (VF.getKnownMinValue() * 7231 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7232 else if (auto Group = getInterleavedAccessGroup(I)) { 7233 // Scalarize an interleave group of address loads. 7234 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7235 if (Instruction *Member = Group->getMember(I)) 7236 setWideningDecision( 7237 Member, VF, CM_Scalarize, 7238 (VF.getKnownMinValue() * 7239 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7240 } 7241 } 7242 } else 7243 // Make sure I gets scalarized and a cost estimate without 7244 // scalarization overhead. 7245 ForcedScalars[VF].insert(I); 7246 } 7247 } 7248 7249 InstructionCost 7250 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7251 Type *&VectorTy) { 7252 Type *RetTy = I->getType(); 7253 if (canTruncateToMinimalBitwidth(I, VF)) 7254 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7255 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7256 auto SE = PSE.getSE(); 7257 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7258 7259 // TODO: We need to estimate the cost of intrinsic calls. 7260 switch (I->getOpcode()) { 7261 case Instruction::GetElementPtr: 7262 // We mark this instruction as zero-cost because the cost of GEPs in 7263 // vectorized code depends on whether the corresponding memory instruction 7264 // is scalarized or not. Therefore, we handle GEPs with the memory 7265 // instruction cost. 7266 return 0; 7267 case Instruction::Br: { 7268 // In cases of scalarized and predicated instructions, there will be VF 7269 // predicated blocks in the vectorized loop. Each branch around these 7270 // blocks requires also an extract of its vector compare i1 element. 7271 bool ScalarPredicatedBB = false; 7272 BranchInst *BI = cast<BranchInst>(I); 7273 if (VF.isVector() && BI->isConditional() && 7274 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7275 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7276 ScalarPredicatedBB = true; 7277 7278 if (ScalarPredicatedBB) { 7279 // Return cost for branches around scalarized and predicated blocks. 7280 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7281 auto *Vec_i1Ty = 7282 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7283 return (TTI.getScalarizationOverhead( 7284 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7285 false, true) + 7286 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7287 VF.getKnownMinValue())); 7288 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7289 // The back-edge branch will remain, as will all scalar branches. 7290 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7291 else 7292 // This branch will be eliminated by if-conversion. 7293 return 0; 7294 // Note: We currently assume zero cost for an unconditional branch inside 7295 // a predicated block since it will become a fall-through, although we 7296 // may decide in the future to call TTI for all branches. 7297 } 7298 case Instruction::PHI: { 7299 auto *Phi = cast<PHINode>(I); 7300 7301 // First-order recurrences are replaced by vector shuffles inside the loop. 7302 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7303 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7304 return TTI.getShuffleCost( 7305 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7306 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7307 7308 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7309 // converted into select instructions. We require N - 1 selects per phi 7310 // node, where N is the number of incoming values. 7311 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7312 return (Phi->getNumIncomingValues() - 1) * 7313 TTI.getCmpSelInstrCost( 7314 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7315 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7316 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7317 7318 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7319 } 7320 case Instruction::UDiv: 7321 case Instruction::SDiv: 7322 case Instruction::URem: 7323 case Instruction::SRem: 7324 // If we have a predicated instruction, it may not be executed for each 7325 // vector lane. Get the scalarization cost and scale this amount by the 7326 // probability of executing the predicated block. If the instruction is not 7327 // predicated, we fall through to the next case. 7328 if (VF.isVector() && isScalarWithPredication(I)) { 7329 InstructionCost Cost = 0; 7330 7331 // These instructions have a non-void type, so account for the phi nodes 7332 // that we will create. This cost is likely to be zero. The phi node 7333 // cost, if any, should be scaled by the block probability because it 7334 // models a copy at the end of each predicated block. 7335 Cost += VF.getKnownMinValue() * 7336 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7337 7338 // The cost of the non-predicated instruction. 7339 Cost += VF.getKnownMinValue() * 7340 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7341 7342 // The cost of insertelement and extractelement instructions needed for 7343 // scalarization. 7344 Cost += getScalarizationOverhead(I, VF); 7345 7346 // Scale the cost by the probability of executing the predicated blocks. 7347 // This assumes the predicated block for each vector lane is equally 7348 // likely. 7349 return Cost / getReciprocalPredBlockProb(); 7350 } 7351 LLVM_FALLTHROUGH; 7352 case Instruction::Add: 7353 case Instruction::FAdd: 7354 case Instruction::Sub: 7355 case Instruction::FSub: 7356 case Instruction::Mul: 7357 case Instruction::FMul: 7358 case Instruction::FDiv: 7359 case Instruction::FRem: 7360 case Instruction::Shl: 7361 case Instruction::LShr: 7362 case Instruction::AShr: 7363 case Instruction::And: 7364 case Instruction::Or: 7365 case Instruction::Xor: { 7366 // Since we will replace the stride by 1 the multiplication should go away. 7367 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7368 return 0; 7369 7370 // Detect reduction patterns 7371 InstructionCost RedCost; 7372 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7373 .isValid()) 7374 return RedCost; 7375 7376 // Certain instructions can be cheaper to vectorize if they have a constant 7377 // second vector operand. One example of this are shifts on x86. 7378 Value *Op2 = I->getOperand(1); 7379 TargetTransformInfo::OperandValueProperties Op2VP; 7380 TargetTransformInfo::OperandValueKind Op2VK = 7381 TTI.getOperandInfo(Op2, Op2VP); 7382 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7383 Op2VK = TargetTransformInfo::OK_UniformValue; 7384 7385 SmallVector<const Value *, 4> Operands(I->operand_values()); 7386 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7387 return N * TTI.getArithmeticInstrCost( 7388 I->getOpcode(), VectorTy, CostKind, 7389 TargetTransformInfo::OK_AnyValue, 7390 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7391 } 7392 case Instruction::FNeg: { 7393 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7394 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7395 return N * TTI.getArithmeticInstrCost( 7396 I->getOpcode(), VectorTy, CostKind, 7397 TargetTransformInfo::OK_AnyValue, 7398 TargetTransformInfo::OK_AnyValue, 7399 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7400 I->getOperand(0), I); 7401 } 7402 case Instruction::Select: { 7403 SelectInst *SI = cast<SelectInst>(I); 7404 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7405 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7406 Type *CondTy = SI->getCondition()->getType(); 7407 if (!ScalarCond) 7408 CondTy = VectorType::get(CondTy, VF); 7409 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7410 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7411 } 7412 case Instruction::ICmp: 7413 case Instruction::FCmp: { 7414 Type *ValTy = I->getOperand(0)->getType(); 7415 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7416 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7417 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7418 VectorTy = ToVectorTy(ValTy, VF); 7419 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7420 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7421 } 7422 case Instruction::Store: 7423 case Instruction::Load: { 7424 ElementCount Width = VF; 7425 if (Width.isVector()) { 7426 InstWidening Decision = getWideningDecision(I, Width); 7427 assert(Decision != CM_Unknown && 7428 "CM decision should be taken at this point"); 7429 if (Decision == CM_Scalarize) 7430 Width = ElementCount::getFixed(1); 7431 } 7432 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7433 return getMemoryInstructionCost(I, VF); 7434 } 7435 case Instruction::ZExt: 7436 case Instruction::SExt: 7437 case Instruction::FPToUI: 7438 case Instruction::FPToSI: 7439 case Instruction::FPExt: 7440 case Instruction::PtrToInt: 7441 case Instruction::IntToPtr: 7442 case Instruction::SIToFP: 7443 case Instruction::UIToFP: 7444 case Instruction::Trunc: 7445 case Instruction::FPTrunc: 7446 case Instruction::BitCast: { 7447 // Computes the CastContextHint from a Load/Store instruction. 7448 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7449 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7450 "Expected a load or a store!"); 7451 7452 if (VF.isScalar() || !TheLoop->contains(I)) 7453 return TTI::CastContextHint::Normal; 7454 7455 switch (getWideningDecision(I, VF)) { 7456 case LoopVectorizationCostModel::CM_GatherScatter: 7457 return TTI::CastContextHint::GatherScatter; 7458 case LoopVectorizationCostModel::CM_Interleave: 7459 return TTI::CastContextHint::Interleave; 7460 case LoopVectorizationCostModel::CM_Scalarize: 7461 case LoopVectorizationCostModel::CM_Widen: 7462 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7463 : TTI::CastContextHint::Normal; 7464 case LoopVectorizationCostModel::CM_Widen_Reverse: 7465 return TTI::CastContextHint::Reversed; 7466 case LoopVectorizationCostModel::CM_Unknown: 7467 llvm_unreachable("Instr did not go through cost modelling?"); 7468 } 7469 7470 llvm_unreachable("Unhandled case!"); 7471 }; 7472 7473 unsigned Opcode = I->getOpcode(); 7474 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7475 // For Trunc, the context is the only user, which must be a StoreInst. 7476 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7477 if (I->hasOneUse()) 7478 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7479 CCH = ComputeCCH(Store); 7480 } 7481 // For Z/Sext, the context is the operand, which must be a LoadInst. 7482 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7483 Opcode == Instruction::FPExt) { 7484 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7485 CCH = ComputeCCH(Load); 7486 } 7487 7488 // We optimize the truncation of induction variables having constant 7489 // integer steps. The cost of these truncations is the same as the scalar 7490 // operation. 7491 if (isOptimizableIVTruncate(I, VF)) { 7492 auto *Trunc = cast<TruncInst>(I); 7493 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7494 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7495 } 7496 7497 // Detect reduction patterns 7498 InstructionCost RedCost; 7499 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7500 .isValid()) 7501 return RedCost; 7502 7503 Type *SrcScalarTy = I->getOperand(0)->getType(); 7504 Type *SrcVecTy = 7505 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7506 if (canTruncateToMinimalBitwidth(I, VF)) { 7507 // This cast is going to be shrunk. This may remove the cast or it might 7508 // turn it into slightly different cast. For example, if MinBW == 16, 7509 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7510 // 7511 // Calculate the modified src and dest types. 7512 Type *MinVecTy = VectorTy; 7513 if (Opcode == Instruction::Trunc) { 7514 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7515 VectorTy = 7516 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7517 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7518 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7519 VectorTy = 7520 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7521 } 7522 } 7523 7524 unsigned N; 7525 if (isScalarAfterVectorization(I, VF)) { 7526 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7527 N = VF.getKnownMinValue(); 7528 } else 7529 N = 1; 7530 return N * 7531 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7532 } 7533 case Instruction::Call: { 7534 bool NeedToScalarize; 7535 CallInst *CI = cast<CallInst>(I); 7536 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7537 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7538 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7539 return std::min(CallCost, IntrinsicCost); 7540 } 7541 return CallCost; 7542 } 7543 case Instruction::ExtractValue: 7544 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7545 default: 7546 // The cost of executing VF copies of the scalar instruction. This opcode 7547 // is unknown. Assume that it is the same as 'mul'. 7548 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7549 Instruction::Mul, VectorTy, CostKind) + 7550 getScalarizationOverhead(I, VF); 7551 } // end of switch. 7552 } 7553 7554 char LoopVectorize::ID = 0; 7555 7556 static const char lv_name[] = "Loop Vectorization"; 7557 7558 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7559 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7560 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7561 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7562 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7563 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7564 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7565 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7566 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7567 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7568 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7569 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7570 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7571 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7572 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7573 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7574 7575 namespace llvm { 7576 7577 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7578 7579 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7580 bool VectorizeOnlyWhenForced) { 7581 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7582 } 7583 7584 } // end namespace llvm 7585 7586 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7587 // Check if the pointer operand of a load or store instruction is 7588 // consecutive. 7589 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7590 return Legal->isConsecutivePtr(Ptr); 7591 return false; 7592 } 7593 7594 void LoopVectorizationCostModel::collectValuesToIgnore() { 7595 // Ignore ephemeral values. 7596 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7597 7598 // Ignore type-promoting instructions we identified during reduction 7599 // detection. 7600 for (auto &Reduction : Legal->getReductionVars()) { 7601 RecurrenceDescriptor &RedDes = Reduction.second; 7602 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7603 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7604 } 7605 // Ignore type-casting instructions we identified during induction 7606 // detection. 7607 for (auto &Induction : Legal->getInductionVars()) { 7608 InductionDescriptor &IndDes = Induction.second; 7609 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7610 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7611 } 7612 } 7613 7614 void LoopVectorizationCostModel::collectInLoopReductions() { 7615 for (auto &Reduction : Legal->getReductionVars()) { 7616 PHINode *Phi = Reduction.first; 7617 RecurrenceDescriptor &RdxDesc = Reduction.second; 7618 7619 // We don't collect reductions that are type promoted (yet). 7620 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7621 continue; 7622 7623 // If the target would prefer this reduction to happen "in-loop", then we 7624 // want to record it as such. 7625 unsigned Opcode = RdxDesc.getOpcode(); 7626 if (!PreferInLoopReductions && 7627 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7628 TargetTransformInfo::ReductionFlags())) 7629 continue; 7630 7631 // Check that we can correctly put the reductions into the loop, by 7632 // finding the chain of operations that leads from the phi to the loop 7633 // exit value. 7634 SmallVector<Instruction *, 4> ReductionOperations = 7635 RdxDesc.getReductionOpChain(Phi, TheLoop); 7636 bool InLoop = !ReductionOperations.empty(); 7637 if (InLoop) { 7638 InLoopReductionChains[Phi] = ReductionOperations; 7639 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7640 Instruction *LastChain = Phi; 7641 for (auto *I : ReductionOperations) { 7642 InLoopReductionImmediateChains[I] = LastChain; 7643 LastChain = I; 7644 } 7645 } 7646 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7647 << " reduction for phi: " << *Phi << "\n"); 7648 } 7649 } 7650 7651 // TODO: we could return a pair of values that specify the max VF and 7652 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7653 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7654 // doesn't have a cost model that can choose which plan to execute if 7655 // more than one is generated. 7656 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7657 LoopVectorizationCostModel &CM) { 7658 unsigned WidestType; 7659 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7660 return WidestVectorRegBits / WidestType; 7661 } 7662 7663 VectorizationFactor 7664 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7665 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7666 ElementCount VF = UserVF; 7667 // Outer loop handling: They may require CFG and instruction level 7668 // transformations before even evaluating whether vectorization is profitable. 7669 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7670 // the vectorization pipeline. 7671 if (!OrigLoop->isInnermost()) { 7672 // If the user doesn't provide a vectorization factor, determine a 7673 // reasonable one. 7674 if (UserVF.isZero()) { 7675 VF = ElementCount::getFixed( 7676 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7677 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7678 7679 // Make sure we have a VF > 1 for stress testing. 7680 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7681 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7682 << "overriding computed VF.\n"); 7683 VF = ElementCount::getFixed(4); 7684 } 7685 } 7686 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7687 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7688 "VF needs to be a power of two"); 7689 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7690 << "VF " << VF << " to build VPlans.\n"); 7691 buildVPlans(VF, VF); 7692 7693 // For VPlan build stress testing, we bail out after VPlan construction. 7694 if (VPlanBuildStressTest) 7695 return VectorizationFactor::Disabled(); 7696 7697 return {VF, 0 /*Cost*/}; 7698 } 7699 7700 LLVM_DEBUG( 7701 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7702 "VPlan-native path.\n"); 7703 return VectorizationFactor::Disabled(); 7704 } 7705 7706 Optional<VectorizationFactor> 7707 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7708 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7709 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7710 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7711 return None; 7712 7713 // Invalidate interleave groups if all blocks of loop will be predicated. 7714 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7715 !useMaskedInterleavedAccesses(*TTI)) { 7716 LLVM_DEBUG( 7717 dbgs() 7718 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7719 "which requires masked-interleaved support.\n"); 7720 if (CM.InterleaveInfo.invalidateGroups()) 7721 // Invalidating interleave groups also requires invalidating all decisions 7722 // based on them, which includes widening decisions and uniform and scalar 7723 // values. 7724 CM.invalidateCostModelingDecisions(); 7725 } 7726 7727 ElementCount MaxVF = MaybeMaxVF.getValue(); 7728 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7729 7730 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7731 if (!UserVF.isZero() && 7732 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7733 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7734 // VFs here, this should be reverted to only use legal UserVFs once the 7735 // loop below supports scalable VFs. 7736 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7737 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7738 << " VF " << VF << ".\n"); 7739 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7740 "VF needs to be a power of two"); 7741 // Collect the instructions (and their associated costs) that will be more 7742 // profitable to scalarize. 7743 CM.selectUserVectorizationFactor(VF); 7744 CM.collectInLoopReductions(); 7745 buildVPlansWithVPRecipes(VF, VF); 7746 LLVM_DEBUG(printPlans(dbgs())); 7747 return {{VF, 0}}; 7748 } 7749 7750 assert(!MaxVF.isScalable() && 7751 "Scalable vectors not yet supported beyond this point"); 7752 7753 for (ElementCount VF = ElementCount::getFixed(1); 7754 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7755 // Collect Uniform and Scalar instructions after vectorization with VF. 7756 CM.collectUniformsAndScalars(VF); 7757 7758 // Collect the instructions (and their associated costs) that will be more 7759 // profitable to scalarize. 7760 if (VF.isVector()) 7761 CM.collectInstsToScalarize(VF); 7762 } 7763 7764 CM.collectInLoopReductions(); 7765 7766 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7767 LLVM_DEBUG(printPlans(dbgs())); 7768 if (MaxVF.isScalar()) 7769 return VectorizationFactor::Disabled(); 7770 7771 // Select the optimal vectorization factor. 7772 return CM.selectVectorizationFactor(MaxVF); 7773 } 7774 7775 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7776 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7777 << '\n'); 7778 BestVF = VF; 7779 BestUF = UF; 7780 7781 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7782 return !Plan->hasVF(VF); 7783 }); 7784 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7785 } 7786 7787 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7788 DominatorTree *DT) { 7789 // Perform the actual loop transformation. 7790 7791 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7792 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7793 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7794 7795 VPTransformState State{ 7796 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7797 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7798 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7799 State.CanonicalIV = ILV.Induction; 7800 7801 ILV.printDebugTracesAtStart(); 7802 7803 //===------------------------------------------------===// 7804 // 7805 // Notice: any optimization or new instruction that go 7806 // into the code below should also be implemented in 7807 // the cost-model. 7808 // 7809 //===------------------------------------------------===// 7810 7811 // 2. Copy and widen instructions from the old loop into the new loop. 7812 VPlans.front()->execute(&State); 7813 7814 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7815 // predication, updating analyses. 7816 ILV.fixVectorizedLoop(State); 7817 7818 ILV.printDebugTracesAtEnd(); 7819 } 7820 7821 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7822 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7823 for (const auto &Plan : VPlans) 7824 if (PrintVPlansInDotFormat) 7825 Plan->printDOT(O); 7826 else 7827 Plan->print(O); 7828 } 7829 #endif 7830 7831 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7832 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7833 7834 // We create new control-flow for the vectorized loop, so the original exit 7835 // conditions will be dead after vectorization if it's only used by the 7836 // terminator 7837 SmallVector<BasicBlock*> ExitingBlocks; 7838 OrigLoop->getExitingBlocks(ExitingBlocks); 7839 for (auto *BB : ExitingBlocks) { 7840 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7841 if (!Cmp || !Cmp->hasOneUse()) 7842 continue; 7843 7844 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7845 if (!DeadInstructions.insert(Cmp).second) 7846 continue; 7847 7848 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7849 // TODO: can recurse through operands in general 7850 for (Value *Op : Cmp->operands()) { 7851 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7852 DeadInstructions.insert(cast<Instruction>(Op)); 7853 } 7854 } 7855 7856 // We create new "steps" for induction variable updates to which the original 7857 // induction variables map. An original update instruction will be dead if 7858 // all its users except the induction variable are dead. 7859 auto *Latch = OrigLoop->getLoopLatch(); 7860 for (auto &Induction : Legal->getInductionVars()) { 7861 PHINode *Ind = Induction.first; 7862 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7863 7864 // If the tail is to be folded by masking, the primary induction variable, 7865 // if exists, isn't dead: it will be used for masking. Don't kill it. 7866 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7867 continue; 7868 7869 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7870 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7871 })) 7872 DeadInstructions.insert(IndUpdate); 7873 7874 // We record as "Dead" also the type-casting instructions we had identified 7875 // during induction analysis. We don't need any handling for them in the 7876 // vectorized loop because we have proven that, under a proper runtime 7877 // test guarding the vectorized loop, the value of the phi, and the casted 7878 // value of the phi, are the same. The last instruction in this casting chain 7879 // will get its scalar/vector/widened def from the scalar/vector/widened def 7880 // of the respective phi node. Any other casts in the induction def-use chain 7881 // have no other uses outside the phi update chain, and will be ignored. 7882 InductionDescriptor &IndDes = Induction.second; 7883 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7884 DeadInstructions.insert(Casts.begin(), Casts.end()); 7885 } 7886 } 7887 7888 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7889 7890 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7891 7892 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7893 Instruction::BinaryOps BinOp) { 7894 // When unrolling and the VF is 1, we only need to add a simple scalar. 7895 Type *Ty = Val->getType(); 7896 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7897 7898 if (Ty->isFloatingPointTy()) { 7899 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7900 7901 // Floating-point operations inherit FMF via the builder's flags. 7902 Value *MulOp = Builder.CreateFMul(C, Step); 7903 return Builder.CreateBinOp(BinOp, Val, MulOp); 7904 } 7905 Constant *C = ConstantInt::get(Ty, StartIdx); 7906 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7907 } 7908 7909 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7910 SmallVector<Metadata *, 4> MDs; 7911 // Reserve first location for self reference to the LoopID metadata node. 7912 MDs.push_back(nullptr); 7913 bool IsUnrollMetadata = false; 7914 MDNode *LoopID = L->getLoopID(); 7915 if (LoopID) { 7916 // First find existing loop unrolling disable metadata. 7917 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7918 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7919 if (MD) { 7920 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7921 IsUnrollMetadata = 7922 S && S->getString().startswith("llvm.loop.unroll.disable"); 7923 } 7924 MDs.push_back(LoopID->getOperand(i)); 7925 } 7926 } 7927 7928 if (!IsUnrollMetadata) { 7929 // Add runtime unroll disable metadata. 7930 LLVMContext &Context = L->getHeader()->getContext(); 7931 SmallVector<Metadata *, 1> DisableOperands; 7932 DisableOperands.push_back( 7933 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7934 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7935 MDs.push_back(DisableNode); 7936 MDNode *NewLoopID = MDNode::get(Context, MDs); 7937 // Set operand 0 to refer to the loop id itself. 7938 NewLoopID->replaceOperandWith(0, NewLoopID); 7939 L->setLoopID(NewLoopID); 7940 } 7941 } 7942 7943 //===--------------------------------------------------------------------===// 7944 // EpilogueVectorizerMainLoop 7945 //===--------------------------------------------------------------------===// 7946 7947 /// This function is partially responsible for generating the control flow 7948 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7949 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7950 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7951 Loop *Lp = createVectorLoopSkeleton(""); 7952 7953 // Generate the code to check the minimum iteration count of the vector 7954 // epilogue (see below). 7955 EPI.EpilogueIterationCountCheck = 7956 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7957 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7958 7959 // Generate the code to check any assumptions that we've made for SCEV 7960 // expressions. 7961 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7962 7963 // Generate the code that checks at runtime if arrays overlap. We put the 7964 // checks into a separate block to make the more common case of few elements 7965 // faster. 7966 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7967 7968 // Generate the iteration count check for the main loop, *after* the check 7969 // for the epilogue loop, so that the path-length is shorter for the case 7970 // that goes directly through the vector epilogue. The longer-path length for 7971 // the main loop is compensated for, by the gain from vectorizing the larger 7972 // trip count. Note: the branch will get updated later on when we vectorize 7973 // the epilogue. 7974 EPI.MainLoopIterationCountCheck = 7975 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7976 7977 // Generate the induction variable. 7978 OldInduction = Legal->getPrimaryInduction(); 7979 Type *IdxTy = Legal->getWidestInductionType(); 7980 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7981 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7982 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7983 EPI.VectorTripCount = CountRoundDown; 7984 Induction = 7985 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7986 getDebugLocFromInstOrOperands(OldInduction)); 7987 7988 // Skip induction resume value creation here because they will be created in 7989 // the second pass. If we created them here, they wouldn't be used anyway, 7990 // because the vplan in the second pass still contains the inductions from the 7991 // original loop. 7992 7993 return completeLoopSkeleton(Lp, OrigLoopID); 7994 } 7995 7996 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7997 LLVM_DEBUG({ 7998 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7999 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8000 << ", Main Loop UF:" << EPI.MainLoopUF 8001 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8002 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8003 }); 8004 } 8005 8006 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8007 DEBUG_WITH_TYPE(VerboseDebug, { 8008 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8009 }); 8010 } 8011 8012 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8013 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8014 assert(L && "Expected valid Loop."); 8015 assert(Bypass && "Expected valid bypass basic block."); 8016 unsigned VFactor = 8017 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8018 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8019 Value *Count = getOrCreateTripCount(L); 8020 // Reuse existing vector loop preheader for TC checks. 8021 // Note that new preheader block is generated for vector loop. 8022 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8023 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8024 8025 // Generate code to check if the loop's trip count is less than VF * UF of the 8026 // main vector loop. 8027 auto P = 8028 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8029 8030 Value *CheckMinIters = Builder.CreateICmp( 8031 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8032 "min.iters.check"); 8033 8034 if (!ForEpilogue) 8035 TCCheckBlock->setName("vector.main.loop.iter.check"); 8036 8037 // Create new preheader for vector loop. 8038 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8039 DT, LI, nullptr, "vector.ph"); 8040 8041 if (ForEpilogue) { 8042 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8043 DT->getNode(Bypass)->getIDom()) && 8044 "TC check is expected to dominate Bypass"); 8045 8046 // Update dominator for Bypass & LoopExit. 8047 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8048 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8049 8050 LoopBypassBlocks.push_back(TCCheckBlock); 8051 8052 // Save the trip count so we don't have to regenerate it in the 8053 // vec.epilog.iter.check. This is safe to do because the trip count 8054 // generated here dominates the vector epilog iter check. 8055 EPI.TripCount = Count; 8056 } 8057 8058 ReplaceInstWithInst( 8059 TCCheckBlock->getTerminator(), 8060 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8061 8062 return TCCheckBlock; 8063 } 8064 8065 //===--------------------------------------------------------------------===// 8066 // EpilogueVectorizerEpilogueLoop 8067 //===--------------------------------------------------------------------===// 8068 8069 /// This function is partially responsible for generating the control flow 8070 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8071 BasicBlock * 8072 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8073 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8074 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8075 8076 // Now, compare the remaining count and if there aren't enough iterations to 8077 // execute the vectorized epilogue skip to the scalar part. 8078 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8079 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8080 LoopVectorPreHeader = 8081 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8082 LI, nullptr, "vec.epilog.ph"); 8083 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8084 VecEpilogueIterationCountCheck); 8085 8086 // Adjust the control flow taking the state info from the main loop 8087 // vectorization into account. 8088 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8089 "expected this to be saved from the previous pass."); 8090 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8091 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8092 8093 DT->changeImmediateDominator(LoopVectorPreHeader, 8094 EPI.MainLoopIterationCountCheck); 8095 8096 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8097 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8098 8099 if (EPI.SCEVSafetyCheck) 8100 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8101 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8102 if (EPI.MemSafetyCheck) 8103 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8104 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8105 8106 DT->changeImmediateDominator( 8107 VecEpilogueIterationCountCheck, 8108 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8109 8110 DT->changeImmediateDominator(LoopScalarPreHeader, 8111 EPI.EpilogueIterationCountCheck); 8112 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8113 8114 // Keep track of bypass blocks, as they feed start values to the induction 8115 // phis in the scalar loop preheader. 8116 if (EPI.SCEVSafetyCheck) 8117 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8118 if (EPI.MemSafetyCheck) 8119 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8120 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8121 8122 // Generate a resume induction for the vector epilogue and put it in the 8123 // vector epilogue preheader 8124 Type *IdxTy = Legal->getWidestInductionType(); 8125 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8126 LoopVectorPreHeader->getFirstNonPHI()); 8127 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8128 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8129 EPI.MainLoopIterationCountCheck); 8130 8131 // Generate the induction variable. 8132 OldInduction = Legal->getPrimaryInduction(); 8133 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8134 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8135 Value *StartIdx = EPResumeVal; 8136 Induction = 8137 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8138 getDebugLocFromInstOrOperands(OldInduction)); 8139 8140 // Generate induction resume values. These variables save the new starting 8141 // indexes for the scalar loop. They are used to test if there are any tail 8142 // iterations left once the vector loop has completed. 8143 // Note that when the vectorized epilogue is skipped due to iteration count 8144 // check, then the resume value for the induction variable comes from 8145 // the trip count of the main vector loop, hence passing the AdditionalBypass 8146 // argument. 8147 createInductionResumeValues(Lp, CountRoundDown, 8148 {VecEpilogueIterationCountCheck, 8149 EPI.VectorTripCount} /* AdditionalBypass */); 8150 8151 AddRuntimeUnrollDisableMetaData(Lp); 8152 return completeLoopSkeleton(Lp, OrigLoopID); 8153 } 8154 8155 BasicBlock * 8156 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8157 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8158 8159 assert(EPI.TripCount && 8160 "Expected trip count to have been safed in the first pass."); 8161 assert( 8162 (!isa<Instruction>(EPI.TripCount) || 8163 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8164 "saved trip count does not dominate insertion point."); 8165 Value *TC = EPI.TripCount; 8166 IRBuilder<> Builder(Insert->getTerminator()); 8167 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8168 8169 // Generate code to check if the loop's trip count is less than VF * UF of the 8170 // vector epilogue loop. 8171 auto P = 8172 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8173 8174 Value *CheckMinIters = Builder.CreateICmp( 8175 P, Count, 8176 ConstantInt::get(Count->getType(), 8177 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8178 "min.epilog.iters.check"); 8179 8180 ReplaceInstWithInst( 8181 Insert->getTerminator(), 8182 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8183 8184 LoopBypassBlocks.push_back(Insert); 8185 return Insert; 8186 } 8187 8188 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8189 LLVM_DEBUG({ 8190 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8191 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8192 << ", Main Loop UF:" << EPI.MainLoopUF 8193 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8194 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8195 }); 8196 } 8197 8198 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8199 DEBUG_WITH_TYPE(VerboseDebug, { 8200 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8201 }); 8202 } 8203 8204 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8205 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8206 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8207 bool PredicateAtRangeStart = Predicate(Range.Start); 8208 8209 for (ElementCount TmpVF = Range.Start * 2; 8210 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8211 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8212 Range.End = TmpVF; 8213 break; 8214 } 8215 8216 return PredicateAtRangeStart; 8217 } 8218 8219 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8220 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8221 /// of VF's starting at a given VF and extending it as much as possible. Each 8222 /// vectorization decision can potentially shorten this sub-range during 8223 /// buildVPlan(). 8224 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8225 ElementCount MaxVF) { 8226 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8227 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8228 VFRange SubRange = {VF, MaxVFPlusOne}; 8229 VPlans.push_back(buildVPlan(SubRange)); 8230 VF = SubRange.End; 8231 } 8232 } 8233 8234 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8235 VPlanPtr &Plan) { 8236 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8237 8238 // Look for cached value. 8239 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8240 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8241 if (ECEntryIt != EdgeMaskCache.end()) 8242 return ECEntryIt->second; 8243 8244 VPValue *SrcMask = createBlockInMask(Src, Plan); 8245 8246 // The terminator has to be a branch inst! 8247 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8248 assert(BI && "Unexpected terminator found"); 8249 8250 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8251 return EdgeMaskCache[Edge] = SrcMask; 8252 8253 // If source is an exiting block, we know the exit edge is dynamically dead 8254 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8255 // adding uses of an otherwise potentially dead instruction. 8256 if (OrigLoop->isLoopExiting(Src)) 8257 return EdgeMaskCache[Edge] = SrcMask; 8258 8259 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8260 assert(EdgeMask && "No Edge Mask found for condition"); 8261 8262 if (BI->getSuccessor(0) != Dst) 8263 EdgeMask = Builder.createNot(EdgeMask); 8264 8265 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8266 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8267 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8268 // The select version does not introduce new UB if SrcMask is false and 8269 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8270 VPValue *False = Plan->getOrAddVPValue( 8271 ConstantInt::getFalse(BI->getCondition()->getType())); 8272 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8273 } 8274 8275 return EdgeMaskCache[Edge] = EdgeMask; 8276 } 8277 8278 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8279 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8280 8281 // Look for cached value. 8282 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8283 if (BCEntryIt != BlockMaskCache.end()) 8284 return BCEntryIt->second; 8285 8286 // All-one mask is modelled as no-mask following the convention for masked 8287 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8288 VPValue *BlockMask = nullptr; 8289 8290 if (OrigLoop->getHeader() == BB) { 8291 if (!CM.blockNeedsPredication(BB)) 8292 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8293 8294 // Create the block in mask as the first non-phi instruction in the block. 8295 VPBuilder::InsertPointGuard Guard(Builder); 8296 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8297 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8298 8299 // Introduce the early-exit compare IV <= BTC to form header block mask. 8300 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8301 // Start by constructing the desired canonical IV. 8302 VPValue *IV = nullptr; 8303 if (Legal->getPrimaryInduction()) 8304 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8305 else { 8306 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8307 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8308 IV = IVRecipe->getVPValue(); 8309 } 8310 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8311 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8312 8313 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8314 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8315 // as a second argument, we only pass the IV here and extract the 8316 // tripcount from the transform state where codegen of the VP instructions 8317 // happen. 8318 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8319 } else { 8320 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8321 } 8322 return BlockMaskCache[BB] = BlockMask; 8323 } 8324 8325 // This is the block mask. We OR all incoming edges. 8326 for (auto *Predecessor : predecessors(BB)) { 8327 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8328 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8329 return BlockMaskCache[BB] = EdgeMask; 8330 8331 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8332 BlockMask = EdgeMask; 8333 continue; 8334 } 8335 8336 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8337 } 8338 8339 return BlockMaskCache[BB] = BlockMask; 8340 } 8341 8342 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8343 VPlanPtr &Plan) { 8344 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8345 "Must be called with either a load or store"); 8346 8347 auto willWiden = [&](ElementCount VF) -> bool { 8348 if (VF.isScalar()) 8349 return false; 8350 LoopVectorizationCostModel::InstWidening Decision = 8351 CM.getWideningDecision(I, VF); 8352 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8353 "CM decision should be taken at this point."); 8354 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8355 return true; 8356 if (CM.isScalarAfterVectorization(I, VF) || 8357 CM.isProfitableToScalarize(I, VF)) 8358 return false; 8359 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8360 }; 8361 8362 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8363 return nullptr; 8364 8365 VPValue *Mask = nullptr; 8366 if (Legal->isMaskRequired(I)) 8367 Mask = createBlockInMask(I->getParent(), Plan); 8368 8369 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8370 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8371 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8372 8373 StoreInst *Store = cast<StoreInst>(I); 8374 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8375 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8376 } 8377 8378 VPWidenIntOrFpInductionRecipe * 8379 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8380 // Check if this is an integer or fp induction. If so, build the recipe that 8381 // produces its scalar and vector values. 8382 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8383 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8384 II.getKind() == InductionDescriptor::IK_FpInduction) { 8385 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8386 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8387 return new VPWidenIntOrFpInductionRecipe( 8388 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8389 } 8390 8391 return nullptr; 8392 } 8393 8394 VPWidenIntOrFpInductionRecipe * 8395 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8396 VPlan &Plan) const { 8397 // Optimize the special case where the source is a constant integer 8398 // induction variable. Notice that we can only optimize the 'trunc' case 8399 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8400 // (c) other casts depend on pointer size. 8401 8402 // Determine whether \p K is a truncation based on an induction variable that 8403 // can be optimized. 8404 auto isOptimizableIVTruncate = 8405 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8406 return [=](ElementCount VF) -> bool { 8407 return CM.isOptimizableIVTruncate(K, VF); 8408 }; 8409 }; 8410 8411 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8412 isOptimizableIVTruncate(I), Range)) { 8413 8414 InductionDescriptor II = 8415 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8416 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8417 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8418 Start, nullptr, I); 8419 } 8420 return nullptr; 8421 } 8422 8423 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8424 // If all incoming values are equal, the incoming VPValue can be used directly 8425 // instead of creating a new VPBlendRecipe. 8426 Value *FirstIncoming = Phi->getIncomingValue(0); 8427 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8428 return FirstIncoming == Inc; 8429 })) { 8430 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8431 } 8432 8433 // We know that all PHIs in non-header blocks are converted into selects, so 8434 // we don't have to worry about the insertion order and we can just use the 8435 // builder. At this point we generate the predication tree. There may be 8436 // duplications since this is a simple recursive scan, but future 8437 // optimizations will clean it up. 8438 SmallVector<VPValue *, 2> Operands; 8439 unsigned NumIncoming = Phi->getNumIncomingValues(); 8440 8441 for (unsigned In = 0; In < NumIncoming; In++) { 8442 VPValue *EdgeMask = 8443 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8444 assert((EdgeMask || NumIncoming == 1) && 8445 "Multiple predecessors with one having a full mask"); 8446 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8447 if (EdgeMask) 8448 Operands.push_back(EdgeMask); 8449 } 8450 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8451 } 8452 8453 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8454 VPlan &Plan) const { 8455 8456 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8457 [this, CI](ElementCount VF) { 8458 return CM.isScalarWithPredication(CI, VF); 8459 }, 8460 Range); 8461 8462 if (IsPredicated) 8463 return nullptr; 8464 8465 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8466 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8467 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8468 ID == Intrinsic::pseudoprobe || 8469 ID == Intrinsic::experimental_noalias_scope_decl)) 8470 return nullptr; 8471 8472 auto willWiden = [&](ElementCount VF) -> bool { 8473 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8474 // The following case may be scalarized depending on the VF. 8475 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8476 // version of the instruction. 8477 // Is it beneficial to perform intrinsic call compared to lib call? 8478 bool NeedToScalarize = false; 8479 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8480 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8481 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8482 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8483 "Cannot have invalid costs while widening"); 8484 return UseVectorIntrinsic || !NeedToScalarize; 8485 }; 8486 8487 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8488 return nullptr; 8489 8490 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8491 } 8492 8493 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8494 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8495 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8496 // Instruction should be widened, unless it is scalar after vectorization, 8497 // scalarization is profitable or it is predicated. 8498 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8499 return CM.isScalarAfterVectorization(I, VF) || 8500 CM.isProfitableToScalarize(I, VF) || 8501 CM.isScalarWithPredication(I, VF); 8502 }; 8503 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8504 Range); 8505 } 8506 8507 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8508 auto IsVectorizableOpcode = [](unsigned Opcode) { 8509 switch (Opcode) { 8510 case Instruction::Add: 8511 case Instruction::And: 8512 case Instruction::AShr: 8513 case Instruction::BitCast: 8514 case Instruction::FAdd: 8515 case Instruction::FCmp: 8516 case Instruction::FDiv: 8517 case Instruction::FMul: 8518 case Instruction::FNeg: 8519 case Instruction::FPExt: 8520 case Instruction::FPToSI: 8521 case Instruction::FPToUI: 8522 case Instruction::FPTrunc: 8523 case Instruction::FRem: 8524 case Instruction::FSub: 8525 case Instruction::ICmp: 8526 case Instruction::IntToPtr: 8527 case Instruction::LShr: 8528 case Instruction::Mul: 8529 case Instruction::Or: 8530 case Instruction::PtrToInt: 8531 case Instruction::SDiv: 8532 case Instruction::Select: 8533 case Instruction::SExt: 8534 case Instruction::Shl: 8535 case Instruction::SIToFP: 8536 case Instruction::SRem: 8537 case Instruction::Sub: 8538 case Instruction::Trunc: 8539 case Instruction::UDiv: 8540 case Instruction::UIToFP: 8541 case Instruction::URem: 8542 case Instruction::Xor: 8543 case Instruction::ZExt: 8544 return true; 8545 } 8546 return false; 8547 }; 8548 8549 if (!IsVectorizableOpcode(I->getOpcode())) 8550 return nullptr; 8551 8552 // Success: widen this instruction. 8553 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8554 } 8555 8556 VPBasicBlock *VPRecipeBuilder::handleReplication( 8557 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8558 VPlanPtr &Plan) { 8559 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8560 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8561 Range); 8562 8563 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8564 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8565 Range); 8566 8567 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8568 IsUniform, IsPredicated); 8569 setRecipe(I, Recipe); 8570 Plan->addVPValue(I, Recipe); 8571 8572 // Find if I uses a predicated instruction. If so, it will use its scalar 8573 // value. Avoid hoisting the insert-element which packs the scalar value into 8574 // a vector value, as that happens iff all users use the vector value. 8575 for (VPValue *Op : Recipe->operands()) { 8576 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8577 if (!PredR) 8578 continue; 8579 auto *RepR = 8580 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8581 assert(RepR->isPredicated() && 8582 "expected Replicate recipe to be predicated"); 8583 RepR->setAlsoPack(false); 8584 } 8585 8586 // Finalize the recipe for Instr, first if it is not predicated. 8587 if (!IsPredicated) { 8588 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8589 VPBB->appendRecipe(Recipe); 8590 return VPBB; 8591 } 8592 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8593 assert(VPBB->getSuccessors().empty() && 8594 "VPBB has successors when handling predicated replication."); 8595 // Record predicated instructions for above packing optimizations. 8596 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8597 VPBlockUtils::insertBlockAfter(Region, VPBB); 8598 auto *RegSucc = new VPBasicBlock(); 8599 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8600 return RegSucc; 8601 } 8602 8603 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8604 VPRecipeBase *PredRecipe, 8605 VPlanPtr &Plan) { 8606 // Instructions marked for predication are replicated and placed under an 8607 // if-then construct to prevent side-effects. 8608 8609 // Generate recipes to compute the block mask for this region. 8610 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8611 8612 // Build the triangular if-then region. 8613 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8614 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8615 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8616 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8617 auto *PHIRecipe = Instr->getType()->isVoidTy() 8618 ? nullptr 8619 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8620 if (PHIRecipe) { 8621 Plan->removeVPValueFor(Instr); 8622 Plan->addVPValue(Instr, PHIRecipe); 8623 } 8624 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8625 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8626 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8627 8628 // Note: first set Entry as region entry and then connect successors starting 8629 // from it in order, to propagate the "parent" of each VPBasicBlock. 8630 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8631 VPBlockUtils::connectBlocks(Pred, Exit); 8632 8633 return Region; 8634 } 8635 8636 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8637 VFRange &Range, 8638 VPlanPtr &Plan) { 8639 // First, check for specific widening recipes that deal with calls, memory 8640 // operations, inductions and Phi nodes. 8641 if (auto *CI = dyn_cast<CallInst>(Instr)) 8642 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8643 8644 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8645 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8646 8647 VPRecipeBase *Recipe; 8648 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8649 if (Phi->getParent() != OrigLoop->getHeader()) 8650 return tryToBlend(Phi, Plan); 8651 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8652 return toVPRecipeResult(Recipe); 8653 8654 if (Legal->isReductionVariable(Phi)) { 8655 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8656 VPValue *StartV = 8657 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8658 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8659 } 8660 8661 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8662 } 8663 8664 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8665 cast<TruncInst>(Instr), Range, *Plan))) 8666 return toVPRecipeResult(Recipe); 8667 8668 if (!shouldWiden(Instr, Range)) 8669 return nullptr; 8670 8671 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8672 return toVPRecipeResult(new VPWidenGEPRecipe( 8673 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8674 8675 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8676 bool InvariantCond = 8677 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8678 return toVPRecipeResult(new VPWidenSelectRecipe( 8679 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8680 } 8681 8682 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8683 } 8684 8685 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8686 ElementCount MaxVF) { 8687 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8688 8689 // Collect instructions from the original loop that will become trivially dead 8690 // in the vectorized loop. We don't need to vectorize these instructions. For 8691 // example, original induction update instructions can become dead because we 8692 // separately emit induction "steps" when generating code for the new loop. 8693 // Similarly, we create a new latch condition when setting up the structure 8694 // of the new loop, so the old one can become dead. 8695 SmallPtrSet<Instruction *, 4> DeadInstructions; 8696 collectTriviallyDeadInstructions(DeadInstructions); 8697 8698 // Add assume instructions we need to drop to DeadInstructions, to prevent 8699 // them from being added to the VPlan. 8700 // TODO: We only need to drop assumes in blocks that get flattend. If the 8701 // control flow is preserved, we should keep them. 8702 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8703 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8704 8705 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8706 // Dead instructions do not need sinking. Remove them from SinkAfter. 8707 for (Instruction *I : DeadInstructions) 8708 SinkAfter.erase(I); 8709 8710 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8711 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8712 VFRange SubRange = {VF, MaxVFPlusOne}; 8713 VPlans.push_back( 8714 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8715 VF = SubRange.End; 8716 } 8717 } 8718 8719 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8720 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8721 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8722 8723 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8724 8725 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8726 8727 // --------------------------------------------------------------------------- 8728 // Pre-construction: record ingredients whose recipes we'll need to further 8729 // process after constructing the initial VPlan. 8730 // --------------------------------------------------------------------------- 8731 8732 // Mark instructions we'll need to sink later and their targets as 8733 // ingredients whose recipe we'll need to record. 8734 for (auto &Entry : SinkAfter) { 8735 RecipeBuilder.recordRecipeOf(Entry.first); 8736 RecipeBuilder.recordRecipeOf(Entry.second); 8737 } 8738 for (auto &Reduction : CM.getInLoopReductionChains()) { 8739 PHINode *Phi = Reduction.first; 8740 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8741 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8742 8743 RecipeBuilder.recordRecipeOf(Phi); 8744 for (auto &R : ReductionOperations) { 8745 RecipeBuilder.recordRecipeOf(R); 8746 // For min/max reducitons, where we have a pair of icmp/select, we also 8747 // need to record the ICmp recipe, so it can be removed later. 8748 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8749 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8750 } 8751 } 8752 8753 // For each interleave group which is relevant for this (possibly trimmed) 8754 // Range, add it to the set of groups to be later applied to the VPlan and add 8755 // placeholders for its members' Recipes which we'll be replacing with a 8756 // single VPInterleaveRecipe. 8757 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8758 auto applyIG = [IG, this](ElementCount VF) -> bool { 8759 return (VF.isVector() && // Query is illegal for VF == 1 8760 CM.getWideningDecision(IG->getInsertPos(), VF) == 8761 LoopVectorizationCostModel::CM_Interleave); 8762 }; 8763 if (!getDecisionAndClampRange(applyIG, Range)) 8764 continue; 8765 InterleaveGroups.insert(IG); 8766 for (unsigned i = 0; i < IG->getFactor(); i++) 8767 if (Instruction *Member = IG->getMember(i)) 8768 RecipeBuilder.recordRecipeOf(Member); 8769 }; 8770 8771 // --------------------------------------------------------------------------- 8772 // Build initial VPlan: Scan the body of the loop in a topological order to 8773 // visit each basic block after having visited its predecessor basic blocks. 8774 // --------------------------------------------------------------------------- 8775 8776 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8777 auto Plan = std::make_unique<VPlan>(); 8778 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8779 Plan->setEntry(VPBB); 8780 8781 // Scan the body of the loop in a topological order to visit each basic block 8782 // after having visited its predecessor basic blocks. 8783 LoopBlocksDFS DFS(OrigLoop); 8784 DFS.perform(LI); 8785 8786 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8787 // Relevant instructions from basic block BB will be grouped into VPRecipe 8788 // ingredients and fill a new VPBasicBlock. 8789 unsigned VPBBsForBB = 0; 8790 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8791 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8792 VPBB = FirstVPBBForBB; 8793 Builder.setInsertPoint(VPBB); 8794 8795 // Introduce each ingredient into VPlan. 8796 // TODO: Model and preserve debug instrinsics in VPlan. 8797 for (Instruction &I : BB->instructionsWithoutDebug()) { 8798 Instruction *Instr = &I; 8799 8800 // First filter out irrelevant instructions, to ensure no recipes are 8801 // built for them. 8802 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8803 continue; 8804 8805 if (auto RecipeOrValue = 8806 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8807 // If Instr can be simplified to an existing VPValue, use it. 8808 if (RecipeOrValue.is<VPValue *>()) { 8809 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8810 continue; 8811 } 8812 // Otherwise, add the new recipe. 8813 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8814 for (auto *Def : Recipe->definedValues()) { 8815 auto *UV = Def->getUnderlyingValue(); 8816 Plan->addVPValue(UV, Def); 8817 } 8818 8819 RecipeBuilder.setRecipe(Instr, Recipe); 8820 VPBB->appendRecipe(Recipe); 8821 continue; 8822 } 8823 8824 // Otherwise, if all widening options failed, Instruction is to be 8825 // replicated. This may create a successor for VPBB. 8826 VPBasicBlock *NextVPBB = 8827 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8828 if (NextVPBB != VPBB) { 8829 VPBB = NextVPBB; 8830 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8831 : ""); 8832 } 8833 } 8834 } 8835 8836 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8837 // may also be empty, such as the last one VPBB, reflecting original 8838 // basic-blocks with no recipes. 8839 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8840 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8841 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8842 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8843 delete PreEntry; 8844 8845 // --------------------------------------------------------------------------- 8846 // Transform initial VPlan: Apply previously taken decisions, in order, to 8847 // bring the VPlan to its final state. 8848 // --------------------------------------------------------------------------- 8849 8850 // Apply Sink-After legal constraints. 8851 for (auto &Entry : SinkAfter) { 8852 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8853 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8854 // If the target is in a replication region, make sure to move Sink to the 8855 // block after it, not into the replication region itself. 8856 if (auto *Region = 8857 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8858 if (Region->isReplicator()) { 8859 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8860 VPBasicBlock *NextBlock = 8861 cast<VPBasicBlock>(Region->getSuccessors().front()); 8862 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8863 continue; 8864 } 8865 } 8866 Sink->moveAfter(Target); 8867 } 8868 8869 // Interleave memory: for each Interleave Group we marked earlier as relevant 8870 // for this VPlan, replace the Recipes widening its memory instructions with a 8871 // single VPInterleaveRecipe at its insertion point. 8872 for (auto IG : InterleaveGroups) { 8873 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8874 RecipeBuilder.getRecipe(IG->getInsertPos())); 8875 SmallVector<VPValue *, 4> StoredValues; 8876 for (unsigned i = 0; i < IG->getFactor(); ++i) 8877 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8878 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8879 8880 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8881 Recipe->getMask()); 8882 VPIG->insertBefore(Recipe); 8883 unsigned J = 0; 8884 for (unsigned i = 0; i < IG->getFactor(); ++i) 8885 if (Instruction *Member = IG->getMember(i)) { 8886 if (!Member->getType()->isVoidTy()) { 8887 VPValue *OriginalV = Plan->getVPValue(Member); 8888 Plan->removeVPValueFor(Member); 8889 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8890 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8891 J++; 8892 } 8893 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8894 } 8895 } 8896 8897 // Adjust the recipes for any inloop reductions. 8898 if (Range.Start.isVector()) 8899 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8900 8901 // Finally, if tail is folded by masking, introduce selects between the phi 8902 // and the live-out instruction of each reduction, at the end of the latch. 8903 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8904 Builder.setInsertPoint(VPBB); 8905 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8906 for (auto &Reduction : Legal->getReductionVars()) { 8907 if (CM.isInLoopReduction(Reduction.first)) 8908 continue; 8909 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8910 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8911 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8912 } 8913 } 8914 8915 std::string PlanName; 8916 raw_string_ostream RSO(PlanName); 8917 ElementCount VF = Range.Start; 8918 Plan->addVF(VF); 8919 RSO << "Initial VPlan for VF={" << VF; 8920 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8921 Plan->addVF(VF); 8922 RSO << "," << VF; 8923 } 8924 RSO << "},UF>=1"; 8925 RSO.flush(); 8926 Plan->setName(PlanName); 8927 8928 return Plan; 8929 } 8930 8931 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8932 // Outer loop handling: They may require CFG and instruction level 8933 // transformations before even evaluating whether vectorization is profitable. 8934 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8935 // the vectorization pipeline. 8936 assert(!OrigLoop->isInnermost()); 8937 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8938 8939 // Create new empty VPlan 8940 auto Plan = std::make_unique<VPlan>(); 8941 8942 // Build hierarchical CFG 8943 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8944 HCFGBuilder.buildHierarchicalCFG(); 8945 8946 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8947 VF *= 2) 8948 Plan->addVF(VF); 8949 8950 if (EnableVPlanPredication) { 8951 VPlanPredicator VPP(*Plan); 8952 VPP.predicate(); 8953 8954 // Avoid running transformation to recipes until masked code generation in 8955 // VPlan-native path is in place. 8956 return Plan; 8957 } 8958 8959 SmallPtrSet<Instruction *, 1> DeadInstructions; 8960 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 8961 Legal->getInductionVars(), 8962 DeadInstructions, *PSE.getSE()); 8963 return Plan; 8964 } 8965 8966 // Adjust the recipes for any inloop reductions. The chain of instructions 8967 // leading from the loop exit instr to the phi need to be converted to 8968 // reductions, with one operand being vector and the other being the scalar 8969 // reduction chain. 8970 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8971 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8972 for (auto &Reduction : CM.getInLoopReductionChains()) { 8973 PHINode *Phi = Reduction.first; 8974 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8975 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8976 8977 // ReductionOperations are orders top-down from the phi's use to the 8978 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8979 // which of the two operands will remain scalar and which will be reduced. 8980 // For minmax the chain will be the select instructions. 8981 Instruction *Chain = Phi; 8982 for (Instruction *R : ReductionOperations) { 8983 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8984 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8985 8986 VPValue *ChainOp = Plan->getVPValue(Chain); 8987 unsigned FirstOpId; 8988 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8989 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8990 "Expected to replace a VPWidenSelectSC"); 8991 FirstOpId = 1; 8992 } else { 8993 assert(isa<VPWidenRecipe>(WidenRecipe) && 8994 "Expected to replace a VPWidenSC"); 8995 FirstOpId = 0; 8996 } 8997 unsigned VecOpId = 8998 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8999 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9000 9001 auto *CondOp = CM.foldTailByMasking() 9002 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9003 : nullptr; 9004 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9005 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9006 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9007 Plan->removeVPValueFor(R); 9008 Plan->addVPValue(R, RedRecipe); 9009 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9010 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9011 WidenRecipe->eraseFromParent(); 9012 9013 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9014 VPRecipeBase *CompareRecipe = 9015 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9016 assert(isa<VPWidenRecipe>(CompareRecipe) && 9017 "Expected to replace a VPWidenSC"); 9018 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9019 "Expected no remaining users"); 9020 CompareRecipe->eraseFromParent(); 9021 } 9022 Chain = R; 9023 } 9024 } 9025 } 9026 9027 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9028 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9029 VPSlotTracker &SlotTracker) const { 9030 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9031 IG->getInsertPos()->printAsOperand(O, false); 9032 O << ", "; 9033 getAddr()->printAsOperand(O, SlotTracker); 9034 VPValue *Mask = getMask(); 9035 if (Mask) { 9036 O << ", "; 9037 Mask->printAsOperand(O, SlotTracker); 9038 } 9039 for (unsigned i = 0; i < IG->getFactor(); ++i) 9040 if (Instruction *I = IG->getMember(i)) 9041 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9042 } 9043 #endif 9044 9045 void VPWidenCallRecipe::execute(VPTransformState &State) { 9046 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9047 *this, State); 9048 } 9049 9050 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9051 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9052 this, *this, InvariantCond, State); 9053 } 9054 9055 void VPWidenRecipe::execute(VPTransformState &State) { 9056 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9057 } 9058 9059 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9060 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9061 *this, State.UF, State.VF, IsPtrLoopInvariant, 9062 IsIndexLoopInvariant, State); 9063 } 9064 9065 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9066 assert(!State.Instance && "Int or FP induction being replicated."); 9067 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9068 getTruncInst(), getVPValue(0), 9069 getCastValue(), State); 9070 } 9071 9072 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9073 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9074 getStartValue(), this, State); 9075 } 9076 9077 void VPBlendRecipe::execute(VPTransformState &State) { 9078 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9079 // We know that all PHIs in non-header blocks are converted into 9080 // selects, so we don't have to worry about the insertion order and we 9081 // can just use the builder. 9082 // At this point we generate the predication tree. There may be 9083 // duplications since this is a simple recursive scan, but future 9084 // optimizations will clean it up. 9085 9086 unsigned NumIncoming = getNumIncomingValues(); 9087 9088 // Generate a sequence of selects of the form: 9089 // SELECT(Mask3, In3, 9090 // SELECT(Mask2, In2, 9091 // SELECT(Mask1, In1, 9092 // In0))) 9093 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9094 // are essentially undef are taken from In0. 9095 InnerLoopVectorizer::VectorParts Entry(State.UF); 9096 for (unsigned In = 0; In < NumIncoming; ++In) { 9097 for (unsigned Part = 0; Part < State.UF; ++Part) { 9098 // We might have single edge PHIs (blocks) - use an identity 9099 // 'select' for the first PHI operand. 9100 Value *In0 = State.get(getIncomingValue(In), Part); 9101 if (In == 0) 9102 Entry[Part] = In0; // Initialize with the first incoming value. 9103 else { 9104 // Select between the current value and the previous incoming edge 9105 // based on the incoming mask. 9106 Value *Cond = State.get(getMask(In), Part); 9107 Entry[Part] = 9108 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9109 } 9110 } 9111 } 9112 for (unsigned Part = 0; Part < State.UF; ++Part) 9113 State.set(this, Entry[Part], Part); 9114 } 9115 9116 void VPInterleaveRecipe::execute(VPTransformState &State) { 9117 assert(!State.Instance && "Interleave group being replicated."); 9118 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9119 getStoredValues(), getMask()); 9120 } 9121 9122 void VPReductionRecipe::execute(VPTransformState &State) { 9123 assert(!State.Instance && "Reduction being replicated."); 9124 for (unsigned Part = 0; Part < State.UF; ++Part) { 9125 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9126 Value *NewVecOp = State.get(getVecOp(), Part); 9127 if (VPValue *Cond = getCondOp()) { 9128 Value *NewCond = State.get(Cond, Part); 9129 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9130 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9131 Kind, VecTy->getElementType()); 9132 Constant *IdenVec = 9133 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9134 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9135 NewVecOp = Select; 9136 } 9137 Value *NewRed = 9138 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9139 Value *PrevInChain = State.get(getChainOp(), Part); 9140 Value *NextInChain; 9141 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9142 NextInChain = 9143 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9144 NewRed, PrevInChain); 9145 } else { 9146 NextInChain = State.Builder.CreateBinOp( 9147 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9148 PrevInChain); 9149 } 9150 State.set(this, NextInChain, Part); 9151 } 9152 } 9153 9154 void VPReplicateRecipe::execute(VPTransformState &State) { 9155 if (State.Instance) { // Generate a single instance. 9156 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9157 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9158 *State.Instance, IsPredicated, State); 9159 // Insert scalar instance packing it into a vector. 9160 if (AlsoPack && State.VF.isVector()) { 9161 // If we're constructing lane 0, initialize to start from poison. 9162 if (State.Instance->Lane.isFirstLane()) { 9163 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9164 Value *Poison = PoisonValue::get( 9165 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9166 State.set(this, Poison, State.Instance->Part); 9167 } 9168 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9169 } 9170 return; 9171 } 9172 9173 // Generate scalar instances for all VF lanes of all UF parts, unless the 9174 // instruction is uniform inwhich case generate only the first lane for each 9175 // of the UF parts. 9176 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9177 assert((!State.VF.isScalable() || IsUniform) && 9178 "Can't scalarize a scalable vector"); 9179 for (unsigned Part = 0; Part < State.UF; ++Part) 9180 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9181 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9182 VPIteration(Part, Lane), IsPredicated, 9183 State); 9184 } 9185 9186 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9187 assert(State.Instance && "Branch on Mask works only on single instance."); 9188 9189 unsigned Part = State.Instance->Part; 9190 unsigned Lane = State.Instance->Lane.getKnownLane(); 9191 9192 Value *ConditionBit = nullptr; 9193 VPValue *BlockInMask = getMask(); 9194 if (BlockInMask) { 9195 ConditionBit = State.get(BlockInMask, Part); 9196 if (ConditionBit->getType()->isVectorTy()) 9197 ConditionBit = State.Builder.CreateExtractElement( 9198 ConditionBit, State.Builder.getInt32(Lane)); 9199 } else // Block in mask is all-one. 9200 ConditionBit = State.Builder.getTrue(); 9201 9202 // Replace the temporary unreachable terminator with a new conditional branch, 9203 // whose two destinations will be set later when they are created. 9204 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9205 assert(isa<UnreachableInst>(CurrentTerminator) && 9206 "Expected to replace unreachable terminator with conditional branch."); 9207 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9208 CondBr->setSuccessor(0, nullptr); 9209 ReplaceInstWithInst(CurrentTerminator, CondBr); 9210 } 9211 9212 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9213 assert(State.Instance && "Predicated instruction PHI works per instance."); 9214 Instruction *ScalarPredInst = 9215 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9216 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9217 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9218 assert(PredicatingBB && "Predicated block has no single predecessor."); 9219 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9220 "operand must be VPReplicateRecipe"); 9221 9222 // By current pack/unpack logic we need to generate only a single phi node: if 9223 // a vector value for the predicated instruction exists at this point it means 9224 // the instruction has vector users only, and a phi for the vector value is 9225 // needed. In this case the recipe of the predicated instruction is marked to 9226 // also do that packing, thereby "hoisting" the insert-element sequence. 9227 // Otherwise, a phi node for the scalar value is needed. 9228 unsigned Part = State.Instance->Part; 9229 if (State.hasVectorValue(getOperand(0), Part)) { 9230 Value *VectorValue = State.get(getOperand(0), Part); 9231 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9232 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9233 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9234 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9235 if (State.hasVectorValue(this, Part)) 9236 State.reset(this, VPhi, Part); 9237 else 9238 State.set(this, VPhi, Part); 9239 // NOTE: Currently we need to update the value of the operand, so the next 9240 // predicated iteration inserts its generated value in the correct vector. 9241 State.reset(getOperand(0), VPhi, Part); 9242 } else { 9243 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9244 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9245 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9246 PredicatingBB); 9247 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9248 if (State.hasScalarValue(this, *State.Instance)) 9249 State.reset(this, Phi, *State.Instance); 9250 else 9251 State.set(this, Phi, *State.Instance); 9252 // NOTE: Currently we need to update the value of the operand, so the next 9253 // predicated iteration inserts its generated value in the correct vector. 9254 State.reset(getOperand(0), Phi, *State.Instance); 9255 } 9256 } 9257 9258 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9259 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9260 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9261 StoredValue ? nullptr : getVPValue(), 9262 getAddr(), StoredValue, getMask()); 9263 } 9264 9265 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9266 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9267 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9268 // for predication. 9269 static ScalarEpilogueLowering getScalarEpilogueLowering( 9270 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9271 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9272 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9273 LoopVectorizationLegality &LVL) { 9274 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9275 // don't look at hints or options, and don't request a scalar epilogue. 9276 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9277 // LoopAccessInfo (due to code dependency and not being able to reliably get 9278 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9279 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9280 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9281 // back to the old way and vectorize with versioning when forced. See D81345.) 9282 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9283 PGSOQueryType::IRPass) && 9284 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9285 return CM_ScalarEpilogueNotAllowedOptSize; 9286 9287 // 2) If set, obey the directives 9288 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9289 switch (PreferPredicateOverEpilogue) { 9290 case PreferPredicateTy::ScalarEpilogue: 9291 return CM_ScalarEpilogueAllowed; 9292 case PreferPredicateTy::PredicateElseScalarEpilogue: 9293 return CM_ScalarEpilogueNotNeededUsePredicate; 9294 case PreferPredicateTy::PredicateOrDontVectorize: 9295 return CM_ScalarEpilogueNotAllowedUsePredicate; 9296 }; 9297 } 9298 9299 // 3) If set, obey the hints 9300 switch (Hints.getPredicate()) { 9301 case LoopVectorizeHints::FK_Enabled: 9302 return CM_ScalarEpilogueNotNeededUsePredicate; 9303 case LoopVectorizeHints::FK_Disabled: 9304 return CM_ScalarEpilogueAllowed; 9305 }; 9306 9307 // 4) if the TTI hook indicates this is profitable, request predication. 9308 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9309 LVL.getLAI())) 9310 return CM_ScalarEpilogueNotNeededUsePredicate; 9311 9312 return CM_ScalarEpilogueAllowed; 9313 } 9314 9315 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9316 // If Values have been set for this Def return the one relevant for \p Part. 9317 if (hasVectorValue(Def, Part)) 9318 return Data.PerPartOutput[Def][Part]; 9319 9320 if (!hasScalarValue(Def, {Part, 0})) { 9321 Value *IRV = Def->getLiveInIRValue(); 9322 Value *B = ILV->getBroadcastInstrs(IRV); 9323 set(Def, B, Part); 9324 return B; 9325 } 9326 9327 Value *ScalarValue = get(Def, {Part, 0}); 9328 // If we aren't vectorizing, we can just copy the scalar map values over 9329 // to the vector map. 9330 if (VF.isScalar()) { 9331 set(Def, ScalarValue, Part); 9332 return ScalarValue; 9333 } 9334 9335 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9336 bool IsUniform = RepR && RepR->isUniform(); 9337 9338 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9339 // Check if there is a scalar value for the selected lane. 9340 if (!hasScalarValue(Def, {Part, LastLane})) { 9341 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9342 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9343 "unexpected recipe found to be invariant"); 9344 IsUniform = true; 9345 LastLane = 0; 9346 } 9347 9348 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9349 9350 // Set the insert point after the last scalarized instruction. This 9351 // ensures the insertelement sequence will directly follow the scalar 9352 // definitions. 9353 auto OldIP = Builder.saveIP(); 9354 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9355 Builder.SetInsertPoint(&*NewIP); 9356 9357 // However, if we are vectorizing, we need to construct the vector values. 9358 // If the value is known to be uniform after vectorization, we can just 9359 // broadcast the scalar value corresponding to lane zero for each unroll 9360 // iteration. Otherwise, we construct the vector values using 9361 // insertelement instructions. Since the resulting vectors are stored in 9362 // State, we will only generate the insertelements once. 9363 Value *VectorValue = nullptr; 9364 if (IsUniform) { 9365 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9366 set(Def, VectorValue, Part); 9367 } else { 9368 // Initialize packing with insertelements to start from undef. 9369 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9370 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9371 set(Def, Undef, Part); 9372 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9373 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9374 VectorValue = get(Def, Part); 9375 } 9376 Builder.restoreIP(OldIP); 9377 return VectorValue; 9378 } 9379 9380 // Process the loop in the VPlan-native vectorization path. This path builds 9381 // VPlan upfront in the vectorization pipeline, which allows to apply 9382 // VPlan-to-VPlan transformations from the very beginning without modifying the 9383 // input LLVM IR. 9384 static bool processLoopInVPlanNativePath( 9385 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9386 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9387 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9388 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9389 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9390 9391 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9392 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9393 return false; 9394 } 9395 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9396 Function *F = L->getHeader()->getParent(); 9397 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9398 9399 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9400 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9401 9402 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9403 &Hints, IAI); 9404 // Use the planner for outer loop vectorization. 9405 // TODO: CM is not used at this point inside the planner. Turn CM into an 9406 // optional argument if we don't need it in the future. 9407 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9408 9409 // Get user vectorization factor. 9410 ElementCount UserVF = Hints.getWidth(); 9411 9412 // Plan how to best vectorize, return the best VF and its cost. 9413 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9414 9415 // If we are stress testing VPlan builds, do not attempt to generate vector 9416 // code. Masked vector code generation support will follow soon. 9417 // Also, do not attempt to vectorize if no vector code will be produced. 9418 if (VPlanBuildStressTest || EnableVPlanPredication || 9419 VectorizationFactor::Disabled() == VF) 9420 return false; 9421 9422 LVP.setBestPlan(VF.Width, 1); 9423 9424 { 9425 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9426 F->getParent()->getDataLayout()); 9427 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9428 &CM, BFI, PSI, Checks); 9429 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9430 << L->getHeader()->getParent()->getName() << "\"\n"); 9431 LVP.executePlan(LB, DT); 9432 } 9433 9434 // Mark the loop as already vectorized to avoid vectorizing again. 9435 Hints.setAlreadyVectorized(); 9436 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9437 return true; 9438 } 9439 9440 // Emit a remark if there are stores to floats that required a floating point 9441 // extension. If the vectorized loop was generated with floating point there 9442 // will be a performance penalty from the conversion overhead and the change in 9443 // the vector width. 9444 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9445 SmallVector<Instruction *, 4> Worklist; 9446 for (BasicBlock *BB : L->getBlocks()) { 9447 for (Instruction &Inst : *BB) { 9448 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9449 if (S->getValueOperand()->getType()->isFloatTy()) 9450 Worklist.push_back(S); 9451 } 9452 } 9453 } 9454 9455 // Traverse the floating point stores upwards searching, for floating point 9456 // conversions. 9457 SmallPtrSet<const Instruction *, 4> Visited; 9458 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9459 while (!Worklist.empty()) { 9460 auto *I = Worklist.pop_back_val(); 9461 if (!L->contains(I)) 9462 continue; 9463 if (!Visited.insert(I).second) 9464 continue; 9465 9466 // Emit a remark if the floating point store required a floating 9467 // point conversion. 9468 // TODO: More work could be done to identify the root cause such as a 9469 // constant or a function return type and point the user to it. 9470 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9471 ORE->emit([&]() { 9472 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9473 I->getDebugLoc(), L->getHeader()) 9474 << "floating point conversion changes vector width. " 9475 << "Mixed floating point precision requires an up/down " 9476 << "cast that will negatively impact performance."; 9477 }); 9478 9479 for (Use &Op : I->operands()) 9480 if (auto *OpI = dyn_cast<Instruction>(Op)) 9481 Worklist.push_back(OpI); 9482 } 9483 } 9484 9485 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9486 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9487 !EnableLoopInterleaving), 9488 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9489 !EnableLoopVectorization) {} 9490 9491 bool LoopVectorizePass::processLoop(Loop *L) { 9492 assert((EnableVPlanNativePath || L->isInnermost()) && 9493 "VPlan-native path is not enabled. Only process inner loops."); 9494 9495 #ifndef NDEBUG 9496 const std::string DebugLocStr = getDebugLocString(L); 9497 #endif /* NDEBUG */ 9498 9499 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9500 << L->getHeader()->getParent()->getName() << "\" from " 9501 << DebugLocStr << "\n"); 9502 9503 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9504 9505 LLVM_DEBUG( 9506 dbgs() << "LV: Loop hints:" 9507 << " force=" 9508 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9509 ? "disabled" 9510 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9511 ? "enabled" 9512 : "?")) 9513 << " width=" << Hints.getWidth() 9514 << " unroll=" << Hints.getInterleave() << "\n"); 9515 9516 // Function containing loop 9517 Function *F = L->getHeader()->getParent(); 9518 9519 // Looking at the diagnostic output is the only way to determine if a loop 9520 // was vectorized (other than looking at the IR or machine code), so it 9521 // is important to generate an optimization remark for each loop. Most of 9522 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9523 // generated as OptimizationRemark and OptimizationRemarkMissed are 9524 // less verbose reporting vectorized loops and unvectorized loops that may 9525 // benefit from vectorization, respectively. 9526 9527 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9528 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9529 return false; 9530 } 9531 9532 PredicatedScalarEvolution PSE(*SE, *L); 9533 9534 // Check if it is legal to vectorize the loop. 9535 LoopVectorizationRequirements Requirements(*ORE); 9536 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9537 &Requirements, &Hints, DB, AC, BFI, PSI); 9538 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9539 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9540 Hints.emitRemarkWithHints(); 9541 return false; 9542 } 9543 9544 // Check the function attributes and profiles to find out if this function 9545 // should be optimized for size. 9546 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9547 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9548 9549 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9550 // here. They may require CFG and instruction level transformations before 9551 // even evaluating whether vectorization is profitable. Since we cannot modify 9552 // the incoming IR, we need to build VPlan upfront in the vectorization 9553 // pipeline. 9554 if (!L->isInnermost()) 9555 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9556 ORE, BFI, PSI, Hints); 9557 9558 assert(L->isInnermost() && "Inner loop expected."); 9559 9560 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9561 // count by optimizing for size, to minimize overheads. 9562 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9563 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9564 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9565 << "This loop is worth vectorizing only if no scalar " 9566 << "iteration overheads are incurred."); 9567 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9568 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9569 else { 9570 LLVM_DEBUG(dbgs() << "\n"); 9571 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9572 } 9573 } 9574 9575 // Check the function attributes to see if implicit floats are allowed. 9576 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9577 // an integer loop and the vector instructions selected are purely integer 9578 // vector instructions? 9579 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9580 reportVectorizationFailure( 9581 "Can't vectorize when the NoImplicitFloat attribute is used", 9582 "loop not vectorized due to NoImplicitFloat attribute", 9583 "NoImplicitFloat", ORE, L); 9584 Hints.emitRemarkWithHints(); 9585 return false; 9586 } 9587 9588 // Check if the target supports potentially unsafe FP vectorization. 9589 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9590 // for the target we're vectorizing for, to make sure none of the 9591 // additional fp-math flags can help. 9592 if (Hints.isPotentiallyUnsafe() && 9593 TTI->isFPVectorizationPotentiallyUnsafe()) { 9594 reportVectorizationFailure( 9595 "Potentially unsafe FP op prevents vectorization", 9596 "loop not vectorized due to unsafe FP support.", 9597 "UnsafeFP", ORE, L); 9598 Hints.emitRemarkWithHints(); 9599 return false; 9600 } 9601 9602 if (!Requirements.canVectorizeFPMath(Hints)) { 9603 ORE->emit([&]() { 9604 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9605 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9606 ExactFPMathInst->getDebugLoc(), 9607 ExactFPMathInst->getParent()) 9608 << "loop not vectorized: cannot prove it is safe to reorder " 9609 "floating-point operations"; 9610 }); 9611 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9612 "reorder floating-point operations\n"); 9613 Hints.emitRemarkWithHints(); 9614 return false; 9615 } 9616 9617 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9618 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9619 9620 // If an override option has been passed in for interleaved accesses, use it. 9621 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9622 UseInterleaved = EnableInterleavedMemAccesses; 9623 9624 // Analyze interleaved memory accesses. 9625 if (UseInterleaved) { 9626 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9627 } 9628 9629 // Use the cost model. 9630 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9631 F, &Hints, IAI); 9632 CM.collectValuesToIgnore(); 9633 9634 // Use the planner for vectorization. 9635 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9636 9637 // Get user vectorization factor and interleave count. 9638 ElementCount UserVF = Hints.getWidth(); 9639 unsigned UserIC = Hints.getInterleave(); 9640 9641 // Plan how to best vectorize, return the best VF and its cost. 9642 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9643 9644 VectorizationFactor VF = VectorizationFactor::Disabled(); 9645 unsigned IC = 1; 9646 9647 if (MaybeVF) { 9648 VF = *MaybeVF; 9649 // Select the interleave count. 9650 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9651 } 9652 9653 // Identify the diagnostic messages that should be produced. 9654 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9655 bool VectorizeLoop = true, InterleaveLoop = true; 9656 if (Requirements.doesNotMeet(F, L, Hints)) { 9657 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9658 "requirements.\n"); 9659 Hints.emitRemarkWithHints(); 9660 return false; 9661 } 9662 9663 if (VF.Width.isScalar()) { 9664 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9665 VecDiagMsg = std::make_pair( 9666 "VectorizationNotBeneficial", 9667 "the cost-model indicates that vectorization is not beneficial"); 9668 VectorizeLoop = false; 9669 } 9670 9671 if (!MaybeVF && UserIC > 1) { 9672 // Tell the user interleaving was avoided up-front, despite being explicitly 9673 // requested. 9674 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9675 "interleaving should be avoided up front\n"); 9676 IntDiagMsg = std::make_pair( 9677 "InterleavingAvoided", 9678 "Ignoring UserIC, because interleaving was avoided up front"); 9679 InterleaveLoop = false; 9680 } else if (IC == 1 && UserIC <= 1) { 9681 // Tell the user interleaving is not beneficial. 9682 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9683 IntDiagMsg = std::make_pair( 9684 "InterleavingNotBeneficial", 9685 "the cost-model indicates that interleaving is not beneficial"); 9686 InterleaveLoop = false; 9687 if (UserIC == 1) { 9688 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9689 IntDiagMsg.second += 9690 " and is explicitly disabled or interleave count is set to 1"; 9691 } 9692 } else if (IC > 1 && UserIC == 1) { 9693 // Tell the user interleaving is beneficial, but it explicitly disabled. 9694 LLVM_DEBUG( 9695 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9696 IntDiagMsg = std::make_pair( 9697 "InterleavingBeneficialButDisabled", 9698 "the cost-model indicates that interleaving is beneficial " 9699 "but is explicitly disabled or interleave count is set to 1"); 9700 InterleaveLoop = false; 9701 } 9702 9703 // Override IC if user provided an interleave count. 9704 IC = UserIC > 0 ? UserIC : IC; 9705 9706 // Emit diagnostic messages, if any. 9707 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9708 if (!VectorizeLoop && !InterleaveLoop) { 9709 // Do not vectorize or interleaving the loop. 9710 ORE->emit([&]() { 9711 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9712 L->getStartLoc(), L->getHeader()) 9713 << VecDiagMsg.second; 9714 }); 9715 ORE->emit([&]() { 9716 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9717 L->getStartLoc(), L->getHeader()) 9718 << IntDiagMsg.second; 9719 }); 9720 return false; 9721 } else if (!VectorizeLoop && InterleaveLoop) { 9722 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9723 ORE->emit([&]() { 9724 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9725 L->getStartLoc(), L->getHeader()) 9726 << VecDiagMsg.second; 9727 }); 9728 } else if (VectorizeLoop && !InterleaveLoop) { 9729 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9730 << ") in " << DebugLocStr << '\n'); 9731 ORE->emit([&]() { 9732 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9733 L->getStartLoc(), L->getHeader()) 9734 << IntDiagMsg.second; 9735 }); 9736 } else if (VectorizeLoop && InterleaveLoop) { 9737 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9738 << ") in " << DebugLocStr << '\n'); 9739 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9740 } 9741 9742 bool DisableRuntimeUnroll = false; 9743 MDNode *OrigLoopID = L->getLoopID(); 9744 { 9745 // Optimistically generate runtime checks. Drop them if they turn out to not 9746 // be profitable. Limit the scope of Checks, so the cleanup happens 9747 // immediately after vector codegeneration is done. 9748 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9749 F->getParent()->getDataLayout()); 9750 if (!VF.Width.isScalar() || IC > 1) 9751 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9752 LVP.setBestPlan(VF.Width, IC); 9753 9754 using namespace ore; 9755 if (!VectorizeLoop) { 9756 assert(IC > 1 && "interleave count should not be 1 or 0"); 9757 // If we decided that it is not legal to vectorize the loop, then 9758 // interleave it. 9759 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9760 &CM, BFI, PSI, Checks); 9761 LVP.executePlan(Unroller, DT); 9762 9763 ORE->emit([&]() { 9764 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9765 L->getHeader()) 9766 << "interleaved loop (interleaved count: " 9767 << NV("InterleaveCount", IC) << ")"; 9768 }); 9769 } else { 9770 // If we decided that it is *legal* to vectorize the loop, then do it. 9771 9772 // Consider vectorizing the epilogue too if it's profitable. 9773 VectorizationFactor EpilogueVF = 9774 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9775 if (EpilogueVF.Width.isVector()) { 9776 9777 // The first pass vectorizes the main loop and creates a scalar epilogue 9778 // to be vectorized by executing the plan (potentially with a different 9779 // factor) again shortly afterwards. 9780 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9781 EpilogueVF.Width.getKnownMinValue(), 9782 1); 9783 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9784 EPI, &LVL, &CM, BFI, PSI, Checks); 9785 9786 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9787 LVP.executePlan(MainILV, DT); 9788 ++LoopsVectorized; 9789 9790 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9791 formLCSSARecursively(*L, *DT, LI, SE); 9792 9793 // Second pass vectorizes the epilogue and adjusts the control flow 9794 // edges from the first pass. 9795 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9796 EPI.MainLoopVF = EPI.EpilogueVF; 9797 EPI.MainLoopUF = EPI.EpilogueUF; 9798 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9799 ORE, EPI, &LVL, &CM, BFI, PSI, 9800 Checks); 9801 LVP.executePlan(EpilogILV, DT); 9802 ++LoopsEpilogueVectorized; 9803 9804 if (!MainILV.areSafetyChecksAdded()) 9805 DisableRuntimeUnroll = true; 9806 } else { 9807 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9808 &LVL, &CM, BFI, PSI, Checks); 9809 LVP.executePlan(LB, DT); 9810 ++LoopsVectorized; 9811 9812 // Add metadata to disable runtime unrolling a scalar loop when there 9813 // are no runtime checks about strides and memory. A scalar loop that is 9814 // rarely used is not worth unrolling. 9815 if (!LB.areSafetyChecksAdded()) 9816 DisableRuntimeUnroll = true; 9817 } 9818 // Report the vectorization decision. 9819 ORE->emit([&]() { 9820 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9821 L->getHeader()) 9822 << "vectorized loop (vectorization width: " 9823 << NV("VectorizationFactor", VF.Width) 9824 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9825 }); 9826 } 9827 9828 if (ORE->allowExtraAnalysis(LV_NAME)) 9829 checkMixedPrecision(L, ORE); 9830 } 9831 9832 Optional<MDNode *> RemainderLoopID = 9833 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9834 LLVMLoopVectorizeFollowupEpilogue}); 9835 if (RemainderLoopID.hasValue()) { 9836 L->setLoopID(RemainderLoopID.getValue()); 9837 } else { 9838 if (DisableRuntimeUnroll) 9839 AddRuntimeUnrollDisableMetaData(L); 9840 9841 // Mark the loop as already vectorized to avoid vectorizing again. 9842 Hints.setAlreadyVectorized(); 9843 } 9844 9845 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9846 return true; 9847 } 9848 9849 LoopVectorizeResult LoopVectorizePass::runImpl( 9850 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9851 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9852 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9853 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9854 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9855 SE = &SE_; 9856 LI = &LI_; 9857 TTI = &TTI_; 9858 DT = &DT_; 9859 BFI = &BFI_; 9860 TLI = TLI_; 9861 AA = &AA_; 9862 AC = &AC_; 9863 GetLAA = &GetLAA_; 9864 DB = &DB_; 9865 ORE = &ORE_; 9866 PSI = PSI_; 9867 9868 // Don't attempt if 9869 // 1. the target claims to have no vector registers, and 9870 // 2. interleaving won't help ILP. 9871 // 9872 // The second condition is necessary because, even if the target has no 9873 // vector registers, loop vectorization may still enable scalar 9874 // interleaving. 9875 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9876 TTI->getMaxInterleaveFactor(1) < 2) 9877 return LoopVectorizeResult(false, false); 9878 9879 bool Changed = false, CFGChanged = false; 9880 9881 // The vectorizer requires loops to be in simplified form. 9882 // Since simplification may add new inner loops, it has to run before the 9883 // legality and profitability checks. This means running the loop vectorizer 9884 // will simplify all loops, regardless of whether anything end up being 9885 // vectorized. 9886 for (auto &L : *LI) 9887 Changed |= CFGChanged |= 9888 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9889 9890 // Build up a worklist of inner-loops to vectorize. This is necessary as 9891 // the act of vectorizing or partially unrolling a loop creates new loops 9892 // and can invalidate iterators across the loops. 9893 SmallVector<Loop *, 8> Worklist; 9894 9895 for (Loop *L : *LI) 9896 collectSupportedLoops(*L, LI, ORE, Worklist); 9897 9898 LoopsAnalyzed += Worklist.size(); 9899 9900 // Now walk the identified inner loops. 9901 while (!Worklist.empty()) { 9902 Loop *L = Worklist.pop_back_val(); 9903 9904 // For the inner loops we actually process, form LCSSA to simplify the 9905 // transform. 9906 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9907 9908 Changed |= CFGChanged |= processLoop(L); 9909 } 9910 9911 // Process each loop nest in the function. 9912 return LoopVectorizeResult(Changed, CFGChanged); 9913 } 9914 9915 PreservedAnalyses LoopVectorizePass::run(Function &F, 9916 FunctionAnalysisManager &AM) { 9917 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9918 auto &LI = AM.getResult<LoopAnalysis>(F); 9919 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9920 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9921 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9922 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9923 auto &AA = AM.getResult<AAManager>(F); 9924 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9925 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9926 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9927 MemorySSA *MSSA = EnableMSSALoopDependency 9928 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9929 : nullptr; 9930 9931 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9932 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9933 [&](Loop &L) -> const LoopAccessInfo & { 9934 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9935 TLI, TTI, nullptr, MSSA}; 9936 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9937 }; 9938 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9939 ProfileSummaryInfo *PSI = 9940 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9941 LoopVectorizeResult Result = 9942 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9943 if (!Result.MadeAnyChange) 9944 return PreservedAnalyses::all(); 9945 PreservedAnalyses PA; 9946 9947 // We currently do not preserve loopinfo/dominator analyses with outer loop 9948 // vectorization. Until this is addressed, mark these analyses as preserved 9949 // only for non-VPlan-native path. 9950 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9951 if (!EnableVPlanNativePath) { 9952 PA.preserve<LoopAnalysis>(); 9953 PA.preserve<DominatorTreeAnalysis>(); 9954 } 9955 PA.preserve<BasicAA>(); 9956 PA.preserve<GlobalsAA>(); 9957 if (!Result.MadeCFGChange) 9958 PA.preserveSet<CFGAnalyses>(); 9959 return PA; 9960 } 9961