1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/Type.h" 120 #include "llvm/IR/Use.h" 121 #include "llvm/IR/User.h" 122 #include "llvm/IR/Value.h" 123 #include "llvm/IR/ValueHandle.h" 124 #include "llvm/IR/Verifier.h" 125 #include "llvm/InitializePasses.h" 126 #include "llvm/Pass.h" 127 #include "llvm/Support/Casting.h" 128 #include "llvm/Support/CommandLine.h" 129 #include "llvm/Support/Compiler.h" 130 #include "llvm/Support/Debug.h" 131 #include "llvm/Support/ErrorHandling.h" 132 #include "llvm/Support/InstructionCost.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 #ifndef NDEBUG 161 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 162 #endif 163 164 /// @{ 165 /// Metadata attribute names 166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 167 const char LLVMLoopVectorizeFollowupVectorized[] = 168 "llvm.loop.vectorize.followup_vectorized"; 169 const char LLVMLoopVectorizeFollowupEpilogue[] = 170 "llvm.loop.vectorize.followup_epilogue"; 171 /// @} 172 173 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 176 177 static cl::opt<bool> EnableEpilogueVectorization( 178 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 179 cl::desc("Enable vectorization of epilogue loops.")); 180 181 static cl::opt<unsigned> EpilogueVectorizationForceVF( 182 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 183 cl::desc("When epilogue vectorization is enabled, and a value greater than " 184 "1 is specified, forces the given VF for all applicable epilogue " 185 "loops.")); 186 187 static cl::opt<unsigned> EpilogueVectorizationMinVF( 188 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 189 cl::desc("Only loops with vectorization factor equal to or larger than " 190 "the specified value are considered for epilogue vectorization.")); 191 192 /// Loops with a known constant trip count below this number are vectorized only 193 /// if no scalar iteration overheads are incurred. 194 static cl::opt<unsigned> TinyTripCountVectorThreshold( 195 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 196 cl::desc("Loops with a constant trip count that is smaller than this " 197 "value are vectorized only if no scalar iteration overheads " 198 "are incurred.")); 199 200 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 201 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 202 cl::desc("The maximum allowed number of runtime memory checks with a " 203 "vectorize(enable) pragma.")); 204 205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206 // that predication is preferred, and this lists all options. I.e., the 207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 208 // and predicate the instructions accordingly. If tail-folding fails, there are 209 // different fallback strategies depending on these values: 210 namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216 } // namespace PreferPredicateTy 217 218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236 static cl::opt<bool> MaximizeBandwidth( 237 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 238 cl::desc("Maximize bandwidth when selecting vectorization factor which " 239 "will be determined by the smallest type in loop.")); 240 241 static cl::opt<bool> EnableInterleavedMemAccesses( 242 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 243 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 244 245 /// An interleave-group may need masking if it resides in a block that needs 246 /// predication, or in order to mask away gaps. 247 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 248 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 249 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 250 251 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 252 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 253 cl::desc("We don't interleave loops with a estimated constant trip count " 254 "below this number")); 255 256 static cl::opt<unsigned> ForceTargetNumScalarRegs( 257 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of scalar registers.")); 259 260 static cl::opt<unsigned> ForceTargetNumVectorRegs( 261 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's number of vector registers.")); 263 264 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 265 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 266 cl::desc("A flag that overrides the target's max interleave factor for " 267 "scalar loops.")); 268 269 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 270 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 271 cl::desc("A flag that overrides the target's max interleave factor for " 272 "vectorized loops.")); 273 274 static cl::opt<unsigned> ForceTargetInstructionCost( 275 "force-target-instruction-cost", cl::init(0), cl::Hidden, 276 cl::desc("A flag that overrides the target's expected cost for " 277 "an instruction to a single constant value. Mostly " 278 "useful for getting consistent testing.")); 279 280 static cl::opt<bool> ForceTargetSupportsScalableVectors( 281 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 282 cl::desc( 283 "Pretend that scalable vectors are supported, even if the target does " 284 "not support them. This flag should only be used for testing.")); 285 286 static cl::opt<unsigned> SmallLoopCost( 287 "small-loop-cost", cl::init(20), cl::Hidden, 288 cl::desc( 289 "The cost of a loop that is considered 'small' by the interleaver.")); 290 291 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 292 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 293 cl::desc("Enable the use of the block frequency analysis to access PGO " 294 "heuristics minimizing code growth in cold regions and being more " 295 "aggressive in hot regions.")); 296 297 // Runtime interleave loops for load/store throughput. 298 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 299 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 300 cl::desc( 301 "Enable runtime interleaving until load/store ports are saturated")); 302 303 /// Interleave small loops with scalar reductions. 304 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 305 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 306 cl::desc("Enable interleaving for loops with small iteration counts that " 307 "contain scalar reductions to expose ILP.")); 308 309 /// The number of stores in a loop that are allowed to need predication. 310 static cl::opt<unsigned> NumberOfStoresToPredicate( 311 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 312 cl::desc("Max number of stores to be predicated behind an if.")); 313 314 static cl::opt<bool> EnableIndVarRegisterHeur( 315 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 316 cl::desc("Count the induction variable only once when interleaving")); 317 318 static cl::opt<bool> EnableCondStoresVectorization( 319 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 320 cl::desc("Enable if predication of stores during vectorization.")); 321 322 static cl::opt<unsigned> MaxNestedScalarReductionIC( 323 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 324 cl::desc("The maximum interleave count to use when interleaving a scalar " 325 "reduction in a nested loop.")); 326 327 static cl::opt<bool> 328 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 329 cl::Hidden, 330 cl::desc("Prefer in-loop vector reductions, " 331 "overriding the targets preference.")); 332 333 static cl::opt<bool> PreferPredicatedReductionSelect( 334 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 335 cl::desc( 336 "Prefer predicating a reduction operation over an after loop select.")); 337 338 cl::opt<bool> EnableVPlanNativePath( 339 "enable-vplan-native-path", cl::init(false), cl::Hidden, 340 cl::desc("Enable VPlan-native vectorization path with " 341 "support for outer loop vectorization.")); 342 343 // FIXME: Remove this switch once we have divergence analysis. Currently we 344 // assume divergent non-backedge branches when this switch is true. 345 cl::opt<bool> EnableVPlanPredication( 346 "enable-vplan-predication", cl::init(false), cl::Hidden, 347 cl::desc("Enable VPlan-native vectorization path predicator with " 348 "support for outer loop vectorization.")); 349 350 // This flag enables the stress testing of the VPlan H-CFG construction in the 351 // VPlan-native vectorization path. It must be used in conjuction with 352 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 353 // verification of the H-CFGs built. 354 static cl::opt<bool> VPlanBuildStressTest( 355 "vplan-build-stress-test", cl::init(false), cl::Hidden, 356 cl::desc( 357 "Build VPlan for every supported loop nest in the function and bail " 358 "out right after the build (stress test the VPlan H-CFG construction " 359 "in the VPlan-native vectorization path).")); 360 361 cl::opt<bool> llvm::EnableLoopInterleaving( 362 "interleave-loops", cl::init(true), cl::Hidden, 363 cl::desc("Enable loop interleaving in Loop vectorization passes")); 364 cl::opt<bool> llvm::EnableLoopVectorization( 365 "vectorize-loops", cl::init(true), cl::Hidden, 366 cl::desc("Run the Loop vectorization passes")); 367 368 cl::opt<bool> PrintVPlansInDotFormat( 369 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 370 cl::desc("Use dot format instead of plain text when dumping VPlans")); 371 372 /// A helper function that returns the type of loaded or stored value. 373 static Type *getMemInstValueType(Value *I) { 374 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 375 "Expected Load or Store instruction"); 376 if (auto *LI = dyn_cast<LoadInst>(I)) 377 return LI->getType(); 378 return cast<StoreInst>(I)->getValueOperand()->getType(); 379 } 380 381 /// A helper function that returns true if the given type is irregular. The 382 /// type is irregular if its allocated size doesn't equal the store size of an 383 /// element of the corresponding vector type. 384 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 385 // Determine if an array of N elements of type Ty is "bitcast compatible" 386 // with a <N x Ty> vector. 387 // This is only true if there is no padding between the array elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that returns an integer or floating-point constant with 400 /// value C. 401 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 402 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 403 : ConstantFP::get(Ty, C); 404 } 405 406 /// Returns "best known" trip count for the specified loop \p L as defined by 407 /// the following procedure: 408 /// 1) Returns exact trip count if it is known. 409 /// 2) Returns expected trip count according to profile data if any. 410 /// 3) Returns upper bound estimate if it is known. 411 /// 4) Returns None if all of the above failed. 412 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 413 // Check if exact trip count is known. 414 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 415 return ExpectedTC; 416 417 // Check if there is an expected trip count available from profile data. 418 if (LoopVectorizeWithBlockFrequency) 419 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 420 return EstimatedTC; 421 422 // Check if upper bound estimate is known. 423 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 424 return ExpectedTC; 425 426 return None; 427 } 428 429 // Forward declare GeneratedRTChecks. 430 class GeneratedRTChecks; 431 432 namespace llvm { 433 434 /// InnerLoopVectorizer vectorizes loops which contain only one basic 435 /// block to a specified vectorization factor (VF). 436 /// This class performs the widening of scalars into vectors, or multiple 437 /// scalars. This class also implements the following features: 438 /// * It inserts an epilogue loop for handling loops that don't have iteration 439 /// counts that are known to be a multiple of the vectorization factor. 440 /// * It handles the code generation for reduction variables. 441 /// * Scalarization (implementation using scalars) of un-vectorizable 442 /// instructions. 443 /// InnerLoopVectorizer does not perform any vectorization-legality 444 /// checks, and relies on the caller to check for the different legality 445 /// aspects. The InnerLoopVectorizer relies on the 446 /// LoopVectorizationLegality class to provide information about the induction 447 /// and reduction variables that were found to a given vectorization factor. 448 class InnerLoopVectorizer { 449 public: 450 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 451 LoopInfo *LI, DominatorTree *DT, 452 const TargetLibraryInfo *TLI, 453 const TargetTransformInfo *TTI, AssumptionCache *AC, 454 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 455 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 456 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 457 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 458 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 459 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 460 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 461 PSI(PSI), RTChecks(RTChecks) { 462 // Query this against the original loop and save it here because the profile 463 // of the original loop header may change as the transformation happens. 464 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 465 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 466 } 467 468 virtual ~InnerLoopVectorizer() = default; 469 470 /// Create a new empty loop that will contain vectorized instructions later 471 /// on, while the old loop will be used as the scalar remainder. Control flow 472 /// is generated around the vectorized (and scalar epilogue) loops consisting 473 /// of various checks and bypasses. Return the pre-header block of the new 474 /// loop. 475 /// In the case of epilogue vectorization, this function is overriden to 476 /// handle the more complex control flow around the loops. 477 virtual BasicBlock *createVectorizedLoopSkeleton(); 478 479 /// Widen a single instruction within the innermost loop. 480 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 481 VPTransformState &State); 482 483 /// Widen a single call instruction within the innermost loop. 484 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 485 VPTransformState &State); 486 487 /// Widen a single select instruction within the innermost loop. 488 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 489 bool InvariantCond, VPTransformState &State); 490 491 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 492 void fixVectorizedLoop(VPTransformState &State); 493 494 // Return true if any runtime check is added. 495 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 496 497 /// A type for vectorized values in the new loop. Each value from the 498 /// original loop, when vectorized, is represented by UF vector values in the 499 /// new unrolled loop, where UF is the unroll factor. 500 using VectorParts = SmallVector<Value *, 2>; 501 502 /// Vectorize a single GetElementPtrInst based on information gathered and 503 /// decisions taken during planning. 504 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 505 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 506 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 507 508 /// Vectorize a single PHINode in a block. This method handles the induction 509 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 510 /// arbitrary length vectors. 511 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 512 VPValue *StartV, VPValue *Def, 513 VPTransformState &State); 514 515 /// A helper function to scalarize a single Instruction in the innermost loop. 516 /// Generates a sequence of scalar instances for each lane between \p MinLane 517 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 518 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 519 /// Instr's operands. 520 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 521 const VPIteration &Instance, bool IfPredicateInstr, 522 VPTransformState &State); 523 524 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 525 /// is provided, the integer induction variable will first be truncated to 526 /// the corresponding type. 527 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 528 VPValue *Def, VPValue *CastDef, 529 VPTransformState &State); 530 531 /// Construct the vector value of a scalarized value \p V one lane at a time. 532 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 533 VPTransformState &State); 534 535 /// Try to vectorize interleaved access group \p Group with the base address 536 /// given in \p Addr, optionally masking the vector operations if \p 537 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 538 /// values in the vectorized loop. 539 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 540 ArrayRef<VPValue *> VPDefs, 541 VPTransformState &State, VPValue *Addr, 542 ArrayRef<VPValue *> StoredValues, 543 VPValue *BlockInMask = nullptr); 544 545 /// Vectorize Load and Store instructions with the base address given in \p 546 /// Addr, optionally masking the vector operations if \p BlockInMask is 547 /// non-null. Use \p State to translate given VPValues to IR values in the 548 /// vectorized loop. 549 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 550 VPValue *Def, VPValue *Addr, 551 VPValue *StoredValue, VPValue *BlockInMask); 552 553 /// Set the debug location in the builder using the debug location in 554 /// the instruction. 555 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 556 557 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 558 void fixNonInductionPHIs(VPTransformState &State); 559 560 /// Create a broadcast instruction. This method generates a broadcast 561 /// instruction (shuffle) for loop invariant values and for the induction 562 /// value. If this is the induction variable then we extend it to N, N+1, ... 563 /// this is needed because each iteration in the loop corresponds to a SIMD 564 /// element. 565 virtual Value *getBroadcastInstrs(Value *V); 566 567 protected: 568 friend class LoopVectorizationPlanner; 569 570 /// A small list of PHINodes. 571 using PhiVector = SmallVector<PHINode *, 4>; 572 573 /// A type for scalarized values in the new loop. Each value from the 574 /// original loop, when scalarized, is represented by UF x VF scalar values 575 /// in the new unrolled loop, where UF is the unroll factor and VF is the 576 /// vectorization factor. 577 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 578 579 /// Set up the values of the IVs correctly when exiting the vector loop. 580 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 581 Value *CountRoundDown, Value *EndValue, 582 BasicBlock *MiddleBlock); 583 584 /// Create a new induction variable inside L. 585 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 586 Value *Step, Instruction *DL); 587 588 /// Handle all cross-iteration phis in the header. 589 void fixCrossIterationPHIs(VPTransformState &State); 590 591 /// Fix a first-order recurrence. This is the second phase of vectorizing 592 /// this phi node. 593 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 594 595 /// Fix a reduction cross-iteration phi. This is the second phase of 596 /// vectorizing this phi node. 597 void fixReduction(PHINode *Phi, VPTransformState &State); 598 599 /// Clear NSW/NUW flags from reduction instructions if necessary. 600 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 601 VPTransformState &State); 602 603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 604 /// means we need to add the appropriate incoming value from the middle 605 /// block as exiting edges from the scalar epilogue loop (if present) are 606 /// already in place, and we exit the vector loop exclusively to the middle 607 /// block. 608 void fixLCSSAPHIs(VPTransformState &State); 609 610 /// Iteratively sink the scalarized operands of a predicated instruction into 611 /// the block that was created for it. 612 void sinkScalarOperands(Instruction *PredInst); 613 614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 615 /// represented as. 616 void truncateToMinimalBitwidths(VPTransformState &State); 617 618 /// This function adds 619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 620 /// to each vector element of Val. The sequence starts at StartIndex. 621 /// \p Opcode is relevant for FP induction variable. 622 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 623 Instruction::BinaryOps Opcode = 624 Instruction::BinaryOpsEnd); 625 626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 627 /// variable on which to base the steps, \p Step is the size of the step, and 628 /// \p EntryVal is the value from the original loop that maps to the steps. 629 /// Note that \p EntryVal doesn't have to be an induction variable - it 630 /// can also be a truncate instruction. 631 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 632 const InductionDescriptor &ID, VPValue *Def, 633 VPValue *CastDef, VPTransformState &State); 634 635 /// Create a vector induction phi node based on an existing scalar one. \p 636 /// EntryVal is the value from the original loop that maps to the vector phi 637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 638 /// truncate instruction, instead of widening the original IV, we widen a 639 /// version of the IV truncated to \p EntryVal's type. 640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 641 Value *Step, Value *Start, 642 Instruction *EntryVal, VPValue *Def, 643 VPValue *CastDef, 644 VPTransformState &State); 645 646 /// Returns true if an instruction \p I should be scalarized instead of 647 /// vectorized for the chosen vectorization factor. 648 bool shouldScalarizeInstruction(Instruction *I) const; 649 650 /// Returns true if we should generate a scalar version of \p IV. 651 bool needsScalarInduction(Instruction *IV) const; 652 653 /// If there is a cast involved in the induction variable \p ID, which should 654 /// be ignored in the vectorized loop body, this function records the 655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 656 /// cast. We had already proved that the casted Phi is equal to the uncasted 657 /// Phi in the vectorized loop (under a runtime guard), and therefore 658 /// there is no need to vectorize the cast - the same value can be used in the 659 /// vector loop for both the Phi and the cast. 660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 662 /// 663 /// \p EntryVal is the value from the original loop that maps to the vector 664 /// phi node and is used to distinguish what is the IV currently being 665 /// processed - original one (if \p EntryVal is a phi corresponding to the 666 /// original IV) or the "newly-created" one based on the proof mentioned above 667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 668 /// latter case \p EntryVal is a TruncInst and we must not record anything for 669 /// that IV, but it's error-prone to expect callers of this routine to care 670 /// about that, hence this explicit parameter. 671 void recordVectorLoopValueForInductionCast( 672 const InductionDescriptor &ID, const Instruction *EntryVal, 673 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 674 unsigned Part, unsigned Lane = UINT_MAX); 675 676 /// Generate a shuffle sequence that will reverse the vector Vec. 677 virtual Value *reverseVector(Value *Vec); 678 679 /// Returns (and creates if needed) the original loop trip count. 680 Value *getOrCreateTripCount(Loop *NewLoop); 681 682 /// Returns (and creates if needed) the trip count of the widened loop. 683 Value *getOrCreateVectorTripCount(Loop *NewLoop); 684 685 /// Returns a bitcasted value to the requested vector type. 686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 688 const DataLayout &DL); 689 690 /// Emit a bypass check to see if the vector trip count is zero, including if 691 /// it overflows. 692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 693 694 /// Emit a bypass check to see if all of the SCEV assumptions we've 695 /// had to make are correct. Returns the block containing the checks or 696 /// nullptr if no checks have been added. 697 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 698 699 /// Emit bypass checks to check any memory assumptions we may have made. 700 /// Returns the block containing the checks or nullptr if no checks have been 701 /// added. 702 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Compute the transformed value of Index at offset StartValue using step 705 /// StepValue. 706 /// For integer induction, returns StartValue + Index * StepValue. 707 /// For pointer induction, returns StartValue[Index * StepValue]. 708 /// FIXME: The newly created binary instructions should contain nsw/nuw 709 /// flags, which can be found from the original scalar operations. 710 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 711 const DataLayout &DL, 712 const InductionDescriptor &ID) const; 713 714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 715 /// vector loop preheader, middle block and scalar preheader. Also 716 /// allocate a loop object for the new vector loop and return it. 717 Loop *createVectorLoopSkeleton(StringRef Prefix); 718 719 /// Create new phi nodes for the induction variables to resume iteration count 720 /// in the scalar epilogue, from where the vectorized loop left off (given by 721 /// \p VectorTripCount). 722 /// In cases where the loop skeleton is more complicated (eg. epilogue 723 /// vectorization) and the resume values can come from an additional bypass 724 /// block, the \p AdditionalBypass pair provides information about the bypass 725 /// block and the end value on the edge from bypass to this loop. 726 void createInductionResumeValues( 727 Loop *L, Value *VectorTripCount, 728 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 729 730 /// Complete the loop skeleton by adding debug MDs, creating appropriate 731 /// conditional branches in the middle block, preparing the builder and 732 /// running the verifier. Take in the vector loop \p L as argument, and return 733 /// the preheader of the completed vector loop. 734 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 735 736 /// Add additional metadata to \p To that was not present on \p Orig. 737 /// 738 /// Currently this is used to add the noalias annotations based on the 739 /// inserted memchecks. Use this for instructions that are *cloned* into the 740 /// vector loop. 741 void addNewMetadata(Instruction *To, const Instruction *Orig); 742 743 /// Add metadata from one instruction to another. 744 /// 745 /// This includes both the original MDs from \p From and additional ones (\see 746 /// addNewMetadata). Use this for *newly created* instructions in the vector 747 /// loop. 748 void addMetadata(Instruction *To, Instruction *From); 749 750 /// Similar to the previous function but it adds the metadata to a 751 /// vector of instructions. 752 void addMetadata(ArrayRef<Value *> To, Instruction *From); 753 754 /// Allow subclasses to override and print debug traces before/after vplan 755 /// execution, when trace information is requested. 756 virtual void printDebugTracesAtStart(){}; 757 virtual void printDebugTracesAtEnd(){}; 758 759 /// The original loop. 760 Loop *OrigLoop; 761 762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 763 /// dynamic knowledge to simplify SCEV expressions and converts them to a 764 /// more usable form. 765 PredicatedScalarEvolution &PSE; 766 767 /// Loop Info. 768 LoopInfo *LI; 769 770 /// Dominator Tree. 771 DominatorTree *DT; 772 773 /// Alias Analysis. 774 AAResults *AA; 775 776 /// Target Library Info. 777 const TargetLibraryInfo *TLI; 778 779 /// Target Transform Info. 780 const TargetTransformInfo *TTI; 781 782 /// Assumption Cache. 783 AssumptionCache *AC; 784 785 /// Interface to emit optimization remarks. 786 OptimizationRemarkEmitter *ORE; 787 788 /// LoopVersioning. It's only set up (non-null) if memchecks were 789 /// used. 790 /// 791 /// This is currently only used to add no-alias metadata based on the 792 /// memchecks. The actually versioning is performed manually. 793 std::unique_ptr<LoopVersioning> LVer; 794 795 /// The vectorization SIMD factor to use. Each vector will have this many 796 /// vector elements. 797 ElementCount VF; 798 799 /// The vectorization unroll factor to use. Each scalar is vectorized to this 800 /// many different vector instructions. 801 unsigned UF; 802 803 /// The builder that we use 804 IRBuilder<> Builder; 805 806 // --- Vectorization state --- 807 808 /// The vector-loop preheader. 809 BasicBlock *LoopVectorPreHeader; 810 811 /// The scalar-loop preheader. 812 BasicBlock *LoopScalarPreHeader; 813 814 /// Middle Block between the vector and the scalar. 815 BasicBlock *LoopMiddleBlock; 816 817 /// The (unique) ExitBlock of the scalar loop. Note that 818 /// there can be multiple exiting edges reaching this block. 819 BasicBlock *LoopExitBlock; 820 821 /// The vector loop body. 822 BasicBlock *LoopVectorBody; 823 824 /// The scalar loop body. 825 BasicBlock *LoopScalarBody; 826 827 /// A list of all bypass blocks. The first block is the entry of the loop. 828 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 829 830 /// The new Induction variable which was added to the new block. 831 PHINode *Induction = nullptr; 832 833 /// The induction variable of the old basic block. 834 PHINode *OldInduction = nullptr; 835 836 /// Store instructions that were predicated. 837 SmallVector<Instruction *, 4> PredicatedInstructions; 838 839 /// Trip count of the original loop. 840 Value *TripCount = nullptr; 841 842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 843 Value *VectorTripCount = nullptr; 844 845 /// The legality analysis. 846 LoopVectorizationLegality *Legal; 847 848 /// The profitablity analysis. 849 LoopVectorizationCostModel *Cost; 850 851 // Record whether runtime checks are added. 852 bool AddedSafetyChecks = false; 853 854 // Holds the end values for each induction variable. We save the end values 855 // so we can later fix-up the external users of the induction variables. 856 DenseMap<PHINode *, Value *> IVEndValues; 857 858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 859 // fixed up at the end of vector code generation. 860 SmallVector<PHINode *, 8> OrigPHIsToFix; 861 862 /// BFI and PSI are used to check for profile guided size optimizations. 863 BlockFrequencyInfo *BFI; 864 ProfileSummaryInfo *PSI; 865 866 // Whether this loop should be optimized for size based on profile guided size 867 // optimizatios. 868 bool OptForSizeBasedOnProfile; 869 870 /// Structure to hold information about generated runtime checks, responsible 871 /// for cleaning the checks, if vectorization turns out unprofitable. 872 GeneratedRTChecks &RTChecks; 873 }; 874 875 class InnerLoopUnroller : public InnerLoopVectorizer { 876 public: 877 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 878 LoopInfo *LI, DominatorTree *DT, 879 const TargetLibraryInfo *TLI, 880 const TargetTransformInfo *TTI, AssumptionCache *AC, 881 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 882 LoopVectorizationLegality *LVL, 883 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 884 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 885 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 886 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 887 BFI, PSI, Check) {} 888 889 private: 890 Value *getBroadcastInstrs(Value *V) override; 891 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 892 Instruction::BinaryOps Opcode = 893 Instruction::BinaryOpsEnd) override; 894 Value *reverseVector(Value *Vec) override; 895 }; 896 897 /// Encapsulate information regarding vectorization of a loop and its epilogue. 898 /// This information is meant to be updated and used across two stages of 899 /// epilogue vectorization. 900 struct EpilogueLoopVectorizationInfo { 901 ElementCount MainLoopVF = ElementCount::getFixed(0); 902 unsigned MainLoopUF = 0; 903 ElementCount EpilogueVF = ElementCount::getFixed(0); 904 unsigned EpilogueUF = 0; 905 BasicBlock *MainLoopIterationCountCheck = nullptr; 906 BasicBlock *EpilogueIterationCountCheck = nullptr; 907 BasicBlock *SCEVSafetyCheck = nullptr; 908 BasicBlock *MemSafetyCheck = nullptr; 909 Value *TripCount = nullptr; 910 Value *VectorTripCount = nullptr; 911 912 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 913 unsigned EUF) 914 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 915 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 916 assert(EUF == 1 && 917 "A high UF for the epilogue loop is likely not beneficial."); 918 } 919 }; 920 921 /// An extension of the inner loop vectorizer that creates a skeleton for a 922 /// vectorized loop that has its epilogue (residual) also vectorized. 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 925 /// from the first step and vectorize the epilogue. This is achieved by 926 /// deriving two concrete strategy classes from this base class and invoking 927 /// them in succession from the loop vectorizer planner. 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 929 public: 930 InnerLoopAndEpilogueVectorizer( 931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 932 DominatorTree *DT, const TargetLibraryInfo *TLI, 933 const TargetTransformInfo *TTI, AssumptionCache *AC, 934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 937 GeneratedRTChecks &Checks) 938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 940 Checks), 941 EPI(EPI) {} 942 943 // Override this function to handle the more complex control flow around the 944 // three loops. 945 BasicBlock *createVectorizedLoopSkeleton() final override { 946 return createEpilogueVectorizedLoopSkeleton(); 947 } 948 949 /// The interface for creating a vectorized skeleton using one of two 950 /// different strategies, each corresponding to one execution of the vplan 951 /// as described above. 952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 953 954 /// Holds and updates state information required to vectorize the main loop 955 /// and its epilogue in two separate passes. This setup helps us avoid 956 /// regenerating and recomputing runtime safety checks. It also helps us to 957 /// shorten the iteration-count-check path length for the cases where the 958 /// iteration count of the loop is so small that the main vector loop is 959 /// completely skipped. 960 EpilogueLoopVectorizationInfo &EPI; 961 }; 962 963 /// A specialized derived class of inner loop vectorizer that performs 964 /// vectorization of *main* loops in the process of vectorizing loops and their 965 /// epilogues. 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 967 public: 968 EpilogueVectorizerMainLoop( 969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 970 DominatorTree *DT, const TargetLibraryInfo *TLI, 971 const TargetTransformInfo *TTI, AssumptionCache *AC, 972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 975 GeneratedRTChecks &Check) 976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 977 EPI, LVL, CM, BFI, PSI, Check) {} 978 /// Implements the interface for creating a vectorized skeleton using the 979 /// *main loop* strategy (ie the first pass of vplan execution). 980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 981 982 protected: 983 /// Emits an iteration count bypass check once for the main loop (when \p 984 /// ForEpilogue is false) and once for the epilogue loop (when \p 985 /// ForEpilogue is true). 986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 987 bool ForEpilogue); 988 void printDebugTracesAtStart() override; 989 void printDebugTracesAtEnd() override; 990 }; 991 992 // A specialized derived class of inner loop vectorizer that performs 993 // vectorization of *epilogue* loops in the process of vectorizing loops and 994 // their epilogues. 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 996 public: 997 EpilogueVectorizerEpilogueLoop( 998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 999 DominatorTree *DT, const TargetLibraryInfo *TLI, 1000 const TargetTransformInfo *TTI, AssumptionCache *AC, 1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1004 GeneratedRTChecks &Checks) 1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1006 EPI, LVL, CM, BFI, PSI, Checks) {} 1007 /// Implements the interface for creating a vectorized skeleton using the 1008 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1010 1011 protected: 1012 /// Emits an iteration count bypass check after the main vector loop has 1013 /// finished to see if there are any iterations left to execute by either 1014 /// the vector epilogue or the scalar epilogue. 1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1016 BasicBlock *Bypass, 1017 BasicBlock *Insert); 1018 void printDebugTracesAtStart() override; 1019 void printDebugTracesAtEnd() override; 1020 }; 1021 } // end namespace llvm 1022 1023 /// Look for a meaningful debug location on the instruction or it's 1024 /// operands. 1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1026 if (!I) 1027 return I; 1028 1029 DebugLoc Empty; 1030 if (I->getDebugLoc() != Empty) 1031 return I; 1032 1033 for (Use &Op : I->operands()) { 1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1035 if (OpInst->getDebugLoc() != Empty) 1036 return OpInst; 1037 } 1038 1039 return I; 1040 } 1041 1042 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1043 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1044 const DILocation *DIL = Inst->getDebugLoc(); 1045 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1046 !isa<DbgInfoIntrinsic>(Inst)) { 1047 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1048 auto NewDIL = 1049 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1050 if (NewDIL) 1051 B.SetCurrentDebugLocation(NewDIL.getValue()); 1052 else 1053 LLVM_DEBUG(dbgs() 1054 << "Failed to create new discriminator: " 1055 << DIL->getFilename() << " Line: " << DIL->getLine()); 1056 } 1057 else 1058 B.SetCurrentDebugLocation(DIL); 1059 } else 1060 B.SetCurrentDebugLocation(DebugLoc()); 1061 } 1062 1063 /// Write a record \p DebugMsg about vectorization failure to the debug 1064 /// output stream. If \p I is passed, it is an instruction that prevents 1065 /// vectorization. 1066 #ifndef NDEBUG 1067 static void debugVectorizationFailure(const StringRef DebugMsg, 1068 Instruction *I) { 1069 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1070 if (I != nullptr) 1071 dbgs() << " " << *I; 1072 else 1073 dbgs() << '.'; 1074 dbgs() << '\n'; 1075 } 1076 #endif 1077 1078 /// Create an analysis remark that explains why vectorization failed 1079 /// 1080 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1081 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1082 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1083 /// the location of the remark. \return the remark object that can be 1084 /// streamed to. 1085 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1086 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1087 Value *CodeRegion = TheLoop->getHeader(); 1088 DebugLoc DL = TheLoop->getStartLoc(); 1089 1090 if (I) { 1091 CodeRegion = I->getParent(); 1092 // If there is no debug location attached to the instruction, revert back to 1093 // using the loop's. 1094 if (I->getDebugLoc()) 1095 DL = I->getDebugLoc(); 1096 } 1097 1098 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1099 R << "loop not vectorized: "; 1100 return R; 1101 } 1102 1103 /// Return a value for Step multiplied by VF. 1104 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1105 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1106 Constant *StepVal = ConstantInt::get( 1107 Step->getType(), 1108 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1109 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1110 } 1111 1112 namespace llvm { 1113 1114 /// Return the runtime value for VF. 1115 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1116 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1117 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1118 } 1119 1120 void reportVectorizationFailure(const StringRef DebugMsg, 1121 const StringRef OREMsg, const StringRef ORETag, 1122 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1123 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1124 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1125 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1126 ORETag, TheLoop, I) << OREMsg); 1127 } 1128 1129 } // end namespace llvm 1130 1131 #ifndef NDEBUG 1132 /// \return string containing a file name and a line # for the given loop. 1133 static std::string getDebugLocString(const Loop *L) { 1134 std::string Result; 1135 if (L) { 1136 raw_string_ostream OS(Result); 1137 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1138 LoopDbgLoc.print(OS); 1139 else 1140 // Just print the module name. 1141 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1142 OS.flush(); 1143 } 1144 return Result; 1145 } 1146 #endif 1147 1148 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1149 const Instruction *Orig) { 1150 // If the loop was versioned with memchecks, add the corresponding no-alias 1151 // metadata. 1152 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1153 LVer->annotateInstWithNoAlias(To, Orig); 1154 } 1155 1156 void InnerLoopVectorizer::addMetadata(Instruction *To, 1157 Instruction *From) { 1158 propagateMetadata(To, From); 1159 addNewMetadata(To, From); 1160 } 1161 1162 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1163 Instruction *From) { 1164 for (Value *V : To) { 1165 if (Instruction *I = dyn_cast<Instruction>(V)) 1166 addMetadata(I, From); 1167 } 1168 } 1169 1170 namespace llvm { 1171 1172 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1173 // lowered. 1174 enum ScalarEpilogueLowering { 1175 1176 // The default: allowing scalar epilogues. 1177 CM_ScalarEpilogueAllowed, 1178 1179 // Vectorization with OptForSize: don't allow epilogues. 1180 CM_ScalarEpilogueNotAllowedOptSize, 1181 1182 // A special case of vectorisation with OptForSize: loops with a very small 1183 // trip count are considered for vectorization under OptForSize, thereby 1184 // making sure the cost of their loop body is dominant, free of runtime 1185 // guards and scalar iteration overheads. 1186 CM_ScalarEpilogueNotAllowedLowTripLoop, 1187 1188 // Loop hint predicate indicating an epilogue is undesired. 1189 CM_ScalarEpilogueNotNeededUsePredicate, 1190 1191 // Directive indicating we must either tail fold or not vectorize 1192 CM_ScalarEpilogueNotAllowedUsePredicate 1193 }; 1194 1195 /// LoopVectorizationCostModel - estimates the expected speedups due to 1196 /// vectorization. 1197 /// In many cases vectorization is not profitable. This can happen because of 1198 /// a number of reasons. In this class we mainly attempt to predict the 1199 /// expected speedup/slowdowns due to the supported instruction set. We use the 1200 /// TargetTransformInfo to query the different backends for the cost of 1201 /// different operations. 1202 class LoopVectorizationCostModel { 1203 public: 1204 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1205 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1206 LoopVectorizationLegality *Legal, 1207 const TargetTransformInfo &TTI, 1208 const TargetLibraryInfo *TLI, DemandedBits *DB, 1209 AssumptionCache *AC, 1210 OptimizationRemarkEmitter *ORE, const Function *F, 1211 const LoopVectorizeHints *Hints, 1212 InterleavedAccessInfo &IAI) 1213 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1214 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1215 Hints(Hints), InterleaveInfo(IAI) {} 1216 1217 /// \return An upper bound for the vectorization factor, or None if 1218 /// vectorization and interleaving should be avoided up front. 1219 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1220 1221 /// \return True if runtime checks are required for vectorization, and false 1222 /// otherwise. 1223 bool runtimeChecksRequired(); 1224 1225 /// \return The most profitable vectorization factor and the cost of that VF. 1226 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1227 /// then this vectorization factor will be selected if vectorization is 1228 /// possible. 1229 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1230 VectorizationFactor 1231 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1232 const LoopVectorizationPlanner &LVP); 1233 1234 /// Setup cost-based decisions for user vectorization factor. 1235 void selectUserVectorizationFactor(ElementCount UserVF) { 1236 collectUniformsAndScalars(UserVF); 1237 collectInstsToScalarize(UserVF); 1238 } 1239 1240 /// \return The size (in bits) of the smallest and widest types in the code 1241 /// that needs to be vectorized. We ignore values that remain scalar such as 1242 /// 64 bit loop indices. 1243 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1244 1245 /// \return The desired interleave count. 1246 /// If interleave count has been specified by metadata it will be returned. 1247 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1248 /// are the selected vectorization factor and the cost of the selected VF. 1249 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1250 1251 /// Memory access instruction may be vectorized in more than one way. 1252 /// Form of instruction after vectorization depends on cost. 1253 /// This function takes cost-based decisions for Load/Store instructions 1254 /// and collects them in a map. This decisions map is used for building 1255 /// the lists of loop-uniform and loop-scalar instructions. 1256 /// The calculated cost is saved with widening decision in order to 1257 /// avoid redundant calculations. 1258 void setCostBasedWideningDecision(ElementCount VF); 1259 1260 /// A struct that represents some properties of the register usage 1261 /// of a loop. 1262 struct RegisterUsage { 1263 /// Holds the number of loop invariant values that are used in the loop. 1264 /// The key is ClassID of target-provided register class. 1265 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1266 /// Holds the maximum number of concurrent live intervals in the loop. 1267 /// The key is ClassID of target-provided register class. 1268 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1269 }; 1270 1271 /// \return Returns information about the register usages of the loop for the 1272 /// given vectorization factors. 1273 SmallVector<RegisterUsage, 8> 1274 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1275 1276 /// Collect values we want to ignore in the cost model. 1277 void collectValuesToIgnore(); 1278 1279 /// Split reductions into those that happen in the loop, and those that happen 1280 /// outside. In loop reductions are collected into InLoopReductionChains. 1281 void collectInLoopReductions(); 1282 1283 /// \returns The smallest bitwidth each instruction can be represented with. 1284 /// The vector equivalents of these instructions should be truncated to this 1285 /// type. 1286 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1287 return MinBWs; 1288 } 1289 1290 /// \returns True if it is more profitable to scalarize instruction \p I for 1291 /// vectorization factor \p VF. 1292 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1293 assert(VF.isVector() && 1294 "Profitable to scalarize relevant only for VF > 1."); 1295 1296 // Cost model is not run in the VPlan-native path - return conservative 1297 // result until this changes. 1298 if (EnableVPlanNativePath) 1299 return false; 1300 1301 auto Scalars = InstsToScalarize.find(VF); 1302 assert(Scalars != InstsToScalarize.end() && 1303 "VF not yet analyzed for scalarization profitability"); 1304 return Scalars->second.find(I) != Scalars->second.end(); 1305 } 1306 1307 /// Returns true if \p I is known to be uniform after vectorization. 1308 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1309 if (VF.isScalar()) 1310 return true; 1311 1312 // Cost model is not run in the VPlan-native path - return conservative 1313 // result until this changes. 1314 if (EnableVPlanNativePath) 1315 return false; 1316 1317 auto UniformsPerVF = Uniforms.find(VF); 1318 assert(UniformsPerVF != Uniforms.end() && 1319 "VF not yet analyzed for uniformity"); 1320 return UniformsPerVF->second.count(I); 1321 } 1322 1323 /// Returns true if \p I is known to be scalar after vectorization. 1324 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1325 if (VF.isScalar()) 1326 return true; 1327 1328 // Cost model is not run in the VPlan-native path - return conservative 1329 // result until this changes. 1330 if (EnableVPlanNativePath) 1331 return false; 1332 1333 auto ScalarsPerVF = Scalars.find(VF); 1334 assert(ScalarsPerVF != Scalars.end() && 1335 "Scalar values are not calculated for VF"); 1336 return ScalarsPerVF->second.count(I); 1337 } 1338 1339 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1340 /// for vectorization factor \p VF. 1341 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1342 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1343 !isProfitableToScalarize(I, VF) && 1344 !isScalarAfterVectorization(I, VF); 1345 } 1346 1347 /// Decision that was taken during cost calculation for memory instruction. 1348 enum InstWidening { 1349 CM_Unknown, 1350 CM_Widen, // For consecutive accesses with stride +1. 1351 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1352 CM_Interleave, 1353 CM_GatherScatter, 1354 CM_Scalarize 1355 }; 1356 1357 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1358 /// instruction \p I and vector width \p VF. 1359 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1360 InstructionCost Cost) { 1361 assert(VF.isVector() && "Expected VF >=2"); 1362 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1363 } 1364 1365 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1366 /// interleaving group \p Grp and vector width \p VF. 1367 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1368 ElementCount VF, InstWidening W, 1369 InstructionCost Cost) { 1370 assert(VF.isVector() && "Expected VF >=2"); 1371 /// Broadcast this decicion to all instructions inside the group. 1372 /// But the cost will be assigned to one instruction only. 1373 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1374 if (auto *I = Grp->getMember(i)) { 1375 if (Grp->getInsertPos() == I) 1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1377 else 1378 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1379 } 1380 } 1381 } 1382 1383 /// Return the cost model decision for the given instruction \p I and vector 1384 /// width \p VF. Return CM_Unknown if this instruction did not pass 1385 /// through the cost modeling. 1386 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1387 assert(VF.isVector() && "Expected VF to be a vector VF"); 1388 // Cost model is not run in the VPlan-native path - return conservative 1389 // result until this changes. 1390 if (EnableVPlanNativePath) 1391 return CM_GatherScatter; 1392 1393 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1394 auto Itr = WideningDecisions.find(InstOnVF); 1395 if (Itr == WideningDecisions.end()) 1396 return CM_Unknown; 1397 return Itr->second.first; 1398 } 1399 1400 /// Return the vectorization cost for the given instruction \p I and vector 1401 /// width \p VF. 1402 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1403 assert(VF.isVector() && "Expected VF >=2"); 1404 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1405 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1406 "The cost is not calculated"); 1407 return WideningDecisions[InstOnVF].second; 1408 } 1409 1410 /// Return True if instruction \p I is an optimizable truncate whose operand 1411 /// is an induction variable. Such a truncate will be removed by adding a new 1412 /// induction variable with the destination type. 1413 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1414 // If the instruction is not a truncate, return false. 1415 auto *Trunc = dyn_cast<TruncInst>(I); 1416 if (!Trunc) 1417 return false; 1418 1419 // Get the source and destination types of the truncate. 1420 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1421 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1422 1423 // If the truncate is free for the given types, return false. Replacing a 1424 // free truncate with an induction variable would add an induction variable 1425 // update instruction to each iteration of the loop. We exclude from this 1426 // check the primary induction variable since it will need an update 1427 // instruction regardless. 1428 Value *Op = Trunc->getOperand(0); 1429 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1430 return false; 1431 1432 // If the truncated value is not an induction variable, return false. 1433 return Legal->isInductionPhi(Op); 1434 } 1435 1436 /// Collects the instructions to scalarize for each predicated instruction in 1437 /// the loop. 1438 void collectInstsToScalarize(ElementCount VF); 1439 1440 /// Collect Uniform and Scalar values for the given \p VF. 1441 /// The sets depend on CM decision for Load/Store instructions 1442 /// that may be vectorized as interleave, gather-scatter or scalarized. 1443 void collectUniformsAndScalars(ElementCount VF) { 1444 // Do the analysis once. 1445 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1446 return; 1447 setCostBasedWideningDecision(VF); 1448 collectLoopUniforms(VF); 1449 collectLoopScalars(VF); 1450 } 1451 1452 /// Returns true if the target machine supports masked store operation 1453 /// for the given \p DataType and kind of access to \p Ptr. 1454 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1455 return Legal->isConsecutivePtr(Ptr) && 1456 TTI.isLegalMaskedStore(DataType, Alignment); 1457 } 1458 1459 /// Returns true if the target machine supports masked load operation 1460 /// for the given \p DataType and kind of access to \p Ptr. 1461 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1462 return Legal->isConsecutivePtr(Ptr) && 1463 TTI.isLegalMaskedLoad(DataType, Alignment); 1464 } 1465 1466 /// Returns true if the target machine supports masked scatter operation 1467 /// for the given \p DataType. 1468 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1469 return TTI.isLegalMaskedScatter(DataType, Alignment); 1470 } 1471 1472 /// Returns true if the target machine supports masked gather operation 1473 /// for the given \p DataType. 1474 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1475 return TTI.isLegalMaskedGather(DataType, Alignment); 1476 } 1477 1478 /// Returns true if the target machine can represent \p V as a masked gather 1479 /// or scatter operation. 1480 bool isLegalGatherOrScatter(Value *V) { 1481 bool LI = isa<LoadInst>(V); 1482 bool SI = isa<StoreInst>(V); 1483 if (!LI && !SI) 1484 return false; 1485 auto *Ty = getMemInstValueType(V); 1486 Align Align = getLoadStoreAlignment(V); 1487 return (LI && isLegalMaskedGather(Ty, Align)) || 1488 (SI && isLegalMaskedScatter(Ty, Align)); 1489 } 1490 1491 /// Returns true if the target machine supports all of the reduction 1492 /// variables found for the given VF. 1493 bool canVectorizeReductions(ElementCount VF) { 1494 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1495 RecurrenceDescriptor RdxDesc = Reduction.second; 1496 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1497 })); 1498 } 1499 1500 /// Returns true if \p I is an instruction that will be scalarized with 1501 /// predication. Such instructions include conditional stores and 1502 /// instructions that may divide by zero. 1503 /// If a non-zero VF has been calculated, we check if I will be scalarized 1504 /// predication for that VF. 1505 bool 1506 isScalarWithPredication(Instruction *I, 1507 ElementCount VF = ElementCount::getFixed(1)) const; 1508 1509 // Returns true if \p I is an instruction that will be predicated either 1510 // through scalar predication or masked load/store or masked gather/scatter. 1511 // Superset of instructions that return true for isScalarWithPredication. 1512 bool isPredicatedInst(Instruction *I) { 1513 if (!blockNeedsPredication(I->getParent())) 1514 return false; 1515 // Loads and stores that need some form of masked operation are predicated 1516 // instructions. 1517 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1518 return Legal->isMaskRequired(I); 1519 return isScalarWithPredication(I); 1520 } 1521 1522 /// Returns true if \p I is a memory instruction with consecutive memory 1523 /// access that can be widened. 1524 bool 1525 memoryInstructionCanBeWidened(Instruction *I, 1526 ElementCount VF = ElementCount::getFixed(1)); 1527 1528 /// Returns true if \p I is a memory instruction in an interleaved-group 1529 /// of memory accesses that can be vectorized with wide vector loads/stores 1530 /// and shuffles. 1531 bool 1532 interleavedAccessCanBeWidened(Instruction *I, 1533 ElementCount VF = ElementCount::getFixed(1)); 1534 1535 /// Check if \p Instr belongs to any interleaved access group. 1536 bool isAccessInterleaved(Instruction *Instr) { 1537 return InterleaveInfo.isInterleaved(Instr); 1538 } 1539 1540 /// Get the interleaved access group that \p Instr belongs to. 1541 const InterleaveGroup<Instruction> * 1542 getInterleavedAccessGroup(Instruction *Instr) { 1543 return InterleaveInfo.getInterleaveGroup(Instr); 1544 } 1545 1546 /// Returns true if we're required to use a scalar epilogue for at least 1547 /// the final iteration of the original loop. 1548 bool requiresScalarEpilogue() const { 1549 if (!isScalarEpilogueAllowed()) 1550 return false; 1551 // If we might exit from anywhere but the latch, must run the exiting 1552 // iteration in scalar form. 1553 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1554 return true; 1555 return InterleaveInfo.requiresScalarEpilogue(); 1556 } 1557 1558 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1559 /// loop hint annotation. 1560 bool isScalarEpilogueAllowed() const { 1561 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1562 } 1563 1564 /// Returns true if all loop blocks should be masked to fold tail loop. 1565 bool foldTailByMasking() const { return FoldTailByMasking; } 1566 1567 bool blockNeedsPredication(BasicBlock *BB) const { 1568 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1569 } 1570 1571 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1572 /// nodes to the chain of instructions representing the reductions. Uses a 1573 /// MapVector to ensure deterministic iteration order. 1574 using ReductionChainMap = 1575 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1576 1577 /// Return the chain of instructions representing an inloop reduction. 1578 const ReductionChainMap &getInLoopReductionChains() const { 1579 return InLoopReductionChains; 1580 } 1581 1582 /// Returns true if the Phi is part of an inloop reduction. 1583 bool isInLoopReduction(PHINode *Phi) const { 1584 return InLoopReductionChains.count(Phi); 1585 } 1586 1587 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1588 /// with factor VF. Return the cost of the instruction, including 1589 /// scalarization overhead if it's needed. 1590 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1591 1592 /// Estimate cost of a call instruction CI if it were vectorized with factor 1593 /// VF. Return the cost of the instruction, including scalarization overhead 1594 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1595 /// scalarized - 1596 /// i.e. either vector version isn't available, or is too expensive. 1597 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1598 bool &NeedToScalarize) const; 1599 1600 /// Invalidates decisions already taken by the cost model. 1601 void invalidateCostModelingDecisions() { 1602 WideningDecisions.clear(); 1603 Uniforms.clear(); 1604 Scalars.clear(); 1605 } 1606 1607 private: 1608 unsigned NumPredStores = 0; 1609 1610 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1611 /// than zero. One is returned if vectorization should best be avoided due 1612 /// to cost. 1613 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1614 ElementCount UserVF); 1615 1616 /// The vectorization cost is a combination of the cost itself and a boolean 1617 /// indicating whether any of the contributing operations will actually 1618 /// operate on 1619 /// vector values after type legalization in the backend. If this latter value 1620 /// is 1621 /// false, then all operations will be scalarized (i.e. no vectorization has 1622 /// actually taken place). 1623 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1624 1625 /// Returns the expected execution cost. The unit of the cost does 1626 /// not matter because we use the 'cost' units to compare different 1627 /// vector widths. The cost that is returned is *not* normalized by 1628 /// the factor width. 1629 VectorizationCostTy expectedCost(ElementCount VF); 1630 1631 /// Returns the execution time cost of an instruction for a given vector 1632 /// width. Vector width of one means scalar. 1633 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1634 1635 /// The cost-computation logic from getInstructionCost which provides 1636 /// the vector type as an output parameter. 1637 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1638 Type *&VectorTy); 1639 1640 /// Return the cost of instructions in an inloop reduction pattern, if I is 1641 /// part of that pattern. 1642 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1643 Type *VectorTy, 1644 TTI::TargetCostKind CostKind); 1645 1646 /// Calculate vectorization cost of memory instruction \p I. 1647 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1648 1649 /// The cost computation for scalarized memory instruction. 1650 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1651 1652 /// The cost computation for interleaving group of memory instructions. 1653 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1654 1655 /// The cost computation for Gather/Scatter instruction. 1656 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1657 1658 /// The cost computation for widening instruction \p I with consecutive 1659 /// memory access. 1660 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1661 1662 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1663 /// Load: scalar load + broadcast. 1664 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1665 /// element) 1666 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1667 1668 /// Estimate the overhead of scalarizing an instruction. This is a 1669 /// convenience wrapper for the type-based getScalarizationOverhead API. 1670 InstructionCost getScalarizationOverhead(Instruction *I, 1671 ElementCount VF) const; 1672 1673 /// Returns whether the instruction is a load or store and will be a emitted 1674 /// as a vector operation. 1675 bool isConsecutiveLoadOrStore(Instruction *I); 1676 1677 /// Returns true if an artificially high cost for emulated masked memrefs 1678 /// should be used. 1679 bool useEmulatedMaskMemRefHack(Instruction *I); 1680 1681 /// Map of scalar integer values to the smallest bitwidth they can be legally 1682 /// represented as. The vector equivalents of these values should be truncated 1683 /// to this type. 1684 MapVector<Instruction *, uint64_t> MinBWs; 1685 1686 /// A type representing the costs for instructions if they were to be 1687 /// scalarized rather than vectorized. The entries are Instruction-Cost 1688 /// pairs. 1689 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1690 1691 /// A set containing all BasicBlocks that are known to present after 1692 /// vectorization as a predicated block. 1693 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1694 1695 /// Records whether it is allowed to have the original scalar loop execute at 1696 /// least once. This may be needed as a fallback loop in case runtime 1697 /// aliasing/dependence checks fail, or to handle the tail/remainder 1698 /// iterations when the trip count is unknown or doesn't divide by the VF, 1699 /// or as a peel-loop to handle gaps in interleave-groups. 1700 /// Under optsize and when the trip count is very small we don't allow any 1701 /// iterations to execute in the scalar loop. 1702 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1703 1704 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1705 bool FoldTailByMasking = false; 1706 1707 /// A map holding scalar costs for different vectorization factors. The 1708 /// presence of a cost for an instruction in the mapping indicates that the 1709 /// instruction will be scalarized when vectorizing with the associated 1710 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1711 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1712 1713 /// Holds the instructions known to be uniform after vectorization. 1714 /// The data is collected per VF. 1715 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1716 1717 /// Holds the instructions known to be scalar after vectorization. 1718 /// The data is collected per VF. 1719 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1720 1721 /// Holds the instructions (address computations) that are forced to be 1722 /// scalarized. 1723 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1724 1725 /// PHINodes of the reductions that should be expanded in-loop along with 1726 /// their associated chains of reduction operations, in program order from top 1727 /// (PHI) to bottom 1728 ReductionChainMap InLoopReductionChains; 1729 1730 /// A Map of inloop reduction operations and their immediate chain operand. 1731 /// FIXME: This can be removed once reductions can be costed correctly in 1732 /// vplan. This was added to allow quick lookup to the inloop operations, 1733 /// without having to loop through InLoopReductionChains. 1734 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1735 1736 /// Returns the expected difference in cost from scalarizing the expression 1737 /// feeding a predicated instruction \p PredInst. The instructions to 1738 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1739 /// non-negative return value implies the expression will be scalarized. 1740 /// Currently, only single-use chains are considered for scalarization. 1741 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1742 ElementCount VF); 1743 1744 /// Collect the instructions that are uniform after vectorization. An 1745 /// instruction is uniform if we represent it with a single scalar value in 1746 /// the vectorized loop corresponding to each vector iteration. Examples of 1747 /// uniform instructions include pointer operands of consecutive or 1748 /// interleaved memory accesses. Note that although uniformity implies an 1749 /// instruction will be scalar, the reverse is not true. In general, a 1750 /// scalarized instruction will be represented by VF scalar values in the 1751 /// vectorized loop, each corresponding to an iteration of the original 1752 /// scalar loop. 1753 void collectLoopUniforms(ElementCount VF); 1754 1755 /// Collect the instructions that are scalar after vectorization. An 1756 /// instruction is scalar if it is known to be uniform or will be scalarized 1757 /// during vectorization. Non-uniform scalarized instructions will be 1758 /// represented by VF values in the vectorized loop, each corresponding to an 1759 /// iteration of the original scalar loop. 1760 void collectLoopScalars(ElementCount VF); 1761 1762 /// Keeps cost model vectorization decision and cost for instructions. 1763 /// Right now it is used for memory instructions only. 1764 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1765 std::pair<InstWidening, InstructionCost>>; 1766 1767 DecisionList WideningDecisions; 1768 1769 /// Returns true if \p V is expected to be vectorized and it needs to be 1770 /// extracted. 1771 bool needsExtract(Value *V, ElementCount VF) const { 1772 Instruction *I = dyn_cast<Instruction>(V); 1773 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1774 TheLoop->isLoopInvariant(I)) 1775 return false; 1776 1777 // Assume we can vectorize V (and hence we need extraction) if the 1778 // scalars are not computed yet. This can happen, because it is called 1779 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1780 // the scalars are collected. That should be a safe assumption in most 1781 // cases, because we check if the operands have vectorizable types 1782 // beforehand in LoopVectorizationLegality. 1783 return Scalars.find(VF) == Scalars.end() || 1784 !isScalarAfterVectorization(I, VF); 1785 }; 1786 1787 /// Returns a range containing only operands needing to be extracted. 1788 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1789 ElementCount VF) const { 1790 return SmallVector<Value *, 4>(make_filter_range( 1791 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1792 } 1793 1794 /// Determines if we have the infrastructure to vectorize loop \p L and its 1795 /// epilogue, assuming the main loop is vectorized by \p VF. 1796 bool isCandidateForEpilogueVectorization(const Loop &L, 1797 const ElementCount VF) const; 1798 1799 /// Returns true if epilogue vectorization is considered profitable, and 1800 /// false otherwise. 1801 /// \p VF is the vectorization factor chosen for the original loop. 1802 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1803 1804 public: 1805 /// The loop that we evaluate. 1806 Loop *TheLoop; 1807 1808 /// Predicated scalar evolution analysis. 1809 PredicatedScalarEvolution &PSE; 1810 1811 /// Loop Info analysis. 1812 LoopInfo *LI; 1813 1814 /// Vectorization legality. 1815 LoopVectorizationLegality *Legal; 1816 1817 /// Vector target information. 1818 const TargetTransformInfo &TTI; 1819 1820 /// Target Library Info. 1821 const TargetLibraryInfo *TLI; 1822 1823 /// Demanded bits analysis. 1824 DemandedBits *DB; 1825 1826 /// Assumption cache. 1827 AssumptionCache *AC; 1828 1829 /// Interface to emit optimization remarks. 1830 OptimizationRemarkEmitter *ORE; 1831 1832 const Function *TheFunction; 1833 1834 /// Loop Vectorize Hint. 1835 const LoopVectorizeHints *Hints; 1836 1837 /// The interleave access information contains groups of interleaved accesses 1838 /// with the same stride and close to each other. 1839 InterleavedAccessInfo &InterleaveInfo; 1840 1841 /// Values to ignore in the cost model. 1842 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1843 1844 /// Values to ignore in the cost model when VF > 1. 1845 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1846 1847 /// Profitable vector factors. 1848 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1849 }; 1850 } // end namespace llvm 1851 1852 /// Helper struct to manage generating runtime checks for vectorization. 1853 /// 1854 /// The runtime checks are created up-front in temporary blocks to allow better 1855 /// estimating the cost and un-linked from the existing IR. After deciding to 1856 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1857 /// temporary blocks are completely removed. 1858 class GeneratedRTChecks { 1859 /// Basic block which contains the generated SCEV checks, if any. 1860 BasicBlock *SCEVCheckBlock = nullptr; 1861 1862 /// The value representing the result of the generated SCEV checks. If it is 1863 /// nullptr, either no SCEV checks have been generated or they have been used. 1864 Value *SCEVCheckCond = nullptr; 1865 1866 /// Basic block which contains the generated memory runtime checks, if any. 1867 BasicBlock *MemCheckBlock = nullptr; 1868 1869 /// The value representing the result of the generated memory runtime checks. 1870 /// If it is nullptr, either no memory runtime checks have been generated or 1871 /// they have been used. 1872 Instruction *MemRuntimeCheckCond = nullptr; 1873 1874 DominatorTree *DT; 1875 LoopInfo *LI; 1876 1877 SCEVExpander SCEVExp; 1878 SCEVExpander MemCheckExp; 1879 1880 public: 1881 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1882 const DataLayout &DL) 1883 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1884 MemCheckExp(SE, DL, "scev.check") {} 1885 1886 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1887 /// accurately estimate the cost of the runtime checks. The blocks are 1888 /// un-linked from the IR and is added back during vector code generation. If 1889 /// there is no vector code generation, the check blocks are removed 1890 /// completely. 1891 void Create(Loop *L, const LoopAccessInfo &LAI, 1892 const SCEVUnionPredicate &UnionPred) { 1893 1894 BasicBlock *LoopHeader = L->getHeader(); 1895 BasicBlock *Preheader = L->getLoopPreheader(); 1896 1897 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1898 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1899 // may be used by SCEVExpander. The blocks will be un-linked from their 1900 // predecessors and removed from LI & DT at the end of the function. 1901 if (!UnionPred.isAlwaysTrue()) { 1902 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1903 nullptr, "vector.scevcheck"); 1904 1905 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1906 &UnionPred, SCEVCheckBlock->getTerminator()); 1907 } 1908 1909 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1910 if (RtPtrChecking.Need) { 1911 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1912 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1913 "vector.memcheck"); 1914 1915 std::tie(std::ignore, MemRuntimeCheckCond) = 1916 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1917 RtPtrChecking.getChecks(), MemCheckExp); 1918 assert(MemRuntimeCheckCond && 1919 "no RT checks generated although RtPtrChecking " 1920 "claimed checks are required"); 1921 } 1922 1923 if (!MemCheckBlock && !SCEVCheckBlock) 1924 return; 1925 1926 // Unhook the temporary block with the checks, update various places 1927 // accordingly. 1928 if (SCEVCheckBlock) 1929 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1930 if (MemCheckBlock) 1931 MemCheckBlock->replaceAllUsesWith(Preheader); 1932 1933 if (SCEVCheckBlock) { 1934 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1935 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1936 Preheader->getTerminator()->eraseFromParent(); 1937 } 1938 if (MemCheckBlock) { 1939 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1940 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1941 Preheader->getTerminator()->eraseFromParent(); 1942 } 1943 1944 DT->changeImmediateDominator(LoopHeader, Preheader); 1945 if (MemCheckBlock) { 1946 DT->eraseNode(MemCheckBlock); 1947 LI->removeBlock(MemCheckBlock); 1948 } 1949 if (SCEVCheckBlock) { 1950 DT->eraseNode(SCEVCheckBlock); 1951 LI->removeBlock(SCEVCheckBlock); 1952 } 1953 } 1954 1955 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1956 /// unused. 1957 ~GeneratedRTChecks() { 1958 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1959 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1960 if (!SCEVCheckCond) 1961 SCEVCleaner.markResultUsed(); 1962 1963 if (!MemRuntimeCheckCond) 1964 MemCheckCleaner.markResultUsed(); 1965 1966 if (MemRuntimeCheckCond) { 1967 auto &SE = *MemCheckExp.getSE(); 1968 // Memory runtime check generation creates compares that use expanded 1969 // values. Remove them before running the SCEVExpanderCleaners. 1970 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 1971 if (MemCheckExp.isInsertedInstruction(&I)) 1972 continue; 1973 SE.forgetValue(&I); 1974 SE.eraseValueFromMap(&I); 1975 I.eraseFromParent(); 1976 } 1977 } 1978 MemCheckCleaner.cleanup(); 1979 SCEVCleaner.cleanup(); 1980 1981 if (SCEVCheckCond) 1982 SCEVCheckBlock->eraseFromParent(); 1983 if (MemRuntimeCheckCond) 1984 MemCheckBlock->eraseFromParent(); 1985 } 1986 1987 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 1988 /// adjusts the branches to branch to the vector preheader or \p Bypass, 1989 /// depending on the generated condition. 1990 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 1991 BasicBlock *LoopVectorPreHeader, 1992 BasicBlock *LoopExitBlock) { 1993 if (!SCEVCheckCond) 1994 return nullptr; 1995 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 1996 if (C->isZero()) 1997 return nullptr; 1998 1999 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2000 2001 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2002 // Create new preheader for vector loop. 2003 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2004 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2005 2006 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2007 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2008 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2009 SCEVCheckBlock); 2010 2011 DT->addNewBlock(SCEVCheckBlock, Pred); 2012 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2013 2014 ReplaceInstWithInst( 2015 SCEVCheckBlock->getTerminator(), 2016 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2017 // Mark the check as used, to prevent it from being removed during cleanup. 2018 SCEVCheckCond = nullptr; 2019 return SCEVCheckBlock; 2020 } 2021 2022 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2023 /// the branches to branch to the vector preheader or \p Bypass, depending on 2024 /// the generated condition. 2025 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2026 BasicBlock *LoopVectorPreHeader) { 2027 // Check if we generated code that checks in runtime if arrays overlap. 2028 if (!MemRuntimeCheckCond) 2029 return nullptr; 2030 2031 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2032 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2033 MemCheckBlock); 2034 2035 DT->addNewBlock(MemCheckBlock, Pred); 2036 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2037 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2038 2039 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2040 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2041 2042 ReplaceInstWithInst( 2043 MemCheckBlock->getTerminator(), 2044 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2045 MemCheckBlock->getTerminator()->setDebugLoc( 2046 Pred->getTerminator()->getDebugLoc()); 2047 2048 // Mark the check as used, to prevent it from being removed during cleanup. 2049 MemRuntimeCheckCond = nullptr; 2050 return MemCheckBlock; 2051 } 2052 }; 2053 2054 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2055 // vectorization. The loop needs to be annotated with #pragma omp simd 2056 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2057 // vector length information is not provided, vectorization is not considered 2058 // explicit. Interleave hints are not allowed either. These limitations will be 2059 // relaxed in the future. 2060 // Please, note that we are currently forced to abuse the pragma 'clang 2061 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2062 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2063 // provides *explicit vectorization hints* (LV can bypass legal checks and 2064 // assume that vectorization is legal). However, both hints are implemented 2065 // using the same metadata (llvm.loop.vectorize, processed by 2066 // LoopVectorizeHints). This will be fixed in the future when the native IR 2067 // representation for pragma 'omp simd' is introduced. 2068 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2069 OptimizationRemarkEmitter *ORE) { 2070 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2071 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2072 2073 // Only outer loops with an explicit vectorization hint are supported. 2074 // Unannotated outer loops are ignored. 2075 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2076 return false; 2077 2078 Function *Fn = OuterLp->getHeader()->getParent(); 2079 if (!Hints.allowVectorization(Fn, OuterLp, 2080 true /*VectorizeOnlyWhenForced*/)) { 2081 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2082 return false; 2083 } 2084 2085 if (Hints.getInterleave() > 1) { 2086 // TODO: Interleave support is future work. 2087 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2088 "outer loops.\n"); 2089 Hints.emitRemarkWithHints(); 2090 return false; 2091 } 2092 2093 return true; 2094 } 2095 2096 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2097 OptimizationRemarkEmitter *ORE, 2098 SmallVectorImpl<Loop *> &V) { 2099 // Collect inner loops and outer loops without irreducible control flow. For 2100 // now, only collect outer loops that have explicit vectorization hints. If we 2101 // are stress testing the VPlan H-CFG construction, we collect the outermost 2102 // loop of every loop nest. 2103 if (L.isInnermost() || VPlanBuildStressTest || 2104 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2105 LoopBlocksRPO RPOT(&L); 2106 RPOT.perform(LI); 2107 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2108 V.push_back(&L); 2109 // TODO: Collect inner loops inside marked outer loops in case 2110 // vectorization fails for the outer loop. Do not invoke 2111 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2112 // already known to be reducible. We can use an inherited attribute for 2113 // that. 2114 return; 2115 } 2116 } 2117 for (Loop *InnerL : L) 2118 collectSupportedLoops(*InnerL, LI, ORE, V); 2119 } 2120 2121 namespace { 2122 2123 /// The LoopVectorize Pass. 2124 struct LoopVectorize : public FunctionPass { 2125 /// Pass identification, replacement for typeid 2126 static char ID; 2127 2128 LoopVectorizePass Impl; 2129 2130 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2131 bool VectorizeOnlyWhenForced = false) 2132 : FunctionPass(ID), 2133 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2134 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2135 } 2136 2137 bool runOnFunction(Function &F) override { 2138 if (skipFunction(F)) 2139 return false; 2140 2141 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2142 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2143 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2144 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2145 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2146 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2147 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2148 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2149 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2150 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2151 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2152 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2153 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2154 2155 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2156 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2157 2158 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2159 GetLAA, *ORE, PSI).MadeAnyChange; 2160 } 2161 2162 void getAnalysisUsage(AnalysisUsage &AU) const override { 2163 AU.addRequired<AssumptionCacheTracker>(); 2164 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2165 AU.addRequired<DominatorTreeWrapperPass>(); 2166 AU.addRequired<LoopInfoWrapperPass>(); 2167 AU.addRequired<ScalarEvolutionWrapperPass>(); 2168 AU.addRequired<TargetTransformInfoWrapperPass>(); 2169 AU.addRequired<AAResultsWrapperPass>(); 2170 AU.addRequired<LoopAccessLegacyAnalysis>(); 2171 AU.addRequired<DemandedBitsWrapperPass>(); 2172 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2173 AU.addRequired<InjectTLIMappingsLegacy>(); 2174 2175 // We currently do not preserve loopinfo/dominator analyses with outer loop 2176 // vectorization. Until this is addressed, mark these analyses as preserved 2177 // only for non-VPlan-native path. 2178 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2179 if (!EnableVPlanNativePath) { 2180 AU.addPreserved<LoopInfoWrapperPass>(); 2181 AU.addPreserved<DominatorTreeWrapperPass>(); 2182 } 2183 2184 AU.addPreserved<BasicAAWrapperPass>(); 2185 AU.addPreserved<GlobalsAAWrapperPass>(); 2186 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2187 } 2188 }; 2189 2190 } // end anonymous namespace 2191 2192 //===----------------------------------------------------------------------===// 2193 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2194 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2195 //===----------------------------------------------------------------------===// 2196 2197 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2198 // We need to place the broadcast of invariant variables outside the loop, 2199 // but only if it's proven safe to do so. Else, broadcast will be inside 2200 // vector loop body. 2201 Instruction *Instr = dyn_cast<Instruction>(V); 2202 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2203 (!Instr || 2204 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2205 // Place the code for broadcasting invariant variables in the new preheader. 2206 IRBuilder<>::InsertPointGuard Guard(Builder); 2207 if (SafeToHoist) 2208 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2209 2210 // Broadcast the scalar into all locations in the vector. 2211 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2212 2213 return Shuf; 2214 } 2215 2216 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2217 const InductionDescriptor &II, Value *Step, Value *Start, 2218 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2219 VPTransformState &State) { 2220 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2221 "Expected either an induction phi-node or a truncate of it!"); 2222 2223 // Construct the initial value of the vector IV in the vector loop preheader 2224 auto CurrIP = Builder.saveIP(); 2225 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2226 if (isa<TruncInst>(EntryVal)) { 2227 assert(Start->getType()->isIntegerTy() && 2228 "Truncation requires an integer type"); 2229 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2230 Step = Builder.CreateTrunc(Step, TruncType); 2231 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2232 } 2233 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2234 Value *SteppedStart = 2235 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2236 2237 // We create vector phi nodes for both integer and floating-point induction 2238 // variables. Here, we determine the kind of arithmetic we will perform. 2239 Instruction::BinaryOps AddOp; 2240 Instruction::BinaryOps MulOp; 2241 if (Step->getType()->isIntegerTy()) { 2242 AddOp = Instruction::Add; 2243 MulOp = Instruction::Mul; 2244 } else { 2245 AddOp = II.getInductionOpcode(); 2246 MulOp = Instruction::FMul; 2247 } 2248 2249 // Multiply the vectorization factor by the step using integer or 2250 // floating-point arithmetic as appropriate. 2251 Value *ConstVF = 2252 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2253 Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); 2254 2255 // Create a vector splat to use in the induction update. 2256 // 2257 // FIXME: If the step is non-constant, we create the vector splat with 2258 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2259 // handle a constant vector splat. 2260 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2261 Value *SplatVF = isa<Constant>(Mul) 2262 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2263 : Builder.CreateVectorSplat(VF, Mul); 2264 Builder.restoreIP(CurrIP); 2265 2266 // We may need to add the step a number of times, depending on the unroll 2267 // factor. The last of those goes into the PHI. 2268 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2269 &*LoopVectorBody->getFirstInsertionPt()); 2270 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2271 Instruction *LastInduction = VecInd; 2272 for (unsigned Part = 0; Part < UF; ++Part) { 2273 State.set(Def, LastInduction, Part); 2274 2275 if (isa<TruncInst>(EntryVal)) 2276 addMetadata(LastInduction, EntryVal); 2277 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2278 State, Part); 2279 2280 LastInduction = cast<Instruction>( 2281 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2282 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2283 } 2284 2285 // Move the last step to the end of the latch block. This ensures consistent 2286 // placement of all induction updates. 2287 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2288 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2289 auto *ICmp = cast<Instruction>(Br->getCondition()); 2290 LastInduction->moveBefore(ICmp); 2291 LastInduction->setName("vec.ind.next"); 2292 2293 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2294 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2295 } 2296 2297 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2298 return Cost->isScalarAfterVectorization(I, VF) || 2299 Cost->isProfitableToScalarize(I, VF); 2300 } 2301 2302 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2303 if (shouldScalarizeInstruction(IV)) 2304 return true; 2305 auto isScalarInst = [&](User *U) -> bool { 2306 auto *I = cast<Instruction>(U); 2307 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2308 }; 2309 return llvm::any_of(IV->users(), isScalarInst); 2310 } 2311 2312 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2313 const InductionDescriptor &ID, const Instruction *EntryVal, 2314 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2315 unsigned Part, unsigned Lane) { 2316 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2317 "Expected either an induction phi-node or a truncate of it!"); 2318 2319 // This induction variable is not the phi from the original loop but the 2320 // newly-created IV based on the proof that casted Phi is equal to the 2321 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2322 // re-uses the same InductionDescriptor that original IV uses but we don't 2323 // have to do any recording in this case - that is done when original IV is 2324 // processed. 2325 if (isa<TruncInst>(EntryVal)) 2326 return; 2327 2328 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2329 if (Casts.empty()) 2330 return; 2331 // Only the first Cast instruction in the Casts vector is of interest. 2332 // The rest of the Casts (if exist) have no uses outside the 2333 // induction update chain itself. 2334 if (Lane < UINT_MAX) 2335 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2336 else 2337 State.set(CastDef, VectorLoopVal, Part); 2338 } 2339 2340 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2341 TruncInst *Trunc, VPValue *Def, 2342 VPValue *CastDef, 2343 VPTransformState &State) { 2344 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2345 "Primary induction variable must have an integer type"); 2346 2347 auto II = Legal->getInductionVars().find(IV); 2348 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2349 2350 auto ID = II->second; 2351 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2352 2353 // The value from the original loop to which we are mapping the new induction 2354 // variable. 2355 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2356 2357 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2358 2359 // Generate code for the induction step. Note that induction steps are 2360 // required to be loop-invariant 2361 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2362 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2363 "Induction step should be loop invariant"); 2364 if (PSE.getSE()->isSCEVable(IV->getType())) { 2365 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2366 return Exp.expandCodeFor(Step, Step->getType(), 2367 LoopVectorPreHeader->getTerminator()); 2368 } 2369 return cast<SCEVUnknown>(Step)->getValue(); 2370 }; 2371 2372 // The scalar value to broadcast. This is derived from the canonical 2373 // induction variable. If a truncation type is given, truncate the canonical 2374 // induction variable and step. Otherwise, derive these values from the 2375 // induction descriptor. 2376 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2377 Value *ScalarIV = Induction; 2378 if (IV != OldInduction) { 2379 ScalarIV = IV->getType()->isIntegerTy() 2380 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2381 : Builder.CreateCast(Instruction::SIToFP, Induction, 2382 IV->getType()); 2383 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2384 ScalarIV->setName("offset.idx"); 2385 } 2386 if (Trunc) { 2387 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2388 assert(Step->getType()->isIntegerTy() && 2389 "Truncation requires an integer step"); 2390 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2391 Step = Builder.CreateTrunc(Step, TruncType); 2392 } 2393 return ScalarIV; 2394 }; 2395 2396 // Create the vector values from the scalar IV, in the absence of creating a 2397 // vector IV. 2398 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2399 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2400 for (unsigned Part = 0; Part < UF; ++Part) { 2401 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2402 Value *EntryPart = 2403 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2404 ID.getInductionOpcode()); 2405 State.set(Def, EntryPart, Part); 2406 if (Trunc) 2407 addMetadata(EntryPart, Trunc); 2408 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2409 State, Part); 2410 } 2411 }; 2412 2413 // Fast-math-flags propagate from the original induction instruction. 2414 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2415 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2416 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2417 2418 // Now do the actual transformations, and start with creating the step value. 2419 Value *Step = CreateStepValue(ID.getStep()); 2420 if (VF.isZero() || VF.isScalar()) { 2421 Value *ScalarIV = CreateScalarIV(Step); 2422 CreateSplatIV(ScalarIV, Step); 2423 return; 2424 } 2425 2426 // Determine if we want a scalar version of the induction variable. This is 2427 // true if the induction variable itself is not widened, or if it has at 2428 // least one user in the loop that is not widened. 2429 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2430 if (!NeedsScalarIV) { 2431 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2432 State); 2433 return; 2434 } 2435 2436 // Try to create a new independent vector induction variable. If we can't 2437 // create the phi node, we will splat the scalar induction variable in each 2438 // loop iteration. 2439 if (!shouldScalarizeInstruction(EntryVal)) { 2440 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2441 State); 2442 Value *ScalarIV = CreateScalarIV(Step); 2443 // Create scalar steps that can be used by instructions we will later 2444 // scalarize. Note that the addition of the scalar steps will not increase 2445 // the number of instructions in the loop in the common case prior to 2446 // InstCombine. We will be trading one vector extract for each scalar step. 2447 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2448 return; 2449 } 2450 2451 // All IV users are scalar instructions, so only emit a scalar IV, not a 2452 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2453 // predicate used by the masked loads/stores. 2454 Value *ScalarIV = CreateScalarIV(Step); 2455 if (!Cost->isScalarEpilogueAllowed()) 2456 CreateSplatIV(ScalarIV, Step); 2457 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2458 } 2459 2460 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2461 Instruction::BinaryOps BinOp) { 2462 // Create and check the types. 2463 assert(isa<FixedVectorType>(Val->getType()) && 2464 "Creation of scalable step vector not yet supported"); 2465 auto *ValVTy = cast<VectorType>(Val->getType()); 2466 ElementCount VLen = ValVTy->getElementCount(); 2467 2468 Type *STy = Val->getType()->getScalarType(); 2469 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2470 "Induction Step must be an integer or FP"); 2471 assert(Step->getType() == STy && "Step has wrong type"); 2472 2473 SmallVector<Constant *, 8> Indices; 2474 2475 // Create a vector of consecutive numbers from zero to VF. 2476 VectorType *InitVecValVTy = ValVTy; 2477 Type *InitVecValSTy = STy; 2478 if (STy->isFloatingPointTy()) { 2479 InitVecValSTy = 2480 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2481 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2482 } 2483 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2484 2485 // Add on StartIdx 2486 Value *StartIdxSplat = Builder.CreateVectorSplat( 2487 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2488 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2489 2490 if (STy->isIntegerTy()) { 2491 Step = Builder.CreateVectorSplat(VLen, Step); 2492 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2493 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2494 // which can be found from the original scalar operations. 2495 Step = Builder.CreateMul(InitVec, Step); 2496 return Builder.CreateAdd(Val, Step, "induction"); 2497 } 2498 2499 // Floating point induction. 2500 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2501 "Binary Opcode should be specified for FP induction"); 2502 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2503 Step = Builder.CreateVectorSplat(VLen, Step); 2504 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2505 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2506 } 2507 2508 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2509 Instruction *EntryVal, 2510 const InductionDescriptor &ID, 2511 VPValue *Def, VPValue *CastDef, 2512 VPTransformState &State) { 2513 // We shouldn't have to build scalar steps if we aren't vectorizing. 2514 assert(VF.isVector() && "VF should be greater than one"); 2515 // Get the value type and ensure it and the step have the same integer type. 2516 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2517 assert(ScalarIVTy == Step->getType() && 2518 "Val and Step should have the same type"); 2519 2520 // We build scalar steps for both integer and floating-point induction 2521 // variables. Here, we determine the kind of arithmetic we will perform. 2522 Instruction::BinaryOps AddOp; 2523 Instruction::BinaryOps MulOp; 2524 if (ScalarIVTy->isIntegerTy()) { 2525 AddOp = Instruction::Add; 2526 MulOp = Instruction::Mul; 2527 } else { 2528 AddOp = ID.getInductionOpcode(); 2529 MulOp = Instruction::FMul; 2530 } 2531 2532 // Determine the number of scalars we need to generate for each unroll 2533 // iteration. If EntryVal is uniform, we only need to generate the first 2534 // lane. Otherwise, we generate all VF values. 2535 unsigned Lanes = 2536 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2537 ? 1 2538 : VF.getKnownMinValue(); 2539 assert((!VF.isScalable() || Lanes == 1) && 2540 "Should never scalarize a scalable vector"); 2541 // Compute the scalar steps and save the results in State. 2542 for (unsigned Part = 0; Part < UF; ++Part) { 2543 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2544 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2545 ScalarIVTy->getScalarSizeInBits()); 2546 Value *StartIdx = 2547 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2548 if (ScalarIVTy->isFloatingPointTy()) 2549 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2550 StartIdx = Builder.CreateBinOp( 2551 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2552 // The step returned by `createStepForVF` is a runtime-evaluated value 2553 // when VF is scalable. Otherwise, it should be folded into a Constant. 2554 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2555 "Expected StartIdx to be folded to a constant when VF is not " 2556 "scalable"); 2557 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2558 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2559 State.set(Def, Add, VPIteration(Part, Lane)); 2560 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2561 Part, Lane); 2562 } 2563 } 2564 } 2565 2566 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2567 const VPIteration &Instance, 2568 VPTransformState &State) { 2569 Value *ScalarInst = State.get(Def, Instance); 2570 Value *VectorValue = State.get(Def, Instance.Part); 2571 VectorValue = Builder.CreateInsertElement( 2572 VectorValue, ScalarInst, 2573 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2574 State.set(Def, VectorValue, Instance.Part); 2575 } 2576 2577 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2578 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2579 return Builder.CreateVectorReverse(Vec, "reverse"); 2580 } 2581 2582 // Return whether we allow using masked interleave-groups (for dealing with 2583 // strided loads/stores that reside in predicated blocks, or for dealing 2584 // with gaps). 2585 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2586 // If an override option has been passed in for interleaved accesses, use it. 2587 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2588 return EnableMaskedInterleavedMemAccesses; 2589 2590 return TTI.enableMaskedInterleavedAccessVectorization(); 2591 } 2592 2593 // Try to vectorize the interleave group that \p Instr belongs to. 2594 // 2595 // E.g. Translate following interleaved load group (factor = 3): 2596 // for (i = 0; i < N; i+=3) { 2597 // R = Pic[i]; // Member of index 0 2598 // G = Pic[i+1]; // Member of index 1 2599 // B = Pic[i+2]; // Member of index 2 2600 // ... // do something to R, G, B 2601 // } 2602 // To: 2603 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2604 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2605 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2606 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2607 // 2608 // Or translate following interleaved store group (factor = 3): 2609 // for (i = 0; i < N; i+=3) { 2610 // ... do something to R, G, B 2611 // Pic[i] = R; // Member of index 0 2612 // Pic[i+1] = G; // Member of index 1 2613 // Pic[i+2] = B; // Member of index 2 2614 // } 2615 // To: 2616 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2617 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2618 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2619 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2620 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2621 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2622 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2623 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2624 VPValue *BlockInMask) { 2625 Instruction *Instr = Group->getInsertPos(); 2626 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2627 2628 // Prepare for the vector type of the interleaved load/store. 2629 Type *ScalarTy = getMemInstValueType(Instr); 2630 unsigned InterleaveFactor = Group->getFactor(); 2631 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2632 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2633 2634 // Prepare for the new pointers. 2635 SmallVector<Value *, 2> AddrParts; 2636 unsigned Index = Group->getIndex(Instr); 2637 2638 // TODO: extend the masked interleaved-group support to reversed access. 2639 assert((!BlockInMask || !Group->isReverse()) && 2640 "Reversed masked interleave-group not supported."); 2641 2642 // If the group is reverse, adjust the index to refer to the last vector lane 2643 // instead of the first. We adjust the index from the first vector lane, 2644 // rather than directly getting the pointer for lane VF - 1, because the 2645 // pointer operand of the interleaved access is supposed to be uniform. For 2646 // uniform instructions, we're only required to generate a value for the 2647 // first vector lane in each unroll iteration. 2648 assert(!VF.isScalable() && 2649 "scalable vector reverse operation is not implemented"); 2650 if (Group->isReverse()) 2651 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2652 2653 for (unsigned Part = 0; Part < UF; Part++) { 2654 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2655 setDebugLocFromInst(Builder, AddrPart); 2656 2657 // Notice current instruction could be any index. Need to adjust the address 2658 // to the member of index 0. 2659 // 2660 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2661 // b = A[i]; // Member of index 0 2662 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2663 // 2664 // E.g. A[i+1] = a; // Member of index 1 2665 // A[i] = b; // Member of index 0 2666 // A[i+2] = c; // Member of index 2 (Current instruction) 2667 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2668 2669 bool InBounds = false; 2670 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2671 InBounds = gep->isInBounds(); 2672 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2673 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2674 2675 // Cast to the vector pointer type. 2676 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2677 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2678 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2679 } 2680 2681 setDebugLocFromInst(Builder, Instr); 2682 Value *PoisonVec = PoisonValue::get(VecTy); 2683 2684 Value *MaskForGaps = nullptr; 2685 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2686 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2687 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2688 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2689 } 2690 2691 // Vectorize the interleaved load group. 2692 if (isa<LoadInst>(Instr)) { 2693 // For each unroll part, create a wide load for the group. 2694 SmallVector<Value *, 2> NewLoads; 2695 for (unsigned Part = 0; Part < UF; Part++) { 2696 Instruction *NewLoad; 2697 if (BlockInMask || MaskForGaps) { 2698 assert(useMaskedInterleavedAccesses(*TTI) && 2699 "masked interleaved groups are not allowed."); 2700 Value *GroupMask = MaskForGaps; 2701 if (BlockInMask) { 2702 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2703 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2704 Value *ShuffledMask = Builder.CreateShuffleVector( 2705 BlockInMaskPart, 2706 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2707 "interleaved.mask"); 2708 GroupMask = MaskForGaps 2709 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2710 MaskForGaps) 2711 : ShuffledMask; 2712 } 2713 NewLoad = 2714 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2715 GroupMask, PoisonVec, "wide.masked.vec"); 2716 } 2717 else 2718 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2719 Group->getAlign(), "wide.vec"); 2720 Group->addMetadata(NewLoad); 2721 NewLoads.push_back(NewLoad); 2722 } 2723 2724 // For each member in the group, shuffle out the appropriate data from the 2725 // wide loads. 2726 unsigned J = 0; 2727 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2728 Instruction *Member = Group->getMember(I); 2729 2730 // Skip the gaps in the group. 2731 if (!Member) 2732 continue; 2733 2734 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2735 auto StrideMask = 2736 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2737 for (unsigned Part = 0; Part < UF; Part++) { 2738 Value *StridedVec = Builder.CreateShuffleVector( 2739 NewLoads[Part], StrideMask, "strided.vec"); 2740 2741 // If this member has different type, cast the result type. 2742 if (Member->getType() != ScalarTy) { 2743 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2744 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2745 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2746 } 2747 2748 if (Group->isReverse()) 2749 StridedVec = reverseVector(StridedVec); 2750 2751 State.set(VPDefs[J], StridedVec, Part); 2752 } 2753 ++J; 2754 } 2755 return; 2756 } 2757 2758 // The sub vector type for current instruction. 2759 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2760 auto *SubVT = VectorType::get(ScalarTy, VF); 2761 2762 // Vectorize the interleaved store group. 2763 for (unsigned Part = 0; Part < UF; Part++) { 2764 // Collect the stored vector from each member. 2765 SmallVector<Value *, 4> StoredVecs; 2766 for (unsigned i = 0; i < InterleaveFactor; i++) { 2767 // Interleaved store group doesn't allow a gap, so each index has a member 2768 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2769 2770 Value *StoredVec = State.get(StoredValues[i], Part); 2771 2772 if (Group->isReverse()) 2773 StoredVec = reverseVector(StoredVec); 2774 2775 // If this member has different type, cast it to a unified type. 2776 2777 if (StoredVec->getType() != SubVT) 2778 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2779 2780 StoredVecs.push_back(StoredVec); 2781 } 2782 2783 // Concatenate all vectors into a wide vector. 2784 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2785 2786 // Interleave the elements in the wide vector. 2787 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2788 Value *IVec = Builder.CreateShuffleVector( 2789 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2790 "interleaved.vec"); 2791 2792 Instruction *NewStoreInstr; 2793 if (BlockInMask) { 2794 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2795 Value *ShuffledMask = Builder.CreateShuffleVector( 2796 BlockInMaskPart, 2797 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2798 "interleaved.mask"); 2799 NewStoreInstr = Builder.CreateMaskedStore( 2800 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2801 } 2802 else 2803 NewStoreInstr = 2804 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2805 2806 Group->addMetadata(NewStoreInstr); 2807 } 2808 } 2809 2810 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2811 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2812 VPValue *StoredValue, VPValue *BlockInMask) { 2813 // Attempt to issue a wide load. 2814 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2815 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2816 2817 assert((LI || SI) && "Invalid Load/Store instruction"); 2818 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2819 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2820 2821 LoopVectorizationCostModel::InstWidening Decision = 2822 Cost->getWideningDecision(Instr, VF); 2823 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2824 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2825 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2826 "CM decision is not to widen the memory instruction"); 2827 2828 Type *ScalarDataTy = getMemInstValueType(Instr); 2829 2830 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2831 const Align Alignment = getLoadStoreAlignment(Instr); 2832 2833 // Determine if the pointer operand of the access is either consecutive or 2834 // reverse consecutive. 2835 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2836 bool ConsecutiveStride = 2837 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2838 bool CreateGatherScatter = 2839 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2840 2841 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2842 // gather/scatter. Otherwise Decision should have been to Scalarize. 2843 assert((ConsecutiveStride || CreateGatherScatter) && 2844 "The instruction should be scalarized"); 2845 (void)ConsecutiveStride; 2846 2847 VectorParts BlockInMaskParts(UF); 2848 bool isMaskRequired = BlockInMask; 2849 if (isMaskRequired) 2850 for (unsigned Part = 0; Part < UF; ++Part) 2851 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2852 2853 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2854 // Calculate the pointer for the specific unroll-part. 2855 GetElementPtrInst *PartPtr = nullptr; 2856 2857 bool InBounds = false; 2858 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2859 InBounds = gep->isInBounds(); 2860 if (Reverse) { 2861 // If the address is consecutive but reversed, then the 2862 // wide store needs to start at the last vector element. 2863 // RunTimeVF = VScale * VF.getKnownMinValue() 2864 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2865 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2866 // NumElt = -Part * RunTimeVF 2867 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2868 // LastLane = 1 - RunTimeVF 2869 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2870 PartPtr = 2871 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2872 PartPtr->setIsInBounds(InBounds); 2873 PartPtr = cast<GetElementPtrInst>( 2874 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2875 PartPtr->setIsInBounds(InBounds); 2876 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2877 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2878 } else { 2879 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2880 PartPtr = cast<GetElementPtrInst>( 2881 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2882 PartPtr->setIsInBounds(InBounds); 2883 } 2884 2885 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2886 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2887 }; 2888 2889 // Handle Stores: 2890 if (SI) { 2891 setDebugLocFromInst(Builder, SI); 2892 2893 for (unsigned Part = 0; Part < UF; ++Part) { 2894 Instruction *NewSI = nullptr; 2895 Value *StoredVal = State.get(StoredValue, Part); 2896 if (CreateGatherScatter) { 2897 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2898 Value *VectorGep = State.get(Addr, Part); 2899 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2900 MaskPart); 2901 } else { 2902 if (Reverse) { 2903 // If we store to reverse consecutive memory locations, then we need 2904 // to reverse the order of elements in the stored value. 2905 StoredVal = reverseVector(StoredVal); 2906 // We don't want to update the value in the map as it might be used in 2907 // another expression. So don't call resetVectorValue(StoredVal). 2908 } 2909 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2910 if (isMaskRequired) 2911 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2912 BlockInMaskParts[Part]); 2913 else 2914 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2915 } 2916 addMetadata(NewSI, SI); 2917 } 2918 return; 2919 } 2920 2921 // Handle loads. 2922 assert(LI && "Must have a load instruction"); 2923 setDebugLocFromInst(Builder, LI); 2924 for (unsigned Part = 0; Part < UF; ++Part) { 2925 Value *NewLI; 2926 if (CreateGatherScatter) { 2927 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2928 Value *VectorGep = State.get(Addr, Part); 2929 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2930 nullptr, "wide.masked.gather"); 2931 addMetadata(NewLI, LI); 2932 } else { 2933 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2934 if (isMaskRequired) 2935 NewLI = Builder.CreateMaskedLoad( 2936 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2937 "wide.masked.load"); 2938 else 2939 NewLI = 2940 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2941 2942 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2943 addMetadata(NewLI, LI); 2944 if (Reverse) 2945 NewLI = reverseVector(NewLI); 2946 } 2947 2948 State.set(Def, NewLI, Part); 2949 } 2950 } 2951 2952 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 2953 VPUser &User, 2954 const VPIteration &Instance, 2955 bool IfPredicateInstr, 2956 VPTransformState &State) { 2957 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2958 2959 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2960 // the first lane and part. 2961 if (isa<NoAliasScopeDeclInst>(Instr)) 2962 if (!Instance.isFirstIteration()) 2963 return; 2964 2965 setDebugLocFromInst(Builder, Instr); 2966 2967 // Does this instruction return a value ? 2968 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2969 2970 Instruction *Cloned = Instr->clone(); 2971 if (!IsVoidRetTy) 2972 Cloned->setName(Instr->getName() + ".cloned"); 2973 2974 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 2975 Builder.GetInsertPoint()); 2976 // Replace the operands of the cloned instructions with their scalar 2977 // equivalents in the new loop. 2978 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2979 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2980 auto InputInstance = Instance; 2981 if (!Operand || !OrigLoop->contains(Operand) || 2982 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2983 InputInstance.Lane = VPLane::getFirstLane(); 2984 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2985 Cloned->setOperand(op, NewOp); 2986 } 2987 addNewMetadata(Cloned, Instr); 2988 2989 // Place the cloned scalar in the new loop. 2990 Builder.Insert(Cloned); 2991 2992 State.set(Def, Cloned, Instance); 2993 2994 // If we just cloned a new assumption, add it the assumption cache. 2995 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2996 if (II->getIntrinsicID() == Intrinsic::assume) 2997 AC->registerAssumption(II); 2998 2999 // End if-block. 3000 if (IfPredicateInstr) 3001 PredicatedInstructions.push_back(Cloned); 3002 } 3003 3004 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3005 Value *End, Value *Step, 3006 Instruction *DL) { 3007 BasicBlock *Header = L->getHeader(); 3008 BasicBlock *Latch = L->getLoopLatch(); 3009 // As we're just creating this loop, it's possible no latch exists 3010 // yet. If so, use the header as this will be a single block loop. 3011 if (!Latch) 3012 Latch = Header; 3013 3014 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3015 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3016 setDebugLocFromInst(Builder, OldInst); 3017 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3018 3019 Builder.SetInsertPoint(Latch->getTerminator()); 3020 setDebugLocFromInst(Builder, OldInst); 3021 3022 // Create i+1 and fill the PHINode. 3023 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3024 Induction->addIncoming(Start, L->getLoopPreheader()); 3025 Induction->addIncoming(Next, Latch); 3026 // Create the compare. 3027 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3028 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3029 3030 // Now we have two terminators. Remove the old one from the block. 3031 Latch->getTerminator()->eraseFromParent(); 3032 3033 return Induction; 3034 } 3035 3036 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3037 if (TripCount) 3038 return TripCount; 3039 3040 assert(L && "Create Trip Count for null loop."); 3041 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3042 // Find the loop boundaries. 3043 ScalarEvolution *SE = PSE.getSE(); 3044 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3045 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3046 "Invalid loop count"); 3047 3048 Type *IdxTy = Legal->getWidestInductionType(); 3049 assert(IdxTy && "No type for induction"); 3050 3051 // The exit count might have the type of i64 while the phi is i32. This can 3052 // happen if we have an induction variable that is sign extended before the 3053 // compare. The only way that we get a backedge taken count is that the 3054 // induction variable was signed and as such will not overflow. In such a case 3055 // truncation is legal. 3056 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3057 IdxTy->getPrimitiveSizeInBits()) 3058 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3059 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3060 3061 // Get the total trip count from the count by adding 1. 3062 const SCEV *ExitCount = SE->getAddExpr( 3063 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3064 3065 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3066 3067 // Expand the trip count and place the new instructions in the preheader. 3068 // Notice that the pre-header does not change, only the loop body. 3069 SCEVExpander Exp(*SE, DL, "induction"); 3070 3071 // Count holds the overall loop count (N). 3072 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3073 L->getLoopPreheader()->getTerminator()); 3074 3075 if (TripCount->getType()->isPointerTy()) 3076 TripCount = 3077 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3078 L->getLoopPreheader()->getTerminator()); 3079 3080 return TripCount; 3081 } 3082 3083 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3084 if (VectorTripCount) 3085 return VectorTripCount; 3086 3087 Value *TC = getOrCreateTripCount(L); 3088 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3089 3090 Type *Ty = TC->getType(); 3091 // This is where we can make the step a runtime constant. 3092 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3093 3094 // If the tail is to be folded by masking, round the number of iterations N 3095 // up to a multiple of Step instead of rounding down. This is done by first 3096 // adding Step-1 and then rounding down. Note that it's ok if this addition 3097 // overflows: the vector induction variable will eventually wrap to zero given 3098 // that it starts at zero and its Step is a power of two; the loop will then 3099 // exit, with the last early-exit vector comparison also producing all-true. 3100 if (Cost->foldTailByMasking()) { 3101 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3102 "VF*UF must be a power of 2 when folding tail by masking"); 3103 assert(!VF.isScalable() && 3104 "Tail folding not yet supported for scalable vectors"); 3105 TC = Builder.CreateAdd( 3106 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3107 } 3108 3109 // Now we need to generate the expression for the part of the loop that the 3110 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3111 // iterations are not required for correctness, or N - Step, otherwise. Step 3112 // is equal to the vectorization factor (number of SIMD elements) times the 3113 // unroll factor (number of SIMD instructions). 3114 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3115 3116 // There are two cases where we need to ensure (at least) the last iteration 3117 // runs in the scalar remainder loop. Thus, if the step evenly divides 3118 // the trip count, we set the remainder to be equal to the step. If the step 3119 // does not evenly divide the trip count, no adjustment is necessary since 3120 // there will already be scalar iterations. Note that the minimum iterations 3121 // check ensures that N >= Step. The cases are: 3122 // 1) If there is a non-reversed interleaved group that may speculatively 3123 // access memory out-of-bounds. 3124 // 2) If any instruction may follow a conditionally taken exit. That is, if 3125 // the loop contains multiple exiting blocks, or a single exiting block 3126 // which is not the latch. 3127 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3128 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3129 R = Builder.CreateSelect(IsZero, Step, R); 3130 } 3131 3132 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3133 3134 return VectorTripCount; 3135 } 3136 3137 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3138 const DataLayout &DL) { 3139 // Verify that V is a vector type with same number of elements as DstVTy. 3140 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3141 unsigned VF = DstFVTy->getNumElements(); 3142 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3143 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3144 Type *SrcElemTy = SrcVecTy->getElementType(); 3145 Type *DstElemTy = DstFVTy->getElementType(); 3146 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3147 "Vector elements must have same size"); 3148 3149 // Do a direct cast if element types are castable. 3150 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3151 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3152 } 3153 // V cannot be directly casted to desired vector type. 3154 // May happen when V is a floating point vector but DstVTy is a vector of 3155 // pointers or vice-versa. Handle this using a two-step bitcast using an 3156 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3157 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3158 "Only one type should be a pointer type"); 3159 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3160 "Only one type should be a floating point type"); 3161 Type *IntTy = 3162 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3163 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3164 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3165 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3166 } 3167 3168 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3169 BasicBlock *Bypass) { 3170 Value *Count = getOrCreateTripCount(L); 3171 // Reuse existing vector loop preheader for TC checks. 3172 // Note that new preheader block is generated for vector loop. 3173 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3174 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3175 3176 // Generate code to check if the loop's trip count is less than VF * UF, or 3177 // equal to it in case a scalar epilogue is required; this implies that the 3178 // vector trip count is zero. This check also covers the case where adding one 3179 // to the backedge-taken count overflowed leading to an incorrect trip count 3180 // of zero. In this case we will also jump to the scalar loop. 3181 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3182 : ICmpInst::ICMP_ULT; 3183 3184 // If tail is to be folded, vector loop takes care of all iterations. 3185 Value *CheckMinIters = Builder.getFalse(); 3186 if (!Cost->foldTailByMasking()) { 3187 Value *Step = 3188 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3189 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3190 } 3191 // Create new preheader for vector loop. 3192 LoopVectorPreHeader = 3193 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3194 "vector.ph"); 3195 3196 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3197 DT->getNode(Bypass)->getIDom()) && 3198 "TC check is expected to dominate Bypass"); 3199 3200 // Update dominator for Bypass & LoopExit. 3201 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3202 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3203 3204 ReplaceInstWithInst( 3205 TCCheckBlock->getTerminator(), 3206 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3207 LoopBypassBlocks.push_back(TCCheckBlock); 3208 } 3209 3210 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3211 3212 BasicBlock *const SCEVCheckBlock = 3213 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3214 if (!SCEVCheckBlock) 3215 return nullptr; 3216 3217 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3218 (OptForSizeBasedOnProfile && 3219 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3220 "Cannot SCEV check stride or overflow when optimizing for size"); 3221 3222 3223 // Update dominator only if this is first RT check. 3224 if (LoopBypassBlocks.empty()) { 3225 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3226 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3227 } 3228 3229 LoopBypassBlocks.push_back(SCEVCheckBlock); 3230 AddedSafetyChecks = true; 3231 return SCEVCheckBlock; 3232 } 3233 3234 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3235 BasicBlock *Bypass) { 3236 // VPlan-native path does not do any analysis for runtime checks currently. 3237 if (EnableVPlanNativePath) 3238 return nullptr; 3239 3240 BasicBlock *const MemCheckBlock = 3241 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3242 3243 // Check if we generated code that checks in runtime if arrays overlap. We put 3244 // the checks into a separate block to make the more common case of few 3245 // elements faster. 3246 if (!MemCheckBlock) 3247 return nullptr; 3248 3249 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3250 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3251 "Cannot emit memory checks when optimizing for size, unless forced " 3252 "to vectorize."); 3253 ORE->emit([&]() { 3254 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3255 L->getStartLoc(), L->getHeader()) 3256 << "Code-size may be reduced by not forcing " 3257 "vectorization, or by source-code modifications " 3258 "eliminating the need for runtime checks " 3259 "(e.g., adding 'restrict')."; 3260 }); 3261 } 3262 3263 LoopBypassBlocks.push_back(MemCheckBlock); 3264 3265 AddedSafetyChecks = true; 3266 3267 // We currently don't use LoopVersioning for the actual loop cloning but we 3268 // still use it to add the noalias metadata. 3269 LVer = std::make_unique<LoopVersioning>( 3270 *Legal->getLAI(), 3271 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3272 DT, PSE.getSE()); 3273 LVer->prepareNoAliasMetadata(); 3274 return MemCheckBlock; 3275 } 3276 3277 Value *InnerLoopVectorizer::emitTransformedIndex( 3278 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3279 const InductionDescriptor &ID) const { 3280 3281 SCEVExpander Exp(*SE, DL, "induction"); 3282 auto Step = ID.getStep(); 3283 auto StartValue = ID.getStartValue(); 3284 assert(Index->getType() == Step->getType() && 3285 "Index type does not match StepValue type"); 3286 3287 // Note: the IR at this point is broken. We cannot use SE to create any new 3288 // SCEV and then expand it, hoping that SCEV's simplification will give us 3289 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3290 // lead to various SCEV crashes. So all we can do is to use builder and rely 3291 // on InstCombine for future simplifications. Here we handle some trivial 3292 // cases only. 3293 auto CreateAdd = [&B](Value *X, Value *Y) { 3294 assert(X->getType() == Y->getType() && "Types don't match!"); 3295 if (auto *CX = dyn_cast<ConstantInt>(X)) 3296 if (CX->isZero()) 3297 return Y; 3298 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3299 if (CY->isZero()) 3300 return X; 3301 return B.CreateAdd(X, Y); 3302 }; 3303 3304 auto CreateMul = [&B](Value *X, Value *Y) { 3305 assert(X->getType() == Y->getType() && "Types don't match!"); 3306 if (auto *CX = dyn_cast<ConstantInt>(X)) 3307 if (CX->isOne()) 3308 return Y; 3309 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3310 if (CY->isOne()) 3311 return X; 3312 return B.CreateMul(X, Y); 3313 }; 3314 3315 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3316 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3317 // the DomTree is not kept up-to-date for additional blocks generated in the 3318 // vector loop. By using the header as insertion point, we guarantee that the 3319 // expanded instructions dominate all their uses. 3320 auto GetInsertPoint = [this, &B]() { 3321 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3322 if (InsertBB != LoopVectorBody && 3323 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3324 return LoopVectorBody->getTerminator(); 3325 return &*B.GetInsertPoint(); 3326 }; 3327 3328 switch (ID.getKind()) { 3329 case InductionDescriptor::IK_IntInduction: { 3330 assert(Index->getType() == StartValue->getType() && 3331 "Index type does not match StartValue type"); 3332 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3333 return B.CreateSub(StartValue, Index); 3334 auto *Offset = CreateMul( 3335 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3336 return CreateAdd(StartValue, Offset); 3337 } 3338 case InductionDescriptor::IK_PtrInduction: { 3339 assert(isa<SCEVConstant>(Step) && 3340 "Expected constant step for pointer induction"); 3341 return B.CreateGEP( 3342 StartValue->getType()->getPointerElementType(), StartValue, 3343 CreateMul(Index, 3344 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3345 } 3346 case InductionDescriptor::IK_FpInduction: { 3347 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3348 auto InductionBinOp = ID.getInductionBinOp(); 3349 assert(InductionBinOp && 3350 (InductionBinOp->getOpcode() == Instruction::FAdd || 3351 InductionBinOp->getOpcode() == Instruction::FSub) && 3352 "Original bin op should be defined for FP induction"); 3353 3354 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3355 Value *MulExp = B.CreateFMul(StepValue, Index); 3356 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3357 "induction"); 3358 } 3359 case InductionDescriptor::IK_NoInduction: 3360 return nullptr; 3361 } 3362 llvm_unreachable("invalid enum"); 3363 } 3364 3365 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3366 LoopScalarBody = OrigLoop->getHeader(); 3367 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3368 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3369 assert(LoopExitBlock && "Must have an exit block"); 3370 assert(LoopVectorPreHeader && "Invalid loop structure"); 3371 3372 LoopMiddleBlock = 3373 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3374 LI, nullptr, Twine(Prefix) + "middle.block"); 3375 LoopScalarPreHeader = 3376 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3377 nullptr, Twine(Prefix) + "scalar.ph"); 3378 3379 // Set up branch from middle block to the exit and scalar preheader blocks. 3380 // completeLoopSkeleton will update the condition to use an iteration check, 3381 // if required to decide whether to execute the remainder. 3382 BranchInst *BrInst = 3383 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3384 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3385 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3386 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3387 3388 // We intentionally don't let SplitBlock to update LoopInfo since 3389 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3390 // LoopVectorBody is explicitly added to the correct place few lines later. 3391 LoopVectorBody = 3392 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3393 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3394 3395 // Update dominator for loop exit. 3396 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3397 3398 // Create and register the new vector loop. 3399 Loop *Lp = LI->AllocateLoop(); 3400 Loop *ParentLoop = OrigLoop->getParentLoop(); 3401 3402 // Insert the new loop into the loop nest and register the new basic blocks 3403 // before calling any utilities such as SCEV that require valid LoopInfo. 3404 if (ParentLoop) { 3405 ParentLoop->addChildLoop(Lp); 3406 } else { 3407 LI->addTopLevelLoop(Lp); 3408 } 3409 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3410 return Lp; 3411 } 3412 3413 void InnerLoopVectorizer::createInductionResumeValues( 3414 Loop *L, Value *VectorTripCount, 3415 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3416 assert(VectorTripCount && L && "Expected valid arguments"); 3417 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3418 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3419 "Inconsistent information about additional bypass."); 3420 // We are going to resume the execution of the scalar loop. 3421 // Go over all of the induction variables that we found and fix the 3422 // PHIs that are left in the scalar version of the loop. 3423 // The starting values of PHI nodes depend on the counter of the last 3424 // iteration in the vectorized loop. 3425 // If we come from a bypass edge then we need to start from the original 3426 // start value. 3427 for (auto &InductionEntry : Legal->getInductionVars()) { 3428 PHINode *OrigPhi = InductionEntry.first; 3429 InductionDescriptor II = InductionEntry.second; 3430 3431 // Create phi nodes to merge from the backedge-taken check block. 3432 PHINode *BCResumeVal = 3433 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3434 LoopScalarPreHeader->getTerminator()); 3435 // Copy original phi DL over to the new one. 3436 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3437 Value *&EndValue = IVEndValues[OrigPhi]; 3438 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3439 if (OrigPhi == OldInduction) { 3440 // We know what the end value is. 3441 EndValue = VectorTripCount; 3442 } else { 3443 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3444 3445 // Fast-math-flags propagate from the original induction instruction. 3446 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3447 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3448 3449 Type *StepType = II.getStep()->getType(); 3450 Instruction::CastOps CastOp = 3451 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3452 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3453 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3454 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3455 EndValue->setName("ind.end"); 3456 3457 // Compute the end value for the additional bypass (if applicable). 3458 if (AdditionalBypass.first) { 3459 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3460 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3461 StepType, true); 3462 CRD = 3463 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3464 EndValueFromAdditionalBypass = 3465 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3466 EndValueFromAdditionalBypass->setName("ind.end"); 3467 } 3468 } 3469 // The new PHI merges the original incoming value, in case of a bypass, 3470 // or the value at the end of the vectorized loop. 3471 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3472 3473 // Fix the scalar body counter (PHI node). 3474 // The old induction's phi node in the scalar body needs the truncated 3475 // value. 3476 for (BasicBlock *BB : LoopBypassBlocks) 3477 BCResumeVal->addIncoming(II.getStartValue(), BB); 3478 3479 if (AdditionalBypass.first) 3480 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3481 EndValueFromAdditionalBypass); 3482 3483 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3484 } 3485 } 3486 3487 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3488 MDNode *OrigLoopID) { 3489 assert(L && "Expected valid loop."); 3490 3491 // The trip counts should be cached by now. 3492 Value *Count = getOrCreateTripCount(L); 3493 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3494 3495 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3496 3497 // Add a check in the middle block to see if we have completed 3498 // all of the iterations in the first vector loop. 3499 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3500 // If tail is to be folded, we know we don't need to run the remainder. 3501 if (!Cost->foldTailByMasking()) { 3502 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3503 Count, VectorTripCount, "cmp.n", 3504 LoopMiddleBlock->getTerminator()); 3505 3506 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3507 // of the corresponding compare because they may have ended up with 3508 // different line numbers and we want to avoid awkward line stepping while 3509 // debugging. Eg. if the compare has got a line number inside the loop. 3510 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3511 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3512 } 3513 3514 // Get ready to start creating new instructions into the vectorized body. 3515 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3516 "Inconsistent vector loop preheader"); 3517 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3518 3519 Optional<MDNode *> VectorizedLoopID = 3520 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3521 LLVMLoopVectorizeFollowupVectorized}); 3522 if (VectorizedLoopID.hasValue()) { 3523 L->setLoopID(VectorizedLoopID.getValue()); 3524 3525 // Do not setAlreadyVectorized if loop attributes have been defined 3526 // explicitly. 3527 return LoopVectorPreHeader; 3528 } 3529 3530 // Keep all loop hints from the original loop on the vector loop (we'll 3531 // replace the vectorizer-specific hints below). 3532 if (MDNode *LID = OrigLoop->getLoopID()) 3533 L->setLoopID(LID); 3534 3535 LoopVectorizeHints Hints(L, true, *ORE); 3536 Hints.setAlreadyVectorized(); 3537 3538 #ifdef EXPENSIVE_CHECKS 3539 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3540 LI->verify(*DT); 3541 #endif 3542 3543 return LoopVectorPreHeader; 3544 } 3545 3546 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3547 /* 3548 In this function we generate a new loop. The new loop will contain 3549 the vectorized instructions while the old loop will continue to run the 3550 scalar remainder. 3551 3552 [ ] <-- loop iteration number check. 3553 / | 3554 / v 3555 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3556 | / | 3557 | / v 3558 || [ ] <-- vector pre header. 3559 |/ | 3560 | v 3561 | [ ] \ 3562 | [ ]_| <-- vector loop. 3563 | | 3564 | v 3565 | -[ ] <--- middle-block. 3566 | / | 3567 | / v 3568 -|- >[ ] <--- new preheader. 3569 | | 3570 | v 3571 | [ ] \ 3572 | [ ]_| <-- old scalar loop to handle remainder. 3573 \ | 3574 \ v 3575 >[ ] <-- exit block. 3576 ... 3577 */ 3578 3579 // Get the metadata of the original loop before it gets modified. 3580 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3581 3582 // Create an empty vector loop, and prepare basic blocks for the runtime 3583 // checks. 3584 Loop *Lp = createVectorLoopSkeleton(""); 3585 3586 // Now, compare the new count to zero. If it is zero skip the vector loop and 3587 // jump to the scalar loop. This check also covers the case where the 3588 // backedge-taken count is uint##_max: adding one to it will overflow leading 3589 // to an incorrect trip count of zero. In this (rare) case we will also jump 3590 // to the scalar loop. 3591 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3592 3593 // Generate the code to check any assumptions that we've made for SCEV 3594 // expressions. 3595 emitSCEVChecks(Lp, LoopScalarPreHeader); 3596 3597 // Generate the code that checks in runtime if arrays overlap. We put the 3598 // checks into a separate block to make the more common case of few elements 3599 // faster. 3600 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3601 3602 // Some loops have a single integer induction variable, while other loops 3603 // don't. One example is c++ iterators that often have multiple pointer 3604 // induction variables. In the code below we also support a case where we 3605 // don't have a single induction variable. 3606 // 3607 // We try to obtain an induction variable from the original loop as hard 3608 // as possible. However if we don't find one that: 3609 // - is an integer 3610 // - counts from zero, stepping by one 3611 // - is the size of the widest induction variable type 3612 // then we create a new one. 3613 OldInduction = Legal->getPrimaryInduction(); 3614 Type *IdxTy = Legal->getWidestInductionType(); 3615 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3616 // The loop step is equal to the vectorization factor (num of SIMD elements) 3617 // times the unroll factor (num of SIMD instructions). 3618 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3619 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3620 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3621 Induction = 3622 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3623 getDebugLocFromInstOrOperands(OldInduction)); 3624 3625 // Emit phis for the new starting index of the scalar loop. 3626 createInductionResumeValues(Lp, CountRoundDown); 3627 3628 return completeLoopSkeleton(Lp, OrigLoopID); 3629 } 3630 3631 // Fix up external users of the induction variable. At this point, we are 3632 // in LCSSA form, with all external PHIs that use the IV having one input value, 3633 // coming from the remainder loop. We need those PHIs to also have a correct 3634 // value for the IV when arriving directly from the middle block. 3635 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3636 const InductionDescriptor &II, 3637 Value *CountRoundDown, Value *EndValue, 3638 BasicBlock *MiddleBlock) { 3639 // There are two kinds of external IV usages - those that use the value 3640 // computed in the last iteration (the PHI) and those that use the penultimate 3641 // value (the value that feeds into the phi from the loop latch). 3642 // We allow both, but they, obviously, have different values. 3643 3644 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3645 3646 DenseMap<Value *, Value *> MissingVals; 3647 3648 // An external user of the last iteration's value should see the value that 3649 // the remainder loop uses to initialize its own IV. 3650 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3651 for (User *U : PostInc->users()) { 3652 Instruction *UI = cast<Instruction>(U); 3653 if (!OrigLoop->contains(UI)) { 3654 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3655 MissingVals[UI] = EndValue; 3656 } 3657 } 3658 3659 // An external user of the penultimate value need to see EndValue - Step. 3660 // The simplest way to get this is to recompute it from the constituent SCEVs, 3661 // that is Start + (Step * (CRD - 1)). 3662 for (User *U : OrigPhi->users()) { 3663 auto *UI = cast<Instruction>(U); 3664 if (!OrigLoop->contains(UI)) { 3665 const DataLayout &DL = 3666 OrigLoop->getHeader()->getModule()->getDataLayout(); 3667 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3668 3669 IRBuilder<> B(MiddleBlock->getTerminator()); 3670 3671 // Fast-math-flags propagate from the original induction instruction. 3672 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3673 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3674 3675 Value *CountMinusOne = B.CreateSub( 3676 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3677 Value *CMO = 3678 !II.getStep()->getType()->isIntegerTy() 3679 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3680 II.getStep()->getType()) 3681 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3682 CMO->setName("cast.cmo"); 3683 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3684 Escape->setName("ind.escape"); 3685 MissingVals[UI] = Escape; 3686 } 3687 } 3688 3689 for (auto &I : MissingVals) { 3690 PHINode *PHI = cast<PHINode>(I.first); 3691 // One corner case we have to handle is two IVs "chasing" each-other, 3692 // that is %IV2 = phi [...], [ %IV1, %latch ] 3693 // In this case, if IV1 has an external use, we need to avoid adding both 3694 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3695 // don't already have an incoming value for the middle block. 3696 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3697 PHI->addIncoming(I.second, MiddleBlock); 3698 } 3699 } 3700 3701 namespace { 3702 3703 struct CSEDenseMapInfo { 3704 static bool canHandle(const Instruction *I) { 3705 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3706 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3707 } 3708 3709 static inline Instruction *getEmptyKey() { 3710 return DenseMapInfo<Instruction *>::getEmptyKey(); 3711 } 3712 3713 static inline Instruction *getTombstoneKey() { 3714 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3715 } 3716 3717 static unsigned getHashValue(const Instruction *I) { 3718 assert(canHandle(I) && "Unknown instruction!"); 3719 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3720 I->value_op_end())); 3721 } 3722 3723 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3724 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3725 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3726 return LHS == RHS; 3727 return LHS->isIdenticalTo(RHS); 3728 } 3729 }; 3730 3731 } // end anonymous namespace 3732 3733 ///Perform cse of induction variable instructions. 3734 static void cse(BasicBlock *BB) { 3735 // Perform simple cse. 3736 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3737 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3738 Instruction *In = &*I++; 3739 3740 if (!CSEDenseMapInfo::canHandle(In)) 3741 continue; 3742 3743 // Check if we can replace this instruction with any of the 3744 // visited instructions. 3745 if (Instruction *V = CSEMap.lookup(In)) { 3746 In->replaceAllUsesWith(V); 3747 In->eraseFromParent(); 3748 continue; 3749 } 3750 3751 CSEMap[In] = In; 3752 } 3753 } 3754 3755 InstructionCost 3756 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3757 bool &NeedToScalarize) const { 3758 Function *F = CI->getCalledFunction(); 3759 Type *ScalarRetTy = CI->getType(); 3760 SmallVector<Type *, 4> Tys, ScalarTys; 3761 for (auto &ArgOp : CI->arg_operands()) 3762 ScalarTys.push_back(ArgOp->getType()); 3763 3764 // Estimate cost of scalarized vector call. The source operands are assumed 3765 // to be vectors, so we need to extract individual elements from there, 3766 // execute VF scalar calls, and then gather the result into the vector return 3767 // value. 3768 InstructionCost ScalarCallCost = 3769 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3770 if (VF.isScalar()) 3771 return ScalarCallCost; 3772 3773 // Compute corresponding vector type for return value and arguments. 3774 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3775 for (Type *ScalarTy : ScalarTys) 3776 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3777 3778 // Compute costs of unpacking argument values for the scalar calls and 3779 // packing the return values to a vector. 3780 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3781 3782 InstructionCost Cost = 3783 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3784 3785 // If we can't emit a vector call for this function, then the currently found 3786 // cost is the cost we need to return. 3787 NeedToScalarize = true; 3788 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3789 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3790 3791 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3792 return Cost; 3793 3794 // If the corresponding vector cost is cheaper, return its cost. 3795 InstructionCost VectorCallCost = 3796 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3797 if (VectorCallCost < Cost) { 3798 NeedToScalarize = false; 3799 Cost = VectorCallCost; 3800 } 3801 return Cost; 3802 } 3803 3804 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3805 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3806 return Elt; 3807 return VectorType::get(Elt, VF); 3808 } 3809 3810 InstructionCost 3811 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3812 ElementCount VF) const { 3813 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3814 assert(ID && "Expected intrinsic call!"); 3815 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3816 FastMathFlags FMF; 3817 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3818 FMF = FPMO->getFastMathFlags(); 3819 3820 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3821 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3822 SmallVector<Type *> ParamTys; 3823 std::transform(FTy->param_begin(), FTy->param_end(), 3824 std::back_inserter(ParamTys), 3825 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3826 3827 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3828 dyn_cast<IntrinsicInst>(CI)); 3829 return TTI.getIntrinsicInstrCost(CostAttrs, 3830 TargetTransformInfo::TCK_RecipThroughput); 3831 } 3832 3833 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3834 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3835 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3836 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3837 } 3838 3839 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3840 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3841 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3842 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3843 } 3844 3845 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3846 // For every instruction `I` in MinBWs, truncate the operands, create a 3847 // truncated version of `I` and reextend its result. InstCombine runs 3848 // later and will remove any ext/trunc pairs. 3849 SmallPtrSet<Value *, 4> Erased; 3850 for (const auto &KV : Cost->getMinimalBitwidths()) { 3851 // If the value wasn't vectorized, we must maintain the original scalar 3852 // type. The absence of the value from State indicates that it 3853 // wasn't vectorized. 3854 VPValue *Def = State.Plan->getVPValue(KV.first); 3855 if (!State.hasAnyVectorValue(Def)) 3856 continue; 3857 for (unsigned Part = 0; Part < UF; ++Part) { 3858 Value *I = State.get(Def, Part); 3859 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3860 continue; 3861 Type *OriginalTy = I->getType(); 3862 Type *ScalarTruncatedTy = 3863 IntegerType::get(OriginalTy->getContext(), KV.second); 3864 auto *TruncatedTy = FixedVectorType::get( 3865 ScalarTruncatedTy, 3866 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3867 if (TruncatedTy == OriginalTy) 3868 continue; 3869 3870 IRBuilder<> B(cast<Instruction>(I)); 3871 auto ShrinkOperand = [&](Value *V) -> Value * { 3872 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3873 if (ZI->getSrcTy() == TruncatedTy) 3874 return ZI->getOperand(0); 3875 return B.CreateZExtOrTrunc(V, TruncatedTy); 3876 }; 3877 3878 // The actual instruction modification depends on the instruction type, 3879 // unfortunately. 3880 Value *NewI = nullptr; 3881 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3882 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3883 ShrinkOperand(BO->getOperand(1))); 3884 3885 // Any wrapping introduced by shrinking this operation shouldn't be 3886 // considered undefined behavior. So, we can't unconditionally copy 3887 // arithmetic wrapping flags to NewI. 3888 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3889 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3890 NewI = 3891 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3892 ShrinkOperand(CI->getOperand(1))); 3893 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3894 NewI = B.CreateSelect(SI->getCondition(), 3895 ShrinkOperand(SI->getTrueValue()), 3896 ShrinkOperand(SI->getFalseValue())); 3897 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3898 switch (CI->getOpcode()) { 3899 default: 3900 llvm_unreachable("Unhandled cast!"); 3901 case Instruction::Trunc: 3902 NewI = ShrinkOperand(CI->getOperand(0)); 3903 break; 3904 case Instruction::SExt: 3905 NewI = B.CreateSExtOrTrunc( 3906 CI->getOperand(0), 3907 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3908 break; 3909 case Instruction::ZExt: 3910 NewI = B.CreateZExtOrTrunc( 3911 CI->getOperand(0), 3912 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3913 break; 3914 } 3915 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3916 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3917 ->getNumElements(); 3918 auto *O0 = B.CreateZExtOrTrunc( 3919 SI->getOperand(0), 3920 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3921 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3922 ->getNumElements(); 3923 auto *O1 = B.CreateZExtOrTrunc( 3924 SI->getOperand(1), 3925 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3926 3927 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3928 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3929 // Don't do anything with the operands, just extend the result. 3930 continue; 3931 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3932 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3933 ->getNumElements(); 3934 auto *O0 = B.CreateZExtOrTrunc( 3935 IE->getOperand(0), 3936 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3937 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3938 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3939 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3940 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3941 ->getNumElements(); 3942 auto *O0 = B.CreateZExtOrTrunc( 3943 EE->getOperand(0), 3944 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3945 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3946 } else { 3947 // If we don't know what to do, be conservative and don't do anything. 3948 continue; 3949 } 3950 3951 // Lastly, extend the result. 3952 NewI->takeName(cast<Instruction>(I)); 3953 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3954 I->replaceAllUsesWith(Res); 3955 cast<Instruction>(I)->eraseFromParent(); 3956 Erased.insert(I); 3957 State.reset(Def, Res, Part); 3958 } 3959 } 3960 3961 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3962 for (const auto &KV : Cost->getMinimalBitwidths()) { 3963 // If the value wasn't vectorized, we must maintain the original scalar 3964 // type. The absence of the value from State indicates that it 3965 // wasn't vectorized. 3966 VPValue *Def = State.Plan->getVPValue(KV.first); 3967 if (!State.hasAnyVectorValue(Def)) 3968 continue; 3969 for (unsigned Part = 0; Part < UF; ++Part) { 3970 Value *I = State.get(Def, Part); 3971 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3972 if (Inst && Inst->use_empty()) { 3973 Value *NewI = Inst->getOperand(0); 3974 Inst->eraseFromParent(); 3975 State.reset(Def, NewI, Part); 3976 } 3977 } 3978 } 3979 } 3980 3981 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 3982 // Insert truncates and extends for any truncated instructions as hints to 3983 // InstCombine. 3984 if (VF.isVector()) 3985 truncateToMinimalBitwidths(State); 3986 3987 // Fix widened non-induction PHIs by setting up the PHI operands. 3988 if (OrigPHIsToFix.size()) { 3989 assert(EnableVPlanNativePath && 3990 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3991 fixNonInductionPHIs(State); 3992 } 3993 3994 // At this point every instruction in the original loop is widened to a 3995 // vector form. Now we need to fix the recurrences in the loop. These PHI 3996 // nodes are currently empty because we did not want to introduce cycles. 3997 // This is the second stage of vectorizing recurrences. 3998 fixCrossIterationPHIs(State); 3999 4000 // Forget the original basic block. 4001 PSE.getSE()->forgetLoop(OrigLoop); 4002 4003 // Fix-up external users of the induction variables. 4004 for (auto &Entry : Legal->getInductionVars()) 4005 fixupIVUsers(Entry.first, Entry.second, 4006 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4007 IVEndValues[Entry.first], LoopMiddleBlock); 4008 4009 fixLCSSAPHIs(State); 4010 for (Instruction *PI : PredicatedInstructions) 4011 sinkScalarOperands(&*PI); 4012 4013 // Remove redundant induction instructions. 4014 cse(LoopVectorBody); 4015 4016 // Set/update profile weights for the vector and remainder loops as original 4017 // loop iterations are now distributed among them. Note that original loop 4018 // represented by LoopScalarBody becomes remainder loop after vectorization. 4019 // 4020 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4021 // end up getting slightly roughened result but that should be OK since 4022 // profile is not inherently precise anyway. Note also possible bypass of 4023 // vector code caused by legality checks is ignored, assigning all the weight 4024 // to the vector loop, optimistically. 4025 // 4026 // For scalable vectorization we can't know at compile time how many iterations 4027 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4028 // vscale of '1'. 4029 setProfileInfoAfterUnrolling( 4030 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4031 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4032 } 4033 4034 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4035 // In order to support recurrences we need to be able to vectorize Phi nodes. 4036 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4037 // stage #2: We now need to fix the recurrences by adding incoming edges to 4038 // the currently empty PHI nodes. At this point every instruction in the 4039 // original loop is widened to a vector form so we can use them to construct 4040 // the incoming edges. 4041 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4042 // Handle first-order recurrences and reductions that need to be fixed. 4043 if (Legal->isFirstOrderRecurrence(&Phi)) 4044 fixFirstOrderRecurrence(&Phi, State); 4045 else if (Legal->isReductionVariable(&Phi)) 4046 fixReduction(&Phi, State); 4047 } 4048 } 4049 4050 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4051 VPTransformState &State) { 4052 // This is the second phase of vectorizing first-order recurrences. An 4053 // overview of the transformation is described below. Suppose we have the 4054 // following loop. 4055 // 4056 // for (int i = 0; i < n; ++i) 4057 // b[i] = a[i] - a[i - 1]; 4058 // 4059 // There is a first-order recurrence on "a". For this loop, the shorthand 4060 // scalar IR looks like: 4061 // 4062 // scalar.ph: 4063 // s_init = a[-1] 4064 // br scalar.body 4065 // 4066 // scalar.body: 4067 // i = phi [0, scalar.ph], [i+1, scalar.body] 4068 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4069 // s2 = a[i] 4070 // b[i] = s2 - s1 4071 // br cond, scalar.body, ... 4072 // 4073 // In this example, s1 is a recurrence because it's value depends on the 4074 // previous iteration. In the first phase of vectorization, we created a 4075 // temporary value for s1. We now complete the vectorization and produce the 4076 // shorthand vector IR shown below (for VF = 4, UF = 1). 4077 // 4078 // vector.ph: 4079 // v_init = vector(..., ..., ..., a[-1]) 4080 // br vector.body 4081 // 4082 // vector.body 4083 // i = phi [0, vector.ph], [i+4, vector.body] 4084 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4085 // v2 = a[i, i+1, i+2, i+3]; 4086 // v3 = vector(v1(3), v2(0, 1, 2)) 4087 // b[i, i+1, i+2, i+3] = v2 - v3 4088 // br cond, vector.body, middle.block 4089 // 4090 // middle.block: 4091 // x = v2(3) 4092 // br scalar.ph 4093 // 4094 // scalar.ph: 4095 // s_init = phi [x, middle.block], [a[-1], otherwise] 4096 // br scalar.body 4097 // 4098 // After execution completes the vector loop, we extract the next value of 4099 // the recurrence (x) to use as the initial value in the scalar loop. 4100 4101 // Get the original loop preheader and single loop latch. 4102 auto *Preheader = OrigLoop->getLoopPreheader(); 4103 auto *Latch = OrigLoop->getLoopLatch(); 4104 4105 // Get the initial and previous values of the scalar recurrence. 4106 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4107 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4108 4109 // Create a vector from the initial value. 4110 auto *VectorInit = ScalarInit; 4111 if (VF.isVector()) { 4112 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4113 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4114 VectorInit = Builder.CreateInsertElement( 4115 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4116 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4117 } 4118 4119 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4120 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4121 // We constructed a temporary phi node in the first phase of vectorization. 4122 // This phi node will eventually be deleted. 4123 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4124 4125 // Create a phi node for the new recurrence. The current value will either be 4126 // the initial value inserted into a vector or loop-varying vector value. 4127 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4128 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4129 4130 // Get the vectorized previous value of the last part UF - 1. It appears last 4131 // among all unrolled iterations, due to the order of their construction. 4132 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4133 4134 // Find and set the insertion point after the previous value if it is an 4135 // instruction. 4136 BasicBlock::iterator InsertPt; 4137 // Note that the previous value may have been constant-folded so it is not 4138 // guaranteed to be an instruction in the vector loop. 4139 // FIXME: Loop invariant values do not form recurrences. We should deal with 4140 // them earlier. 4141 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4142 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4143 else { 4144 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4145 if (isa<PHINode>(PreviousLastPart)) 4146 // If the previous value is a phi node, we should insert after all the phi 4147 // nodes in the block containing the PHI to avoid breaking basic block 4148 // verification. Note that the basic block may be different to 4149 // LoopVectorBody, in case we predicate the loop. 4150 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4151 else 4152 InsertPt = ++PreviousInst->getIterator(); 4153 } 4154 Builder.SetInsertPoint(&*InsertPt); 4155 4156 // We will construct a vector for the recurrence by combining the values for 4157 // the current and previous iterations. This is the required shuffle mask. 4158 assert(!VF.isScalable()); 4159 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4160 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4161 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4162 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4163 4164 // The vector from which to take the initial value for the current iteration 4165 // (actual or unrolled). Initially, this is the vector phi node. 4166 Value *Incoming = VecPhi; 4167 4168 // Shuffle the current and previous vector and update the vector parts. 4169 for (unsigned Part = 0; Part < UF; ++Part) { 4170 Value *PreviousPart = State.get(PreviousDef, Part); 4171 Value *PhiPart = State.get(PhiDef, Part); 4172 auto *Shuffle = 4173 VF.isVector() 4174 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4175 : Incoming; 4176 PhiPart->replaceAllUsesWith(Shuffle); 4177 cast<Instruction>(PhiPart)->eraseFromParent(); 4178 State.reset(PhiDef, Shuffle, Part); 4179 Incoming = PreviousPart; 4180 } 4181 4182 // Fix the latch value of the new recurrence in the vector loop. 4183 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4184 4185 // Extract the last vector element in the middle block. This will be the 4186 // initial value for the recurrence when jumping to the scalar loop. 4187 auto *ExtractForScalar = Incoming; 4188 if (VF.isVector()) { 4189 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4190 ExtractForScalar = Builder.CreateExtractElement( 4191 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4192 "vector.recur.extract"); 4193 } 4194 // Extract the second last element in the middle block if the 4195 // Phi is used outside the loop. We need to extract the phi itself 4196 // and not the last element (the phi update in the current iteration). This 4197 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4198 // when the scalar loop is not run at all. 4199 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4200 if (VF.isVector()) 4201 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4202 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4203 "vector.recur.extract.for.phi"); 4204 // When loop is unrolled without vectorizing, initialize 4205 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4206 // `Incoming`. This is analogous to the vectorized case above: extracting the 4207 // second last element when VF > 1. 4208 else if (UF > 1) 4209 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4210 4211 // Fix the initial value of the original recurrence in the scalar loop. 4212 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4213 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4214 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4215 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4216 Start->addIncoming(Incoming, BB); 4217 } 4218 4219 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4220 Phi->setName("scalar.recur"); 4221 4222 // Finally, fix users of the recurrence outside the loop. The users will need 4223 // either the last value of the scalar recurrence or the last value of the 4224 // vector recurrence we extracted in the middle block. Since the loop is in 4225 // LCSSA form, we just need to find all the phi nodes for the original scalar 4226 // recurrence in the exit block, and then add an edge for the middle block. 4227 // Note that LCSSA does not imply single entry when the original scalar loop 4228 // had multiple exiting edges (as we always run the last iteration in the 4229 // scalar epilogue); in that case, the exiting path through middle will be 4230 // dynamically dead and the value picked for the phi doesn't matter. 4231 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4232 if (any_of(LCSSAPhi.incoming_values(), 4233 [Phi](Value *V) { return V == Phi; })) 4234 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4235 } 4236 4237 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4238 // Get it's reduction variable descriptor. 4239 assert(Legal->isReductionVariable(Phi) && 4240 "Unable to find the reduction variable"); 4241 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4242 4243 RecurKind RK = RdxDesc.getRecurrenceKind(); 4244 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4245 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4246 setDebugLocFromInst(Builder, ReductionStartValue); 4247 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4248 4249 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4250 // This is the vector-clone of the value that leaves the loop. 4251 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4252 4253 // Wrap flags are in general invalid after vectorization, clear them. 4254 clearReductionWrapFlags(RdxDesc, State); 4255 4256 // Fix the vector-loop phi. 4257 4258 // Reductions do not have to start at zero. They can start with 4259 // any loop invariant values. 4260 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4261 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4262 4263 for (unsigned Part = 0; Part < UF; ++Part) { 4264 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4265 Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part); 4266 cast<PHINode>(VecRdxPhi) 4267 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4268 } 4269 4270 // Before each round, move the insertion point right between 4271 // the PHIs and the values we are going to write. 4272 // This allows us to write both PHINodes and the extractelement 4273 // instructions. 4274 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4275 4276 setDebugLocFromInst(Builder, LoopExitInst); 4277 4278 Type *PhiTy = Phi->getType(); 4279 // If tail is folded by masking, the vector value to leave the loop should be 4280 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4281 // instead of the former. For an inloop reduction the reduction will already 4282 // be predicated, and does not need to be handled here. 4283 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4284 for (unsigned Part = 0; Part < UF; ++Part) { 4285 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4286 Value *Sel = nullptr; 4287 for (User *U : VecLoopExitInst->users()) { 4288 if (isa<SelectInst>(U)) { 4289 assert(!Sel && "Reduction exit feeding two selects"); 4290 Sel = U; 4291 } else 4292 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4293 } 4294 assert(Sel && "Reduction exit feeds no select"); 4295 State.reset(LoopExitInstDef, Sel, Part); 4296 4297 // If the target can create a predicated operator for the reduction at no 4298 // extra cost in the loop (for example a predicated vadd), it can be 4299 // cheaper for the select to remain in the loop than be sunk out of it, 4300 // and so use the select value for the phi instead of the old 4301 // LoopExitValue. 4302 if (PreferPredicatedReductionSelect || 4303 TTI->preferPredicatedReductionSelect( 4304 RdxDesc.getOpcode(), PhiTy, 4305 TargetTransformInfo::ReductionFlags())) { 4306 auto *VecRdxPhi = 4307 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4308 VecRdxPhi->setIncomingValueForBlock( 4309 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4310 } 4311 } 4312 } 4313 4314 // If the vector reduction can be performed in a smaller type, we truncate 4315 // then extend the loop exit value to enable InstCombine to evaluate the 4316 // entire expression in the smaller type. 4317 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4318 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4319 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4320 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4321 Builder.SetInsertPoint( 4322 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4323 VectorParts RdxParts(UF); 4324 for (unsigned Part = 0; Part < UF; ++Part) { 4325 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4326 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4327 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4328 : Builder.CreateZExt(Trunc, VecTy); 4329 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4330 UI != RdxParts[Part]->user_end();) 4331 if (*UI != Trunc) { 4332 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4333 RdxParts[Part] = Extnd; 4334 } else { 4335 ++UI; 4336 } 4337 } 4338 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4339 for (unsigned Part = 0; Part < UF; ++Part) { 4340 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4341 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4342 } 4343 } 4344 4345 // Reduce all of the unrolled parts into a single vector. 4346 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4347 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4348 4349 // The middle block terminator has already been assigned a DebugLoc here (the 4350 // OrigLoop's single latch terminator). We want the whole middle block to 4351 // appear to execute on this line because: (a) it is all compiler generated, 4352 // (b) these instructions are always executed after evaluating the latch 4353 // conditional branch, and (c) other passes may add new predecessors which 4354 // terminate on this line. This is the easiest way to ensure we don't 4355 // accidentally cause an extra step back into the loop while debugging. 4356 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4357 { 4358 // Floating-point operations should have some FMF to enable the reduction. 4359 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4360 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4361 for (unsigned Part = 1; Part < UF; ++Part) { 4362 Value *RdxPart = State.get(LoopExitInstDef, Part); 4363 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4364 ReducedPartRdx = Builder.CreateBinOp( 4365 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4366 } else { 4367 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4368 } 4369 } 4370 } 4371 4372 // Create the reduction after the loop. Note that inloop reductions create the 4373 // target reduction in the loop using a Reduction recipe. 4374 if (VF.isVector() && !IsInLoopReductionPhi) { 4375 ReducedPartRdx = 4376 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4377 // If the reduction can be performed in a smaller type, we need to extend 4378 // the reduction to the wider type before we branch to the original loop. 4379 if (PhiTy != RdxDesc.getRecurrenceType()) 4380 ReducedPartRdx = RdxDesc.isSigned() 4381 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4382 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4383 } 4384 4385 // Create a phi node that merges control-flow from the backedge-taken check 4386 // block and the middle block. 4387 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4388 LoopScalarPreHeader->getTerminator()); 4389 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4390 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4391 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4392 4393 // Now, we need to fix the users of the reduction variable 4394 // inside and outside of the scalar remainder loop. 4395 4396 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4397 // in the exit blocks. See comment on analogous loop in 4398 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4399 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4400 if (any_of(LCSSAPhi.incoming_values(), 4401 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4402 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4403 4404 // Fix the scalar loop reduction variable with the incoming reduction sum 4405 // from the vector body and from the backedge value. 4406 int IncomingEdgeBlockIdx = 4407 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4408 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4409 // Pick the other block. 4410 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4411 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4412 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4413 } 4414 4415 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4416 VPTransformState &State) { 4417 RecurKind RK = RdxDesc.getRecurrenceKind(); 4418 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4419 return; 4420 4421 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4422 assert(LoopExitInstr && "null loop exit instruction"); 4423 SmallVector<Instruction *, 8> Worklist; 4424 SmallPtrSet<Instruction *, 8> Visited; 4425 Worklist.push_back(LoopExitInstr); 4426 Visited.insert(LoopExitInstr); 4427 4428 while (!Worklist.empty()) { 4429 Instruction *Cur = Worklist.pop_back_val(); 4430 if (isa<OverflowingBinaryOperator>(Cur)) 4431 for (unsigned Part = 0; Part < UF; ++Part) { 4432 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4433 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4434 } 4435 4436 for (User *U : Cur->users()) { 4437 Instruction *UI = cast<Instruction>(U); 4438 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4439 Visited.insert(UI).second) 4440 Worklist.push_back(UI); 4441 } 4442 } 4443 } 4444 4445 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4446 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4447 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4448 // Some phis were already hand updated by the reduction and recurrence 4449 // code above, leave them alone. 4450 continue; 4451 4452 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4453 // Non-instruction incoming values will have only one value. 4454 4455 VPLane Lane = VPLane::getFirstLane(); 4456 if (isa<Instruction>(IncomingValue) && 4457 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4458 VF)) 4459 Lane = VPLane::getLastLaneForVF(VF); 4460 4461 // Can be a loop invariant incoming value or the last scalar value to be 4462 // extracted from the vectorized loop. 4463 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4464 Value *lastIncomingValue = 4465 OrigLoop->isLoopInvariant(IncomingValue) 4466 ? IncomingValue 4467 : State.get(State.Plan->getVPValue(IncomingValue), 4468 VPIteration(UF - 1, Lane)); 4469 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4470 } 4471 } 4472 4473 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4474 // The basic block and loop containing the predicated instruction. 4475 auto *PredBB = PredInst->getParent(); 4476 auto *VectorLoop = LI->getLoopFor(PredBB); 4477 4478 // Initialize a worklist with the operands of the predicated instruction. 4479 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4480 4481 // Holds instructions that we need to analyze again. An instruction may be 4482 // reanalyzed if we don't yet know if we can sink it or not. 4483 SmallVector<Instruction *, 8> InstsToReanalyze; 4484 4485 // Returns true if a given use occurs in the predicated block. Phi nodes use 4486 // their operands in their corresponding predecessor blocks. 4487 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4488 auto *I = cast<Instruction>(U.getUser()); 4489 BasicBlock *BB = I->getParent(); 4490 if (auto *Phi = dyn_cast<PHINode>(I)) 4491 BB = Phi->getIncomingBlock( 4492 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4493 return BB == PredBB; 4494 }; 4495 4496 // Iteratively sink the scalarized operands of the predicated instruction 4497 // into the block we created for it. When an instruction is sunk, it's 4498 // operands are then added to the worklist. The algorithm ends after one pass 4499 // through the worklist doesn't sink a single instruction. 4500 bool Changed; 4501 do { 4502 // Add the instructions that need to be reanalyzed to the worklist, and 4503 // reset the changed indicator. 4504 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4505 InstsToReanalyze.clear(); 4506 Changed = false; 4507 4508 while (!Worklist.empty()) { 4509 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4510 4511 // We can't sink an instruction if it is a phi node, is already in the 4512 // predicated block, is not in the loop, or may have side effects. 4513 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4514 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4515 continue; 4516 4517 // It's legal to sink the instruction if all its uses occur in the 4518 // predicated block. Otherwise, there's nothing to do yet, and we may 4519 // need to reanalyze the instruction. 4520 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4521 InstsToReanalyze.push_back(I); 4522 continue; 4523 } 4524 4525 // Move the instruction to the beginning of the predicated block, and add 4526 // it's operands to the worklist. 4527 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4528 Worklist.insert(I->op_begin(), I->op_end()); 4529 4530 // The sinking may have enabled other instructions to be sunk, so we will 4531 // need to iterate. 4532 Changed = true; 4533 } 4534 } while (Changed); 4535 } 4536 4537 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4538 for (PHINode *OrigPhi : OrigPHIsToFix) { 4539 VPWidenPHIRecipe *VPPhi = 4540 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4541 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4542 // Make sure the builder has a valid insert point. 4543 Builder.SetInsertPoint(NewPhi); 4544 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4545 VPValue *Inc = VPPhi->getIncomingValue(i); 4546 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4547 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4548 } 4549 } 4550 } 4551 4552 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4553 VPUser &Operands, unsigned UF, 4554 ElementCount VF, bool IsPtrLoopInvariant, 4555 SmallBitVector &IsIndexLoopInvariant, 4556 VPTransformState &State) { 4557 // Construct a vector GEP by widening the operands of the scalar GEP as 4558 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4559 // results in a vector of pointers when at least one operand of the GEP 4560 // is vector-typed. Thus, to keep the representation compact, we only use 4561 // vector-typed operands for loop-varying values. 4562 4563 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4564 // If we are vectorizing, but the GEP has only loop-invariant operands, 4565 // the GEP we build (by only using vector-typed operands for 4566 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4567 // produce a vector of pointers, we need to either arbitrarily pick an 4568 // operand to broadcast, or broadcast a clone of the original GEP. 4569 // Here, we broadcast a clone of the original. 4570 // 4571 // TODO: If at some point we decide to scalarize instructions having 4572 // loop-invariant operands, this special case will no longer be 4573 // required. We would add the scalarization decision to 4574 // collectLoopScalars() and teach getVectorValue() to broadcast 4575 // the lane-zero scalar value. 4576 auto *Clone = Builder.Insert(GEP->clone()); 4577 for (unsigned Part = 0; Part < UF; ++Part) { 4578 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4579 State.set(VPDef, EntryPart, Part); 4580 addMetadata(EntryPart, GEP); 4581 } 4582 } else { 4583 // If the GEP has at least one loop-varying operand, we are sure to 4584 // produce a vector of pointers. But if we are only unrolling, we want 4585 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4586 // produce with the code below will be scalar (if VF == 1) or vector 4587 // (otherwise). Note that for the unroll-only case, we still maintain 4588 // values in the vector mapping with initVector, as we do for other 4589 // instructions. 4590 for (unsigned Part = 0; Part < UF; ++Part) { 4591 // The pointer operand of the new GEP. If it's loop-invariant, we 4592 // won't broadcast it. 4593 auto *Ptr = IsPtrLoopInvariant 4594 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4595 : State.get(Operands.getOperand(0), Part); 4596 4597 // Collect all the indices for the new GEP. If any index is 4598 // loop-invariant, we won't broadcast it. 4599 SmallVector<Value *, 4> Indices; 4600 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4601 VPValue *Operand = Operands.getOperand(I); 4602 if (IsIndexLoopInvariant[I - 1]) 4603 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4604 else 4605 Indices.push_back(State.get(Operand, Part)); 4606 } 4607 4608 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4609 // but it should be a vector, otherwise. 4610 auto *NewGEP = 4611 GEP->isInBounds() 4612 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4613 Indices) 4614 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4615 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4616 "NewGEP is not a pointer vector"); 4617 State.set(VPDef, NewGEP, Part); 4618 addMetadata(NewGEP, GEP); 4619 } 4620 } 4621 } 4622 4623 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4624 RecurrenceDescriptor *RdxDesc, 4625 VPValue *StartVPV, VPValue *Def, 4626 VPTransformState &State) { 4627 PHINode *P = cast<PHINode>(PN); 4628 if (EnableVPlanNativePath) { 4629 // Currently we enter here in the VPlan-native path for non-induction 4630 // PHIs where all control flow is uniform. We simply widen these PHIs. 4631 // Create a vector phi with no operands - the vector phi operands will be 4632 // set at the end of vector code generation. 4633 Type *VecTy = (State.VF.isScalar()) 4634 ? PN->getType() 4635 : VectorType::get(PN->getType(), State.VF); 4636 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4637 State.set(Def, VecPhi, 0); 4638 OrigPHIsToFix.push_back(P); 4639 4640 return; 4641 } 4642 4643 assert(PN->getParent() == OrigLoop->getHeader() && 4644 "Non-header phis should have been handled elsewhere"); 4645 4646 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4647 // In order to support recurrences we need to be able to vectorize Phi nodes. 4648 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4649 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4650 // this value when we vectorize all of the instructions that use the PHI. 4651 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4652 Value *Iden = nullptr; 4653 bool ScalarPHI = 4654 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4655 Type *VecTy = 4656 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4657 4658 if (RdxDesc) { 4659 assert(Legal->isReductionVariable(P) && StartV && 4660 "RdxDesc should only be set for reduction variables; in that case " 4661 "a StartV is also required"); 4662 RecurKind RK = RdxDesc->getRecurrenceKind(); 4663 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4664 // MinMax reduction have the start value as their identify. 4665 if (ScalarPHI) { 4666 Iden = StartV; 4667 } else { 4668 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4669 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4670 StartV = Iden = 4671 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4672 } 4673 } else { 4674 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4675 RK, VecTy->getScalarType()); 4676 Iden = IdenC; 4677 4678 if (!ScalarPHI) { 4679 Iden = ConstantVector::getSplat(State.VF, IdenC); 4680 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4681 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4682 Constant *Zero = Builder.getInt32(0); 4683 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4684 } 4685 } 4686 } 4687 4688 for (unsigned Part = 0; Part < State.UF; ++Part) { 4689 // This is phase one of vectorizing PHIs. 4690 Value *EntryPart = PHINode::Create( 4691 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4692 State.set(Def, EntryPart, Part); 4693 if (StartV) { 4694 // Make sure to add the reduction start value only to the 4695 // first unroll part. 4696 Value *StartVal = (Part == 0) ? StartV : Iden; 4697 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4698 } 4699 } 4700 return; 4701 } 4702 4703 assert(!Legal->isReductionVariable(P) && 4704 "reductions should be handled above"); 4705 4706 setDebugLocFromInst(Builder, P); 4707 4708 // This PHINode must be an induction variable. 4709 // Make sure that we know about it. 4710 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4711 4712 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4713 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4714 4715 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4716 // which can be found from the original scalar operations. 4717 switch (II.getKind()) { 4718 case InductionDescriptor::IK_NoInduction: 4719 llvm_unreachable("Unknown induction"); 4720 case InductionDescriptor::IK_IntInduction: 4721 case InductionDescriptor::IK_FpInduction: 4722 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4723 case InductionDescriptor::IK_PtrInduction: { 4724 // Handle the pointer induction variable case. 4725 assert(P->getType()->isPointerTy() && "Unexpected type."); 4726 4727 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4728 // This is the normalized GEP that starts counting at zero. 4729 Value *PtrInd = 4730 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4731 // Determine the number of scalars we need to generate for each unroll 4732 // iteration. If the instruction is uniform, we only need to generate the 4733 // first lane. Otherwise, we generate all VF values. 4734 unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF) 4735 ? 1 4736 : State.VF.getKnownMinValue(); 4737 for (unsigned Part = 0; Part < UF; ++Part) { 4738 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4739 Constant *Idx = ConstantInt::get( 4740 PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue()); 4741 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4742 Value *SclrGep = 4743 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4744 SclrGep->setName("next.gep"); 4745 State.set(Def, SclrGep, VPIteration(Part, Lane)); 4746 } 4747 } 4748 return; 4749 } 4750 assert(isa<SCEVConstant>(II.getStep()) && 4751 "Induction step not a SCEV constant!"); 4752 Type *PhiType = II.getStep()->getType(); 4753 4754 // Build a pointer phi 4755 Value *ScalarStartValue = II.getStartValue(); 4756 Type *ScStValueType = ScalarStartValue->getType(); 4757 PHINode *NewPointerPhi = 4758 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4759 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4760 4761 // A pointer induction, performed by using a gep 4762 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4763 Instruction *InductionLoc = LoopLatch->getTerminator(); 4764 const SCEV *ScalarStep = II.getStep(); 4765 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4766 Value *ScalarStepValue = 4767 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4768 Value *InductionGEP = GetElementPtrInst::Create( 4769 ScStValueType->getPointerElementType(), NewPointerPhi, 4770 Builder.CreateMul( 4771 ScalarStepValue, 4772 ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)), 4773 "ptr.ind", InductionLoc); 4774 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4775 4776 // Create UF many actual address geps that use the pointer 4777 // phi as base and a vectorized version of the step value 4778 // (<step*0, ..., step*N>) as offset. 4779 for (unsigned Part = 0; Part < State.UF; ++Part) { 4780 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4781 Value *StartOffset = 4782 ConstantInt::get(VecPhiType, Part * State.VF.getKnownMinValue()); 4783 // Create a vector of consecutive numbers from zero to VF. 4784 StartOffset = 4785 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4786 4787 Value *GEP = Builder.CreateGEP( 4788 ScStValueType->getPointerElementType(), NewPointerPhi, 4789 Builder.CreateMul(StartOffset, 4790 Builder.CreateVectorSplat( 4791 State.VF.getKnownMinValue(), ScalarStepValue), 4792 "vector.gep")); 4793 State.set(Def, GEP, Part); 4794 } 4795 } 4796 } 4797 } 4798 4799 /// A helper function for checking whether an integer division-related 4800 /// instruction may divide by zero (in which case it must be predicated if 4801 /// executed conditionally in the scalar code). 4802 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4803 /// Non-zero divisors that are non compile-time constants will not be 4804 /// converted into multiplication, so we will still end up scalarizing 4805 /// the division, but can do so w/o predication. 4806 static bool mayDivideByZero(Instruction &I) { 4807 assert((I.getOpcode() == Instruction::UDiv || 4808 I.getOpcode() == Instruction::SDiv || 4809 I.getOpcode() == Instruction::URem || 4810 I.getOpcode() == Instruction::SRem) && 4811 "Unexpected instruction"); 4812 Value *Divisor = I.getOperand(1); 4813 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4814 return !CInt || CInt->isZero(); 4815 } 4816 4817 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4818 VPUser &User, 4819 VPTransformState &State) { 4820 switch (I.getOpcode()) { 4821 case Instruction::Call: 4822 case Instruction::Br: 4823 case Instruction::PHI: 4824 case Instruction::GetElementPtr: 4825 case Instruction::Select: 4826 llvm_unreachable("This instruction is handled by a different recipe."); 4827 case Instruction::UDiv: 4828 case Instruction::SDiv: 4829 case Instruction::SRem: 4830 case Instruction::URem: 4831 case Instruction::Add: 4832 case Instruction::FAdd: 4833 case Instruction::Sub: 4834 case Instruction::FSub: 4835 case Instruction::FNeg: 4836 case Instruction::Mul: 4837 case Instruction::FMul: 4838 case Instruction::FDiv: 4839 case Instruction::FRem: 4840 case Instruction::Shl: 4841 case Instruction::LShr: 4842 case Instruction::AShr: 4843 case Instruction::And: 4844 case Instruction::Or: 4845 case Instruction::Xor: { 4846 // Just widen unops and binops. 4847 setDebugLocFromInst(Builder, &I); 4848 4849 for (unsigned Part = 0; Part < UF; ++Part) { 4850 SmallVector<Value *, 2> Ops; 4851 for (VPValue *VPOp : User.operands()) 4852 Ops.push_back(State.get(VPOp, Part)); 4853 4854 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4855 4856 if (auto *VecOp = dyn_cast<Instruction>(V)) 4857 VecOp->copyIRFlags(&I); 4858 4859 // Use this vector value for all users of the original instruction. 4860 State.set(Def, V, Part); 4861 addMetadata(V, &I); 4862 } 4863 4864 break; 4865 } 4866 case Instruction::ICmp: 4867 case Instruction::FCmp: { 4868 // Widen compares. Generate vector compares. 4869 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4870 auto *Cmp = cast<CmpInst>(&I); 4871 setDebugLocFromInst(Builder, Cmp); 4872 for (unsigned Part = 0; Part < UF; ++Part) { 4873 Value *A = State.get(User.getOperand(0), Part); 4874 Value *B = State.get(User.getOperand(1), Part); 4875 Value *C = nullptr; 4876 if (FCmp) { 4877 // Propagate fast math flags. 4878 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4879 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4880 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4881 } else { 4882 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4883 } 4884 State.set(Def, C, Part); 4885 addMetadata(C, &I); 4886 } 4887 4888 break; 4889 } 4890 4891 case Instruction::ZExt: 4892 case Instruction::SExt: 4893 case Instruction::FPToUI: 4894 case Instruction::FPToSI: 4895 case Instruction::FPExt: 4896 case Instruction::PtrToInt: 4897 case Instruction::IntToPtr: 4898 case Instruction::SIToFP: 4899 case Instruction::UIToFP: 4900 case Instruction::Trunc: 4901 case Instruction::FPTrunc: 4902 case Instruction::BitCast: { 4903 auto *CI = cast<CastInst>(&I); 4904 setDebugLocFromInst(Builder, CI); 4905 4906 /// Vectorize casts. 4907 Type *DestTy = 4908 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4909 4910 for (unsigned Part = 0; Part < UF; ++Part) { 4911 Value *A = State.get(User.getOperand(0), Part); 4912 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4913 State.set(Def, Cast, Part); 4914 addMetadata(Cast, &I); 4915 } 4916 break; 4917 } 4918 default: 4919 // This instruction is not vectorized by simple widening. 4920 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4921 llvm_unreachable("Unhandled instruction!"); 4922 } // end of switch. 4923 } 4924 4925 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4926 VPUser &ArgOperands, 4927 VPTransformState &State) { 4928 assert(!isa<DbgInfoIntrinsic>(I) && 4929 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4930 setDebugLocFromInst(Builder, &I); 4931 4932 Module *M = I.getParent()->getParent()->getParent(); 4933 auto *CI = cast<CallInst>(&I); 4934 4935 SmallVector<Type *, 4> Tys; 4936 for (Value *ArgOperand : CI->arg_operands()) 4937 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4938 4939 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4940 4941 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4942 // version of the instruction. 4943 // Is it beneficial to perform intrinsic call compared to lib call? 4944 bool NeedToScalarize = false; 4945 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4946 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4947 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4948 assert((UseVectorIntrinsic || !NeedToScalarize) && 4949 "Instruction should be scalarized elsewhere."); 4950 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4951 "Cannot have invalid costs while widening"); 4952 4953 for (unsigned Part = 0; Part < UF; ++Part) { 4954 SmallVector<Value *, 4> Args; 4955 for (auto &I : enumerate(ArgOperands.operands())) { 4956 // Some intrinsics have a scalar argument - don't replace it with a 4957 // vector. 4958 Value *Arg; 4959 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4960 Arg = State.get(I.value(), Part); 4961 else 4962 Arg = State.get(I.value(), VPIteration(0, 0)); 4963 Args.push_back(Arg); 4964 } 4965 4966 Function *VectorF; 4967 if (UseVectorIntrinsic) { 4968 // Use vector version of the intrinsic. 4969 Type *TysForDecl[] = {CI->getType()}; 4970 if (VF.isVector()) 4971 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4972 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4973 assert(VectorF && "Can't retrieve vector intrinsic."); 4974 } else { 4975 // Use vector version of the function call. 4976 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4977 #ifndef NDEBUG 4978 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4979 "Can't create vector function."); 4980 #endif 4981 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4982 } 4983 SmallVector<OperandBundleDef, 1> OpBundles; 4984 CI->getOperandBundlesAsDefs(OpBundles); 4985 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4986 4987 if (isa<FPMathOperator>(V)) 4988 V->copyFastMathFlags(CI); 4989 4990 State.set(Def, V, Part); 4991 addMetadata(V, &I); 4992 } 4993 } 4994 4995 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4996 VPUser &Operands, 4997 bool InvariantCond, 4998 VPTransformState &State) { 4999 setDebugLocFromInst(Builder, &I); 5000 5001 // The condition can be loop invariant but still defined inside the 5002 // loop. This means that we can't just use the original 'cond' value. 5003 // We have to take the 'vectorized' value and pick the first lane. 5004 // Instcombine will make this a no-op. 5005 auto *InvarCond = InvariantCond 5006 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5007 : nullptr; 5008 5009 for (unsigned Part = 0; Part < UF; ++Part) { 5010 Value *Cond = 5011 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5012 Value *Op0 = State.get(Operands.getOperand(1), Part); 5013 Value *Op1 = State.get(Operands.getOperand(2), Part); 5014 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5015 State.set(VPDef, Sel, Part); 5016 addMetadata(Sel, &I); 5017 } 5018 } 5019 5020 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5021 // We should not collect Scalars more than once per VF. Right now, this 5022 // function is called from collectUniformsAndScalars(), which already does 5023 // this check. Collecting Scalars for VF=1 does not make any sense. 5024 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5025 "This function should not be visited twice for the same VF"); 5026 5027 SmallSetVector<Instruction *, 8> Worklist; 5028 5029 // These sets are used to seed the analysis with pointers used by memory 5030 // accesses that will remain scalar. 5031 SmallSetVector<Instruction *, 8> ScalarPtrs; 5032 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5033 auto *Latch = TheLoop->getLoopLatch(); 5034 5035 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5036 // The pointer operands of loads and stores will be scalar as long as the 5037 // memory access is not a gather or scatter operation. The value operand of a 5038 // store will remain scalar if the store is scalarized. 5039 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5040 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5041 assert(WideningDecision != CM_Unknown && 5042 "Widening decision should be ready at this moment"); 5043 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5044 if (Ptr == Store->getValueOperand()) 5045 return WideningDecision == CM_Scalarize; 5046 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5047 "Ptr is neither a value or pointer operand"); 5048 return WideningDecision != CM_GatherScatter; 5049 }; 5050 5051 // A helper that returns true if the given value is a bitcast or 5052 // getelementptr instruction contained in the loop. 5053 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5054 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5055 isa<GetElementPtrInst>(V)) && 5056 !TheLoop->isLoopInvariant(V); 5057 }; 5058 5059 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5060 if (!isa<PHINode>(Ptr) || 5061 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5062 return false; 5063 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5064 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5065 return false; 5066 return isScalarUse(MemAccess, Ptr); 5067 }; 5068 5069 // A helper that evaluates a memory access's use of a pointer. If the 5070 // pointer is actually the pointer induction of a loop, it is being 5071 // inserted into Worklist. If the use will be a scalar use, and the 5072 // pointer is only used by memory accesses, we place the pointer in 5073 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5074 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5075 if (isScalarPtrInduction(MemAccess, Ptr)) { 5076 Worklist.insert(cast<Instruction>(Ptr)); 5077 Instruction *Update = cast<Instruction>( 5078 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5079 Worklist.insert(Update); 5080 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5081 << "\n"); 5082 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5083 << "\n"); 5084 return; 5085 } 5086 // We only care about bitcast and getelementptr instructions contained in 5087 // the loop. 5088 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5089 return; 5090 5091 // If the pointer has already been identified as scalar (e.g., if it was 5092 // also identified as uniform), there's nothing to do. 5093 auto *I = cast<Instruction>(Ptr); 5094 if (Worklist.count(I)) 5095 return; 5096 5097 // If the use of the pointer will be a scalar use, and all users of the 5098 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5099 // place the pointer in PossibleNonScalarPtrs. 5100 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5101 return isa<LoadInst>(U) || isa<StoreInst>(U); 5102 })) 5103 ScalarPtrs.insert(I); 5104 else 5105 PossibleNonScalarPtrs.insert(I); 5106 }; 5107 5108 // We seed the scalars analysis with three classes of instructions: (1) 5109 // instructions marked uniform-after-vectorization and (2) bitcast, 5110 // getelementptr and (pointer) phi instructions used by memory accesses 5111 // requiring a scalar use. 5112 // 5113 // (1) Add to the worklist all instructions that have been identified as 5114 // uniform-after-vectorization. 5115 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5116 5117 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5118 // memory accesses requiring a scalar use. The pointer operands of loads and 5119 // stores will be scalar as long as the memory accesses is not a gather or 5120 // scatter operation. The value operand of a store will remain scalar if the 5121 // store is scalarized. 5122 for (auto *BB : TheLoop->blocks()) 5123 for (auto &I : *BB) { 5124 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5125 evaluatePtrUse(Load, Load->getPointerOperand()); 5126 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5127 evaluatePtrUse(Store, Store->getPointerOperand()); 5128 evaluatePtrUse(Store, Store->getValueOperand()); 5129 } 5130 } 5131 for (auto *I : ScalarPtrs) 5132 if (!PossibleNonScalarPtrs.count(I)) { 5133 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5134 Worklist.insert(I); 5135 } 5136 5137 // Insert the forced scalars. 5138 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5139 // induction variable when the PHI user is scalarized. 5140 auto ForcedScalar = ForcedScalars.find(VF); 5141 if (ForcedScalar != ForcedScalars.end()) 5142 for (auto *I : ForcedScalar->second) 5143 Worklist.insert(I); 5144 5145 // Expand the worklist by looking through any bitcasts and getelementptr 5146 // instructions we've already identified as scalar. This is similar to the 5147 // expansion step in collectLoopUniforms(); however, here we're only 5148 // expanding to include additional bitcasts and getelementptr instructions. 5149 unsigned Idx = 0; 5150 while (Idx != Worklist.size()) { 5151 Instruction *Dst = Worklist[Idx++]; 5152 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5153 continue; 5154 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5155 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5156 auto *J = cast<Instruction>(U); 5157 return !TheLoop->contains(J) || Worklist.count(J) || 5158 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5159 isScalarUse(J, Src)); 5160 })) { 5161 Worklist.insert(Src); 5162 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5163 } 5164 } 5165 5166 // An induction variable will remain scalar if all users of the induction 5167 // variable and induction variable update remain scalar. 5168 for (auto &Induction : Legal->getInductionVars()) { 5169 auto *Ind = Induction.first; 5170 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5171 5172 // If tail-folding is applied, the primary induction variable will be used 5173 // to feed a vector compare. 5174 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5175 continue; 5176 5177 // Determine if all users of the induction variable are scalar after 5178 // vectorization. 5179 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5180 auto *I = cast<Instruction>(U); 5181 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5182 }); 5183 if (!ScalarInd) 5184 continue; 5185 5186 // Determine if all users of the induction variable update instruction are 5187 // scalar after vectorization. 5188 auto ScalarIndUpdate = 5189 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5190 auto *I = cast<Instruction>(U); 5191 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5192 }); 5193 if (!ScalarIndUpdate) 5194 continue; 5195 5196 // The induction variable and its update instruction will remain scalar. 5197 Worklist.insert(Ind); 5198 Worklist.insert(IndUpdate); 5199 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5200 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5201 << "\n"); 5202 } 5203 5204 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5205 } 5206 5207 bool LoopVectorizationCostModel::isScalarWithPredication( 5208 Instruction *I, ElementCount VF) const { 5209 if (!blockNeedsPredication(I->getParent())) 5210 return false; 5211 switch(I->getOpcode()) { 5212 default: 5213 break; 5214 case Instruction::Load: 5215 case Instruction::Store: { 5216 if (!Legal->isMaskRequired(I)) 5217 return false; 5218 auto *Ptr = getLoadStorePointerOperand(I); 5219 auto *Ty = getMemInstValueType(I); 5220 // We have already decided how to vectorize this instruction, get that 5221 // result. 5222 if (VF.isVector()) { 5223 InstWidening WideningDecision = getWideningDecision(I, VF); 5224 assert(WideningDecision != CM_Unknown && 5225 "Widening decision should be ready at this moment"); 5226 return WideningDecision == CM_Scalarize; 5227 } 5228 const Align Alignment = getLoadStoreAlignment(I); 5229 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5230 isLegalMaskedGather(Ty, Alignment)) 5231 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5232 isLegalMaskedScatter(Ty, Alignment)); 5233 } 5234 case Instruction::UDiv: 5235 case Instruction::SDiv: 5236 case Instruction::SRem: 5237 case Instruction::URem: 5238 return mayDivideByZero(*I); 5239 } 5240 return false; 5241 } 5242 5243 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5244 Instruction *I, ElementCount VF) { 5245 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5246 assert(getWideningDecision(I, VF) == CM_Unknown && 5247 "Decision should not be set yet."); 5248 auto *Group = getInterleavedAccessGroup(I); 5249 assert(Group && "Must have a group."); 5250 5251 // If the instruction's allocated size doesn't equal it's type size, it 5252 // requires padding and will be scalarized. 5253 auto &DL = I->getModule()->getDataLayout(); 5254 auto *ScalarTy = getMemInstValueType(I); 5255 if (hasIrregularType(ScalarTy, DL)) 5256 return false; 5257 5258 // Check if masking is required. 5259 // A Group may need masking for one of two reasons: it resides in a block that 5260 // needs predication, or it was decided to use masking to deal with gaps. 5261 bool PredicatedAccessRequiresMasking = 5262 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5263 bool AccessWithGapsRequiresMasking = 5264 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5265 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5266 return true; 5267 5268 // If masked interleaving is required, we expect that the user/target had 5269 // enabled it, because otherwise it either wouldn't have been created or 5270 // it should have been invalidated by the CostModel. 5271 assert(useMaskedInterleavedAccesses(TTI) && 5272 "Masked interleave-groups for predicated accesses are not enabled."); 5273 5274 auto *Ty = getMemInstValueType(I); 5275 const Align Alignment = getLoadStoreAlignment(I); 5276 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5277 : TTI.isLegalMaskedStore(Ty, Alignment); 5278 } 5279 5280 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5281 Instruction *I, ElementCount VF) { 5282 // Get and ensure we have a valid memory instruction. 5283 LoadInst *LI = dyn_cast<LoadInst>(I); 5284 StoreInst *SI = dyn_cast<StoreInst>(I); 5285 assert((LI || SI) && "Invalid memory instruction"); 5286 5287 auto *Ptr = getLoadStorePointerOperand(I); 5288 5289 // In order to be widened, the pointer should be consecutive, first of all. 5290 if (!Legal->isConsecutivePtr(Ptr)) 5291 return false; 5292 5293 // If the instruction is a store located in a predicated block, it will be 5294 // scalarized. 5295 if (isScalarWithPredication(I)) 5296 return false; 5297 5298 // If the instruction's allocated size doesn't equal it's type size, it 5299 // requires padding and will be scalarized. 5300 auto &DL = I->getModule()->getDataLayout(); 5301 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5302 if (hasIrregularType(ScalarTy, DL)) 5303 return false; 5304 5305 return true; 5306 } 5307 5308 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5309 // We should not collect Uniforms more than once per VF. Right now, 5310 // this function is called from collectUniformsAndScalars(), which 5311 // already does this check. Collecting Uniforms for VF=1 does not make any 5312 // sense. 5313 5314 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5315 "This function should not be visited twice for the same VF"); 5316 5317 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5318 // not analyze again. Uniforms.count(VF) will return 1. 5319 Uniforms[VF].clear(); 5320 5321 // We now know that the loop is vectorizable! 5322 // Collect instructions inside the loop that will remain uniform after 5323 // vectorization. 5324 5325 // Global values, params and instructions outside of current loop are out of 5326 // scope. 5327 auto isOutOfScope = [&](Value *V) -> bool { 5328 Instruction *I = dyn_cast<Instruction>(V); 5329 return (!I || !TheLoop->contains(I)); 5330 }; 5331 5332 SetVector<Instruction *> Worklist; 5333 BasicBlock *Latch = TheLoop->getLoopLatch(); 5334 5335 // Instructions that are scalar with predication must not be considered 5336 // uniform after vectorization, because that would create an erroneous 5337 // replicating region where only a single instance out of VF should be formed. 5338 // TODO: optimize such seldom cases if found important, see PR40816. 5339 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5340 if (isOutOfScope(I)) { 5341 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5342 << *I << "\n"); 5343 return; 5344 } 5345 if (isScalarWithPredication(I, VF)) { 5346 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5347 << *I << "\n"); 5348 return; 5349 } 5350 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5351 Worklist.insert(I); 5352 }; 5353 5354 // Start with the conditional branch. If the branch condition is an 5355 // instruction contained in the loop that is only used by the branch, it is 5356 // uniform. 5357 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5358 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5359 addToWorklistIfAllowed(Cmp); 5360 5361 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5362 InstWidening WideningDecision = getWideningDecision(I, VF); 5363 assert(WideningDecision != CM_Unknown && 5364 "Widening decision should be ready at this moment"); 5365 5366 // A uniform memory op is itself uniform. We exclude uniform stores 5367 // here as they demand the last lane, not the first one. 5368 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5369 assert(WideningDecision == CM_Scalarize); 5370 return true; 5371 } 5372 5373 return (WideningDecision == CM_Widen || 5374 WideningDecision == CM_Widen_Reverse || 5375 WideningDecision == CM_Interleave); 5376 }; 5377 5378 5379 // Returns true if Ptr is the pointer operand of a memory access instruction 5380 // I, and I is known to not require scalarization. 5381 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5382 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5383 }; 5384 5385 // Holds a list of values which are known to have at least one uniform use. 5386 // Note that there may be other uses which aren't uniform. A "uniform use" 5387 // here is something which only demands lane 0 of the unrolled iterations; 5388 // it does not imply that all lanes produce the same value (e.g. this is not 5389 // the usual meaning of uniform) 5390 SmallPtrSet<Value *, 8> HasUniformUse; 5391 5392 // Scan the loop for instructions which are either a) known to have only 5393 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5394 for (auto *BB : TheLoop->blocks()) 5395 for (auto &I : *BB) { 5396 // If there's no pointer operand, there's nothing to do. 5397 auto *Ptr = getLoadStorePointerOperand(&I); 5398 if (!Ptr) 5399 continue; 5400 5401 // A uniform memory op is itself uniform. We exclude uniform stores 5402 // here as they demand the last lane, not the first one. 5403 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5404 addToWorklistIfAllowed(&I); 5405 5406 if (isUniformDecision(&I, VF)) { 5407 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5408 HasUniformUse.insert(Ptr); 5409 } 5410 } 5411 5412 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5413 // demanding) users. Since loops are assumed to be in LCSSA form, this 5414 // disallows uses outside the loop as well. 5415 for (auto *V : HasUniformUse) { 5416 if (isOutOfScope(V)) 5417 continue; 5418 auto *I = cast<Instruction>(V); 5419 auto UsersAreMemAccesses = 5420 llvm::all_of(I->users(), [&](User *U) -> bool { 5421 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5422 }); 5423 if (UsersAreMemAccesses) 5424 addToWorklistIfAllowed(I); 5425 } 5426 5427 // Expand Worklist in topological order: whenever a new instruction 5428 // is added , its users should be already inside Worklist. It ensures 5429 // a uniform instruction will only be used by uniform instructions. 5430 unsigned idx = 0; 5431 while (idx != Worklist.size()) { 5432 Instruction *I = Worklist[idx++]; 5433 5434 for (auto OV : I->operand_values()) { 5435 // isOutOfScope operands cannot be uniform instructions. 5436 if (isOutOfScope(OV)) 5437 continue; 5438 // First order recurrence Phi's should typically be considered 5439 // non-uniform. 5440 auto *OP = dyn_cast<PHINode>(OV); 5441 if (OP && Legal->isFirstOrderRecurrence(OP)) 5442 continue; 5443 // If all the users of the operand are uniform, then add the 5444 // operand into the uniform worklist. 5445 auto *OI = cast<Instruction>(OV); 5446 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5447 auto *J = cast<Instruction>(U); 5448 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5449 })) 5450 addToWorklistIfAllowed(OI); 5451 } 5452 } 5453 5454 // For an instruction to be added into Worklist above, all its users inside 5455 // the loop should also be in Worklist. However, this condition cannot be 5456 // true for phi nodes that form a cyclic dependence. We must process phi 5457 // nodes separately. An induction variable will remain uniform if all users 5458 // of the induction variable and induction variable update remain uniform. 5459 // The code below handles both pointer and non-pointer induction variables. 5460 for (auto &Induction : Legal->getInductionVars()) { 5461 auto *Ind = Induction.first; 5462 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5463 5464 // Determine if all users of the induction variable are uniform after 5465 // vectorization. 5466 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5467 auto *I = cast<Instruction>(U); 5468 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5469 isVectorizedMemAccessUse(I, Ind); 5470 }); 5471 if (!UniformInd) 5472 continue; 5473 5474 // Determine if all users of the induction variable update instruction are 5475 // uniform after vectorization. 5476 auto UniformIndUpdate = 5477 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5478 auto *I = cast<Instruction>(U); 5479 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5480 isVectorizedMemAccessUse(I, IndUpdate); 5481 }); 5482 if (!UniformIndUpdate) 5483 continue; 5484 5485 // The induction variable and its update instruction will remain uniform. 5486 addToWorklistIfAllowed(Ind); 5487 addToWorklistIfAllowed(IndUpdate); 5488 } 5489 5490 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5491 } 5492 5493 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5494 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5495 5496 if (Legal->getRuntimePointerChecking()->Need) { 5497 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5498 "runtime pointer checks needed. Enable vectorization of this " 5499 "loop with '#pragma clang loop vectorize(enable)' when " 5500 "compiling with -Os/-Oz", 5501 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5502 return true; 5503 } 5504 5505 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5506 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5507 "runtime SCEV checks needed. Enable vectorization of this " 5508 "loop with '#pragma clang loop vectorize(enable)' when " 5509 "compiling with -Os/-Oz", 5510 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5511 return true; 5512 } 5513 5514 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5515 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5516 reportVectorizationFailure("Runtime stride check for small trip count", 5517 "runtime stride == 1 checks needed. Enable vectorization of " 5518 "this loop without such check by compiling with -Os/-Oz", 5519 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5520 return true; 5521 } 5522 5523 return false; 5524 } 5525 5526 Optional<ElementCount> 5527 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5528 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5529 // TODO: It may by useful to do since it's still likely to be dynamically 5530 // uniform if the target can skip. 5531 reportVectorizationFailure( 5532 "Not inserting runtime ptr check for divergent target", 5533 "runtime pointer checks needed. Not enabled for divergent target", 5534 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5535 return None; 5536 } 5537 5538 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5539 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5540 if (TC == 1) { 5541 reportVectorizationFailure("Single iteration (non) loop", 5542 "loop trip count is one, irrelevant for vectorization", 5543 "SingleIterationLoop", ORE, TheLoop); 5544 return None; 5545 } 5546 5547 switch (ScalarEpilogueStatus) { 5548 case CM_ScalarEpilogueAllowed: 5549 return computeFeasibleMaxVF(TC, UserVF); 5550 case CM_ScalarEpilogueNotAllowedUsePredicate: 5551 LLVM_FALLTHROUGH; 5552 case CM_ScalarEpilogueNotNeededUsePredicate: 5553 LLVM_DEBUG( 5554 dbgs() << "LV: vector predicate hint/switch found.\n" 5555 << "LV: Not allowing scalar epilogue, creating predicated " 5556 << "vector loop.\n"); 5557 break; 5558 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5559 // fallthrough as a special case of OptForSize 5560 case CM_ScalarEpilogueNotAllowedOptSize: 5561 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5562 LLVM_DEBUG( 5563 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5564 else 5565 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5566 << "count.\n"); 5567 5568 // Bail if runtime checks are required, which are not good when optimising 5569 // for size. 5570 if (runtimeChecksRequired()) 5571 return None; 5572 5573 break; 5574 } 5575 5576 // The only loops we can vectorize without a scalar epilogue, are loops with 5577 // a bottom-test and a single exiting block. We'd have to handle the fact 5578 // that not every instruction executes on the last iteration. This will 5579 // require a lane mask which varies through the vector loop body. (TODO) 5580 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5581 // If there was a tail-folding hint/switch, but we can't fold the tail by 5582 // masking, fallback to a vectorization with a scalar epilogue. 5583 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5584 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5585 "scalar epilogue instead.\n"); 5586 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5587 return computeFeasibleMaxVF(TC, UserVF); 5588 } 5589 return None; 5590 } 5591 5592 // Now try the tail folding 5593 5594 // Invalidate interleave groups that require an epilogue if we can't mask 5595 // the interleave-group. 5596 if (!useMaskedInterleavedAccesses(TTI)) { 5597 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5598 "No decisions should have been taken at this point"); 5599 // Note: There is no need to invalidate any cost modeling decisions here, as 5600 // non where taken so far. 5601 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5602 } 5603 5604 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5605 assert(!MaxVF.isScalable() && 5606 "Scalable vectors do not yet support tail folding"); 5607 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5608 "MaxVF must be a power of 2"); 5609 unsigned MaxVFtimesIC = 5610 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5611 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5612 // chose. 5613 ScalarEvolution *SE = PSE.getSE(); 5614 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5615 const SCEV *ExitCount = SE->getAddExpr( 5616 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5617 const SCEV *Rem = SE->getURemExpr( 5618 SE->applyLoopGuards(ExitCount, TheLoop), 5619 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5620 if (Rem->isZero()) { 5621 // Accept MaxVF if we do not have a tail. 5622 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5623 return MaxVF; 5624 } 5625 5626 // If we don't know the precise trip count, or if the trip count that we 5627 // found modulo the vectorization factor is not zero, try to fold the tail 5628 // by masking. 5629 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5630 if (Legal->prepareToFoldTailByMasking()) { 5631 FoldTailByMasking = true; 5632 return MaxVF; 5633 } 5634 5635 // If there was a tail-folding hint/switch, but we can't fold the tail by 5636 // masking, fallback to a vectorization with a scalar epilogue. 5637 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5638 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5639 "scalar epilogue instead.\n"); 5640 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5641 return MaxVF; 5642 } 5643 5644 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5645 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5646 return None; 5647 } 5648 5649 if (TC == 0) { 5650 reportVectorizationFailure( 5651 "Unable to calculate the loop count due to complex control flow", 5652 "unable to calculate the loop count due to complex control flow", 5653 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5654 return None; 5655 } 5656 5657 reportVectorizationFailure( 5658 "Cannot optimize for size and vectorize at the same time.", 5659 "cannot optimize for size and vectorize at the same time. " 5660 "Enable vectorization of this loop with '#pragma clang loop " 5661 "vectorize(enable)' when compiling with -Os/-Oz", 5662 "NoTailLoopWithOptForSize", ORE, TheLoop); 5663 return None; 5664 } 5665 5666 ElementCount 5667 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5668 ElementCount UserVF) { 5669 bool IgnoreScalableUserVF = UserVF.isScalable() && 5670 !TTI.supportsScalableVectors() && 5671 !ForceTargetSupportsScalableVectors; 5672 if (IgnoreScalableUserVF) { 5673 LLVM_DEBUG( 5674 dbgs() << "LV: Ignoring VF=" << UserVF 5675 << " because target does not support scalable vectors.\n"); 5676 ORE->emit([&]() { 5677 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5678 TheLoop->getStartLoc(), 5679 TheLoop->getHeader()) 5680 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5681 << " because target does not support scalable vectors."; 5682 }); 5683 } 5684 5685 // Beyond this point two scenarios are handled. If UserVF isn't specified 5686 // then a suitable VF is chosen. If UserVF is specified and there are 5687 // dependencies, check if it's legal. However, if a UserVF is specified and 5688 // there are no dependencies, then there's nothing to do. 5689 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5690 if (!canVectorizeReductions(UserVF)) { 5691 reportVectorizationFailure( 5692 "LV: Scalable vectorization not supported for the reduction " 5693 "operations found in this loop. Using fixed-width " 5694 "vectorization instead.", 5695 "Scalable vectorization not supported for the reduction operations " 5696 "found in this loop. Using fixed-width vectorization instead.", 5697 "ScalableVFUnfeasible", ORE, TheLoop); 5698 return computeFeasibleMaxVF( 5699 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5700 } 5701 5702 if (Legal->isSafeForAnyVectorWidth()) 5703 return UserVF; 5704 } 5705 5706 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5707 unsigned SmallestType, WidestType; 5708 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5709 unsigned WidestRegister = 5710 TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 5711 .getFixedSize(); 5712 5713 // Get the maximum safe dependence distance in bits computed by LAA. 5714 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5715 // the memory accesses that is most restrictive (involved in the smallest 5716 // dependence distance). 5717 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5718 5719 // If the user vectorization factor is legally unsafe, clamp it to a safe 5720 // value. Otherwise, return as is. 5721 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5722 unsigned MaxSafeElements = 5723 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5724 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5725 5726 if (UserVF.isScalable()) { 5727 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5728 5729 // Scale VF by vscale before checking if it's safe. 5730 MaxSafeVF = ElementCount::getScalable( 5731 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5732 5733 if (MaxSafeVF.isZero()) { 5734 // The dependence distance is too small to use scalable vectors, 5735 // fallback on fixed. 5736 LLVM_DEBUG( 5737 dbgs() 5738 << "LV: Max legal vector width too small, scalable vectorization " 5739 "unfeasible. Using fixed-width vectorization instead.\n"); 5740 ORE->emit([&]() { 5741 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5742 TheLoop->getStartLoc(), 5743 TheLoop->getHeader()) 5744 << "Max legal vector width too small, scalable vectorization " 5745 << "unfeasible. Using fixed-width vectorization instead."; 5746 }); 5747 return computeFeasibleMaxVF( 5748 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5749 } 5750 } 5751 5752 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5753 5754 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5755 return UserVF; 5756 5757 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5758 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5759 << ".\n"); 5760 ORE->emit([&]() { 5761 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5762 TheLoop->getStartLoc(), 5763 TheLoop->getHeader()) 5764 << "User-specified vectorization factor " 5765 << ore::NV("UserVectorizationFactor", UserVF) 5766 << " is unsafe, clamping to maximum safe vectorization factor " 5767 << ore::NV("VectorizationFactor", MaxSafeVF); 5768 }); 5769 return MaxSafeVF; 5770 } 5771 5772 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5773 5774 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5775 // Note that both WidestRegister and WidestType may not be a powers of 2. 5776 auto MaxVectorSize = 5777 ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType)); 5778 5779 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5780 << " / " << WidestType << " bits.\n"); 5781 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5782 << WidestRegister << " bits.\n"); 5783 5784 assert(MaxVectorSize.getFixedValue() <= WidestRegister && 5785 "Did not expect to pack so many elements" 5786 " into one vector!"); 5787 if (MaxVectorSize.getFixedValue() == 0) { 5788 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5789 return ElementCount::getFixed(1); 5790 } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() && 5791 isPowerOf2_32(ConstTripCount)) { 5792 // We need to clamp the VF to be the ConstTripCount. There is no point in 5793 // choosing a higher viable VF as done in the loop below. 5794 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5795 << ConstTripCount << "\n"); 5796 return ElementCount::getFixed(ConstTripCount); 5797 } 5798 5799 ElementCount MaxVF = MaxVectorSize; 5800 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5801 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5802 // Collect all viable vectorization factors larger than the default MaxVF 5803 // (i.e. MaxVectorSize). 5804 SmallVector<ElementCount, 8> VFs; 5805 auto MaxVectorSizeMaxBW = 5806 ElementCount::getFixed(WidestRegister / SmallestType); 5807 for (ElementCount VS = MaxVectorSize * 2; 5808 ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2) 5809 VFs.push_back(VS); 5810 5811 // For each VF calculate its register usage. 5812 auto RUs = calculateRegisterUsage(VFs); 5813 5814 // Select the largest VF which doesn't require more registers than existing 5815 // ones. 5816 for (int i = RUs.size() - 1; i >= 0; --i) { 5817 bool Selected = true; 5818 for (auto &pair : RUs[i].MaxLocalUsers) { 5819 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5820 if (pair.second > TargetNumRegisters) 5821 Selected = false; 5822 } 5823 if (Selected) { 5824 MaxVF = VFs[i]; 5825 break; 5826 } 5827 } 5828 if (ElementCount MinVF = 5829 TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) { 5830 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5831 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5832 << ") with target's minimum: " << MinVF << '\n'); 5833 MaxVF = MinVF; 5834 } 5835 } 5836 } 5837 return MaxVF; 5838 } 5839 5840 VectorizationFactor 5841 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5842 // FIXME: This can be fixed for scalable vectors later, because at this stage 5843 // the LoopVectorizer will only consider vectorizing a loop with scalable 5844 // vectors when the loop has a hint to enable vectorization for a given VF. 5845 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5846 5847 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5848 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5849 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5850 5851 auto Width = ElementCount::getFixed(1); 5852 const float ScalarCost = *ExpectedCost.getValue(); 5853 float Cost = ScalarCost; 5854 5855 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5856 if (ForceVectorization && MaxVF.isVector()) { 5857 // Ignore scalar width, because the user explicitly wants vectorization. 5858 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5859 // evaluation. 5860 Cost = std::numeric_limits<float>::max(); 5861 } 5862 5863 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5864 i *= 2) { 5865 // Notice that the vector loop needs to be executed less times, so 5866 // we need to divide the cost of the vector loops by the width of 5867 // the vector elements. 5868 VectorizationCostTy C = expectedCost(i); 5869 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5870 float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); 5871 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5872 << " costs: " << (int)VectorCost << ".\n"); 5873 if (!C.second && !ForceVectorization) { 5874 LLVM_DEBUG( 5875 dbgs() << "LV: Not considering vector loop of width " << i 5876 << " because it will not generate any vector instructions.\n"); 5877 continue; 5878 } 5879 5880 // If profitable add it to ProfitableVF list. 5881 if (VectorCost < ScalarCost) { 5882 ProfitableVFs.push_back(VectorizationFactor( 5883 {i, (unsigned)VectorCost})); 5884 } 5885 5886 if (VectorCost < Cost) { 5887 Cost = VectorCost; 5888 Width = i; 5889 } 5890 } 5891 5892 if (!EnableCondStoresVectorization && NumPredStores) { 5893 reportVectorizationFailure("There are conditional stores.", 5894 "store that is conditionally executed prevents vectorization", 5895 "ConditionalStore", ORE, TheLoop); 5896 Width = ElementCount::getFixed(1); 5897 Cost = ScalarCost; 5898 } 5899 5900 LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs() 5901 << "LV: Vectorization seems to be not beneficial, " 5902 << "but was forced by a user.\n"); 5903 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5904 VectorizationFactor Factor = {Width, 5905 (unsigned)(Width.getKnownMinValue() * Cost)}; 5906 return Factor; 5907 } 5908 5909 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5910 const Loop &L, ElementCount VF) const { 5911 // Cross iteration phis such as reductions need special handling and are 5912 // currently unsupported. 5913 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5914 return Legal->isFirstOrderRecurrence(&Phi) || 5915 Legal->isReductionVariable(&Phi); 5916 })) 5917 return false; 5918 5919 // Phis with uses outside of the loop require special handling and are 5920 // currently unsupported. 5921 for (auto &Entry : Legal->getInductionVars()) { 5922 // Look for uses of the value of the induction at the last iteration. 5923 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5924 for (User *U : PostInc->users()) 5925 if (!L.contains(cast<Instruction>(U))) 5926 return false; 5927 // Look for uses of penultimate value of the induction. 5928 for (User *U : Entry.first->users()) 5929 if (!L.contains(cast<Instruction>(U))) 5930 return false; 5931 } 5932 5933 // Induction variables that are widened require special handling that is 5934 // currently not supported. 5935 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5936 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5937 this->isProfitableToScalarize(Entry.first, VF)); 5938 })) 5939 return false; 5940 5941 return true; 5942 } 5943 5944 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5945 const ElementCount VF) const { 5946 // FIXME: We need a much better cost-model to take different parameters such 5947 // as register pressure, code size increase and cost of extra branches into 5948 // account. For now we apply a very crude heuristic and only consider loops 5949 // with vectorization factors larger than a certain value. 5950 // We also consider epilogue vectorization unprofitable for targets that don't 5951 // consider interleaving beneficial (eg. MVE). 5952 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5953 return false; 5954 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5955 return true; 5956 return false; 5957 } 5958 5959 VectorizationFactor 5960 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5961 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5962 VectorizationFactor Result = VectorizationFactor::Disabled(); 5963 if (!EnableEpilogueVectorization) { 5964 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5965 return Result; 5966 } 5967 5968 if (!isScalarEpilogueAllowed()) { 5969 LLVM_DEBUG( 5970 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5971 "allowed.\n";); 5972 return Result; 5973 } 5974 5975 // FIXME: This can be fixed for scalable vectors later, because at this stage 5976 // the LoopVectorizer will only consider vectorizing a loop with scalable 5977 // vectors when the loop has a hint to enable vectorization for a given VF. 5978 if (MainLoopVF.isScalable()) { 5979 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5980 "yet supported.\n"); 5981 return Result; 5982 } 5983 5984 // Not really a cost consideration, but check for unsupported cases here to 5985 // simplify the logic. 5986 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5987 LLVM_DEBUG( 5988 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5989 "not a supported candidate.\n";); 5990 return Result; 5991 } 5992 5993 if (EpilogueVectorizationForceVF > 1) { 5994 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5995 if (LVP.hasPlanWithVFs( 5996 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5997 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5998 else { 5999 LLVM_DEBUG( 6000 dbgs() 6001 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6002 return Result; 6003 } 6004 } 6005 6006 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6007 TheLoop->getHeader()->getParent()->hasMinSize()) { 6008 LLVM_DEBUG( 6009 dbgs() 6010 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6011 return Result; 6012 } 6013 6014 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6015 return Result; 6016 6017 for (auto &NextVF : ProfitableVFs) 6018 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6019 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 6020 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6021 Result = NextVF; 6022 6023 if (Result != VectorizationFactor::Disabled()) 6024 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6025 << Result.Width.getFixedValue() << "\n";); 6026 return Result; 6027 } 6028 6029 std::pair<unsigned, unsigned> 6030 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6031 unsigned MinWidth = -1U; 6032 unsigned MaxWidth = 8; 6033 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6034 6035 // For each block. 6036 for (BasicBlock *BB : TheLoop->blocks()) { 6037 // For each instruction in the loop. 6038 for (Instruction &I : BB->instructionsWithoutDebug()) { 6039 Type *T = I.getType(); 6040 6041 // Skip ignored values. 6042 if (ValuesToIgnore.count(&I)) 6043 continue; 6044 6045 // Only examine Loads, Stores and PHINodes. 6046 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6047 continue; 6048 6049 // Examine PHI nodes that are reduction variables. Update the type to 6050 // account for the recurrence type. 6051 if (auto *PN = dyn_cast<PHINode>(&I)) { 6052 if (!Legal->isReductionVariable(PN)) 6053 continue; 6054 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6055 if (PreferInLoopReductions || 6056 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6057 RdxDesc.getRecurrenceType(), 6058 TargetTransformInfo::ReductionFlags())) 6059 continue; 6060 T = RdxDesc.getRecurrenceType(); 6061 } 6062 6063 // Examine the stored values. 6064 if (auto *ST = dyn_cast<StoreInst>(&I)) 6065 T = ST->getValueOperand()->getType(); 6066 6067 // Ignore loaded pointer types and stored pointer types that are not 6068 // vectorizable. 6069 // 6070 // FIXME: The check here attempts to predict whether a load or store will 6071 // be vectorized. We only know this for certain after a VF has 6072 // been selected. Here, we assume that if an access can be 6073 // vectorized, it will be. We should also look at extending this 6074 // optimization to non-pointer types. 6075 // 6076 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6077 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6078 continue; 6079 6080 MinWidth = std::min(MinWidth, 6081 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6082 MaxWidth = std::max(MaxWidth, 6083 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6084 } 6085 } 6086 6087 return {MinWidth, MaxWidth}; 6088 } 6089 6090 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6091 unsigned LoopCost) { 6092 // -- The interleave heuristics -- 6093 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6094 // There are many micro-architectural considerations that we can't predict 6095 // at this level. For example, frontend pressure (on decode or fetch) due to 6096 // code size, or the number and capabilities of the execution ports. 6097 // 6098 // We use the following heuristics to select the interleave count: 6099 // 1. If the code has reductions, then we interleave to break the cross 6100 // iteration dependency. 6101 // 2. If the loop is really small, then we interleave to reduce the loop 6102 // overhead. 6103 // 3. We don't interleave if we think that we will spill registers to memory 6104 // due to the increased register pressure. 6105 6106 if (!isScalarEpilogueAllowed()) 6107 return 1; 6108 6109 // We used the distance for the interleave count. 6110 if (Legal->getMaxSafeDepDistBytes() != -1U) 6111 return 1; 6112 6113 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6114 const bool HasReductions = !Legal->getReductionVars().empty(); 6115 // Do not interleave loops with a relatively small known or estimated trip 6116 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6117 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6118 // because with the above conditions interleaving can expose ILP and break 6119 // cross iteration dependences for reductions. 6120 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6121 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6122 return 1; 6123 6124 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6125 // We divide by these constants so assume that we have at least one 6126 // instruction that uses at least one register. 6127 for (auto& pair : R.MaxLocalUsers) { 6128 pair.second = std::max(pair.second, 1U); 6129 } 6130 6131 // We calculate the interleave count using the following formula. 6132 // Subtract the number of loop invariants from the number of available 6133 // registers. These registers are used by all of the interleaved instances. 6134 // Next, divide the remaining registers by the number of registers that is 6135 // required by the loop, in order to estimate how many parallel instances 6136 // fit without causing spills. All of this is rounded down if necessary to be 6137 // a power of two. We want power of two interleave count to simplify any 6138 // addressing operations or alignment considerations. 6139 // We also want power of two interleave counts to ensure that the induction 6140 // variable of the vector loop wraps to zero, when tail is folded by masking; 6141 // this currently happens when OptForSize, in which case IC is set to 1 above. 6142 unsigned IC = UINT_MAX; 6143 6144 for (auto& pair : R.MaxLocalUsers) { 6145 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6146 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6147 << " registers of " 6148 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6149 if (VF.isScalar()) { 6150 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6151 TargetNumRegisters = ForceTargetNumScalarRegs; 6152 } else { 6153 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6154 TargetNumRegisters = ForceTargetNumVectorRegs; 6155 } 6156 unsigned MaxLocalUsers = pair.second; 6157 unsigned LoopInvariantRegs = 0; 6158 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6159 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6160 6161 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6162 // Don't count the induction variable as interleaved. 6163 if (EnableIndVarRegisterHeur) { 6164 TmpIC = 6165 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6166 std::max(1U, (MaxLocalUsers - 1))); 6167 } 6168 6169 IC = std::min(IC, TmpIC); 6170 } 6171 6172 // Clamp the interleave ranges to reasonable counts. 6173 unsigned MaxInterleaveCount = 6174 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6175 6176 // Check if the user has overridden the max. 6177 if (VF.isScalar()) { 6178 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6179 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6180 } else { 6181 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6182 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6183 } 6184 6185 // If trip count is known or estimated compile time constant, limit the 6186 // interleave count to be less than the trip count divided by VF, provided it 6187 // is at least 1. 6188 // 6189 // For scalable vectors we can't know if interleaving is beneficial. It may 6190 // not be beneficial for small loops if none of the lanes in the second vector 6191 // iterations is enabled. However, for larger loops, there is likely to be a 6192 // similar benefit as for fixed-width vectors. For now, we choose to leave 6193 // the InterleaveCount as if vscale is '1', although if some information about 6194 // the vector is known (e.g. min vector size), we can make a better decision. 6195 if (BestKnownTC) { 6196 MaxInterleaveCount = 6197 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6198 // Make sure MaxInterleaveCount is greater than 0. 6199 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6200 } 6201 6202 assert(MaxInterleaveCount > 0 && 6203 "Maximum interleave count must be greater than 0"); 6204 6205 // Clamp the calculated IC to be between the 1 and the max interleave count 6206 // that the target and trip count allows. 6207 if (IC > MaxInterleaveCount) 6208 IC = MaxInterleaveCount; 6209 else 6210 // Make sure IC is greater than 0. 6211 IC = std::max(1u, IC); 6212 6213 assert(IC > 0 && "Interleave count must be greater than 0."); 6214 6215 // If we did not calculate the cost for VF (because the user selected the VF) 6216 // then we calculate the cost of VF here. 6217 if (LoopCost == 0) { 6218 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6219 LoopCost = *expectedCost(VF).first.getValue(); 6220 } 6221 6222 assert(LoopCost && "Non-zero loop cost expected"); 6223 6224 // Interleave if we vectorized this loop and there is a reduction that could 6225 // benefit from interleaving. 6226 if (VF.isVector() && HasReductions) { 6227 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6228 return IC; 6229 } 6230 6231 // Note that if we've already vectorized the loop we will have done the 6232 // runtime check and so interleaving won't require further checks. 6233 bool InterleavingRequiresRuntimePointerCheck = 6234 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6235 6236 // We want to interleave small loops in order to reduce the loop overhead and 6237 // potentially expose ILP opportunities. 6238 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6239 << "LV: IC is " << IC << '\n' 6240 << "LV: VF is " << VF << '\n'); 6241 const bool AggressivelyInterleaveReductions = 6242 TTI.enableAggressiveInterleaving(HasReductions); 6243 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6244 // We assume that the cost overhead is 1 and we use the cost model 6245 // to estimate the cost of the loop and interleave until the cost of the 6246 // loop overhead is about 5% of the cost of the loop. 6247 unsigned SmallIC = 6248 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6249 6250 // Interleave until store/load ports (estimated by max interleave count) are 6251 // saturated. 6252 unsigned NumStores = Legal->getNumStores(); 6253 unsigned NumLoads = Legal->getNumLoads(); 6254 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6255 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6256 6257 // If we have a scalar reduction (vector reductions are already dealt with 6258 // by this point), we can increase the critical path length if the loop 6259 // we're interleaving is inside another loop. Limit, by default to 2, so the 6260 // critical path only gets increased by one reduction operation. 6261 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6262 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6263 SmallIC = std::min(SmallIC, F); 6264 StoresIC = std::min(StoresIC, F); 6265 LoadsIC = std::min(LoadsIC, F); 6266 } 6267 6268 if (EnableLoadStoreRuntimeInterleave && 6269 std::max(StoresIC, LoadsIC) > SmallIC) { 6270 LLVM_DEBUG( 6271 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6272 return std::max(StoresIC, LoadsIC); 6273 } 6274 6275 // If there are scalar reductions and TTI has enabled aggressive 6276 // interleaving for reductions, we will interleave to expose ILP. 6277 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6278 AggressivelyInterleaveReductions) { 6279 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6280 // Interleave no less than SmallIC but not as aggressive as the normal IC 6281 // to satisfy the rare situation when resources are too limited. 6282 return std::max(IC / 2, SmallIC); 6283 } else { 6284 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6285 return SmallIC; 6286 } 6287 } 6288 6289 // Interleave if this is a large loop (small loops are already dealt with by 6290 // this point) that could benefit from interleaving. 6291 if (AggressivelyInterleaveReductions) { 6292 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6293 return IC; 6294 } 6295 6296 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6297 return 1; 6298 } 6299 6300 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6301 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6302 // This function calculates the register usage by measuring the highest number 6303 // of values that are alive at a single location. Obviously, this is a very 6304 // rough estimation. We scan the loop in a topological order in order and 6305 // assign a number to each instruction. We use RPO to ensure that defs are 6306 // met before their users. We assume that each instruction that has in-loop 6307 // users starts an interval. We record every time that an in-loop value is 6308 // used, so we have a list of the first and last occurrences of each 6309 // instruction. Next, we transpose this data structure into a multi map that 6310 // holds the list of intervals that *end* at a specific location. This multi 6311 // map allows us to perform a linear search. We scan the instructions linearly 6312 // and record each time that a new interval starts, by placing it in a set. 6313 // If we find this value in the multi-map then we remove it from the set. 6314 // The max register usage is the maximum size of the set. 6315 // We also search for instructions that are defined outside the loop, but are 6316 // used inside the loop. We need this number separately from the max-interval 6317 // usage number because when we unroll, loop-invariant values do not take 6318 // more register. 6319 LoopBlocksDFS DFS(TheLoop); 6320 DFS.perform(LI); 6321 6322 RegisterUsage RU; 6323 6324 // Each 'key' in the map opens a new interval. The values 6325 // of the map are the index of the 'last seen' usage of the 6326 // instruction that is the key. 6327 using IntervalMap = DenseMap<Instruction *, unsigned>; 6328 6329 // Maps instruction to its index. 6330 SmallVector<Instruction *, 64> IdxToInstr; 6331 // Marks the end of each interval. 6332 IntervalMap EndPoint; 6333 // Saves the list of instruction indices that are used in the loop. 6334 SmallPtrSet<Instruction *, 8> Ends; 6335 // Saves the list of values that are used in the loop but are 6336 // defined outside the loop, such as arguments and constants. 6337 SmallPtrSet<Value *, 8> LoopInvariants; 6338 6339 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6340 for (Instruction &I : BB->instructionsWithoutDebug()) { 6341 IdxToInstr.push_back(&I); 6342 6343 // Save the end location of each USE. 6344 for (Value *U : I.operands()) { 6345 auto *Instr = dyn_cast<Instruction>(U); 6346 6347 // Ignore non-instruction values such as arguments, constants, etc. 6348 if (!Instr) 6349 continue; 6350 6351 // If this instruction is outside the loop then record it and continue. 6352 if (!TheLoop->contains(Instr)) { 6353 LoopInvariants.insert(Instr); 6354 continue; 6355 } 6356 6357 // Overwrite previous end points. 6358 EndPoint[Instr] = IdxToInstr.size(); 6359 Ends.insert(Instr); 6360 } 6361 } 6362 } 6363 6364 // Saves the list of intervals that end with the index in 'key'. 6365 using InstrList = SmallVector<Instruction *, 2>; 6366 DenseMap<unsigned, InstrList> TransposeEnds; 6367 6368 // Transpose the EndPoints to a list of values that end at each index. 6369 for (auto &Interval : EndPoint) 6370 TransposeEnds[Interval.second].push_back(Interval.first); 6371 6372 SmallPtrSet<Instruction *, 8> OpenIntervals; 6373 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6374 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6375 6376 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6377 6378 // A lambda that gets the register usage for the given type and VF. 6379 const auto &TTICapture = TTI; 6380 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6381 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6382 return 0U; 6383 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6384 }; 6385 6386 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6387 Instruction *I = IdxToInstr[i]; 6388 6389 // Remove all of the instructions that end at this location. 6390 InstrList &List = TransposeEnds[i]; 6391 for (Instruction *ToRemove : List) 6392 OpenIntervals.erase(ToRemove); 6393 6394 // Ignore instructions that are never used within the loop. 6395 if (!Ends.count(I)) 6396 continue; 6397 6398 // Skip ignored values. 6399 if (ValuesToIgnore.count(I)) 6400 continue; 6401 6402 // For each VF find the maximum usage of registers. 6403 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6404 // Count the number of live intervals. 6405 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6406 6407 if (VFs[j].isScalar()) { 6408 for (auto Inst : OpenIntervals) { 6409 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6410 if (RegUsage.find(ClassID) == RegUsage.end()) 6411 RegUsage[ClassID] = 1; 6412 else 6413 RegUsage[ClassID] += 1; 6414 } 6415 } else { 6416 collectUniformsAndScalars(VFs[j]); 6417 for (auto Inst : OpenIntervals) { 6418 // Skip ignored values for VF > 1. 6419 if (VecValuesToIgnore.count(Inst)) 6420 continue; 6421 if (isScalarAfterVectorization(Inst, VFs[j])) { 6422 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6423 if (RegUsage.find(ClassID) == RegUsage.end()) 6424 RegUsage[ClassID] = 1; 6425 else 6426 RegUsage[ClassID] += 1; 6427 } else { 6428 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6429 if (RegUsage.find(ClassID) == RegUsage.end()) 6430 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6431 else 6432 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6433 } 6434 } 6435 } 6436 6437 for (auto& pair : RegUsage) { 6438 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6439 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6440 else 6441 MaxUsages[j][pair.first] = pair.second; 6442 } 6443 } 6444 6445 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6446 << OpenIntervals.size() << '\n'); 6447 6448 // Add the current instruction to the list of open intervals. 6449 OpenIntervals.insert(I); 6450 } 6451 6452 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6453 SmallMapVector<unsigned, unsigned, 4> Invariant; 6454 6455 for (auto Inst : LoopInvariants) { 6456 unsigned Usage = 6457 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6458 unsigned ClassID = 6459 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6460 if (Invariant.find(ClassID) == Invariant.end()) 6461 Invariant[ClassID] = Usage; 6462 else 6463 Invariant[ClassID] += Usage; 6464 } 6465 6466 LLVM_DEBUG({ 6467 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6468 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6469 << " item\n"; 6470 for (const auto &pair : MaxUsages[i]) { 6471 dbgs() << "LV(REG): RegisterClass: " 6472 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6473 << " registers\n"; 6474 } 6475 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6476 << " item\n"; 6477 for (const auto &pair : Invariant) { 6478 dbgs() << "LV(REG): RegisterClass: " 6479 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6480 << " registers\n"; 6481 } 6482 }); 6483 6484 RU.LoopInvariantRegs = Invariant; 6485 RU.MaxLocalUsers = MaxUsages[i]; 6486 RUs[i] = RU; 6487 } 6488 6489 return RUs; 6490 } 6491 6492 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6493 // TODO: Cost model for emulated masked load/store is completely 6494 // broken. This hack guides the cost model to use an artificially 6495 // high enough value to practically disable vectorization with such 6496 // operations, except where previously deployed legality hack allowed 6497 // using very low cost values. This is to avoid regressions coming simply 6498 // from moving "masked load/store" check from legality to cost model. 6499 // Masked Load/Gather emulation was previously never allowed. 6500 // Limited number of Masked Store/Scatter emulation was allowed. 6501 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6502 return isa<LoadInst>(I) || 6503 (isa<StoreInst>(I) && 6504 NumPredStores > NumberOfStoresToPredicate); 6505 } 6506 6507 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6508 // If we aren't vectorizing the loop, or if we've already collected the 6509 // instructions to scalarize, there's nothing to do. Collection may already 6510 // have occurred if we have a user-selected VF and are now computing the 6511 // expected cost for interleaving. 6512 if (VF.isScalar() || VF.isZero() || 6513 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6514 return; 6515 6516 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6517 // not profitable to scalarize any instructions, the presence of VF in the 6518 // map will indicate that we've analyzed it already. 6519 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6520 6521 // Find all the instructions that are scalar with predication in the loop and 6522 // determine if it would be better to not if-convert the blocks they are in. 6523 // If so, we also record the instructions to scalarize. 6524 for (BasicBlock *BB : TheLoop->blocks()) { 6525 if (!blockNeedsPredication(BB)) 6526 continue; 6527 for (Instruction &I : *BB) 6528 if (isScalarWithPredication(&I)) { 6529 ScalarCostsTy ScalarCosts; 6530 // Do not apply discount logic if hacked cost is needed 6531 // for emulated masked memrefs. 6532 if (!useEmulatedMaskMemRefHack(&I) && 6533 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6534 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6535 // Remember that BB will remain after vectorization. 6536 PredicatedBBsAfterVectorization.insert(BB); 6537 } 6538 } 6539 } 6540 6541 int LoopVectorizationCostModel::computePredInstDiscount( 6542 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6543 assert(!isUniformAfterVectorization(PredInst, VF) && 6544 "Instruction marked uniform-after-vectorization will be predicated"); 6545 6546 // Initialize the discount to zero, meaning that the scalar version and the 6547 // vector version cost the same. 6548 InstructionCost Discount = 0; 6549 6550 // Holds instructions to analyze. The instructions we visit are mapped in 6551 // ScalarCosts. Those instructions are the ones that would be scalarized if 6552 // we find that the scalar version costs less. 6553 SmallVector<Instruction *, 8> Worklist; 6554 6555 // Returns true if the given instruction can be scalarized. 6556 auto canBeScalarized = [&](Instruction *I) -> bool { 6557 // We only attempt to scalarize instructions forming a single-use chain 6558 // from the original predicated block that would otherwise be vectorized. 6559 // Although not strictly necessary, we give up on instructions we know will 6560 // already be scalar to avoid traversing chains that are unlikely to be 6561 // beneficial. 6562 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6563 isScalarAfterVectorization(I, VF)) 6564 return false; 6565 6566 // If the instruction is scalar with predication, it will be analyzed 6567 // separately. We ignore it within the context of PredInst. 6568 if (isScalarWithPredication(I)) 6569 return false; 6570 6571 // If any of the instruction's operands are uniform after vectorization, 6572 // the instruction cannot be scalarized. This prevents, for example, a 6573 // masked load from being scalarized. 6574 // 6575 // We assume we will only emit a value for lane zero of an instruction 6576 // marked uniform after vectorization, rather than VF identical values. 6577 // Thus, if we scalarize an instruction that uses a uniform, we would 6578 // create uses of values corresponding to the lanes we aren't emitting code 6579 // for. This behavior can be changed by allowing getScalarValue to clone 6580 // the lane zero values for uniforms rather than asserting. 6581 for (Use &U : I->operands()) 6582 if (auto *J = dyn_cast<Instruction>(U.get())) 6583 if (isUniformAfterVectorization(J, VF)) 6584 return false; 6585 6586 // Otherwise, we can scalarize the instruction. 6587 return true; 6588 }; 6589 6590 // Compute the expected cost discount from scalarizing the entire expression 6591 // feeding the predicated instruction. We currently only consider expressions 6592 // that are single-use instruction chains. 6593 Worklist.push_back(PredInst); 6594 while (!Worklist.empty()) { 6595 Instruction *I = Worklist.pop_back_val(); 6596 6597 // If we've already analyzed the instruction, there's nothing to do. 6598 if (ScalarCosts.find(I) != ScalarCosts.end()) 6599 continue; 6600 6601 // Compute the cost of the vector instruction. Note that this cost already 6602 // includes the scalarization overhead of the predicated instruction. 6603 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6604 6605 // Compute the cost of the scalarized instruction. This cost is the cost of 6606 // the instruction as if it wasn't if-converted and instead remained in the 6607 // predicated block. We will scale this cost by block probability after 6608 // computing the scalarization overhead. 6609 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6610 InstructionCost ScalarCost = 6611 VF.getKnownMinValue() * 6612 getInstructionCost(I, ElementCount::getFixed(1)).first; 6613 6614 // Compute the scalarization overhead of needed insertelement instructions 6615 // and phi nodes. 6616 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6617 ScalarCost += TTI.getScalarizationOverhead( 6618 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6619 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6620 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6621 ScalarCost += 6622 VF.getKnownMinValue() * 6623 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6624 } 6625 6626 // Compute the scalarization overhead of needed extractelement 6627 // instructions. For each of the instruction's operands, if the operand can 6628 // be scalarized, add it to the worklist; otherwise, account for the 6629 // overhead. 6630 for (Use &U : I->operands()) 6631 if (auto *J = dyn_cast<Instruction>(U.get())) { 6632 assert(VectorType::isValidElementType(J->getType()) && 6633 "Instruction has non-scalar type"); 6634 if (canBeScalarized(J)) 6635 Worklist.push_back(J); 6636 else if (needsExtract(J, VF)) { 6637 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6638 ScalarCost += TTI.getScalarizationOverhead( 6639 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6640 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6641 } 6642 } 6643 6644 // Scale the total scalar cost by block probability. 6645 ScalarCost /= getReciprocalPredBlockProb(); 6646 6647 // Compute the discount. A non-negative discount means the vector version 6648 // of the instruction costs more, and scalarizing would be beneficial. 6649 Discount += VectorCost - ScalarCost; 6650 ScalarCosts[I] = ScalarCost; 6651 } 6652 6653 return *Discount.getValue(); 6654 } 6655 6656 LoopVectorizationCostModel::VectorizationCostTy 6657 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6658 VectorizationCostTy Cost; 6659 6660 // For each block. 6661 for (BasicBlock *BB : TheLoop->blocks()) { 6662 VectorizationCostTy BlockCost; 6663 6664 // For each instruction in the old loop. 6665 for (Instruction &I : BB->instructionsWithoutDebug()) { 6666 // Skip ignored values. 6667 if (ValuesToIgnore.count(&I) || 6668 (VF.isVector() && VecValuesToIgnore.count(&I))) 6669 continue; 6670 6671 VectorizationCostTy C = getInstructionCost(&I, VF); 6672 6673 // Check if we should override the cost. 6674 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6675 C.first = InstructionCost(ForceTargetInstructionCost); 6676 6677 BlockCost.first += C.first; 6678 BlockCost.second |= C.second; 6679 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6680 << " for VF " << VF << " For instruction: " << I 6681 << '\n'); 6682 } 6683 6684 // If we are vectorizing a predicated block, it will have been 6685 // if-converted. This means that the block's instructions (aside from 6686 // stores and instructions that may divide by zero) will now be 6687 // unconditionally executed. For the scalar case, we may not always execute 6688 // the predicated block, if it is an if-else block. Thus, scale the block's 6689 // cost by the probability of executing it. blockNeedsPredication from 6690 // Legal is used so as to not include all blocks in tail folded loops. 6691 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6692 BlockCost.first /= getReciprocalPredBlockProb(); 6693 6694 Cost.first += BlockCost.first; 6695 Cost.second |= BlockCost.second; 6696 } 6697 6698 return Cost; 6699 } 6700 6701 /// Gets Address Access SCEV after verifying that the access pattern 6702 /// is loop invariant except the induction variable dependence. 6703 /// 6704 /// This SCEV can be sent to the Target in order to estimate the address 6705 /// calculation cost. 6706 static const SCEV *getAddressAccessSCEV( 6707 Value *Ptr, 6708 LoopVectorizationLegality *Legal, 6709 PredicatedScalarEvolution &PSE, 6710 const Loop *TheLoop) { 6711 6712 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6713 if (!Gep) 6714 return nullptr; 6715 6716 // We are looking for a gep with all loop invariant indices except for one 6717 // which should be an induction variable. 6718 auto SE = PSE.getSE(); 6719 unsigned NumOperands = Gep->getNumOperands(); 6720 for (unsigned i = 1; i < NumOperands; ++i) { 6721 Value *Opd = Gep->getOperand(i); 6722 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6723 !Legal->isInductionVariable(Opd)) 6724 return nullptr; 6725 } 6726 6727 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6728 return PSE.getSCEV(Ptr); 6729 } 6730 6731 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6732 return Legal->hasStride(I->getOperand(0)) || 6733 Legal->hasStride(I->getOperand(1)); 6734 } 6735 6736 InstructionCost 6737 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6738 ElementCount VF) { 6739 assert(VF.isVector() && 6740 "Scalarization cost of instruction implies vectorization."); 6741 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6742 Type *ValTy = getMemInstValueType(I); 6743 auto SE = PSE.getSE(); 6744 6745 unsigned AS = getLoadStoreAddressSpace(I); 6746 Value *Ptr = getLoadStorePointerOperand(I); 6747 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6748 6749 // Figure out whether the access is strided and get the stride value 6750 // if it's known in compile time 6751 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6752 6753 // Get the cost of the scalar memory instruction and address computation. 6754 InstructionCost Cost = 6755 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6756 6757 // Don't pass *I here, since it is scalar but will actually be part of a 6758 // vectorized loop where the user of it is a vectorized instruction. 6759 const Align Alignment = getLoadStoreAlignment(I); 6760 Cost += VF.getKnownMinValue() * 6761 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6762 AS, TTI::TCK_RecipThroughput); 6763 6764 // Get the overhead of the extractelement and insertelement instructions 6765 // we might create due to scalarization. 6766 Cost += getScalarizationOverhead(I, VF); 6767 6768 // If we have a predicated load/store, it will need extra i1 extracts and 6769 // conditional branches, but may not be executed for each vector lane. Scale 6770 // the cost by the probability of executing the predicated block. 6771 if (isPredicatedInst(I)) { 6772 Cost /= getReciprocalPredBlockProb(); 6773 6774 // Add the cost of an i1 extract and a branch 6775 auto *Vec_i1Ty = 6776 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6777 Cost += TTI.getScalarizationOverhead( 6778 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6779 /*Insert=*/false, /*Extract=*/true); 6780 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6781 6782 if (useEmulatedMaskMemRefHack(I)) 6783 // Artificially setting to a high enough value to practically disable 6784 // vectorization with such operations. 6785 Cost = 3000000; 6786 } 6787 6788 return Cost; 6789 } 6790 6791 InstructionCost 6792 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6793 ElementCount VF) { 6794 Type *ValTy = getMemInstValueType(I); 6795 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6796 Value *Ptr = getLoadStorePointerOperand(I); 6797 unsigned AS = getLoadStoreAddressSpace(I); 6798 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6799 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6800 6801 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6802 "Stride should be 1 or -1 for consecutive memory access"); 6803 const Align Alignment = getLoadStoreAlignment(I); 6804 InstructionCost Cost = 0; 6805 if (Legal->isMaskRequired(I)) 6806 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6807 CostKind); 6808 else 6809 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6810 CostKind, I); 6811 6812 bool Reverse = ConsecutiveStride < 0; 6813 if (Reverse) 6814 Cost += 6815 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6816 return Cost; 6817 } 6818 6819 InstructionCost 6820 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6821 ElementCount VF) { 6822 assert(Legal->isUniformMemOp(*I)); 6823 6824 Type *ValTy = getMemInstValueType(I); 6825 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6826 const Align Alignment = getLoadStoreAlignment(I); 6827 unsigned AS = getLoadStoreAddressSpace(I); 6828 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6829 if (isa<LoadInst>(I)) { 6830 return TTI.getAddressComputationCost(ValTy) + 6831 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6832 CostKind) + 6833 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6834 } 6835 StoreInst *SI = cast<StoreInst>(I); 6836 6837 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6838 return TTI.getAddressComputationCost(ValTy) + 6839 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6840 CostKind) + 6841 (isLoopInvariantStoreValue 6842 ? 0 6843 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6844 VF.getKnownMinValue() - 1)); 6845 } 6846 6847 InstructionCost 6848 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6849 ElementCount VF) { 6850 Type *ValTy = getMemInstValueType(I); 6851 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6852 const Align Alignment = getLoadStoreAlignment(I); 6853 const Value *Ptr = getLoadStorePointerOperand(I); 6854 6855 return TTI.getAddressComputationCost(VectorTy) + 6856 TTI.getGatherScatterOpCost( 6857 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6858 TargetTransformInfo::TCK_RecipThroughput, I); 6859 } 6860 6861 InstructionCost 6862 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6863 ElementCount VF) { 6864 // TODO: Once we have support for interleaving with scalable vectors 6865 // we can calculate the cost properly here. 6866 if (VF.isScalable()) 6867 return InstructionCost::getInvalid(); 6868 6869 Type *ValTy = getMemInstValueType(I); 6870 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6871 unsigned AS = getLoadStoreAddressSpace(I); 6872 6873 auto Group = getInterleavedAccessGroup(I); 6874 assert(Group && "Fail to get an interleaved access group."); 6875 6876 unsigned InterleaveFactor = Group->getFactor(); 6877 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6878 6879 // Holds the indices of existing members in an interleaved load group. 6880 // An interleaved store group doesn't need this as it doesn't allow gaps. 6881 SmallVector<unsigned, 4> Indices; 6882 if (isa<LoadInst>(I)) { 6883 for (unsigned i = 0; i < InterleaveFactor; i++) 6884 if (Group->getMember(i)) 6885 Indices.push_back(i); 6886 } 6887 6888 // Calculate the cost of the whole interleaved group. 6889 bool UseMaskForGaps = 6890 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6891 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6892 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6893 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6894 6895 if (Group->isReverse()) { 6896 // TODO: Add support for reversed masked interleaved access. 6897 assert(!Legal->isMaskRequired(I) && 6898 "Reverse masked interleaved access not supported."); 6899 Cost += 6900 Group->getNumMembers() * 6901 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6902 } 6903 return Cost; 6904 } 6905 6906 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6907 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6908 // Early exit for no inloop reductions 6909 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6910 return InstructionCost::getInvalid(); 6911 auto *VectorTy = cast<VectorType>(Ty); 6912 6913 // We are looking for a pattern of, and finding the minimal acceptable cost: 6914 // reduce(mul(ext(A), ext(B))) or 6915 // reduce(mul(A, B)) or 6916 // reduce(ext(A)) or 6917 // reduce(A). 6918 // The basic idea is that we walk down the tree to do that, finding the root 6919 // reduction instruction in InLoopReductionImmediateChains. From there we find 6920 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6921 // of the components. If the reduction cost is lower then we return it for the 6922 // reduction instruction and 0 for the other instructions in the pattern. If 6923 // it is not we return an invalid cost specifying the orignal cost method 6924 // should be used. 6925 Instruction *RetI = I; 6926 if ((RetI->getOpcode() == Instruction::SExt || 6927 RetI->getOpcode() == Instruction::ZExt)) { 6928 if (!RetI->hasOneUser()) 6929 return InstructionCost::getInvalid(); 6930 RetI = RetI->user_back(); 6931 } 6932 if (RetI->getOpcode() == Instruction::Mul && 6933 RetI->user_back()->getOpcode() == Instruction::Add) { 6934 if (!RetI->hasOneUser()) 6935 return InstructionCost::getInvalid(); 6936 RetI = RetI->user_back(); 6937 } 6938 6939 // Test if the found instruction is a reduction, and if not return an invalid 6940 // cost specifying the parent to use the original cost modelling. 6941 if (!InLoopReductionImmediateChains.count(RetI)) 6942 return InstructionCost::getInvalid(); 6943 6944 // Find the reduction this chain is a part of and calculate the basic cost of 6945 // the reduction on its own. 6946 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6947 Instruction *ReductionPhi = LastChain; 6948 while (!isa<PHINode>(ReductionPhi)) 6949 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6950 6951 RecurrenceDescriptor RdxDesc = 6952 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6953 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6954 VectorTy, false, CostKind); 6955 6956 // Get the operand that was not the reduction chain and match it to one of the 6957 // patterns, returning the better cost if it is found. 6958 Instruction *RedOp = RetI->getOperand(1) == LastChain 6959 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6960 : dyn_cast<Instruction>(RetI->getOperand(1)); 6961 6962 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6963 6964 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6965 !TheLoop->isLoopInvariant(RedOp)) { 6966 bool IsUnsigned = isa<ZExtInst>(RedOp); 6967 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6968 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6969 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6970 CostKind); 6971 6972 unsigned ExtCost = 6973 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6974 TTI::CastContextHint::None, CostKind, RedOp); 6975 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6976 return I == RetI ? *RedCost.getValue() : 0; 6977 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6978 Instruction *Mul = RedOp; 6979 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6980 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6981 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6982 Op0->getOpcode() == Op1->getOpcode() && 6983 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6984 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6985 bool IsUnsigned = isa<ZExtInst>(Op0); 6986 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6987 // reduce(mul(ext, ext)) 6988 unsigned ExtCost = 6989 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6990 TTI::CastContextHint::None, CostKind, Op0); 6991 InstructionCost MulCost = 6992 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6993 6994 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6995 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6996 CostKind); 6997 6998 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6999 return I == RetI ? *RedCost.getValue() : 0; 7000 } else { 7001 InstructionCost MulCost = 7002 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7003 7004 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7005 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7006 CostKind); 7007 7008 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7009 return I == RetI ? *RedCost.getValue() : 0; 7010 } 7011 } 7012 7013 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7014 } 7015 7016 InstructionCost 7017 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7018 ElementCount VF) { 7019 // Calculate scalar cost only. Vectorization cost should be ready at this 7020 // moment. 7021 if (VF.isScalar()) { 7022 Type *ValTy = getMemInstValueType(I); 7023 const Align Alignment = getLoadStoreAlignment(I); 7024 unsigned AS = getLoadStoreAddressSpace(I); 7025 7026 return TTI.getAddressComputationCost(ValTy) + 7027 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7028 TTI::TCK_RecipThroughput, I); 7029 } 7030 return getWideningCost(I, VF); 7031 } 7032 7033 LoopVectorizationCostModel::VectorizationCostTy 7034 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7035 ElementCount VF) { 7036 // If we know that this instruction will remain uniform, check the cost of 7037 // the scalar version. 7038 if (isUniformAfterVectorization(I, VF)) 7039 VF = ElementCount::getFixed(1); 7040 7041 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7042 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7043 7044 // Forced scalars do not have any scalarization overhead. 7045 auto ForcedScalar = ForcedScalars.find(VF); 7046 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7047 auto InstSet = ForcedScalar->second; 7048 if (InstSet.count(I)) 7049 return VectorizationCostTy( 7050 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7051 VF.getKnownMinValue()), 7052 false); 7053 } 7054 7055 Type *VectorTy; 7056 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7057 7058 bool TypeNotScalarized = 7059 VF.isVector() && VectorTy->isVectorTy() && 7060 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7061 return VectorizationCostTy(C, TypeNotScalarized); 7062 } 7063 7064 InstructionCost 7065 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7066 ElementCount VF) const { 7067 7068 if (VF.isScalable()) 7069 return InstructionCost::getInvalid(); 7070 7071 if (VF.isScalar()) 7072 return 0; 7073 7074 InstructionCost Cost = 0; 7075 Type *RetTy = ToVectorTy(I->getType(), VF); 7076 if (!RetTy->isVoidTy() && 7077 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7078 Cost += TTI.getScalarizationOverhead( 7079 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7080 true, false); 7081 7082 // Some targets keep addresses scalar. 7083 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7084 return Cost; 7085 7086 // Some targets support efficient element stores. 7087 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7088 return Cost; 7089 7090 // Collect operands to consider. 7091 CallInst *CI = dyn_cast<CallInst>(I); 7092 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7093 7094 // Skip operands that do not require extraction/scalarization and do not incur 7095 // any overhead. 7096 SmallVector<Type *> Tys; 7097 for (auto *V : filterExtractingOperands(Ops, VF)) 7098 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7099 return Cost + TTI.getOperandsScalarizationOverhead( 7100 filterExtractingOperands(Ops, VF), Tys); 7101 } 7102 7103 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7104 if (VF.isScalar()) 7105 return; 7106 NumPredStores = 0; 7107 for (BasicBlock *BB : TheLoop->blocks()) { 7108 // For each instruction in the old loop. 7109 for (Instruction &I : *BB) { 7110 Value *Ptr = getLoadStorePointerOperand(&I); 7111 if (!Ptr) 7112 continue; 7113 7114 // TODO: We should generate better code and update the cost model for 7115 // predicated uniform stores. Today they are treated as any other 7116 // predicated store (see added test cases in 7117 // invariant-store-vectorization.ll). 7118 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7119 NumPredStores++; 7120 7121 if (Legal->isUniformMemOp(I)) { 7122 // TODO: Avoid replicating loads and stores instead of 7123 // relying on instcombine to remove them. 7124 // Load: Scalar load + broadcast 7125 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7126 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7127 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7128 continue; 7129 } 7130 7131 // We assume that widening is the best solution when possible. 7132 if (memoryInstructionCanBeWidened(&I, VF)) { 7133 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7134 int ConsecutiveStride = 7135 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7136 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7137 "Expected consecutive stride."); 7138 InstWidening Decision = 7139 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7140 setWideningDecision(&I, VF, Decision, Cost); 7141 continue; 7142 } 7143 7144 // Choose between Interleaving, Gather/Scatter or Scalarization. 7145 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7146 unsigned NumAccesses = 1; 7147 if (isAccessInterleaved(&I)) { 7148 auto Group = getInterleavedAccessGroup(&I); 7149 assert(Group && "Fail to get an interleaved access group."); 7150 7151 // Make one decision for the whole group. 7152 if (getWideningDecision(&I, VF) != CM_Unknown) 7153 continue; 7154 7155 NumAccesses = Group->getNumMembers(); 7156 if (interleavedAccessCanBeWidened(&I, VF)) 7157 InterleaveCost = getInterleaveGroupCost(&I, VF); 7158 } 7159 7160 InstructionCost GatherScatterCost = 7161 isLegalGatherOrScatter(&I) 7162 ? getGatherScatterCost(&I, VF) * NumAccesses 7163 : InstructionCost::getInvalid(); 7164 7165 InstructionCost ScalarizationCost = 7166 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7167 : InstructionCost::getInvalid(); 7168 7169 // Choose better solution for the current VF, 7170 // write down this decision and use it during vectorization. 7171 InstructionCost Cost; 7172 InstWidening Decision; 7173 if (InterleaveCost <= GatherScatterCost && 7174 InterleaveCost < ScalarizationCost) { 7175 Decision = CM_Interleave; 7176 Cost = InterleaveCost; 7177 } else if (GatherScatterCost < ScalarizationCost) { 7178 Decision = CM_GatherScatter; 7179 Cost = GatherScatterCost; 7180 } else { 7181 assert(!VF.isScalable() && 7182 "We cannot yet scalarise for scalable vectors"); 7183 Decision = CM_Scalarize; 7184 Cost = ScalarizationCost; 7185 } 7186 // If the instructions belongs to an interleave group, the whole group 7187 // receives the same decision. The whole group receives the cost, but 7188 // the cost will actually be assigned to one instruction. 7189 if (auto Group = getInterleavedAccessGroup(&I)) 7190 setWideningDecision(Group, VF, Decision, Cost); 7191 else 7192 setWideningDecision(&I, VF, Decision, Cost); 7193 } 7194 } 7195 7196 // Make sure that any load of address and any other address computation 7197 // remains scalar unless there is gather/scatter support. This avoids 7198 // inevitable extracts into address registers, and also has the benefit of 7199 // activating LSR more, since that pass can't optimize vectorized 7200 // addresses. 7201 if (TTI.prefersVectorizedAddressing()) 7202 return; 7203 7204 // Start with all scalar pointer uses. 7205 SmallPtrSet<Instruction *, 8> AddrDefs; 7206 for (BasicBlock *BB : TheLoop->blocks()) 7207 for (Instruction &I : *BB) { 7208 Instruction *PtrDef = 7209 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7210 if (PtrDef && TheLoop->contains(PtrDef) && 7211 getWideningDecision(&I, VF) != CM_GatherScatter) 7212 AddrDefs.insert(PtrDef); 7213 } 7214 7215 // Add all instructions used to generate the addresses. 7216 SmallVector<Instruction *, 4> Worklist; 7217 append_range(Worklist, AddrDefs); 7218 while (!Worklist.empty()) { 7219 Instruction *I = Worklist.pop_back_val(); 7220 for (auto &Op : I->operands()) 7221 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7222 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7223 AddrDefs.insert(InstOp).second) 7224 Worklist.push_back(InstOp); 7225 } 7226 7227 for (auto *I : AddrDefs) { 7228 if (isa<LoadInst>(I)) { 7229 // Setting the desired widening decision should ideally be handled in 7230 // by cost functions, but since this involves the task of finding out 7231 // if the loaded register is involved in an address computation, it is 7232 // instead changed here when we know this is the case. 7233 InstWidening Decision = getWideningDecision(I, VF); 7234 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7235 // Scalarize a widened load of address. 7236 setWideningDecision( 7237 I, VF, CM_Scalarize, 7238 (VF.getKnownMinValue() * 7239 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7240 else if (auto Group = getInterleavedAccessGroup(I)) { 7241 // Scalarize an interleave group of address loads. 7242 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7243 if (Instruction *Member = Group->getMember(I)) 7244 setWideningDecision( 7245 Member, VF, CM_Scalarize, 7246 (VF.getKnownMinValue() * 7247 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7248 } 7249 } 7250 } else 7251 // Make sure I gets scalarized and a cost estimate without 7252 // scalarization overhead. 7253 ForcedScalars[VF].insert(I); 7254 } 7255 } 7256 7257 InstructionCost 7258 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7259 Type *&VectorTy) { 7260 Type *RetTy = I->getType(); 7261 if (canTruncateToMinimalBitwidth(I, VF)) 7262 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7263 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7264 auto SE = PSE.getSE(); 7265 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7266 7267 // TODO: We need to estimate the cost of intrinsic calls. 7268 switch (I->getOpcode()) { 7269 case Instruction::GetElementPtr: 7270 // We mark this instruction as zero-cost because the cost of GEPs in 7271 // vectorized code depends on whether the corresponding memory instruction 7272 // is scalarized or not. Therefore, we handle GEPs with the memory 7273 // instruction cost. 7274 return 0; 7275 case Instruction::Br: { 7276 // In cases of scalarized and predicated instructions, there will be VF 7277 // predicated blocks in the vectorized loop. Each branch around these 7278 // blocks requires also an extract of its vector compare i1 element. 7279 bool ScalarPredicatedBB = false; 7280 BranchInst *BI = cast<BranchInst>(I); 7281 if (VF.isVector() && BI->isConditional() && 7282 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7283 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7284 ScalarPredicatedBB = true; 7285 7286 if (ScalarPredicatedBB) { 7287 // Return cost for branches around scalarized and predicated blocks. 7288 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7289 auto *Vec_i1Ty = 7290 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7291 return (TTI.getScalarizationOverhead( 7292 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7293 false, true) + 7294 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7295 VF.getKnownMinValue())); 7296 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7297 // The back-edge branch will remain, as will all scalar branches. 7298 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7299 else 7300 // This branch will be eliminated by if-conversion. 7301 return 0; 7302 // Note: We currently assume zero cost for an unconditional branch inside 7303 // a predicated block since it will become a fall-through, although we 7304 // may decide in the future to call TTI for all branches. 7305 } 7306 case Instruction::PHI: { 7307 auto *Phi = cast<PHINode>(I); 7308 7309 // First-order recurrences are replaced by vector shuffles inside the loop. 7310 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7311 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7312 return TTI.getShuffleCost( 7313 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7314 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7315 7316 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7317 // converted into select instructions. We require N - 1 selects per phi 7318 // node, where N is the number of incoming values. 7319 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7320 return (Phi->getNumIncomingValues() - 1) * 7321 TTI.getCmpSelInstrCost( 7322 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7323 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7324 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7325 7326 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7327 } 7328 case Instruction::UDiv: 7329 case Instruction::SDiv: 7330 case Instruction::URem: 7331 case Instruction::SRem: 7332 // If we have a predicated instruction, it may not be executed for each 7333 // vector lane. Get the scalarization cost and scale this amount by the 7334 // probability of executing the predicated block. If the instruction is not 7335 // predicated, we fall through to the next case. 7336 if (VF.isVector() && isScalarWithPredication(I)) { 7337 InstructionCost Cost = 0; 7338 7339 // These instructions have a non-void type, so account for the phi nodes 7340 // that we will create. This cost is likely to be zero. The phi node 7341 // cost, if any, should be scaled by the block probability because it 7342 // models a copy at the end of each predicated block. 7343 Cost += VF.getKnownMinValue() * 7344 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7345 7346 // The cost of the non-predicated instruction. 7347 Cost += VF.getKnownMinValue() * 7348 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7349 7350 // The cost of insertelement and extractelement instructions needed for 7351 // scalarization. 7352 Cost += getScalarizationOverhead(I, VF); 7353 7354 // Scale the cost by the probability of executing the predicated blocks. 7355 // This assumes the predicated block for each vector lane is equally 7356 // likely. 7357 return Cost / getReciprocalPredBlockProb(); 7358 } 7359 LLVM_FALLTHROUGH; 7360 case Instruction::Add: 7361 case Instruction::FAdd: 7362 case Instruction::Sub: 7363 case Instruction::FSub: 7364 case Instruction::Mul: 7365 case Instruction::FMul: 7366 case Instruction::FDiv: 7367 case Instruction::FRem: 7368 case Instruction::Shl: 7369 case Instruction::LShr: 7370 case Instruction::AShr: 7371 case Instruction::And: 7372 case Instruction::Or: 7373 case Instruction::Xor: { 7374 // Since we will replace the stride by 1 the multiplication should go away. 7375 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7376 return 0; 7377 7378 // Detect reduction patterns 7379 InstructionCost RedCost; 7380 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7381 .isValid()) 7382 return RedCost; 7383 7384 // Certain instructions can be cheaper to vectorize if they have a constant 7385 // second vector operand. One example of this are shifts on x86. 7386 Value *Op2 = I->getOperand(1); 7387 TargetTransformInfo::OperandValueProperties Op2VP; 7388 TargetTransformInfo::OperandValueKind Op2VK = 7389 TTI.getOperandInfo(Op2, Op2VP); 7390 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7391 Op2VK = TargetTransformInfo::OK_UniformValue; 7392 7393 SmallVector<const Value *, 4> Operands(I->operand_values()); 7394 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7395 return N * TTI.getArithmeticInstrCost( 7396 I->getOpcode(), VectorTy, CostKind, 7397 TargetTransformInfo::OK_AnyValue, 7398 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7399 } 7400 case Instruction::FNeg: { 7401 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7402 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7403 return N * TTI.getArithmeticInstrCost( 7404 I->getOpcode(), VectorTy, CostKind, 7405 TargetTransformInfo::OK_AnyValue, 7406 TargetTransformInfo::OK_AnyValue, 7407 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7408 I->getOperand(0), I); 7409 } 7410 case Instruction::Select: { 7411 SelectInst *SI = cast<SelectInst>(I); 7412 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7413 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7414 Type *CondTy = SI->getCondition()->getType(); 7415 if (!ScalarCond) 7416 CondTy = VectorType::get(CondTy, VF); 7417 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7418 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7419 } 7420 case Instruction::ICmp: 7421 case Instruction::FCmp: { 7422 Type *ValTy = I->getOperand(0)->getType(); 7423 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7424 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7425 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7426 VectorTy = ToVectorTy(ValTy, VF); 7427 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7428 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7429 } 7430 case Instruction::Store: 7431 case Instruction::Load: { 7432 ElementCount Width = VF; 7433 if (Width.isVector()) { 7434 InstWidening Decision = getWideningDecision(I, Width); 7435 assert(Decision != CM_Unknown && 7436 "CM decision should be taken at this point"); 7437 if (Decision == CM_Scalarize) 7438 Width = ElementCount::getFixed(1); 7439 } 7440 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7441 return getMemoryInstructionCost(I, VF); 7442 } 7443 case Instruction::ZExt: 7444 case Instruction::SExt: 7445 case Instruction::FPToUI: 7446 case Instruction::FPToSI: 7447 case Instruction::FPExt: 7448 case Instruction::PtrToInt: 7449 case Instruction::IntToPtr: 7450 case Instruction::SIToFP: 7451 case Instruction::UIToFP: 7452 case Instruction::Trunc: 7453 case Instruction::FPTrunc: 7454 case Instruction::BitCast: { 7455 // Computes the CastContextHint from a Load/Store instruction. 7456 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7457 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7458 "Expected a load or a store!"); 7459 7460 if (VF.isScalar() || !TheLoop->contains(I)) 7461 return TTI::CastContextHint::Normal; 7462 7463 switch (getWideningDecision(I, VF)) { 7464 case LoopVectorizationCostModel::CM_GatherScatter: 7465 return TTI::CastContextHint::GatherScatter; 7466 case LoopVectorizationCostModel::CM_Interleave: 7467 return TTI::CastContextHint::Interleave; 7468 case LoopVectorizationCostModel::CM_Scalarize: 7469 case LoopVectorizationCostModel::CM_Widen: 7470 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7471 : TTI::CastContextHint::Normal; 7472 case LoopVectorizationCostModel::CM_Widen_Reverse: 7473 return TTI::CastContextHint::Reversed; 7474 case LoopVectorizationCostModel::CM_Unknown: 7475 llvm_unreachable("Instr did not go through cost modelling?"); 7476 } 7477 7478 llvm_unreachable("Unhandled case!"); 7479 }; 7480 7481 unsigned Opcode = I->getOpcode(); 7482 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7483 // For Trunc, the context is the only user, which must be a StoreInst. 7484 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7485 if (I->hasOneUse()) 7486 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7487 CCH = ComputeCCH(Store); 7488 } 7489 // For Z/Sext, the context is the operand, which must be a LoadInst. 7490 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7491 Opcode == Instruction::FPExt) { 7492 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7493 CCH = ComputeCCH(Load); 7494 } 7495 7496 // We optimize the truncation of induction variables having constant 7497 // integer steps. The cost of these truncations is the same as the scalar 7498 // operation. 7499 if (isOptimizableIVTruncate(I, VF)) { 7500 auto *Trunc = cast<TruncInst>(I); 7501 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7502 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7503 } 7504 7505 // Detect reduction patterns 7506 InstructionCost RedCost; 7507 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7508 .isValid()) 7509 return RedCost; 7510 7511 Type *SrcScalarTy = I->getOperand(0)->getType(); 7512 Type *SrcVecTy = 7513 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7514 if (canTruncateToMinimalBitwidth(I, VF)) { 7515 // This cast is going to be shrunk. This may remove the cast or it might 7516 // turn it into slightly different cast. For example, if MinBW == 16, 7517 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7518 // 7519 // Calculate the modified src and dest types. 7520 Type *MinVecTy = VectorTy; 7521 if (Opcode == Instruction::Trunc) { 7522 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7523 VectorTy = 7524 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7525 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7526 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7527 VectorTy = 7528 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7529 } 7530 } 7531 7532 unsigned N; 7533 if (isScalarAfterVectorization(I, VF)) { 7534 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7535 N = VF.getKnownMinValue(); 7536 } else 7537 N = 1; 7538 return N * 7539 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7540 } 7541 case Instruction::Call: { 7542 bool NeedToScalarize; 7543 CallInst *CI = cast<CallInst>(I); 7544 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7545 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7546 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7547 return std::min(CallCost, IntrinsicCost); 7548 } 7549 return CallCost; 7550 } 7551 case Instruction::ExtractValue: 7552 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7553 default: 7554 // The cost of executing VF copies of the scalar instruction. This opcode 7555 // is unknown. Assume that it is the same as 'mul'. 7556 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7557 Instruction::Mul, VectorTy, CostKind) + 7558 getScalarizationOverhead(I, VF); 7559 } // end of switch. 7560 } 7561 7562 char LoopVectorize::ID = 0; 7563 7564 static const char lv_name[] = "Loop Vectorization"; 7565 7566 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7567 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7568 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7569 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7570 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7571 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7572 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7573 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7574 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7575 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7576 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7577 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7578 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7579 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7580 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7581 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7582 7583 namespace llvm { 7584 7585 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7586 7587 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7588 bool VectorizeOnlyWhenForced) { 7589 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7590 } 7591 7592 } // end namespace llvm 7593 7594 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7595 // Check if the pointer operand of a load or store instruction is 7596 // consecutive. 7597 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7598 return Legal->isConsecutivePtr(Ptr); 7599 return false; 7600 } 7601 7602 void LoopVectorizationCostModel::collectValuesToIgnore() { 7603 // Ignore ephemeral values. 7604 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7605 7606 // Ignore type-promoting instructions we identified during reduction 7607 // detection. 7608 for (auto &Reduction : Legal->getReductionVars()) { 7609 RecurrenceDescriptor &RedDes = Reduction.second; 7610 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7611 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7612 } 7613 // Ignore type-casting instructions we identified during induction 7614 // detection. 7615 for (auto &Induction : Legal->getInductionVars()) { 7616 InductionDescriptor &IndDes = Induction.second; 7617 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7618 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7619 } 7620 } 7621 7622 void LoopVectorizationCostModel::collectInLoopReductions() { 7623 for (auto &Reduction : Legal->getReductionVars()) { 7624 PHINode *Phi = Reduction.first; 7625 RecurrenceDescriptor &RdxDesc = Reduction.second; 7626 7627 // We don't collect reductions that are type promoted (yet). 7628 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7629 continue; 7630 7631 // If the target would prefer this reduction to happen "in-loop", then we 7632 // want to record it as such. 7633 unsigned Opcode = RdxDesc.getOpcode(); 7634 if (!PreferInLoopReductions && 7635 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7636 TargetTransformInfo::ReductionFlags())) 7637 continue; 7638 7639 // Check that we can correctly put the reductions into the loop, by 7640 // finding the chain of operations that leads from the phi to the loop 7641 // exit value. 7642 SmallVector<Instruction *, 4> ReductionOperations = 7643 RdxDesc.getReductionOpChain(Phi, TheLoop); 7644 bool InLoop = !ReductionOperations.empty(); 7645 if (InLoop) { 7646 InLoopReductionChains[Phi] = ReductionOperations; 7647 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7648 Instruction *LastChain = Phi; 7649 for (auto *I : ReductionOperations) { 7650 InLoopReductionImmediateChains[I] = LastChain; 7651 LastChain = I; 7652 } 7653 } 7654 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7655 << " reduction for phi: " << *Phi << "\n"); 7656 } 7657 } 7658 7659 // TODO: we could return a pair of values that specify the max VF and 7660 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7661 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7662 // doesn't have a cost model that can choose which plan to execute if 7663 // more than one is generated. 7664 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7665 LoopVectorizationCostModel &CM) { 7666 unsigned WidestType; 7667 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7668 return WidestVectorRegBits / WidestType; 7669 } 7670 7671 VectorizationFactor 7672 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7673 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7674 ElementCount VF = UserVF; 7675 // Outer loop handling: They may require CFG and instruction level 7676 // transformations before even evaluating whether vectorization is profitable. 7677 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7678 // the vectorization pipeline. 7679 if (!OrigLoop->isInnermost()) { 7680 // If the user doesn't provide a vectorization factor, determine a 7681 // reasonable one. 7682 if (UserVF.isZero()) { 7683 VF = ElementCount::getFixed(determineVPlanVF( 7684 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7685 .getFixedSize(), 7686 CM)); 7687 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7688 7689 // Make sure we have a VF > 1 for stress testing. 7690 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7691 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7692 << "overriding computed VF.\n"); 7693 VF = ElementCount::getFixed(4); 7694 } 7695 } 7696 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7697 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7698 "VF needs to be a power of two"); 7699 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7700 << "VF " << VF << " to build VPlans.\n"); 7701 buildVPlans(VF, VF); 7702 7703 // For VPlan build stress testing, we bail out after VPlan construction. 7704 if (VPlanBuildStressTest) 7705 return VectorizationFactor::Disabled(); 7706 7707 return {VF, 0 /*Cost*/}; 7708 } 7709 7710 LLVM_DEBUG( 7711 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7712 "VPlan-native path.\n"); 7713 return VectorizationFactor::Disabled(); 7714 } 7715 7716 Optional<VectorizationFactor> 7717 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7718 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7719 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7720 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7721 return None; 7722 7723 // Invalidate interleave groups if all blocks of loop will be predicated. 7724 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7725 !useMaskedInterleavedAccesses(*TTI)) { 7726 LLVM_DEBUG( 7727 dbgs() 7728 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7729 "which requires masked-interleaved support.\n"); 7730 if (CM.InterleaveInfo.invalidateGroups()) 7731 // Invalidating interleave groups also requires invalidating all decisions 7732 // based on them, which includes widening decisions and uniform and scalar 7733 // values. 7734 CM.invalidateCostModelingDecisions(); 7735 } 7736 7737 ElementCount MaxVF = MaybeMaxVF.getValue(); 7738 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7739 7740 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7741 if (!UserVF.isZero() && 7742 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7743 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7744 // VFs here, this should be reverted to only use legal UserVFs once the 7745 // loop below supports scalable VFs. 7746 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7747 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7748 << " VF " << VF << ".\n"); 7749 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7750 "VF needs to be a power of two"); 7751 // Collect the instructions (and their associated costs) that will be more 7752 // profitable to scalarize. 7753 CM.selectUserVectorizationFactor(VF); 7754 CM.collectInLoopReductions(); 7755 buildVPlansWithVPRecipes(VF, VF); 7756 LLVM_DEBUG(printPlans(dbgs())); 7757 return {{VF, 0}}; 7758 } 7759 7760 assert(!MaxVF.isScalable() && 7761 "Scalable vectors not yet supported beyond this point"); 7762 7763 for (ElementCount VF = ElementCount::getFixed(1); 7764 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7765 // Collect Uniform and Scalar instructions after vectorization with VF. 7766 CM.collectUniformsAndScalars(VF); 7767 7768 // Collect the instructions (and their associated costs) that will be more 7769 // profitable to scalarize. 7770 if (VF.isVector()) 7771 CM.collectInstsToScalarize(VF); 7772 } 7773 7774 CM.collectInLoopReductions(); 7775 7776 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7777 LLVM_DEBUG(printPlans(dbgs())); 7778 if (MaxVF.isScalar()) 7779 return VectorizationFactor::Disabled(); 7780 7781 // Select the optimal vectorization factor. 7782 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 7783 7784 // Check if it is profitable to vectorize with runtime checks. 7785 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7786 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7787 bool PragmaThresholdReached = 7788 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7789 bool ThresholdReached = 7790 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7791 if ((ThresholdReached && !Hints.allowReordering()) || 7792 PragmaThresholdReached) { 7793 ORE->emit([&]() { 7794 return OptimizationRemarkAnalysisAliasing( 7795 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7796 OrigLoop->getHeader()) 7797 << "loop not vectorized: cannot prove it is safe to reorder " 7798 "memory operations"; 7799 }); 7800 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7801 Hints.emitRemarkWithHints(); 7802 return VectorizationFactor::Disabled(); 7803 } 7804 } 7805 return SelectedVF; 7806 } 7807 7808 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7809 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7810 << '\n'); 7811 BestVF = VF; 7812 BestUF = UF; 7813 7814 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7815 return !Plan->hasVF(VF); 7816 }); 7817 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7818 } 7819 7820 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7821 DominatorTree *DT) { 7822 // Perform the actual loop transformation. 7823 7824 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7825 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7826 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7827 7828 VPTransformState State{ 7829 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7830 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7831 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7832 State.CanonicalIV = ILV.Induction; 7833 7834 ILV.printDebugTracesAtStart(); 7835 7836 //===------------------------------------------------===// 7837 // 7838 // Notice: any optimization or new instruction that go 7839 // into the code below should also be implemented in 7840 // the cost-model. 7841 // 7842 //===------------------------------------------------===// 7843 7844 // 2. Copy and widen instructions from the old loop into the new loop. 7845 VPlans.front()->execute(&State); 7846 7847 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7848 // predication, updating analyses. 7849 ILV.fixVectorizedLoop(State); 7850 7851 ILV.printDebugTracesAtEnd(); 7852 } 7853 7854 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7855 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7856 for (const auto &Plan : VPlans) 7857 if (PrintVPlansInDotFormat) 7858 Plan->printDOT(O); 7859 else 7860 Plan->print(O); 7861 } 7862 #endif 7863 7864 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7865 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7866 7867 // We create new control-flow for the vectorized loop, so the original exit 7868 // conditions will be dead after vectorization if it's only used by the 7869 // terminator 7870 SmallVector<BasicBlock*> ExitingBlocks; 7871 OrigLoop->getExitingBlocks(ExitingBlocks); 7872 for (auto *BB : ExitingBlocks) { 7873 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7874 if (!Cmp || !Cmp->hasOneUse()) 7875 continue; 7876 7877 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7878 if (!DeadInstructions.insert(Cmp).second) 7879 continue; 7880 7881 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7882 // TODO: can recurse through operands in general 7883 for (Value *Op : Cmp->operands()) { 7884 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7885 DeadInstructions.insert(cast<Instruction>(Op)); 7886 } 7887 } 7888 7889 // We create new "steps" for induction variable updates to which the original 7890 // induction variables map. An original update instruction will be dead if 7891 // all its users except the induction variable are dead. 7892 auto *Latch = OrigLoop->getLoopLatch(); 7893 for (auto &Induction : Legal->getInductionVars()) { 7894 PHINode *Ind = Induction.first; 7895 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7896 7897 // If the tail is to be folded by masking, the primary induction variable, 7898 // if exists, isn't dead: it will be used for masking. Don't kill it. 7899 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7900 continue; 7901 7902 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7903 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7904 })) 7905 DeadInstructions.insert(IndUpdate); 7906 7907 // We record as "Dead" also the type-casting instructions we had identified 7908 // during induction analysis. We don't need any handling for them in the 7909 // vectorized loop because we have proven that, under a proper runtime 7910 // test guarding the vectorized loop, the value of the phi, and the casted 7911 // value of the phi, are the same. The last instruction in this casting chain 7912 // will get its scalar/vector/widened def from the scalar/vector/widened def 7913 // of the respective phi node. Any other casts in the induction def-use chain 7914 // have no other uses outside the phi update chain, and will be ignored. 7915 InductionDescriptor &IndDes = Induction.second; 7916 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7917 DeadInstructions.insert(Casts.begin(), Casts.end()); 7918 } 7919 } 7920 7921 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7922 7923 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7924 7925 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7926 Instruction::BinaryOps BinOp) { 7927 // When unrolling and the VF is 1, we only need to add a simple scalar. 7928 Type *Ty = Val->getType(); 7929 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7930 7931 if (Ty->isFloatingPointTy()) { 7932 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7933 7934 // Floating-point operations inherit FMF via the builder's flags. 7935 Value *MulOp = Builder.CreateFMul(C, Step); 7936 return Builder.CreateBinOp(BinOp, Val, MulOp); 7937 } 7938 Constant *C = ConstantInt::get(Ty, StartIdx); 7939 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7940 } 7941 7942 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7943 SmallVector<Metadata *, 4> MDs; 7944 // Reserve first location for self reference to the LoopID metadata node. 7945 MDs.push_back(nullptr); 7946 bool IsUnrollMetadata = false; 7947 MDNode *LoopID = L->getLoopID(); 7948 if (LoopID) { 7949 // First find existing loop unrolling disable metadata. 7950 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7951 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7952 if (MD) { 7953 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7954 IsUnrollMetadata = 7955 S && S->getString().startswith("llvm.loop.unroll.disable"); 7956 } 7957 MDs.push_back(LoopID->getOperand(i)); 7958 } 7959 } 7960 7961 if (!IsUnrollMetadata) { 7962 // Add runtime unroll disable metadata. 7963 LLVMContext &Context = L->getHeader()->getContext(); 7964 SmallVector<Metadata *, 1> DisableOperands; 7965 DisableOperands.push_back( 7966 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7967 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7968 MDs.push_back(DisableNode); 7969 MDNode *NewLoopID = MDNode::get(Context, MDs); 7970 // Set operand 0 to refer to the loop id itself. 7971 NewLoopID->replaceOperandWith(0, NewLoopID); 7972 L->setLoopID(NewLoopID); 7973 } 7974 } 7975 7976 //===--------------------------------------------------------------------===// 7977 // EpilogueVectorizerMainLoop 7978 //===--------------------------------------------------------------------===// 7979 7980 /// This function is partially responsible for generating the control flow 7981 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7982 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7983 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7984 Loop *Lp = createVectorLoopSkeleton(""); 7985 7986 // Generate the code to check the minimum iteration count of the vector 7987 // epilogue (see below). 7988 EPI.EpilogueIterationCountCheck = 7989 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7990 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7991 7992 // Generate the code to check any assumptions that we've made for SCEV 7993 // expressions. 7994 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 7995 7996 // Generate the code that checks at runtime if arrays overlap. We put the 7997 // checks into a separate block to make the more common case of few elements 7998 // faster. 7999 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8000 8001 // Generate the iteration count check for the main loop, *after* the check 8002 // for the epilogue loop, so that the path-length is shorter for the case 8003 // that goes directly through the vector epilogue. The longer-path length for 8004 // the main loop is compensated for, by the gain from vectorizing the larger 8005 // trip count. Note: the branch will get updated later on when we vectorize 8006 // the epilogue. 8007 EPI.MainLoopIterationCountCheck = 8008 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8009 8010 // Generate the induction variable. 8011 OldInduction = Legal->getPrimaryInduction(); 8012 Type *IdxTy = Legal->getWidestInductionType(); 8013 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8014 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8015 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8016 EPI.VectorTripCount = CountRoundDown; 8017 Induction = 8018 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8019 getDebugLocFromInstOrOperands(OldInduction)); 8020 8021 // Skip induction resume value creation here because they will be created in 8022 // the second pass. If we created them here, they wouldn't be used anyway, 8023 // because the vplan in the second pass still contains the inductions from the 8024 // original loop. 8025 8026 return completeLoopSkeleton(Lp, OrigLoopID); 8027 } 8028 8029 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8030 LLVM_DEBUG({ 8031 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8032 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8033 << ", Main Loop UF:" << EPI.MainLoopUF 8034 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8035 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8036 }); 8037 } 8038 8039 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8040 DEBUG_WITH_TYPE(VerboseDebug, { 8041 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8042 }); 8043 } 8044 8045 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8046 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8047 assert(L && "Expected valid Loop."); 8048 assert(Bypass && "Expected valid bypass basic block."); 8049 unsigned VFactor = 8050 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8051 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8052 Value *Count = getOrCreateTripCount(L); 8053 // Reuse existing vector loop preheader for TC checks. 8054 // Note that new preheader block is generated for vector loop. 8055 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8056 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8057 8058 // Generate code to check if the loop's trip count is less than VF * UF of the 8059 // main vector loop. 8060 auto P = 8061 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8062 8063 Value *CheckMinIters = Builder.CreateICmp( 8064 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8065 "min.iters.check"); 8066 8067 if (!ForEpilogue) 8068 TCCheckBlock->setName("vector.main.loop.iter.check"); 8069 8070 // Create new preheader for vector loop. 8071 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8072 DT, LI, nullptr, "vector.ph"); 8073 8074 if (ForEpilogue) { 8075 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8076 DT->getNode(Bypass)->getIDom()) && 8077 "TC check is expected to dominate Bypass"); 8078 8079 // Update dominator for Bypass & LoopExit. 8080 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8081 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8082 8083 LoopBypassBlocks.push_back(TCCheckBlock); 8084 8085 // Save the trip count so we don't have to regenerate it in the 8086 // vec.epilog.iter.check. This is safe to do because the trip count 8087 // generated here dominates the vector epilog iter check. 8088 EPI.TripCount = Count; 8089 } 8090 8091 ReplaceInstWithInst( 8092 TCCheckBlock->getTerminator(), 8093 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8094 8095 return TCCheckBlock; 8096 } 8097 8098 //===--------------------------------------------------------------------===// 8099 // EpilogueVectorizerEpilogueLoop 8100 //===--------------------------------------------------------------------===// 8101 8102 /// This function is partially responsible for generating the control flow 8103 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8104 BasicBlock * 8105 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8106 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8107 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8108 8109 // Now, compare the remaining count and if there aren't enough iterations to 8110 // execute the vectorized epilogue skip to the scalar part. 8111 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8112 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8113 LoopVectorPreHeader = 8114 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8115 LI, nullptr, "vec.epilog.ph"); 8116 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8117 VecEpilogueIterationCountCheck); 8118 8119 // Adjust the control flow taking the state info from the main loop 8120 // vectorization into account. 8121 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8122 "expected this to be saved from the previous pass."); 8123 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8124 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8125 8126 DT->changeImmediateDominator(LoopVectorPreHeader, 8127 EPI.MainLoopIterationCountCheck); 8128 8129 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8130 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8131 8132 if (EPI.SCEVSafetyCheck) 8133 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8134 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8135 if (EPI.MemSafetyCheck) 8136 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8137 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8138 8139 DT->changeImmediateDominator( 8140 VecEpilogueIterationCountCheck, 8141 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8142 8143 DT->changeImmediateDominator(LoopScalarPreHeader, 8144 EPI.EpilogueIterationCountCheck); 8145 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8146 8147 // Keep track of bypass blocks, as they feed start values to the induction 8148 // phis in the scalar loop preheader. 8149 if (EPI.SCEVSafetyCheck) 8150 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8151 if (EPI.MemSafetyCheck) 8152 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8153 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8154 8155 // Generate a resume induction for the vector epilogue and put it in the 8156 // vector epilogue preheader 8157 Type *IdxTy = Legal->getWidestInductionType(); 8158 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8159 LoopVectorPreHeader->getFirstNonPHI()); 8160 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8161 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8162 EPI.MainLoopIterationCountCheck); 8163 8164 // Generate the induction variable. 8165 OldInduction = Legal->getPrimaryInduction(); 8166 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8167 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8168 Value *StartIdx = EPResumeVal; 8169 Induction = 8170 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8171 getDebugLocFromInstOrOperands(OldInduction)); 8172 8173 // Generate induction resume values. These variables save the new starting 8174 // indexes for the scalar loop. They are used to test if there are any tail 8175 // iterations left once the vector loop has completed. 8176 // Note that when the vectorized epilogue is skipped due to iteration count 8177 // check, then the resume value for the induction variable comes from 8178 // the trip count of the main vector loop, hence passing the AdditionalBypass 8179 // argument. 8180 createInductionResumeValues(Lp, CountRoundDown, 8181 {VecEpilogueIterationCountCheck, 8182 EPI.VectorTripCount} /* AdditionalBypass */); 8183 8184 AddRuntimeUnrollDisableMetaData(Lp); 8185 return completeLoopSkeleton(Lp, OrigLoopID); 8186 } 8187 8188 BasicBlock * 8189 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8190 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8191 8192 assert(EPI.TripCount && 8193 "Expected trip count to have been safed in the first pass."); 8194 assert( 8195 (!isa<Instruction>(EPI.TripCount) || 8196 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8197 "saved trip count does not dominate insertion point."); 8198 Value *TC = EPI.TripCount; 8199 IRBuilder<> Builder(Insert->getTerminator()); 8200 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8201 8202 // Generate code to check if the loop's trip count is less than VF * UF of the 8203 // vector epilogue loop. 8204 auto P = 8205 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8206 8207 Value *CheckMinIters = Builder.CreateICmp( 8208 P, Count, 8209 ConstantInt::get(Count->getType(), 8210 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8211 "min.epilog.iters.check"); 8212 8213 ReplaceInstWithInst( 8214 Insert->getTerminator(), 8215 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8216 8217 LoopBypassBlocks.push_back(Insert); 8218 return Insert; 8219 } 8220 8221 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8222 LLVM_DEBUG({ 8223 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8224 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8225 << ", Main Loop UF:" << EPI.MainLoopUF 8226 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8227 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8228 }); 8229 } 8230 8231 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8232 DEBUG_WITH_TYPE(VerboseDebug, { 8233 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8234 }); 8235 } 8236 8237 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8238 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8239 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8240 bool PredicateAtRangeStart = Predicate(Range.Start); 8241 8242 for (ElementCount TmpVF = Range.Start * 2; 8243 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8244 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8245 Range.End = TmpVF; 8246 break; 8247 } 8248 8249 return PredicateAtRangeStart; 8250 } 8251 8252 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8253 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8254 /// of VF's starting at a given VF and extending it as much as possible. Each 8255 /// vectorization decision can potentially shorten this sub-range during 8256 /// buildVPlan(). 8257 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8258 ElementCount MaxVF) { 8259 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8260 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8261 VFRange SubRange = {VF, MaxVFPlusOne}; 8262 VPlans.push_back(buildVPlan(SubRange)); 8263 VF = SubRange.End; 8264 } 8265 } 8266 8267 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8268 VPlanPtr &Plan) { 8269 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8270 8271 // Look for cached value. 8272 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8273 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8274 if (ECEntryIt != EdgeMaskCache.end()) 8275 return ECEntryIt->second; 8276 8277 VPValue *SrcMask = createBlockInMask(Src, Plan); 8278 8279 // The terminator has to be a branch inst! 8280 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8281 assert(BI && "Unexpected terminator found"); 8282 8283 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8284 return EdgeMaskCache[Edge] = SrcMask; 8285 8286 // If source is an exiting block, we know the exit edge is dynamically dead 8287 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8288 // adding uses of an otherwise potentially dead instruction. 8289 if (OrigLoop->isLoopExiting(Src)) 8290 return EdgeMaskCache[Edge] = SrcMask; 8291 8292 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8293 assert(EdgeMask && "No Edge Mask found for condition"); 8294 8295 if (BI->getSuccessor(0) != Dst) 8296 EdgeMask = Builder.createNot(EdgeMask); 8297 8298 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8299 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8300 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8301 // The select version does not introduce new UB if SrcMask is false and 8302 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8303 VPValue *False = Plan->getOrAddVPValue( 8304 ConstantInt::getFalse(BI->getCondition()->getType())); 8305 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8306 } 8307 8308 return EdgeMaskCache[Edge] = EdgeMask; 8309 } 8310 8311 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8312 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8313 8314 // Look for cached value. 8315 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8316 if (BCEntryIt != BlockMaskCache.end()) 8317 return BCEntryIt->second; 8318 8319 // All-one mask is modelled as no-mask following the convention for masked 8320 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8321 VPValue *BlockMask = nullptr; 8322 8323 if (OrigLoop->getHeader() == BB) { 8324 if (!CM.blockNeedsPredication(BB)) 8325 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8326 8327 // Create the block in mask as the first non-phi instruction in the block. 8328 VPBuilder::InsertPointGuard Guard(Builder); 8329 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8330 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8331 8332 // Introduce the early-exit compare IV <= BTC to form header block mask. 8333 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8334 // Start by constructing the desired canonical IV. 8335 VPValue *IV = nullptr; 8336 if (Legal->getPrimaryInduction()) 8337 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8338 else { 8339 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8340 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8341 IV = IVRecipe->getVPValue(); 8342 } 8343 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8344 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8345 8346 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8347 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8348 // as a second argument, we only pass the IV here and extract the 8349 // tripcount from the transform state where codegen of the VP instructions 8350 // happen. 8351 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8352 } else { 8353 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8354 } 8355 return BlockMaskCache[BB] = BlockMask; 8356 } 8357 8358 // This is the block mask. We OR all incoming edges. 8359 for (auto *Predecessor : predecessors(BB)) { 8360 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8361 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8362 return BlockMaskCache[BB] = EdgeMask; 8363 8364 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8365 BlockMask = EdgeMask; 8366 continue; 8367 } 8368 8369 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8370 } 8371 8372 return BlockMaskCache[BB] = BlockMask; 8373 } 8374 8375 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8376 VPlanPtr &Plan) { 8377 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8378 "Must be called with either a load or store"); 8379 8380 auto willWiden = [&](ElementCount VF) -> bool { 8381 if (VF.isScalar()) 8382 return false; 8383 LoopVectorizationCostModel::InstWidening Decision = 8384 CM.getWideningDecision(I, VF); 8385 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8386 "CM decision should be taken at this point."); 8387 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8388 return true; 8389 if (CM.isScalarAfterVectorization(I, VF) || 8390 CM.isProfitableToScalarize(I, VF)) 8391 return false; 8392 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8393 }; 8394 8395 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8396 return nullptr; 8397 8398 VPValue *Mask = nullptr; 8399 if (Legal->isMaskRequired(I)) 8400 Mask = createBlockInMask(I->getParent(), Plan); 8401 8402 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8403 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8404 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8405 8406 StoreInst *Store = cast<StoreInst>(I); 8407 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8408 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8409 } 8410 8411 VPWidenIntOrFpInductionRecipe * 8412 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8413 // Check if this is an integer or fp induction. If so, build the recipe that 8414 // produces its scalar and vector values. 8415 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8416 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8417 II.getKind() == InductionDescriptor::IK_FpInduction) { 8418 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8419 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8420 return new VPWidenIntOrFpInductionRecipe( 8421 Phi, Start, Casts.empty() ? nullptr : Casts.front()); 8422 } 8423 8424 return nullptr; 8425 } 8426 8427 VPWidenIntOrFpInductionRecipe * 8428 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8429 VPlan &Plan) const { 8430 // Optimize the special case where the source is a constant integer 8431 // induction variable. Notice that we can only optimize the 'trunc' case 8432 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8433 // (c) other casts depend on pointer size. 8434 8435 // Determine whether \p K is a truncation based on an induction variable that 8436 // can be optimized. 8437 auto isOptimizableIVTruncate = 8438 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8439 return [=](ElementCount VF) -> bool { 8440 return CM.isOptimizableIVTruncate(K, VF); 8441 }; 8442 }; 8443 8444 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8445 isOptimizableIVTruncate(I), Range)) { 8446 8447 InductionDescriptor II = 8448 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8449 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8450 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8451 Start, nullptr, I); 8452 } 8453 return nullptr; 8454 } 8455 8456 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8457 // If all incoming values are equal, the incoming VPValue can be used directly 8458 // instead of creating a new VPBlendRecipe. 8459 Value *FirstIncoming = Phi->getIncomingValue(0); 8460 if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) { 8461 return FirstIncoming == Inc; 8462 })) { 8463 return Plan->getOrAddVPValue(Phi->getIncomingValue(0)); 8464 } 8465 8466 // We know that all PHIs in non-header blocks are converted into selects, so 8467 // we don't have to worry about the insertion order and we can just use the 8468 // builder. At this point we generate the predication tree. There may be 8469 // duplications since this is a simple recursive scan, but future 8470 // optimizations will clean it up. 8471 SmallVector<VPValue *, 2> Operands; 8472 unsigned NumIncoming = Phi->getNumIncomingValues(); 8473 8474 for (unsigned In = 0; In < NumIncoming; In++) { 8475 VPValue *EdgeMask = 8476 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8477 assert((EdgeMask || NumIncoming == 1) && 8478 "Multiple predecessors with one having a full mask"); 8479 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8480 if (EdgeMask) 8481 Operands.push_back(EdgeMask); 8482 } 8483 return toVPRecipeResult(new VPBlendRecipe(Phi, Operands)); 8484 } 8485 8486 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8487 VPlan &Plan) const { 8488 8489 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8490 [this, CI](ElementCount VF) { 8491 return CM.isScalarWithPredication(CI, VF); 8492 }, 8493 Range); 8494 8495 if (IsPredicated) 8496 return nullptr; 8497 8498 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8499 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8500 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8501 ID == Intrinsic::pseudoprobe || 8502 ID == Intrinsic::experimental_noalias_scope_decl)) 8503 return nullptr; 8504 8505 auto willWiden = [&](ElementCount VF) -> bool { 8506 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8507 // The following case may be scalarized depending on the VF. 8508 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8509 // version of the instruction. 8510 // Is it beneficial to perform intrinsic call compared to lib call? 8511 bool NeedToScalarize = false; 8512 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8513 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8514 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8515 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8516 "Cannot have invalid costs while widening"); 8517 return UseVectorIntrinsic || !NeedToScalarize; 8518 }; 8519 8520 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8521 return nullptr; 8522 8523 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8524 } 8525 8526 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8527 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8528 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8529 // Instruction should be widened, unless it is scalar after vectorization, 8530 // scalarization is profitable or it is predicated. 8531 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8532 return CM.isScalarAfterVectorization(I, VF) || 8533 CM.isProfitableToScalarize(I, VF) || 8534 CM.isScalarWithPredication(I, VF); 8535 }; 8536 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8537 Range); 8538 } 8539 8540 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8541 auto IsVectorizableOpcode = [](unsigned Opcode) { 8542 switch (Opcode) { 8543 case Instruction::Add: 8544 case Instruction::And: 8545 case Instruction::AShr: 8546 case Instruction::BitCast: 8547 case Instruction::FAdd: 8548 case Instruction::FCmp: 8549 case Instruction::FDiv: 8550 case Instruction::FMul: 8551 case Instruction::FNeg: 8552 case Instruction::FPExt: 8553 case Instruction::FPToSI: 8554 case Instruction::FPToUI: 8555 case Instruction::FPTrunc: 8556 case Instruction::FRem: 8557 case Instruction::FSub: 8558 case Instruction::ICmp: 8559 case Instruction::IntToPtr: 8560 case Instruction::LShr: 8561 case Instruction::Mul: 8562 case Instruction::Or: 8563 case Instruction::PtrToInt: 8564 case Instruction::SDiv: 8565 case Instruction::Select: 8566 case Instruction::SExt: 8567 case Instruction::Shl: 8568 case Instruction::SIToFP: 8569 case Instruction::SRem: 8570 case Instruction::Sub: 8571 case Instruction::Trunc: 8572 case Instruction::UDiv: 8573 case Instruction::UIToFP: 8574 case Instruction::URem: 8575 case Instruction::Xor: 8576 case Instruction::ZExt: 8577 return true; 8578 } 8579 return false; 8580 }; 8581 8582 if (!IsVectorizableOpcode(I->getOpcode())) 8583 return nullptr; 8584 8585 // Success: widen this instruction. 8586 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8587 } 8588 8589 VPBasicBlock *VPRecipeBuilder::handleReplication( 8590 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8591 VPlanPtr &Plan) { 8592 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8593 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8594 Range); 8595 8596 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8597 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8598 Range); 8599 8600 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8601 IsUniform, IsPredicated); 8602 setRecipe(I, Recipe); 8603 Plan->addVPValue(I, Recipe); 8604 8605 // Find if I uses a predicated instruction. If so, it will use its scalar 8606 // value. Avoid hoisting the insert-element which packs the scalar value into 8607 // a vector value, as that happens iff all users use the vector value. 8608 for (VPValue *Op : Recipe->operands()) { 8609 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8610 if (!PredR) 8611 continue; 8612 auto *RepR = 8613 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8614 assert(RepR->isPredicated() && 8615 "expected Replicate recipe to be predicated"); 8616 RepR->setAlsoPack(false); 8617 } 8618 8619 // Finalize the recipe for Instr, first if it is not predicated. 8620 if (!IsPredicated) { 8621 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8622 VPBB->appendRecipe(Recipe); 8623 return VPBB; 8624 } 8625 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8626 assert(VPBB->getSuccessors().empty() && 8627 "VPBB has successors when handling predicated replication."); 8628 // Record predicated instructions for above packing optimizations. 8629 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8630 VPBlockUtils::insertBlockAfter(Region, VPBB); 8631 auto *RegSucc = new VPBasicBlock(); 8632 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8633 return RegSucc; 8634 } 8635 8636 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8637 VPRecipeBase *PredRecipe, 8638 VPlanPtr &Plan) { 8639 // Instructions marked for predication are replicated and placed under an 8640 // if-then construct to prevent side-effects. 8641 8642 // Generate recipes to compute the block mask for this region. 8643 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8644 8645 // Build the triangular if-then region. 8646 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8647 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8648 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8649 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8650 auto *PHIRecipe = Instr->getType()->isVoidTy() 8651 ? nullptr 8652 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8653 if (PHIRecipe) { 8654 Plan->removeVPValueFor(Instr); 8655 Plan->addVPValue(Instr, PHIRecipe); 8656 } 8657 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8658 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8659 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8660 8661 // Note: first set Entry as region entry and then connect successors starting 8662 // from it in order, to propagate the "parent" of each VPBasicBlock. 8663 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8664 VPBlockUtils::connectBlocks(Pred, Exit); 8665 8666 return Region; 8667 } 8668 8669 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8670 VFRange &Range, 8671 VPlanPtr &Plan) { 8672 // First, check for specific widening recipes that deal with calls, memory 8673 // operations, inductions and Phi nodes. 8674 if (auto *CI = dyn_cast<CallInst>(Instr)) 8675 return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); 8676 8677 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8678 return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); 8679 8680 VPRecipeBase *Recipe; 8681 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8682 if (Phi->getParent() != OrigLoop->getHeader()) 8683 return tryToBlend(Phi, Plan); 8684 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8685 return toVPRecipeResult(Recipe); 8686 8687 if (Legal->isReductionVariable(Phi)) { 8688 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8689 VPValue *StartV = 8690 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8691 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8692 } 8693 8694 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8695 } 8696 8697 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8698 cast<TruncInst>(Instr), Range, *Plan))) 8699 return toVPRecipeResult(Recipe); 8700 8701 if (!shouldWiden(Instr, Range)) 8702 return nullptr; 8703 8704 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8705 return toVPRecipeResult(new VPWidenGEPRecipe( 8706 GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop)); 8707 8708 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8709 bool InvariantCond = 8710 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8711 return toVPRecipeResult(new VPWidenSelectRecipe( 8712 *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); 8713 } 8714 8715 return toVPRecipeResult(tryToWiden(Instr, *Plan)); 8716 } 8717 8718 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8719 ElementCount MaxVF) { 8720 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8721 8722 // Collect instructions from the original loop that will become trivially dead 8723 // in the vectorized loop. We don't need to vectorize these instructions. For 8724 // example, original induction update instructions can become dead because we 8725 // separately emit induction "steps" when generating code for the new loop. 8726 // Similarly, we create a new latch condition when setting up the structure 8727 // of the new loop, so the old one can become dead. 8728 SmallPtrSet<Instruction *, 4> DeadInstructions; 8729 collectTriviallyDeadInstructions(DeadInstructions); 8730 8731 // Add assume instructions we need to drop to DeadInstructions, to prevent 8732 // them from being added to the VPlan. 8733 // TODO: We only need to drop assumes in blocks that get flattend. If the 8734 // control flow is preserved, we should keep them. 8735 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8736 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8737 8738 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8739 // Dead instructions do not need sinking. Remove them from SinkAfter. 8740 for (Instruction *I : DeadInstructions) 8741 SinkAfter.erase(I); 8742 8743 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8744 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8745 VFRange SubRange = {VF, MaxVFPlusOne}; 8746 VPlans.push_back( 8747 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8748 VF = SubRange.End; 8749 } 8750 } 8751 8752 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8753 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8754 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8755 8756 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8757 8758 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8759 8760 // --------------------------------------------------------------------------- 8761 // Pre-construction: record ingredients whose recipes we'll need to further 8762 // process after constructing the initial VPlan. 8763 // --------------------------------------------------------------------------- 8764 8765 // Mark instructions we'll need to sink later and their targets as 8766 // ingredients whose recipe we'll need to record. 8767 for (auto &Entry : SinkAfter) { 8768 RecipeBuilder.recordRecipeOf(Entry.first); 8769 RecipeBuilder.recordRecipeOf(Entry.second); 8770 } 8771 for (auto &Reduction : CM.getInLoopReductionChains()) { 8772 PHINode *Phi = Reduction.first; 8773 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8774 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8775 8776 RecipeBuilder.recordRecipeOf(Phi); 8777 for (auto &R : ReductionOperations) { 8778 RecipeBuilder.recordRecipeOf(R); 8779 // For min/max reducitons, where we have a pair of icmp/select, we also 8780 // need to record the ICmp recipe, so it can be removed later. 8781 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8782 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8783 } 8784 } 8785 8786 // For each interleave group which is relevant for this (possibly trimmed) 8787 // Range, add it to the set of groups to be later applied to the VPlan and add 8788 // placeholders for its members' Recipes which we'll be replacing with a 8789 // single VPInterleaveRecipe. 8790 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8791 auto applyIG = [IG, this](ElementCount VF) -> bool { 8792 return (VF.isVector() && // Query is illegal for VF == 1 8793 CM.getWideningDecision(IG->getInsertPos(), VF) == 8794 LoopVectorizationCostModel::CM_Interleave); 8795 }; 8796 if (!getDecisionAndClampRange(applyIG, Range)) 8797 continue; 8798 InterleaveGroups.insert(IG); 8799 for (unsigned i = 0; i < IG->getFactor(); i++) 8800 if (Instruction *Member = IG->getMember(i)) 8801 RecipeBuilder.recordRecipeOf(Member); 8802 }; 8803 8804 // --------------------------------------------------------------------------- 8805 // Build initial VPlan: Scan the body of the loop in a topological order to 8806 // visit each basic block after having visited its predecessor basic blocks. 8807 // --------------------------------------------------------------------------- 8808 8809 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8810 auto Plan = std::make_unique<VPlan>(); 8811 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8812 Plan->setEntry(VPBB); 8813 8814 // Scan the body of the loop in a topological order to visit each basic block 8815 // after having visited its predecessor basic blocks. 8816 LoopBlocksDFS DFS(OrigLoop); 8817 DFS.perform(LI); 8818 8819 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8820 // Relevant instructions from basic block BB will be grouped into VPRecipe 8821 // ingredients and fill a new VPBasicBlock. 8822 unsigned VPBBsForBB = 0; 8823 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8824 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8825 VPBB = FirstVPBBForBB; 8826 Builder.setInsertPoint(VPBB); 8827 8828 // Introduce each ingredient into VPlan. 8829 // TODO: Model and preserve debug instrinsics in VPlan. 8830 for (Instruction &I : BB->instructionsWithoutDebug()) { 8831 Instruction *Instr = &I; 8832 8833 // First filter out irrelevant instructions, to ensure no recipes are 8834 // built for them. 8835 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8836 continue; 8837 8838 if (auto RecipeOrValue = 8839 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8840 // If Instr can be simplified to an existing VPValue, use it. 8841 if (RecipeOrValue.is<VPValue *>()) { 8842 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 8843 continue; 8844 } 8845 // Otherwise, add the new recipe. 8846 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 8847 for (auto *Def : Recipe->definedValues()) { 8848 auto *UV = Def->getUnderlyingValue(); 8849 Plan->addVPValue(UV, Def); 8850 } 8851 8852 RecipeBuilder.setRecipe(Instr, Recipe); 8853 VPBB->appendRecipe(Recipe); 8854 continue; 8855 } 8856 8857 // Otherwise, if all widening options failed, Instruction is to be 8858 // replicated. This may create a successor for VPBB. 8859 VPBasicBlock *NextVPBB = 8860 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 8861 if (NextVPBB != VPBB) { 8862 VPBB = NextVPBB; 8863 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8864 : ""); 8865 } 8866 } 8867 } 8868 8869 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8870 // may also be empty, such as the last one VPBB, reflecting original 8871 // basic-blocks with no recipes. 8872 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8873 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8874 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8875 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8876 delete PreEntry; 8877 8878 // --------------------------------------------------------------------------- 8879 // Transform initial VPlan: Apply previously taken decisions, in order, to 8880 // bring the VPlan to its final state. 8881 // --------------------------------------------------------------------------- 8882 8883 // Apply Sink-After legal constraints. 8884 for (auto &Entry : SinkAfter) { 8885 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8886 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8887 // If the target is in a replication region, make sure to move Sink to the 8888 // block after it, not into the replication region itself. 8889 if (auto *Region = 8890 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8891 if (Region->isReplicator()) { 8892 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8893 VPBasicBlock *NextBlock = 8894 cast<VPBasicBlock>(Region->getSuccessors().front()); 8895 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8896 continue; 8897 } 8898 } 8899 Sink->moveAfter(Target); 8900 } 8901 8902 // Interleave memory: for each Interleave Group we marked earlier as relevant 8903 // for this VPlan, replace the Recipes widening its memory instructions with a 8904 // single VPInterleaveRecipe at its insertion point. 8905 for (auto IG : InterleaveGroups) { 8906 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8907 RecipeBuilder.getRecipe(IG->getInsertPos())); 8908 SmallVector<VPValue *, 4> StoredValues; 8909 for (unsigned i = 0; i < IG->getFactor(); ++i) 8910 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8911 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8912 8913 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8914 Recipe->getMask()); 8915 VPIG->insertBefore(Recipe); 8916 unsigned J = 0; 8917 for (unsigned i = 0; i < IG->getFactor(); ++i) 8918 if (Instruction *Member = IG->getMember(i)) { 8919 if (!Member->getType()->isVoidTy()) { 8920 VPValue *OriginalV = Plan->getVPValue(Member); 8921 Plan->removeVPValueFor(Member); 8922 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8923 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8924 J++; 8925 } 8926 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8927 } 8928 } 8929 8930 // Adjust the recipes for any inloop reductions. 8931 if (Range.Start.isVector()) 8932 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8933 8934 // Finally, if tail is folded by masking, introduce selects between the phi 8935 // and the live-out instruction of each reduction, at the end of the latch. 8936 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8937 Builder.setInsertPoint(VPBB); 8938 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8939 for (auto &Reduction : Legal->getReductionVars()) { 8940 if (CM.isInLoopReduction(Reduction.first)) 8941 continue; 8942 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8943 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8944 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8945 } 8946 } 8947 8948 std::string PlanName; 8949 raw_string_ostream RSO(PlanName); 8950 ElementCount VF = Range.Start; 8951 Plan->addVF(VF); 8952 RSO << "Initial VPlan for VF={" << VF; 8953 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8954 Plan->addVF(VF); 8955 RSO << "," << VF; 8956 } 8957 RSO << "},UF>=1"; 8958 RSO.flush(); 8959 Plan->setName(PlanName); 8960 8961 return Plan; 8962 } 8963 8964 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8965 // Outer loop handling: They may require CFG and instruction level 8966 // transformations before even evaluating whether vectorization is profitable. 8967 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8968 // the vectorization pipeline. 8969 assert(!OrigLoop->isInnermost()); 8970 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8971 8972 // Create new empty VPlan 8973 auto Plan = std::make_unique<VPlan>(); 8974 8975 // Build hierarchical CFG 8976 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8977 HCFGBuilder.buildHierarchicalCFG(); 8978 8979 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8980 VF *= 2) 8981 Plan->addVF(VF); 8982 8983 if (EnableVPlanPredication) { 8984 VPlanPredicator VPP(*Plan); 8985 VPP.predicate(); 8986 8987 // Avoid running transformation to recipes until masked code generation in 8988 // VPlan-native path is in place. 8989 return Plan; 8990 } 8991 8992 SmallPtrSet<Instruction *, 1> DeadInstructions; 8993 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 8994 Legal->getInductionVars(), 8995 DeadInstructions, *PSE.getSE()); 8996 return Plan; 8997 } 8998 8999 // Adjust the recipes for any inloop reductions. The chain of instructions 9000 // leading from the loop exit instr to the phi need to be converted to 9001 // reductions, with one operand being vector and the other being the scalar 9002 // reduction chain. 9003 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9004 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9005 for (auto &Reduction : CM.getInLoopReductionChains()) { 9006 PHINode *Phi = Reduction.first; 9007 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9008 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9009 9010 // ReductionOperations are orders top-down from the phi's use to the 9011 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9012 // which of the two operands will remain scalar and which will be reduced. 9013 // For minmax the chain will be the select instructions. 9014 Instruction *Chain = Phi; 9015 for (Instruction *R : ReductionOperations) { 9016 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9017 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9018 9019 VPValue *ChainOp = Plan->getVPValue(Chain); 9020 unsigned FirstOpId; 9021 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9022 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9023 "Expected to replace a VPWidenSelectSC"); 9024 FirstOpId = 1; 9025 } else { 9026 assert(isa<VPWidenRecipe>(WidenRecipe) && 9027 "Expected to replace a VPWidenSC"); 9028 FirstOpId = 0; 9029 } 9030 unsigned VecOpId = 9031 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9032 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9033 9034 auto *CondOp = CM.foldTailByMasking() 9035 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9036 : nullptr; 9037 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9038 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9039 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9040 Plan->removeVPValueFor(R); 9041 Plan->addVPValue(R, RedRecipe); 9042 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9043 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9044 WidenRecipe->eraseFromParent(); 9045 9046 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9047 VPRecipeBase *CompareRecipe = 9048 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9049 assert(isa<VPWidenRecipe>(CompareRecipe) && 9050 "Expected to replace a VPWidenSC"); 9051 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9052 "Expected no remaining users"); 9053 CompareRecipe->eraseFromParent(); 9054 } 9055 Chain = R; 9056 } 9057 } 9058 } 9059 9060 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9061 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9062 VPSlotTracker &SlotTracker) const { 9063 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9064 IG->getInsertPos()->printAsOperand(O, false); 9065 O << ", "; 9066 getAddr()->printAsOperand(O, SlotTracker); 9067 VPValue *Mask = getMask(); 9068 if (Mask) { 9069 O << ", "; 9070 Mask->printAsOperand(O, SlotTracker); 9071 } 9072 for (unsigned i = 0; i < IG->getFactor(); ++i) 9073 if (Instruction *I = IG->getMember(i)) 9074 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9075 } 9076 #endif 9077 9078 void VPWidenCallRecipe::execute(VPTransformState &State) { 9079 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9080 *this, State); 9081 } 9082 9083 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9084 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9085 this, *this, InvariantCond, State); 9086 } 9087 9088 void VPWidenRecipe::execute(VPTransformState &State) { 9089 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9090 } 9091 9092 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9093 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9094 *this, State.UF, State.VF, IsPtrLoopInvariant, 9095 IsIndexLoopInvariant, State); 9096 } 9097 9098 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9099 assert(!State.Instance && "Int or FP induction being replicated."); 9100 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9101 getTruncInst(), getVPValue(0), 9102 getCastValue(), State); 9103 } 9104 9105 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9106 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9107 getStartValue(), this, State); 9108 } 9109 9110 void VPBlendRecipe::execute(VPTransformState &State) { 9111 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9112 // We know that all PHIs in non-header blocks are converted into 9113 // selects, so we don't have to worry about the insertion order and we 9114 // can just use the builder. 9115 // At this point we generate the predication tree. There may be 9116 // duplications since this is a simple recursive scan, but future 9117 // optimizations will clean it up. 9118 9119 unsigned NumIncoming = getNumIncomingValues(); 9120 9121 // Generate a sequence of selects of the form: 9122 // SELECT(Mask3, In3, 9123 // SELECT(Mask2, In2, 9124 // SELECT(Mask1, In1, 9125 // In0))) 9126 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9127 // are essentially undef are taken from In0. 9128 InnerLoopVectorizer::VectorParts Entry(State.UF); 9129 for (unsigned In = 0; In < NumIncoming; ++In) { 9130 for (unsigned Part = 0; Part < State.UF; ++Part) { 9131 // We might have single edge PHIs (blocks) - use an identity 9132 // 'select' for the first PHI operand. 9133 Value *In0 = State.get(getIncomingValue(In), Part); 9134 if (In == 0) 9135 Entry[Part] = In0; // Initialize with the first incoming value. 9136 else { 9137 // Select between the current value and the previous incoming edge 9138 // based on the incoming mask. 9139 Value *Cond = State.get(getMask(In), Part); 9140 Entry[Part] = 9141 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9142 } 9143 } 9144 } 9145 for (unsigned Part = 0; Part < State.UF; ++Part) 9146 State.set(this, Entry[Part], Part); 9147 } 9148 9149 void VPInterleaveRecipe::execute(VPTransformState &State) { 9150 assert(!State.Instance && "Interleave group being replicated."); 9151 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9152 getStoredValues(), getMask()); 9153 } 9154 9155 void VPReductionRecipe::execute(VPTransformState &State) { 9156 assert(!State.Instance && "Reduction being replicated."); 9157 for (unsigned Part = 0; Part < State.UF; ++Part) { 9158 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9159 Value *NewVecOp = State.get(getVecOp(), Part); 9160 if (VPValue *Cond = getCondOp()) { 9161 Value *NewCond = State.get(Cond, Part); 9162 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9163 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9164 Kind, VecTy->getElementType()); 9165 Constant *IdenVec = 9166 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9167 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9168 NewVecOp = Select; 9169 } 9170 Value *NewRed = 9171 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9172 Value *PrevInChain = State.get(getChainOp(), Part); 9173 Value *NextInChain; 9174 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9175 NextInChain = 9176 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9177 NewRed, PrevInChain); 9178 } else { 9179 NextInChain = State.Builder.CreateBinOp( 9180 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9181 PrevInChain); 9182 } 9183 State.set(this, NextInChain, Part); 9184 } 9185 } 9186 9187 void VPReplicateRecipe::execute(VPTransformState &State) { 9188 if (State.Instance) { // Generate a single instance. 9189 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9190 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9191 *State.Instance, IsPredicated, State); 9192 // Insert scalar instance packing it into a vector. 9193 if (AlsoPack && State.VF.isVector()) { 9194 // If we're constructing lane 0, initialize to start from poison. 9195 if (State.Instance->Lane.isFirstLane()) { 9196 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9197 Value *Poison = PoisonValue::get( 9198 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9199 State.set(this, Poison, State.Instance->Part); 9200 } 9201 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9202 } 9203 return; 9204 } 9205 9206 // Generate scalar instances for all VF lanes of all UF parts, unless the 9207 // instruction is uniform inwhich case generate only the first lane for each 9208 // of the UF parts. 9209 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9210 assert((!State.VF.isScalable() || IsUniform) && 9211 "Can't scalarize a scalable vector"); 9212 for (unsigned Part = 0; Part < State.UF; ++Part) 9213 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9214 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9215 VPIteration(Part, Lane), IsPredicated, 9216 State); 9217 } 9218 9219 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9220 assert(State.Instance && "Branch on Mask works only on single instance."); 9221 9222 unsigned Part = State.Instance->Part; 9223 unsigned Lane = State.Instance->Lane.getKnownLane(); 9224 9225 Value *ConditionBit = nullptr; 9226 VPValue *BlockInMask = getMask(); 9227 if (BlockInMask) { 9228 ConditionBit = State.get(BlockInMask, Part); 9229 if (ConditionBit->getType()->isVectorTy()) 9230 ConditionBit = State.Builder.CreateExtractElement( 9231 ConditionBit, State.Builder.getInt32(Lane)); 9232 } else // Block in mask is all-one. 9233 ConditionBit = State.Builder.getTrue(); 9234 9235 // Replace the temporary unreachable terminator with a new conditional branch, 9236 // whose two destinations will be set later when they are created. 9237 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9238 assert(isa<UnreachableInst>(CurrentTerminator) && 9239 "Expected to replace unreachable terminator with conditional branch."); 9240 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9241 CondBr->setSuccessor(0, nullptr); 9242 ReplaceInstWithInst(CurrentTerminator, CondBr); 9243 } 9244 9245 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9246 assert(State.Instance && "Predicated instruction PHI works per instance."); 9247 Instruction *ScalarPredInst = 9248 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9249 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9250 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9251 assert(PredicatingBB && "Predicated block has no single predecessor."); 9252 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9253 "operand must be VPReplicateRecipe"); 9254 9255 // By current pack/unpack logic we need to generate only a single phi node: if 9256 // a vector value for the predicated instruction exists at this point it means 9257 // the instruction has vector users only, and a phi for the vector value is 9258 // needed. In this case the recipe of the predicated instruction is marked to 9259 // also do that packing, thereby "hoisting" the insert-element sequence. 9260 // Otherwise, a phi node for the scalar value is needed. 9261 unsigned Part = State.Instance->Part; 9262 if (State.hasVectorValue(getOperand(0), Part)) { 9263 Value *VectorValue = State.get(getOperand(0), Part); 9264 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9265 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9266 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9267 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9268 if (State.hasVectorValue(this, Part)) 9269 State.reset(this, VPhi, Part); 9270 else 9271 State.set(this, VPhi, Part); 9272 // NOTE: Currently we need to update the value of the operand, so the next 9273 // predicated iteration inserts its generated value in the correct vector. 9274 State.reset(getOperand(0), VPhi, Part); 9275 } else { 9276 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9277 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9278 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9279 PredicatingBB); 9280 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9281 if (State.hasScalarValue(this, *State.Instance)) 9282 State.reset(this, Phi, *State.Instance); 9283 else 9284 State.set(this, Phi, *State.Instance); 9285 // NOTE: Currently we need to update the value of the operand, so the next 9286 // predicated iteration inserts its generated value in the correct vector. 9287 State.reset(getOperand(0), Phi, *State.Instance); 9288 } 9289 } 9290 9291 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9292 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9293 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9294 StoredValue ? nullptr : getVPValue(), 9295 getAddr(), StoredValue, getMask()); 9296 } 9297 9298 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9299 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9300 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9301 // for predication. 9302 static ScalarEpilogueLowering getScalarEpilogueLowering( 9303 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9304 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9305 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9306 LoopVectorizationLegality &LVL) { 9307 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9308 // don't look at hints or options, and don't request a scalar epilogue. 9309 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9310 // LoopAccessInfo (due to code dependency and not being able to reliably get 9311 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9312 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9313 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9314 // back to the old way and vectorize with versioning when forced. See D81345.) 9315 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9316 PGSOQueryType::IRPass) && 9317 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9318 return CM_ScalarEpilogueNotAllowedOptSize; 9319 9320 // 2) If set, obey the directives 9321 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9322 switch (PreferPredicateOverEpilogue) { 9323 case PreferPredicateTy::ScalarEpilogue: 9324 return CM_ScalarEpilogueAllowed; 9325 case PreferPredicateTy::PredicateElseScalarEpilogue: 9326 return CM_ScalarEpilogueNotNeededUsePredicate; 9327 case PreferPredicateTy::PredicateOrDontVectorize: 9328 return CM_ScalarEpilogueNotAllowedUsePredicate; 9329 }; 9330 } 9331 9332 // 3) If set, obey the hints 9333 switch (Hints.getPredicate()) { 9334 case LoopVectorizeHints::FK_Enabled: 9335 return CM_ScalarEpilogueNotNeededUsePredicate; 9336 case LoopVectorizeHints::FK_Disabled: 9337 return CM_ScalarEpilogueAllowed; 9338 }; 9339 9340 // 4) if the TTI hook indicates this is profitable, request predication. 9341 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9342 LVL.getLAI())) 9343 return CM_ScalarEpilogueNotNeededUsePredicate; 9344 9345 return CM_ScalarEpilogueAllowed; 9346 } 9347 9348 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9349 // If Values have been set for this Def return the one relevant for \p Part. 9350 if (hasVectorValue(Def, Part)) 9351 return Data.PerPartOutput[Def][Part]; 9352 9353 if (!hasScalarValue(Def, {Part, 0})) { 9354 Value *IRV = Def->getLiveInIRValue(); 9355 Value *B = ILV->getBroadcastInstrs(IRV); 9356 set(Def, B, Part); 9357 return B; 9358 } 9359 9360 Value *ScalarValue = get(Def, {Part, 0}); 9361 // If we aren't vectorizing, we can just copy the scalar map values over 9362 // to the vector map. 9363 if (VF.isScalar()) { 9364 set(Def, ScalarValue, Part); 9365 return ScalarValue; 9366 } 9367 9368 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9369 bool IsUniform = RepR && RepR->isUniform(); 9370 9371 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9372 // Check if there is a scalar value for the selected lane. 9373 if (!hasScalarValue(Def, {Part, LastLane})) { 9374 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9375 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9376 "unexpected recipe found to be invariant"); 9377 IsUniform = true; 9378 LastLane = 0; 9379 } 9380 9381 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9382 9383 // Set the insert point after the last scalarized instruction. This 9384 // ensures the insertelement sequence will directly follow the scalar 9385 // definitions. 9386 auto OldIP = Builder.saveIP(); 9387 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9388 Builder.SetInsertPoint(&*NewIP); 9389 9390 // However, if we are vectorizing, we need to construct the vector values. 9391 // If the value is known to be uniform after vectorization, we can just 9392 // broadcast the scalar value corresponding to lane zero for each unroll 9393 // iteration. Otherwise, we construct the vector values using 9394 // insertelement instructions. Since the resulting vectors are stored in 9395 // State, we will only generate the insertelements once. 9396 Value *VectorValue = nullptr; 9397 if (IsUniform) { 9398 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9399 set(Def, VectorValue, Part); 9400 } else { 9401 // Initialize packing with insertelements to start from undef. 9402 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9403 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9404 set(Def, Undef, Part); 9405 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9406 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9407 VectorValue = get(Def, Part); 9408 } 9409 Builder.restoreIP(OldIP); 9410 return VectorValue; 9411 } 9412 9413 // Process the loop in the VPlan-native vectorization path. This path builds 9414 // VPlan upfront in the vectorization pipeline, which allows to apply 9415 // VPlan-to-VPlan transformations from the very beginning without modifying the 9416 // input LLVM IR. 9417 static bool processLoopInVPlanNativePath( 9418 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9419 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9420 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9421 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9422 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9423 LoopVectorizationRequirements &Requirements) { 9424 9425 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9426 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9427 return false; 9428 } 9429 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9430 Function *F = L->getHeader()->getParent(); 9431 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9432 9433 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9434 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9435 9436 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9437 &Hints, IAI); 9438 // Use the planner for outer loop vectorization. 9439 // TODO: CM is not used at this point inside the planner. Turn CM into an 9440 // optional argument if we don't need it in the future. 9441 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9442 Requirements, ORE); 9443 9444 // Get user vectorization factor. 9445 ElementCount UserVF = Hints.getWidth(); 9446 9447 // Plan how to best vectorize, return the best VF and its cost. 9448 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9449 9450 // If we are stress testing VPlan builds, do not attempt to generate vector 9451 // code. Masked vector code generation support will follow soon. 9452 // Also, do not attempt to vectorize if no vector code will be produced. 9453 if (VPlanBuildStressTest || EnableVPlanPredication || 9454 VectorizationFactor::Disabled() == VF) 9455 return false; 9456 9457 LVP.setBestPlan(VF.Width, 1); 9458 9459 { 9460 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9461 F->getParent()->getDataLayout()); 9462 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9463 &CM, BFI, PSI, Checks); 9464 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9465 << L->getHeader()->getParent()->getName() << "\"\n"); 9466 LVP.executePlan(LB, DT); 9467 } 9468 9469 // Mark the loop as already vectorized to avoid vectorizing again. 9470 Hints.setAlreadyVectorized(); 9471 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9472 return true; 9473 } 9474 9475 // Emit a remark if there are stores to floats that required a floating point 9476 // extension. If the vectorized loop was generated with floating point there 9477 // will be a performance penalty from the conversion overhead and the change in 9478 // the vector width. 9479 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9480 SmallVector<Instruction *, 4> Worklist; 9481 for (BasicBlock *BB : L->getBlocks()) { 9482 for (Instruction &Inst : *BB) { 9483 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9484 if (S->getValueOperand()->getType()->isFloatTy()) 9485 Worklist.push_back(S); 9486 } 9487 } 9488 } 9489 9490 // Traverse the floating point stores upwards searching, for floating point 9491 // conversions. 9492 SmallPtrSet<const Instruction *, 4> Visited; 9493 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9494 while (!Worklist.empty()) { 9495 auto *I = Worklist.pop_back_val(); 9496 if (!L->contains(I)) 9497 continue; 9498 if (!Visited.insert(I).second) 9499 continue; 9500 9501 // Emit a remark if the floating point store required a floating 9502 // point conversion. 9503 // TODO: More work could be done to identify the root cause such as a 9504 // constant or a function return type and point the user to it. 9505 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9506 ORE->emit([&]() { 9507 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9508 I->getDebugLoc(), L->getHeader()) 9509 << "floating point conversion changes vector width. " 9510 << "Mixed floating point precision requires an up/down " 9511 << "cast that will negatively impact performance."; 9512 }); 9513 9514 for (Use &Op : I->operands()) 9515 if (auto *OpI = dyn_cast<Instruction>(Op)) 9516 Worklist.push_back(OpI); 9517 } 9518 } 9519 9520 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9521 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9522 !EnableLoopInterleaving), 9523 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9524 !EnableLoopVectorization) {} 9525 9526 bool LoopVectorizePass::processLoop(Loop *L) { 9527 assert((EnableVPlanNativePath || L->isInnermost()) && 9528 "VPlan-native path is not enabled. Only process inner loops."); 9529 9530 #ifndef NDEBUG 9531 const std::string DebugLocStr = getDebugLocString(L); 9532 #endif /* NDEBUG */ 9533 9534 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9535 << L->getHeader()->getParent()->getName() << "\" from " 9536 << DebugLocStr << "\n"); 9537 9538 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9539 9540 LLVM_DEBUG( 9541 dbgs() << "LV: Loop hints:" 9542 << " force=" 9543 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9544 ? "disabled" 9545 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9546 ? "enabled" 9547 : "?")) 9548 << " width=" << Hints.getWidth() 9549 << " unroll=" << Hints.getInterleave() << "\n"); 9550 9551 // Function containing loop 9552 Function *F = L->getHeader()->getParent(); 9553 9554 // Looking at the diagnostic output is the only way to determine if a loop 9555 // was vectorized (other than looking at the IR or machine code), so it 9556 // is important to generate an optimization remark for each loop. Most of 9557 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9558 // generated as OptimizationRemark and OptimizationRemarkMissed are 9559 // less verbose reporting vectorized loops and unvectorized loops that may 9560 // benefit from vectorization, respectively. 9561 9562 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9563 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9564 return false; 9565 } 9566 9567 PredicatedScalarEvolution PSE(*SE, *L); 9568 9569 // Check if it is legal to vectorize the loop. 9570 LoopVectorizationRequirements Requirements; 9571 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9572 &Requirements, &Hints, DB, AC, BFI, PSI); 9573 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9574 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9575 Hints.emitRemarkWithHints(); 9576 return false; 9577 } 9578 9579 // Check the function attributes and profiles to find out if this function 9580 // should be optimized for size. 9581 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9582 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9583 9584 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9585 // here. They may require CFG and instruction level transformations before 9586 // even evaluating whether vectorization is profitable. Since we cannot modify 9587 // the incoming IR, we need to build VPlan upfront in the vectorization 9588 // pipeline. 9589 if (!L->isInnermost()) 9590 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9591 ORE, BFI, PSI, Hints, Requirements); 9592 9593 assert(L->isInnermost() && "Inner loop expected."); 9594 9595 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9596 // count by optimizing for size, to minimize overheads. 9597 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9598 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9599 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9600 << "This loop is worth vectorizing only if no scalar " 9601 << "iteration overheads are incurred."); 9602 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9603 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9604 else { 9605 LLVM_DEBUG(dbgs() << "\n"); 9606 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9607 } 9608 } 9609 9610 // Check the function attributes to see if implicit floats are allowed. 9611 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9612 // an integer loop and the vector instructions selected are purely integer 9613 // vector instructions? 9614 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9615 reportVectorizationFailure( 9616 "Can't vectorize when the NoImplicitFloat attribute is used", 9617 "loop not vectorized due to NoImplicitFloat attribute", 9618 "NoImplicitFloat", ORE, L); 9619 Hints.emitRemarkWithHints(); 9620 return false; 9621 } 9622 9623 // Check if the target supports potentially unsafe FP vectorization. 9624 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9625 // for the target we're vectorizing for, to make sure none of the 9626 // additional fp-math flags can help. 9627 if (Hints.isPotentiallyUnsafe() && 9628 TTI->isFPVectorizationPotentiallyUnsafe()) { 9629 reportVectorizationFailure( 9630 "Potentially unsafe FP op prevents vectorization", 9631 "loop not vectorized due to unsafe FP support.", 9632 "UnsafeFP", ORE, L); 9633 Hints.emitRemarkWithHints(); 9634 return false; 9635 } 9636 9637 if (!Requirements.canVectorizeFPMath(Hints)) { 9638 ORE->emit([&]() { 9639 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9640 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9641 ExactFPMathInst->getDebugLoc(), 9642 ExactFPMathInst->getParent()) 9643 << "loop not vectorized: cannot prove it is safe to reorder " 9644 "floating-point operations"; 9645 }); 9646 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9647 "reorder floating-point operations\n"); 9648 Hints.emitRemarkWithHints(); 9649 return false; 9650 } 9651 9652 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9653 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9654 9655 // If an override option has been passed in for interleaved accesses, use it. 9656 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9657 UseInterleaved = EnableInterleavedMemAccesses; 9658 9659 // Analyze interleaved memory accesses. 9660 if (UseInterleaved) { 9661 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9662 } 9663 9664 // Use the cost model. 9665 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9666 F, &Hints, IAI); 9667 CM.collectValuesToIgnore(); 9668 9669 // Use the planner for vectorization. 9670 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9671 Requirements, ORE); 9672 9673 // Get user vectorization factor and interleave count. 9674 ElementCount UserVF = Hints.getWidth(); 9675 unsigned UserIC = Hints.getInterleave(); 9676 9677 // Plan how to best vectorize, return the best VF and its cost. 9678 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9679 9680 VectorizationFactor VF = VectorizationFactor::Disabled(); 9681 unsigned IC = 1; 9682 9683 if (MaybeVF) { 9684 VF = *MaybeVF; 9685 // Select the interleave count. 9686 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9687 } 9688 9689 // Identify the diagnostic messages that should be produced. 9690 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9691 bool VectorizeLoop = true, InterleaveLoop = true; 9692 if (VF.Width.isScalar()) { 9693 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9694 VecDiagMsg = std::make_pair( 9695 "VectorizationNotBeneficial", 9696 "the cost-model indicates that vectorization is not beneficial"); 9697 VectorizeLoop = false; 9698 } 9699 9700 if (!MaybeVF && UserIC > 1) { 9701 // Tell the user interleaving was avoided up-front, despite being explicitly 9702 // requested. 9703 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9704 "interleaving should be avoided up front\n"); 9705 IntDiagMsg = std::make_pair( 9706 "InterleavingAvoided", 9707 "Ignoring UserIC, because interleaving was avoided up front"); 9708 InterleaveLoop = false; 9709 } else if (IC == 1 && UserIC <= 1) { 9710 // Tell the user interleaving is not beneficial. 9711 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9712 IntDiagMsg = std::make_pair( 9713 "InterleavingNotBeneficial", 9714 "the cost-model indicates that interleaving is not beneficial"); 9715 InterleaveLoop = false; 9716 if (UserIC == 1) { 9717 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9718 IntDiagMsg.second += 9719 " and is explicitly disabled or interleave count is set to 1"; 9720 } 9721 } else if (IC > 1 && UserIC == 1) { 9722 // Tell the user interleaving is beneficial, but it explicitly disabled. 9723 LLVM_DEBUG( 9724 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9725 IntDiagMsg = std::make_pair( 9726 "InterleavingBeneficialButDisabled", 9727 "the cost-model indicates that interleaving is beneficial " 9728 "but is explicitly disabled or interleave count is set to 1"); 9729 InterleaveLoop = false; 9730 } 9731 9732 // Override IC if user provided an interleave count. 9733 IC = UserIC > 0 ? UserIC : IC; 9734 9735 // Emit diagnostic messages, if any. 9736 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9737 if (!VectorizeLoop && !InterleaveLoop) { 9738 // Do not vectorize or interleaving the loop. 9739 ORE->emit([&]() { 9740 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9741 L->getStartLoc(), L->getHeader()) 9742 << VecDiagMsg.second; 9743 }); 9744 ORE->emit([&]() { 9745 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9746 L->getStartLoc(), L->getHeader()) 9747 << IntDiagMsg.second; 9748 }); 9749 return false; 9750 } else if (!VectorizeLoop && InterleaveLoop) { 9751 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9752 ORE->emit([&]() { 9753 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9754 L->getStartLoc(), L->getHeader()) 9755 << VecDiagMsg.second; 9756 }); 9757 } else if (VectorizeLoop && !InterleaveLoop) { 9758 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9759 << ") in " << DebugLocStr << '\n'); 9760 ORE->emit([&]() { 9761 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9762 L->getStartLoc(), L->getHeader()) 9763 << IntDiagMsg.second; 9764 }); 9765 } else if (VectorizeLoop && InterleaveLoop) { 9766 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9767 << ") in " << DebugLocStr << '\n'); 9768 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9769 } 9770 9771 bool DisableRuntimeUnroll = false; 9772 MDNode *OrigLoopID = L->getLoopID(); 9773 { 9774 // Optimistically generate runtime checks. Drop them if they turn out to not 9775 // be profitable. Limit the scope of Checks, so the cleanup happens 9776 // immediately after vector codegeneration is done. 9777 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9778 F->getParent()->getDataLayout()); 9779 if (!VF.Width.isScalar() || IC > 1) 9780 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9781 LVP.setBestPlan(VF.Width, IC); 9782 9783 using namespace ore; 9784 if (!VectorizeLoop) { 9785 assert(IC > 1 && "interleave count should not be 1 or 0"); 9786 // If we decided that it is not legal to vectorize the loop, then 9787 // interleave it. 9788 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9789 &CM, BFI, PSI, Checks); 9790 LVP.executePlan(Unroller, DT); 9791 9792 ORE->emit([&]() { 9793 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9794 L->getHeader()) 9795 << "interleaved loop (interleaved count: " 9796 << NV("InterleaveCount", IC) << ")"; 9797 }); 9798 } else { 9799 // If we decided that it is *legal* to vectorize the loop, then do it. 9800 9801 // Consider vectorizing the epilogue too if it's profitable. 9802 VectorizationFactor EpilogueVF = 9803 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9804 if (EpilogueVF.Width.isVector()) { 9805 9806 // The first pass vectorizes the main loop and creates a scalar epilogue 9807 // to be vectorized by executing the plan (potentially with a different 9808 // factor) again shortly afterwards. 9809 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9810 EpilogueVF.Width.getKnownMinValue(), 9811 1); 9812 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9813 EPI, &LVL, &CM, BFI, PSI, Checks); 9814 9815 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9816 LVP.executePlan(MainILV, DT); 9817 ++LoopsVectorized; 9818 9819 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9820 formLCSSARecursively(*L, *DT, LI, SE); 9821 9822 // Second pass vectorizes the epilogue and adjusts the control flow 9823 // edges from the first pass. 9824 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9825 EPI.MainLoopVF = EPI.EpilogueVF; 9826 EPI.MainLoopUF = EPI.EpilogueUF; 9827 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9828 ORE, EPI, &LVL, &CM, BFI, PSI, 9829 Checks); 9830 LVP.executePlan(EpilogILV, DT); 9831 ++LoopsEpilogueVectorized; 9832 9833 if (!MainILV.areSafetyChecksAdded()) 9834 DisableRuntimeUnroll = true; 9835 } else { 9836 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9837 &LVL, &CM, BFI, PSI, Checks); 9838 LVP.executePlan(LB, DT); 9839 ++LoopsVectorized; 9840 9841 // Add metadata to disable runtime unrolling a scalar loop when there 9842 // are no runtime checks about strides and memory. A scalar loop that is 9843 // rarely used is not worth unrolling. 9844 if (!LB.areSafetyChecksAdded()) 9845 DisableRuntimeUnroll = true; 9846 } 9847 // Report the vectorization decision. 9848 ORE->emit([&]() { 9849 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9850 L->getHeader()) 9851 << "vectorized loop (vectorization width: " 9852 << NV("VectorizationFactor", VF.Width) 9853 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9854 }); 9855 } 9856 9857 if (ORE->allowExtraAnalysis(LV_NAME)) 9858 checkMixedPrecision(L, ORE); 9859 } 9860 9861 Optional<MDNode *> RemainderLoopID = 9862 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9863 LLVMLoopVectorizeFollowupEpilogue}); 9864 if (RemainderLoopID.hasValue()) { 9865 L->setLoopID(RemainderLoopID.getValue()); 9866 } else { 9867 if (DisableRuntimeUnroll) 9868 AddRuntimeUnrollDisableMetaData(L); 9869 9870 // Mark the loop as already vectorized to avoid vectorizing again. 9871 Hints.setAlreadyVectorized(); 9872 } 9873 9874 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9875 return true; 9876 } 9877 9878 LoopVectorizeResult LoopVectorizePass::runImpl( 9879 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9880 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9881 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9882 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9883 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9884 SE = &SE_; 9885 LI = &LI_; 9886 TTI = &TTI_; 9887 DT = &DT_; 9888 BFI = &BFI_; 9889 TLI = TLI_; 9890 AA = &AA_; 9891 AC = &AC_; 9892 GetLAA = &GetLAA_; 9893 DB = &DB_; 9894 ORE = &ORE_; 9895 PSI = PSI_; 9896 9897 // Don't attempt if 9898 // 1. the target claims to have no vector registers, and 9899 // 2. interleaving won't help ILP. 9900 // 9901 // The second condition is necessary because, even if the target has no 9902 // vector registers, loop vectorization may still enable scalar 9903 // interleaving. 9904 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9905 TTI->getMaxInterleaveFactor(1) < 2) 9906 return LoopVectorizeResult(false, false); 9907 9908 bool Changed = false, CFGChanged = false; 9909 9910 // The vectorizer requires loops to be in simplified form. 9911 // Since simplification may add new inner loops, it has to run before the 9912 // legality and profitability checks. This means running the loop vectorizer 9913 // will simplify all loops, regardless of whether anything end up being 9914 // vectorized. 9915 for (auto &L : *LI) 9916 Changed |= CFGChanged |= 9917 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9918 9919 // Build up a worklist of inner-loops to vectorize. This is necessary as 9920 // the act of vectorizing or partially unrolling a loop creates new loops 9921 // and can invalidate iterators across the loops. 9922 SmallVector<Loop *, 8> Worklist; 9923 9924 for (Loop *L : *LI) 9925 collectSupportedLoops(*L, LI, ORE, Worklist); 9926 9927 LoopsAnalyzed += Worklist.size(); 9928 9929 // Now walk the identified inner loops. 9930 while (!Worklist.empty()) { 9931 Loop *L = Worklist.pop_back_val(); 9932 9933 // For the inner loops we actually process, form LCSSA to simplify the 9934 // transform. 9935 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9936 9937 Changed |= CFGChanged |= processLoop(L); 9938 } 9939 9940 // Process each loop nest in the function. 9941 return LoopVectorizeResult(Changed, CFGChanged); 9942 } 9943 9944 PreservedAnalyses LoopVectorizePass::run(Function &F, 9945 FunctionAnalysisManager &AM) { 9946 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9947 auto &LI = AM.getResult<LoopAnalysis>(F); 9948 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9949 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9950 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9951 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9952 auto &AA = AM.getResult<AAManager>(F); 9953 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9954 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9955 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9956 MemorySSA *MSSA = EnableMSSALoopDependency 9957 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9958 : nullptr; 9959 9960 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9961 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9962 [&](Loop &L) -> const LoopAccessInfo & { 9963 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9964 TLI, TTI, nullptr, MSSA}; 9965 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9966 }; 9967 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9968 ProfileSummaryInfo *PSI = 9969 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9970 LoopVectorizeResult Result = 9971 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9972 if (!Result.MadeAnyChange) 9973 return PreservedAnalyses::all(); 9974 PreservedAnalyses PA; 9975 9976 // We currently do not preserve loopinfo/dominator analyses with outer loop 9977 // vectorization. Until this is addressed, mark these analyses as preserved 9978 // only for non-VPlan-native path. 9979 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9980 if (!EnableVPlanNativePath) { 9981 PA.preserve<LoopAnalysis>(); 9982 PA.preserve<DominatorTreeAnalysis>(); 9983 } 9984 PA.preserve<BasicAA>(); 9985 PA.preserve<GlobalsAA>(); 9986 if (!Result.MadeCFGChange) 9987 PA.preserveSet<CFGAnalyses>(); 9988 return PA; 9989 } 9990