1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> EnableStrictReductions( 335 "enable-strict-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns the type of loaded or stored value. 379 static Type *getMemInstValueType(Value *I) { 380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 381 "Expected Load or Store instruction"); 382 if (auto *LI = dyn_cast<LoadInst>(I)) 383 return LI->getType(); 384 return cast<StoreInst>(I)->getValueOperand()->getType(); 385 } 386 387 /// A helper function that returns true if the given type is irregular. The 388 /// type is irregular if its allocated size doesn't equal the store size of an 389 /// element of the corresponding vector type. 390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 391 // Determine if an array of N elements of type Ty is "bitcast compatible" 392 // with a <N x Ty> vector. 393 // This is only true if there is no padding between the array elements. 394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 395 } 396 397 /// A helper function that returns the reciprocal of the block probability of 398 /// predicated blocks. If we return X, we are assuming the predicated block 399 /// will execute once for every X iterations of the loop header. 400 /// 401 /// TODO: We should use actual block probability here, if available. Currently, 402 /// we always assume predicated blocks have a 50% chance of executing. 403 static unsigned getReciprocalPredBlockProb() { return 2; } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPWidenPHIRecipe *PhiR, VPTransformState &State); 519 520 /// A helper function to scalarize a single Instruction in the innermost loop. 521 /// Generates a sequence of scalar instances for each lane between \p MinLane 522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 524 /// Instr's operands. 525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 526 const VPIteration &Instance, bool IfPredicateInstr, 527 VPTransformState &State); 528 529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 530 /// is provided, the integer induction variable will first be truncated to 531 /// the corresponding type. 532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 533 VPValue *Def, VPValue *CastDef, 534 VPTransformState &State); 535 536 /// Construct the vector value of a scalarized value \p V one lane at a time. 537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 538 VPTransformState &State); 539 540 /// Try to vectorize interleaved access group \p Group with the base address 541 /// given in \p Addr, optionally masking the vector operations if \p 542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 543 /// values in the vectorized loop. 544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 545 ArrayRef<VPValue *> VPDefs, 546 VPTransformState &State, VPValue *Addr, 547 ArrayRef<VPValue *> StoredValues, 548 VPValue *BlockInMask = nullptr); 549 550 /// Vectorize Load and Store instructions with the base address given in \p 551 /// Addr, optionally masking the vector operations if \p BlockInMask is 552 /// non-null. Use \p State to translate given VPValues to IR values in the 553 /// vectorized loop. 554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 555 VPValue *Def, VPValue *Addr, 556 VPValue *StoredValue, VPValue *BlockInMask); 557 558 /// Set the debug location in the builder using the debug location in 559 /// the instruction. 560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 561 562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 563 void fixNonInductionPHIs(VPTransformState &State); 564 565 /// Create a broadcast instruction. This method generates a broadcast 566 /// instruction (shuffle) for loop invariant values and for the induction 567 /// value. If this is the induction variable then we extend it to N, N+1, ... 568 /// this is needed because each iteration in the loop corresponds to a SIMD 569 /// element. 570 virtual Value *getBroadcastInstrs(Value *V); 571 572 protected: 573 friend class LoopVectorizationPlanner; 574 575 /// A small list of PHINodes. 576 using PhiVector = SmallVector<PHINode *, 4>; 577 578 /// A type for scalarized values in the new loop. Each value from the 579 /// original loop, when scalarized, is represented by UF x VF scalar values 580 /// in the new unrolled loop, where UF is the unroll factor and VF is the 581 /// vectorization factor. 582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 583 584 /// Set up the values of the IVs correctly when exiting the vector loop. 585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 586 Value *CountRoundDown, Value *EndValue, 587 BasicBlock *MiddleBlock); 588 589 /// Create a new induction variable inside L. 590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 591 Value *Step, Instruction *DL); 592 593 /// Handle all cross-iteration phis in the header. 594 void fixCrossIterationPHIs(VPTransformState &State); 595 596 /// Fix a first-order recurrence. This is the second phase of vectorizing 597 /// this phi node. 598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 599 600 /// Fix a reduction cross-iteration phi. This is the second phase of 601 /// vectorizing this phi node. 602 void fixReduction(PHINode *Phi, VPTransformState &State); 603 604 /// Clear NSW/NUW flags from reduction instructions if necessary. 605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 606 VPTransformState &State); 607 608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 609 /// means we need to add the appropriate incoming value from the middle 610 /// block as exiting edges from the scalar epilogue loop (if present) are 611 /// already in place, and we exit the vector loop exclusively to the middle 612 /// block. 613 void fixLCSSAPHIs(VPTransformState &State); 614 615 /// Iteratively sink the scalarized operands of a predicated instruction into 616 /// the block that was created for it. 617 void sinkScalarOperands(Instruction *PredInst); 618 619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 620 /// represented as. 621 void truncateToMinimalBitwidths(VPTransformState &State); 622 623 /// This function adds 624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The (unique) ExitBlock of the scalar loop. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// LoopVectorizationCostModel - estimates the expected speedups due to 1211 /// vectorization. 1212 /// In many cases vectorization is not profitable. This can happen because of 1213 /// a number of reasons. In this class we mainly attempt to predict the 1214 /// expected speedup/slowdowns due to the supported instruction set. We use the 1215 /// TargetTransformInfo to query the different backends for the cost of 1216 /// different operations. 1217 class LoopVectorizationCostModel { 1218 public: 1219 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1220 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1221 LoopVectorizationLegality *Legal, 1222 const TargetTransformInfo &TTI, 1223 const TargetLibraryInfo *TLI, DemandedBits *DB, 1224 AssumptionCache *AC, 1225 OptimizationRemarkEmitter *ORE, const Function *F, 1226 const LoopVectorizeHints *Hints, 1227 InterleavedAccessInfo &IAI) 1228 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1229 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1230 Hints(Hints), InterleaveInfo(IAI) {} 1231 1232 /// \return An upper bound for the vectorization factor, or None if 1233 /// vectorization and interleaving should be avoided up front. 1234 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1235 1236 /// \return True if runtime checks are required for vectorization, and false 1237 /// otherwise. 1238 bool runtimeChecksRequired(); 1239 1240 /// \return The most profitable vectorization factor and the cost of that VF. 1241 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1242 /// then this vectorization factor will be selected if vectorization is 1243 /// possible. 1244 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1245 VectorizationFactor 1246 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1247 const LoopVectorizationPlanner &LVP); 1248 1249 /// Setup cost-based decisions for user vectorization factor. 1250 void selectUserVectorizationFactor(ElementCount UserVF) { 1251 collectUniformsAndScalars(UserVF); 1252 collectInstsToScalarize(UserVF); 1253 } 1254 1255 /// \return The size (in bits) of the smallest and widest types in the code 1256 /// that needs to be vectorized. We ignore values that remain scalar such as 1257 /// 64 bit loop indices. 1258 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1259 1260 /// \return The desired interleave count. 1261 /// If interleave count has been specified by metadata it will be returned. 1262 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1263 /// are the selected vectorization factor and the cost of the selected VF. 1264 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1265 1266 /// Memory access instruction may be vectorized in more than one way. 1267 /// Form of instruction after vectorization depends on cost. 1268 /// This function takes cost-based decisions for Load/Store instructions 1269 /// and collects them in a map. This decisions map is used for building 1270 /// the lists of loop-uniform and loop-scalar instructions. 1271 /// The calculated cost is saved with widening decision in order to 1272 /// avoid redundant calculations. 1273 void setCostBasedWideningDecision(ElementCount VF); 1274 1275 /// A struct that represents some properties of the register usage 1276 /// of a loop. 1277 struct RegisterUsage { 1278 /// Holds the number of loop invariant values that are used in the loop. 1279 /// The key is ClassID of target-provided register class. 1280 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1281 /// Holds the maximum number of concurrent live intervals in the loop. 1282 /// The key is ClassID of target-provided register class. 1283 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1284 }; 1285 1286 /// \return Returns information about the register usages of the loop for the 1287 /// given vectorization factors. 1288 SmallVector<RegisterUsage, 8> 1289 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1290 1291 /// Collect values we want to ignore in the cost model. 1292 void collectValuesToIgnore(); 1293 1294 /// Split reductions into those that happen in the loop, and those that happen 1295 /// outside. In loop reductions are collected into InLoopReductionChains. 1296 void collectInLoopReductions(); 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 InstructionCost Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, 1384 InstructionCost Cost) { 1385 assert(VF.isVector() && "Expected VF >=2"); 1386 /// Broadcast this decicion to all instructions inside the group. 1387 /// But the cost will be assigned to one instruction only. 1388 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1389 if (auto *I = Grp->getMember(i)) { 1390 if (Grp->getInsertPos() == I) 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 else 1393 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1394 } 1395 } 1396 } 1397 1398 /// Return the cost model decision for the given instruction \p I and vector 1399 /// width \p VF. Return CM_Unknown if this instruction did not pass 1400 /// through the cost modeling. 1401 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1402 assert(VF.isVector() && "Expected VF to be a vector VF"); 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return CM_GatherScatter; 1407 1408 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1409 auto Itr = WideningDecisions.find(InstOnVF); 1410 if (Itr == WideningDecisions.end()) 1411 return CM_Unknown; 1412 return Itr->second.first; 1413 } 1414 1415 /// Return the vectorization cost for the given instruction \p I and vector 1416 /// width \p VF. 1417 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1418 assert(VF.isVector() && "Expected VF >=2"); 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1421 "The cost is not calculated"); 1422 return WideningDecisions[InstOnVF].second; 1423 } 1424 1425 /// Return True if instruction \p I is an optimizable truncate whose operand 1426 /// is an induction variable. Such a truncate will be removed by adding a new 1427 /// induction variable with the destination type. 1428 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1429 // If the instruction is not a truncate, return false. 1430 auto *Trunc = dyn_cast<TruncInst>(I); 1431 if (!Trunc) 1432 return false; 1433 1434 // Get the source and destination types of the truncate. 1435 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1436 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1437 1438 // If the truncate is free for the given types, return false. Replacing a 1439 // free truncate with an induction variable would add an induction variable 1440 // update instruction to each iteration of the loop. We exclude from this 1441 // check the primary induction variable since it will need an update 1442 // instruction regardless. 1443 Value *Op = Trunc->getOperand(0); 1444 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1445 return false; 1446 1447 // If the truncated value is not an induction variable, return false. 1448 return Legal->isInductionPhi(Op); 1449 } 1450 1451 /// Collects the instructions to scalarize for each predicated instruction in 1452 /// the loop. 1453 void collectInstsToScalarize(ElementCount VF); 1454 1455 /// Collect Uniform and Scalar values for the given \p VF. 1456 /// The sets depend on CM decision for Load/Store instructions 1457 /// that may be vectorized as interleave, gather-scatter or scalarized. 1458 void collectUniformsAndScalars(ElementCount VF) { 1459 // Do the analysis once. 1460 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1461 return; 1462 setCostBasedWideningDecision(VF); 1463 collectLoopUniforms(VF); 1464 collectLoopScalars(VF); 1465 } 1466 1467 /// Returns true if the target machine supports masked store operation 1468 /// for the given \p DataType and kind of access to \p Ptr. 1469 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1470 return Legal->isConsecutivePtr(Ptr) && 1471 TTI.isLegalMaskedStore(DataType, Alignment); 1472 } 1473 1474 /// Returns true if the target machine supports masked load operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedLoad(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked scatter operation 1482 /// for the given \p DataType. 1483 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1484 return TTI.isLegalMaskedScatter(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked gather operation 1488 /// for the given \p DataType. 1489 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1490 return TTI.isLegalMaskedGather(DataType, Alignment); 1491 } 1492 1493 /// Returns true if the target machine can represent \p V as a masked gather 1494 /// or scatter operation. 1495 bool isLegalGatherOrScatter(Value *V) { 1496 bool LI = isa<LoadInst>(V); 1497 bool SI = isa<StoreInst>(V); 1498 if (!LI && !SI) 1499 return false; 1500 auto *Ty = getMemInstValueType(V); 1501 Align Align = getLoadStoreAlignment(V); 1502 return (LI && isLegalMaskedGather(Ty, Align)) || 1503 (SI && isLegalMaskedScatter(Ty, Align)); 1504 } 1505 1506 /// Returns true if the target machine supports all of the reduction 1507 /// variables found for the given VF. 1508 bool canVectorizeReductions(ElementCount VF) { 1509 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1510 RecurrenceDescriptor RdxDesc = Reduction.second; 1511 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1512 })); 1513 } 1514 1515 /// Returns true if \p I is an instruction that will be scalarized with 1516 /// predication. Such instructions include conditional stores and 1517 /// instructions that may divide by zero. 1518 /// If a non-zero VF has been calculated, we check if I will be scalarized 1519 /// predication for that VF. 1520 bool 1521 isScalarWithPredication(Instruction *I, 1522 ElementCount VF = ElementCount::getFixed(1)) const; 1523 1524 // Returns true if \p I is an instruction that will be predicated either 1525 // through scalar predication or masked load/store or masked gather/scatter. 1526 // Superset of instructions that return true for isScalarWithPredication. 1527 bool isPredicatedInst(Instruction *I, ElementCount VF) { 1528 if (!blockNeedsPredication(I->getParent())) 1529 return false; 1530 // Loads and stores that need some form of masked operation are predicated 1531 // instructions. 1532 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1533 return Legal->isMaskRequired(I); 1534 return isScalarWithPredication(I, VF); 1535 } 1536 1537 /// Returns true if \p I is a memory instruction with consecutive memory 1538 /// access that can be widened. 1539 bool 1540 memoryInstructionCanBeWidened(Instruction *I, 1541 ElementCount VF = ElementCount::getFixed(1)); 1542 1543 /// Returns true if \p I is a memory instruction in an interleaved-group 1544 /// of memory accesses that can be vectorized with wide vector loads/stores 1545 /// and shuffles. 1546 bool 1547 interleavedAccessCanBeWidened(Instruction *I, 1548 ElementCount VF = ElementCount::getFixed(1)); 1549 1550 /// Check if \p Instr belongs to any interleaved access group. 1551 bool isAccessInterleaved(Instruction *Instr) { 1552 return InterleaveInfo.isInterleaved(Instr); 1553 } 1554 1555 /// Get the interleaved access group that \p Instr belongs to. 1556 const InterleaveGroup<Instruction> * 1557 getInterleavedAccessGroup(Instruction *Instr) { 1558 return InterleaveInfo.getInterleaveGroup(Instr); 1559 } 1560 1561 /// Returns true if we're required to use a scalar epilogue for at least 1562 /// the final iteration of the original loop. 1563 bool requiresScalarEpilogue() const { 1564 if (!isScalarEpilogueAllowed()) 1565 return false; 1566 // If we might exit from anywhere but the latch, must run the exiting 1567 // iteration in scalar form. 1568 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1569 return true; 1570 return InterleaveInfo.requiresScalarEpilogue(); 1571 } 1572 1573 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1574 /// loop hint annotation. 1575 bool isScalarEpilogueAllowed() const { 1576 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1577 } 1578 1579 /// Returns true if all loop blocks should be masked to fold tail loop. 1580 bool foldTailByMasking() const { return FoldTailByMasking; } 1581 1582 bool blockNeedsPredication(BasicBlock *BB) const { 1583 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1584 } 1585 1586 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1587 /// nodes to the chain of instructions representing the reductions. Uses a 1588 /// MapVector to ensure deterministic iteration order. 1589 using ReductionChainMap = 1590 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1591 1592 /// Return the chain of instructions representing an inloop reduction. 1593 const ReductionChainMap &getInLoopReductionChains() const { 1594 return InLoopReductionChains; 1595 } 1596 1597 /// Returns true if the Phi is part of an inloop reduction. 1598 bool isInLoopReduction(PHINode *Phi) const { 1599 return InLoopReductionChains.count(Phi); 1600 } 1601 1602 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1603 /// with factor VF. Return the cost of the instruction, including 1604 /// scalarization overhead if it's needed. 1605 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1606 1607 /// Estimate cost of a call instruction CI if it were vectorized with factor 1608 /// VF. Return the cost of the instruction, including scalarization overhead 1609 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1610 /// scalarized - 1611 /// i.e. either vector version isn't available, or is too expensive. 1612 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1613 bool &NeedToScalarize) const; 1614 1615 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1616 /// that of B. 1617 bool isMoreProfitable(const VectorizationFactor &A, 1618 const VectorizationFactor &B) const; 1619 1620 /// Invalidates decisions already taken by the cost model. 1621 void invalidateCostModelingDecisions() { 1622 WideningDecisions.clear(); 1623 Uniforms.clear(); 1624 Scalars.clear(); 1625 } 1626 1627 private: 1628 unsigned NumPredStores = 0; 1629 1630 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1631 /// than zero. One is returned if vectorization should best be avoided due 1632 /// to cost. 1633 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1634 ElementCount UserVF); 1635 1636 /// \return the maximized element count based on the targets vector 1637 /// registers and the loop trip-count, but limited to a maximum safe VF. 1638 /// This is a helper function of computeFeasibleMaxVF. 1639 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1640 unsigned SmallestType, 1641 unsigned WidestType, 1642 ElementCount MaxSafeVF); 1643 1644 /// \return the maximum legal scalable VF, based on the safe max number 1645 /// of elements. 1646 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1647 1648 /// The vectorization cost is a combination of the cost itself and a boolean 1649 /// indicating whether any of the contributing operations will actually 1650 /// operate on 1651 /// vector values after type legalization in the backend. If this latter value 1652 /// is 1653 /// false, then all operations will be scalarized (i.e. no vectorization has 1654 /// actually taken place). 1655 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1656 1657 /// Returns the expected execution cost. The unit of the cost does 1658 /// not matter because we use the 'cost' units to compare different 1659 /// vector widths. The cost that is returned is *not* normalized by 1660 /// the factor width. 1661 VectorizationCostTy expectedCost(ElementCount VF); 1662 1663 /// Returns the execution time cost of an instruction for a given vector 1664 /// width. Vector width of one means scalar. 1665 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1666 1667 /// The cost-computation logic from getInstructionCost which provides 1668 /// the vector type as an output parameter. 1669 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1670 Type *&VectorTy); 1671 1672 /// Return the cost of instructions in an inloop reduction pattern, if I is 1673 /// part of that pattern. 1674 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1675 Type *VectorTy, 1676 TTI::TargetCostKind CostKind); 1677 1678 /// Calculate vectorization cost of memory instruction \p I. 1679 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1680 1681 /// The cost computation for scalarized memory instruction. 1682 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1683 1684 /// The cost computation for interleaving group of memory instructions. 1685 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1686 1687 /// The cost computation for Gather/Scatter instruction. 1688 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1689 1690 /// The cost computation for widening instruction \p I with consecutive 1691 /// memory access. 1692 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1693 1694 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1695 /// Load: scalar load + broadcast. 1696 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1697 /// element) 1698 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1699 1700 /// Estimate the overhead of scalarizing an instruction. This is a 1701 /// convenience wrapper for the type-based getScalarizationOverhead API. 1702 InstructionCost getScalarizationOverhead(Instruction *I, 1703 ElementCount VF) const; 1704 1705 /// Returns whether the instruction is a load or store and will be a emitted 1706 /// as a vector operation. 1707 bool isConsecutiveLoadOrStore(Instruction *I); 1708 1709 /// Returns true if an artificially high cost for emulated masked memrefs 1710 /// should be used. 1711 bool useEmulatedMaskMemRefHack(Instruction *I); 1712 1713 /// Map of scalar integer values to the smallest bitwidth they can be legally 1714 /// represented as. The vector equivalents of these values should be truncated 1715 /// to this type. 1716 MapVector<Instruction *, uint64_t> MinBWs; 1717 1718 /// A type representing the costs for instructions if they were to be 1719 /// scalarized rather than vectorized. The entries are Instruction-Cost 1720 /// pairs. 1721 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1722 1723 /// A set containing all BasicBlocks that are known to present after 1724 /// vectorization as a predicated block. 1725 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1726 1727 /// Records whether it is allowed to have the original scalar loop execute at 1728 /// least once. This may be needed as a fallback loop in case runtime 1729 /// aliasing/dependence checks fail, or to handle the tail/remainder 1730 /// iterations when the trip count is unknown or doesn't divide by the VF, 1731 /// or as a peel-loop to handle gaps in interleave-groups. 1732 /// Under optsize and when the trip count is very small we don't allow any 1733 /// iterations to execute in the scalar loop. 1734 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1735 1736 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1737 bool FoldTailByMasking = false; 1738 1739 /// A map holding scalar costs for different vectorization factors. The 1740 /// presence of a cost for an instruction in the mapping indicates that the 1741 /// instruction will be scalarized when vectorizing with the associated 1742 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1743 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1744 1745 /// Holds the instructions known to be uniform after vectorization. 1746 /// The data is collected per VF. 1747 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1748 1749 /// Holds the instructions known to be scalar after vectorization. 1750 /// The data is collected per VF. 1751 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1752 1753 /// Holds the instructions (address computations) that are forced to be 1754 /// scalarized. 1755 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1756 1757 /// PHINodes of the reductions that should be expanded in-loop along with 1758 /// their associated chains of reduction operations, in program order from top 1759 /// (PHI) to bottom 1760 ReductionChainMap InLoopReductionChains; 1761 1762 /// A Map of inloop reduction operations and their immediate chain operand. 1763 /// FIXME: This can be removed once reductions can be costed correctly in 1764 /// vplan. This was added to allow quick lookup to the inloop operations, 1765 /// without having to loop through InLoopReductionChains. 1766 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1767 1768 /// Returns the expected difference in cost from scalarizing the expression 1769 /// feeding a predicated instruction \p PredInst. The instructions to 1770 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1771 /// non-negative return value implies the expression will be scalarized. 1772 /// Currently, only single-use chains are considered for scalarization. 1773 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1774 ElementCount VF); 1775 1776 /// Collect the instructions that are uniform after vectorization. An 1777 /// instruction is uniform if we represent it with a single scalar value in 1778 /// the vectorized loop corresponding to each vector iteration. Examples of 1779 /// uniform instructions include pointer operands of consecutive or 1780 /// interleaved memory accesses. Note that although uniformity implies an 1781 /// instruction will be scalar, the reverse is not true. In general, a 1782 /// scalarized instruction will be represented by VF scalar values in the 1783 /// vectorized loop, each corresponding to an iteration of the original 1784 /// scalar loop. 1785 void collectLoopUniforms(ElementCount VF); 1786 1787 /// Collect the instructions that are scalar after vectorization. An 1788 /// instruction is scalar if it is known to be uniform or will be scalarized 1789 /// during vectorization. Non-uniform scalarized instructions will be 1790 /// represented by VF values in the vectorized loop, each corresponding to an 1791 /// iteration of the original scalar loop. 1792 void collectLoopScalars(ElementCount VF); 1793 1794 /// Keeps cost model vectorization decision and cost for instructions. 1795 /// Right now it is used for memory instructions only. 1796 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1797 std::pair<InstWidening, InstructionCost>>; 1798 1799 DecisionList WideningDecisions; 1800 1801 /// Returns true if \p V is expected to be vectorized and it needs to be 1802 /// extracted. 1803 bool needsExtract(Value *V, ElementCount VF) const { 1804 Instruction *I = dyn_cast<Instruction>(V); 1805 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1806 TheLoop->isLoopInvariant(I)) 1807 return false; 1808 1809 // Assume we can vectorize V (and hence we need extraction) if the 1810 // scalars are not computed yet. This can happen, because it is called 1811 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1812 // the scalars are collected. That should be a safe assumption in most 1813 // cases, because we check if the operands have vectorizable types 1814 // beforehand in LoopVectorizationLegality. 1815 return Scalars.find(VF) == Scalars.end() || 1816 !isScalarAfterVectorization(I, VF); 1817 }; 1818 1819 /// Returns a range containing only operands needing to be extracted. 1820 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1821 ElementCount VF) const { 1822 return SmallVector<Value *, 4>(make_filter_range( 1823 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1824 } 1825 1826 /// Determines if we have the infrastructure to vectorize loop \p L and its 1827 /// epilogue, assuming the main loop is vectorized by \p VF. 1828 bool isCandidateForEpilogueVectorization(const Loop &L, 1829 const ElementCount VF) const; 1830 1831 /// Returns true if epilogue vectorization is considered profitable, and 1832 /// false otherwise. 1833 /// \p VF is the vectorization factor chosen for the original loop. 1834 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1835 1836 public: 1837 /// The loop that we evaluate. 1838 Loop *TheLoop; 1839 1840 /// Predicated scalar evolution analysis. 1841 PredicatedScalarEvolution &PSE; 1842 1843 /// Loop Info analysis. 1844 LoopInfo *LI; 1845 1846 /// Vectorization legality. 1847 LoopVectorizationLegality *Legal; 1848 1849 /// Vector target information. 1850 const TargetTransformInfo &TTI; 1851 1852 /// Target Library Info. 1853 const TargetLibraryInfo *TLI; 1854 1855 /// Demanded bits analysis. 1856 DemandedBits *DB; 1857 1858 /// Assumption cache. 1859 AssumptionCache *AC; 1860 1861 /// Interface to emit optimization remarks. 1862 OptimizationRemarkEmitter *ORE; 1863 1864 const Function *TheFunction; 1865 1866 /// Loop Vectorize Hint. 1867 const LoopVectorizeHints *Hints; 1868 1869 /// The interleave access information contains groups of interleaved accesses 1870 /// with the same stride and close to each other. 1871 InterleavedAccessInfo &InterleaveInfo; 1872 1873 /// Values to ignore in the cost model. 1874 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1875 1876 /// Values to ignore in the cost model when VF > 1. 1877 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1878 1879 /// Profitable vector factors. 1880 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1881 }; 1882 } // end namespace llvm 1883 1884 /// Helper struct to manage generating runtime checks for vectorization. 1885 /// 1886 /// The runtime checks are created up-front in temporary blocks to allow better 1887 /// estimating the cost and un-linked from the existing IR. After deciding to 1888 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1889 /// temporary blocks are completely removed. 1890 class GeneratedRTChecks { 1891 /// Basic block which contains the generated SCEV checks, if any. 1892 BasicBlock *SCEVCheckBlock = nullptr; 1893 1894 /// The value representing the result of the generated SCEV checks. If it is 1895 /// nullptr, either no SCEV checks have been generated or they have been used. 1896 Value *SCEVCheckCond = nullptr; 1897 1898 /// Basic block which contains the generated memory runtime checks, if any. 1899 BasicBlock *MemCheckBlock = nullptr; 1900 1901 /// The value representing the result of the generated memory runtime checks. 1902 /// If it is nullptr, either no memory runtime checks have been generated or 1903 /// they have been used. 1904 Instruction *MemRuntimeCheckCond = nullptr; 1905 1906 DominatorTree *DT; 1907 LoopInfo *LI; 1908 1909 SCEVExpander SCEVExp; 1910 SCEVExpander MemCheckExp; 1911 1912 public: 1913 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1914 const DataLayout &DL) 1915 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1916 MemCheckExp(SE, DL, "scev.check") {} 1917 1918 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1919 /// accurately estimate the cost of the runtime checks. The blocks are 1920 /// un-linked from the IR and is added back during vector code generation. If 1921 /// there is no vector code generation, the check blocks are removed 1922 /// completely. 1923 void Create(Loop *L, const LoopAccessInfo &LAI, 1924 const SCEVUnionPredicate &UnionPred) { 1925 1926 BasicBlock *LoopHeader = L->getHeader(); 1927 BasicBlock *Preheader = L->getLoopPreheader(); 1928 1929 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1930 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1931 // may be used by SCEVExpander. The blocks will be un-linked from their 1932 // predecessors and removed from LI & DT at the end of the function. 1933 if (!UnionPred.isAlwaysTrue()) { 1934 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1935 nullptr, "vector.scevcheck"); 1936 1937 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1938 &UnionPred, SCEVCheckBlock->getTerminator()); 1939 } 1940 1941 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1942 if (RtPtrChecking.Need) { 1943 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1944 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1945 "vector.memcheck"); 1946 1947 std::tie(std::ignore, MemRuntimeCheckCond) = 1948 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1949 RtPtrChecking.getChecks(), MemCheckExp); 1950 assert(MemRuntimeCheckCond && 1951 "no RT checks generated although RtPtrChecking " 1952 "claimed checks are required"); 1953 } 1954 1955 if (!MemCheckBlock && !SCEVCheckBlock) 1956 return; 1957 1958 // Unhook the temporary block with the checks, update various places 1959 // accordingly. 1960 if (SCEVCheckBlock) 1961 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1962 if (MemCheckBlock) 1963 MemCheckBlock->replaceAllUsesWith(Preheader); 1964 1965 if (SCEVCheckBlock) { 1966 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1967 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1968 Preheader->getTerminator()->eraseFromParent(); 1969 } 1970 if (MemCheckBlock) { 1971 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1972 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1973 Preheader->getTerminator()->eraseFromParent(); 1974 } 1975 1976 DT->changeImmediateDominator(LoopHeader, Preheader); 1977 if (MemCheckBlock) { 1978 DT->eraseNode(MemCheckBlock); 1979 LI->removeBlock(MemCheckBlock); 1980 } 1981 if (SCEVCheckBlock) { 1982 DT->eraseNode(SCEVCheckBlock); 1983 LI->removeBlock(SCEVCheckBlock); 1984 } 1985 } 1986 1987 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1988 /// unused. 1989 ~GeneratedRTChecks() { 1990 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1991 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1992 if (!SCEVCheckCond) 1993 SCEVCleaner.markResultUsed(); 1994 1995 if (!MemRuntimeCheckCond) 1996 MemCheckCleaner.markResultUsed(); 1997 1998 if (MemRuntimeCheckCond) { 1999 auto &SE = *MemCheckExp.getSE(); 2000 // Memory runtime check generation creates compares that use expanded 2001 // values. Remove them before running the SCEVExpanderCleaners. 2002 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2003 if (MemCheckExp.isInsertedInstruction(&I)) 2004 continue; 2005 SE.forgetValue(&I); 2006 SE.eraseValueFromMap(&I); 2007 I.eraseFromParent(); 2008 } 2009 } 2010 MemCheckCleaner.cleanup(); 2011 SCEVCleaner.cleanup(); 2012 2013 if (SCEVCheckCond) 2014 SCEVCheckBlock->eraseFromParent(); 2015 if (MemRuntimeCheckCond) 2016 MemCheckBlock->eraseFromParent(); 2017 } 2018 2019 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2020 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2021 /// depending on the generated condition. 2022 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2023 BasicBlock *LoopVectorPreHeader, 2024 BasicBlock *LoopExitBlock) { 2025 if (!SCEVCheckCond) 2026 return nullptr; 2027 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2028 if (C->isZero()) 2029 return nullptr; 2030 2031 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2032 2033 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2034 // Create new preheader for vector loop. 2035 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2036 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2037 2038 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2039 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2040 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2041 SCEVCheckBlock); 2042 2043 DT->addNewBlock(SCEVCheckBlock, Pred); 2044 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2045 2046 ReplaceInstWithInst( 2047 SCEVCheckBlock->getTerminator(), 2048 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2049 // Mark the check as used, to prevent it from being removed during cleanup. 2050 SCEVCheckCond = nullptr; 2051 return SCEVCheckBlock; 2052 } 2053 2054 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2055 /// the branches to branch to the vector preheader or \p Bypass, depending on 2056 /// the generated condition. 2057 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2058 BasicBlock *LoopVectorPreHeader) { 2059 // Check if we generated code that checks in runtime if arrays overlap. 2060 if (!MemRuntimeCheckCond) 2061 return nullptr; 2062 2063 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2064 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2065 MemCheckBlock); 2066 2067 DT->addNewBlock(MemCheckBlock, Pred); 2068 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2069 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2070 2071 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2072 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2073 2074 ReplaceInstWithInst( 2075 MemCheckBlock->getTerminator(), 2076 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2077 MemCheckBlock->getTerminator()->setDebugLoc( 2078 Pred->getTerminator()->getDebugLoc()); 2079 2080 // Mark the check as used, to prevent it from being removed during cleanup. 2081 MemRuntimeCheckCond = nullptr; 2082 return MemCheckBlock; 2083 } 2084 }; 2085 2086 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2087 // vectorization. The loop needs to be annotated with #pragma omp simd 2088 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2089 // vector length information is not provided, vectorization is not considered 2090 // explicit. Interleave hints are not allowed either. These limitations will be 2091 // relaxed in the future. 2092 // Please, note that we are currently forced to abuse the pragma 'clang 2093 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2094 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2095 // provides *explicit vectorization hints* (LV can bypass legal checks and 2096 // assume that vectorization is legal). However, both hints are implemented 2097 // using the same metadata (llvm.loop.vectorize, processed by 2098 // LoopVectorizeHints). This will be fixed in the future when the native IR 2099 // representation for pragma 'omp simd' is introduced. 2100 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2101 OptimizationRemarkEmitter *ORE) { 2102 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2103 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2104 2105 // Only outer loops with an explicit vectorization hint are supported. 2106 // Unannotated outer loops are ignored. 2107 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2108 return false; 2109 2110 Function *Fn = OuterLp->getHeader()->getParent(); 2111 if (!Hints.allowVectorization(Fn, OuterLp, 2112 true /*VectorizeOnlyWhenForced*/)) { 2113 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2114 return false; 2115 } 2116 2117 if (Hints.getInterleave() > 1) { 2118 // TODO: Interleave support is future work. 2119 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2120 "outer loops.\n"); 2121 Hints.emitRemarkWithHints(); 2122 return false; 2123 } 2124 2125 return true; 2126 } 2127 2128 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2129 OptimizationRemarkEmitter *ORE, 2130 SmallVectorImpl<Loop *> &V) { 2131 // Collect inner loops and outer loops without irreducible control flow. For 2132 // now, only collect outer loops that have explicit vectorization hints. If we 2133 // are stress testing the VPlan H-CFG construction, we collect the outermost 2134 // loop of every loop nest. 2135 if (L.isInnermost() || VPlanBuildStressTest || 2136 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2137 LoopBlocksRPO RPOT(&L); 2138 RPOT.perform(LI); 2139 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2140 V.push_back(&L); 2141 // TODO: Collect inner loops inside marked outer loops in case 2142 // vectorization fails for the outer loop. Do not invoke 2143 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2144 // already known to be reducible. We can use an inherited attribute for 2145 // that. 2146 return; 2147 } 2148 } 2149 for (Loop *InnerL : L) 2150 collectSupportedLoops(*InnerL, LI, ORE, V); 2151 } 2152 2153 namespace { 2154 2155 /// The LoopVectorize Pass. 2156 struct LoopVectorize : public FunctionPass { 2157 /// Pass identification, replacement for typeid 2158 static char ID; 2159 2160 LoopVectorizePass Impl; 2161 2162 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2163 bool VectorizeOnlyWhenForced = false) 2164 : FunctionPass(ID), 2165 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2166 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2167 } 2168 2169 bool runOnFunction(Function &F) override { 2170 if (skipFunction(F)) 2171 return false; 2172 2173 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2174 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2175 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2176 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2177 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2178 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2179 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2180 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2181 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2182 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2183 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2184 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2185 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2186 2187 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2188 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2189 2190 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2191 GetLAA, *ORE, PSI).MadeAnyChange; 2192 } 2193 2194 void getAnalysisUsage(AnalysisUsage &AU) const override { 2195 AU.addRequired<AssumptionCacheTracker>(); 2196 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2197 AU.addRequired<DominatorTreeWrapperPass>(); 2198 AU.addRequired<LoopInfoWrapperPass>(); 2199 AU.addRequired<ScalarEvolutionWrapperPass>(); 2200 AU.addRequired<TargetTransformInfoWrapperPass>(); 2201 AU.addRequired<AAResultsWrapperPass>(); 2202 AU.addRequired<LoopAccessLegacyAnalysis>(); 2203 AU.addRequired<DemandedBitsWrapperPass>(); 2204 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2205 AU.addRequired<InjectTLIMappingsLegacy>(); 2206 2207 // We currently do not preserve loopinfo/dominator analyses with outer loop 2208 // vectorization. Until this is addressed, mark these analyses as preserved 2209 // only for non-VPlan-native path. 2210 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2211 if (!EnableVPlanNativePath) { 2212 AU.addPreserved<LoopInfoWrapperPass>(); 2213 AU.addPreserved<DominatorTreeWrapperPass>(); 2214 } 2215 2216 AU.addPreserved<BasicAAWrapperPass>(); 2217 AU.addPreserved<GlobalsAAWrapperPass>(); 2218 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2219 } 2220 }; 2221 2222 } // end anonymous namespace 2223 2224 //===----------------------------------------------------------------------===// 2225 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2226 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2227 //===----------------------------------------------------------------------===// 2228 2229 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2230 // We need to place the broadcast of invariant variables outside the loop, 2231 // but only if it's proven safe to do so. Else, broadcast will be inside 2232 // vector loop body. 2233 Instruction *Instr = dyn_cast<Instruction>(V); 2234 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2235 (!Instr || 2236 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2237 // Place the code for broadcasting invariant variables in the new preheader. 2238 IRBuilder<>::InsertPointGuard Guard(Builder); 2239 if (SafeToHoist) 2240 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2241 2242 // Broadcast the scalar into all locations in the vector. 2243 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2244 2245 return Shuf; 2246 } 2247 2248 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2249 const InductionDescriptor &II, Value *Step, Value *Start, 2250 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2251 VPTransformState &State) { 2252 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2253 "Expected either an induction phi-node or a truncate of it!"); 2254 2255 // Construct the initial value of the vector IV in the vector loop preheader 2256 auto CurrIP = Builder.saveIP(); 2257 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2258 if (isa<TruncInst>(EntryVal)) { 2259 assert(Start->getType()->isIntegerTy() && 2260 "Truncation requires an integer type"); 2261 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2262 Step = Builder.CreateTrunc(Step, TruncType); 2263 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2264 } 2265 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2266 Value *SteppedStart = 2267 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2268 2269 // We create vector phi nodes for both integer and floating-point induction 2270 // variables. Here, we determine the kind of arithmetic we will perform. 2271 Instruction::BinaryOps AddOp; 2272 Instruction::BinaryOps MulOp; 2273 if (Step->getType()->isIntegerTy()) { 2274 AddOp = Instruction::Add; 2275 MulOp = Instruction::Mul; 2276 } else { 2277 AddOp = II.getInductionOpcode(); 2278 MulOp = Instruction::FMul; 2279 } 2280 2281 // Multiply the vectorization factor by the step using integer or 2282 // floating-point arithmetic as appropriate. 2283 Type *StepType = Step->getType(); 2284 if (Step->getType()->isFloatingPointTy()) 2285 StepType = IntegerType::get(StepType->getContext(), 2286 StepType->getScalarSizeInBits()); 2287 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2288 if (Step->getType()->isFloatingPointTy()) 2289 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2290 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2291 2292 // Create a vector splat to use in the induction update. 2293 // 2294 // FIXME: If the step is non-constant, we create the vector splat with 2295 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2296 // handle a constant vector splat. 2297 Value *SplatVF = isa<Constant>(Mul) 2298 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2299 : Builder.CreateVectorSplat(VF, Mul); 2300 Builder.restoreIP(CurrIP); 2301 2302 // We may need to add the step a number of times, depending on the unroll 2303 // factor. The last of those goes into the PHI. 2304 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2305 &*LoopVectorBody->getFirstInsertionPt()); 2306 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2307 Instruction *LastInduction = VecInd; 2308 for (unsigned Part = 0; Part < UF; ++Part) { 2309 State.set(Def, LastInduction, Part); 2310 2311 if (isa<TruncInst>(EntryVal)) 2312 addMetadata(LastInduction, EntryVal); 2313 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2314 State, Part); 2315 2316 LastInduction = cast<Instruction>( 2317 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2318 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2319 } 2320 2321 // Move the last step to the end of the latch block. This ensures consistent 2322 // placement of all induction updates. 2323 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2324 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2325 auto *ICmp = cast<Instruction>(Br->getCondition()); 2326 LastInduction->moveBefore(ICmp); 2327 LastInduction->setName("vec.ind.next"); 2328 2329 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2330 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2331 } 2332 2333 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2334 return Cost->isScalarAfterVectorization(I, VF) || 2335 Cost->isProfitableToScalarize(I, VF); 2336 } 2337 2338 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2339 if (shouldScalarizeInstruction(IV)) 2340 return true; 2341 auto isScalarInst = [&](User *U) -> bool { 2342 auto *I = cast<Instruction>(U); 2343 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2344 }; 2345 return llvm::any_of(IV->users(), isScalarInst); 2346 } 2347 2348 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2349 const InductionDescriptor &ID, const Instruction *EntryVal, 2350 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2351 unsigned Part, unsigned Lane) { 2352 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2353 "Expected either an induction phi-node or a truncate of it!"); 2354 2355 // This induction variable is not the phi from the original loop but the 2356 // newly-created IV based on the proof that casted Phi is equal to the 2357 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2358 // re-uses the same InductionDescriptor that original IV uses but we don't 2359 // have to do any recording in this case - that is done when original IV is 2360 // processed. 2361 if (isa<TruncInst>(EntryVal)) 2362 return; 2363 2364 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2365 if (Casts.empty()) 2366 return; 2367 // Only the first Cast instruction in the Casts vector is of interest. 2368 // The rest of the Casts (if exist) have no uses outside the 2369 // induction update chain itself. 2370 if (Lane < UINT_MAX) 2371 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2372 else 2373 State.set(CastDef, VectorLoopVal, Part); 2374 } 2375 2376 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2377 TruncInst *Trunc, VPValue *Def, 2378 VPValue *CastDef, 2379 VPTransformState &State) { 2380 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2381 "Primary induction variable must have an integer type"); 2382 2383 auto II = Legal->getInductionVars().find(IV); 2384 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2385 2386 auto ID = II->second; 2387 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2388 2389 // The value from the original loop to which we are mapping the new induction 2390 // variable. 2391 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2392 2393 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2394 2395 // Generate code for the induction step. Note that induction steps are 2396 // required to be loop-invariant 2397 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2398 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2399 "Induction step should be loop invariant"); 2400 if (PSE.getSE()->isSCEVable(IV->getType())) { 2401 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2402 return Exp.expandCodeFor(Step, Step->getType(), 2403 LoopVectorPreHeader->getTerminator()); 2404 } 2405 return cast<SCEVUnknown>(Step)->getValue(); 2406 }; 2407 2408 // The scalar value to broadcast. This is derived from the canonical 2409 // induction variable. If a truncation type is given, truncate the canonical 2410 // induction variable and step. Otherwise, derive these values from the 2411 // induction descriptor. 2412 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2413 Value *ScalarIV = Induction; 2414 if (IV != OldInduction) { 2415 ScalarIV = IV->getType()->isIntegerTy() 2416 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2417 : Builder.CreateCast(Instruction::SIToFP, Induction, 2418 IV->getType()); 2419 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2420 ScalarIV->setName("offset.idx"); 2421 } 2422 if (Trunc) { 2423 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2424 assert(Step->getType()->isIntegerTy() && 2425 "Truncation requires an integer step"); 2426 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2427 Step = Builder.CreateTrunc(Step, TruncType); 2428 } 2429 return ScalarIV; 2430 }; 2431 2432 // Create the vector values from the scalar IV, in the absence of creating a 2433 // vector IV. 2434 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2435 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2436 for (unsigned Part = 0; Part < UF; ++Part) { 2437 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2438 Value *EntryPart = 2439 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2440 ID.getInductionOpcode()); 2441 State.set(Def, EntryPart, Part); 2442 if (Trunc) 2443 addMetadata(EntryPart, Trunc); 2444 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2445 State, Part); 2446 } 2447 }; 2448 2449 // Fast-math-flags propagate from the original induction instruction. 2450 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2451 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2452 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2453 2454 // Now do the actual transformations, and start with creating the step value. 2455 Value *Step = CreateStepValue(ID.getStep()); 2456 if (VF.isZero() || VF.isScalar()) { 2457 Value *ScalarIV = CreateScalarIV(Step); 2458 CreateSplatIV(ScalarIV, Step); 2459 return; 2460 } 2461 2462 // Determine if we want a scalar version of the induction variable. This is 2463 // true if the induction variable itself is not widened, or if it has at 2464 // least one user in the loop that is not widened. 2465 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2466 if (!NeedsScalarIV) { 2467 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2468 State); 2469 return; 2470 } 2471 2472 // Try to create a new independent vector induction variable. If we can't 2473 // create the phi node, we will splat the scalar induction variable in each 2474 // loop iteration. 2475 if (!shouldScalarizeInstruction(EntryVal)) { 2476 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2477 State); 2478 Value *ScalarIV = CreateScalarIV(Step); 2479 // Create scalar steps that can be used by instructions we will later 2480 // scalarize. Note that the addition of the scalar steps will not increase 2481 // the number of instructions in the loop in the common case prior to 2482 // InstCombine. We will be trading one vector extract for each scalar step. 2483 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2484 return; 2485 } 2486 2487 // All IV users are scalar instructions, so only emit a scalar IV, not a 2488 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2489 // predicate used by the masked loads/stores. 2490 Value *ScalarIV = CreateScalarIV(Step); 2491 if (!Cost->isScalarEpilogueAllowed()) 2492 CreateSplatIV(ScalarIV, Step); 2493 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2494 } 2495 2496 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2497 Instruction::BinaryOps BinOp) { 2498 // Create and check the types. 2499 auto *ValVTy = cast<VectorType>(Val->getType()); 2500 ElementCount VLen = ValVTy->getElementCount(); 2501 2502 Type *STy = Val->getType()->getScalarType(); 2503 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2504 "Induction Step must be an integer or FP"); 2505 assert(Step->getType() == STy && "Step has wrong type"); 2506 2507 SmallVector<Constant *, 8> Indices; 2508 2509 // Create a vector of consecutive numbers from zero to VF. 2510 VectorType *InitVecValVTy = ValVTy; 2511 Type *InitVecValSTy = STy; 2512 if (STy->isFloatingPointTy()) { 2513 InitVecValSTy = 2514 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2515 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2516 } 2517 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2518 2519 // Add on StartIdx 2520 Value *StartIdxSplat = Builder.CreateVectorSplat( 2521 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2522 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2523 2524 if (STy->isIntegerTy()) { 2525 Step = Builder.CreateVectorSplat(VLen, Step); 2526 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2527 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2528 // which can be found from the original scalar operations. 2529 Step = Builder.CreateMul(InitVec, Step); 2530 return Builder.CreateAdd(Val, Step, "induction"); 2531 } 2532 2533 // Floating point induction. 2534 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2535 "Binary Opcode should be specified for FP induction"); 2536 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2537 Step = Builder.CreateVectorSplat(VLen, Step); 2538 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2539 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2540 } 2541 2542 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2543 Instruction *EntryVal, 2544 const InductionDescriptor &ID, 2545 VPValue *Def, VPValue *CastDef, 2546 VPTransformState &State) { 2547 // We shouldn't have to build scalar steps if we aren't vectorizing. 2548 assert(VF.isVector() && "VF should be greater than one"); 2549 // Get the value type and ensure it and the step have the same integer type. 2550 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2551 assert(ScalarIVTy == Step->getType() && 2552 "Val and Step should have the same type"); 2553 2554 // We build scalar steps for both integer and floating-point induction 2555 // variables. Here, we determine the kind of arithmetic we will perform. 2556 Instruction::BinaryOps AddOp; 2557 Instruction::BinaryOps MulOp; 2558 if (ScalarIVTy->isIntegerTy()) { 2559 AddOp = Instruction::Add; 2560 MulOp = Instruction::Mul; 2561 } else { 2562 AddOp = ID.getInductionOpcode(); 2563 MulOp = Instruction::FMul; 2564 } 2565 2566 // Determine the number of scalars we need to generate for each unroll 2567 // iteration. If EntryVal is uniform, we only need to generate the first 2568 // lane. Otherwise, we generate all VF values. 2569 bool IsUniform = 2570 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2571 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2572 // Compute the scalar steps and save the results in State. 2573 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2574 ScalarIVTy->getScalarSizeInBits()); 2575 Type *VecIVTy = nullptr; 2576 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2577 if (!IsUniform && VF.isScalable()) { 2578 VecIVTy = VectorType::get(ScalarIVTy, VF); 2579 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2580 SplatStep = Builder.CreateVectorSplat(VF, Step); 2581 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2582 } 2583 2584 for (unsigned Part = 0; Part < UF; ++Part) { 2585 Value *StartIdx0 = 2586 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2587 2588 if (!IsUniform && VF.isScalable()) { 2589 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2590 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2591 if (ScalarIVTy->isFloatingPointTy()) 2592 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2593 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2594 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2595 State.set(Def, Add, Part); 2596 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2597 Part); 2598 // It's useful to record the lane values too for the known minimum number 2599 // of elements so we do those below. This improves the code quality when 2600 // trying to extract the first element, for example. 2601 } 2602 2603 if (ScalarIVTy->isFloatingPointTy()) 2604 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2605 2606 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2607 Value *StartIdx = Builder.CreateBinOp( 2608 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2609 // The step returned by `createStepForVF` is a runtime-evaluated value 2610 // when VF is scalable. Otherwise, it should be folded into a Constant. 2611 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2612 "Expected StartIdx to be folded to a constant when VF is not " 2613 "scalable"); 2614 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2615 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2616 State.set(Def, Add, VPIteration(Part, Lane)); 2617 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2618 Part, Lane); 2619 } 2620 } 2621 } 2622 2623 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2624 const VPIteration &Instance, 2625 VPTransformState &State) { 2626 Value *ScalarInst = State.get(Def, Instance); 2627 Value *VectorValue = State.get(Def, Instance.Part); 2628 VectorValue = Builder.CreateInsertElement( 2629 VectorValue, ScalarInst, 2630 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2631 State.set(Def, VectorValue, Instance.Part); 2632 } 2633 2634 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2635 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2636 return Builder.CreateVectorReverse(Vec, "reverse"); 2637 } 2638 2639 // Return whether we allow using masked interleave-groups (for dealing with 2640 // strided loads/stores that reside in predicated blocks, or for dealing 2641 // with gaps). 2642 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2643 // If an override option has been passed in for interleaved accesses, use it. 2644 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2645 return EnableMaskedInterleavedMemAccesses; 2646 2647 return TTI.enableMaskedInterleavedAccessVectorization(); 2648 } 2649 2650 // Try to vectorize the interleave group that \p Instr belongs to. 2651 // 2652 // E.g. Translate following interleaved load group (factor = 3): 2653 // for (i = 0; i < N; i+=3) { 2654 // R = Pic[i]; // Member of index 0 2655 // G = Pic[i+1]; // Member of index 1 2656 // B = Pic[i+2]; // Member of index 2 2657 // ... // do something to R, G, B 2658 // } 2659 // To: 2660 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2661 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2662 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2663 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2664 // 2665 // Or translate following interleaved store group (factor = 3): 2666 // for (i = 0; i < N; i+=3) { 2667 // ... do something to R, G, B 2668 // Pic[i] = R; // Member of index 0 2669 // Pic[i+1] = G; // Member of index 1 2670 // Pic[i+2] = B; // Member of index 2 2671 // } 2672 // To: 2673 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2674 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2675 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2676 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2677 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2678 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2679 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2680 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2681 VPValue *BlockInMask) { 2682 Instruction *Instr = Group->getInsertPos(); 2683 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2684 2685 // Prepare for the vector type of the interleaved load/store. 2686 Type *ScalarTy = getMemInstValueType(Instr); 2687 unsigned InterleaveFactor = Group->getFactor(); 2688 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2689 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2690 2691 // Prepare for the new pointers. 2692 SmallVector<Value *, 2> AddrParts; 2693 unsigned Index = Group->getIndex(Instr); 2694 2695 // TODO: extend the masked interleaved-group support to reversed access. 2696 assert((!BlockInMask || !Group->isReverse()) && 2697 "Reversed masked interleave-group not supported."); 2698 2699 // If the group is reverse, adjust the index to refer to the last vector lane 2700 // instead of the first. We adjust the index from the first vector lane, 2701 // rather than directly getting the pointer for lane VF - 1, because the 2702 // pointer operand of the interleaved access is supposed to be uniform. For 2703 // uniform instructions, we're only required to generate a value for the 2704 // first vector lane in each unroll iteration. 2705 if (Group->isReverse()) 2706 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2707 2708 for (unsigned Part = 0; Part < UF; Part++) { 2709 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2710 setDebugLocFromInst(Builder, AddrPart); 2711 2712 // Notice current instruction could be any index. Need to adjust the address 2713 // to the member of index 0. 2714 // 2715 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2716 // b = A[i]; // Member of index 0 2717 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2718 // 2719 // E.g. A[i+1] = a; // Member of index 1 2720 // A[i] = b; // Member of index 0 2721 // A[i+2] = c; // Member of index 2 (Current instruction) 2722 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2723 2724 bool InBounds = false; 2725 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2726 InBounds = gep->isInBounds(); 2727 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2728 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2729 2730 // Cast to the vector pointer type. 2731 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2732 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2733 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2734 } 2735 2736 setDebugLocFromInst(Builder, Instr); 2737 Value *PoisonVec = PoisonValue::get(VecTy); 2738 2739 Value *MaskForGaps = nullptr; 2740 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2741 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2742 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2743 } 2744 2745 // Vectorize the interleaved load group. 2746 if (isa<LoadInst>(Instr)) { 2747 // For each unroll part, create a wide load for the group. 2748 SmallVector<Value *, 2> NewLoads; 2749 for (unsigned Part = 0; Part < UF; Part++) { 2750 Instruction *NewLoad; 2751 if (BlockInMask || MaskForGaps) { 2752 assert(useMaskedInterleavedAccesses(*TTI) && 2753 "masked interleaved groups are not allowed."); 2754 Value *GroupMask = MaskForGaps; 2755 if (BlockInMask) { 2756 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2757 Value *ShuffledMask = Builder.CreateShuffleVector( 2758 BlockInMaskPart, 2759 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2760 "interleaved.mask"); 2761 GroupMask = MaskForGaps 2762 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2763 MaskForGaps) 2764 : ShuffledMask; 2765 } 2766 NewLoad = 2767 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2768 GroupMask, PoisonVec, "wide.masked.vec"); 2769 } 2770 else 2771 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2772 Group->getAlign(), "wide.vec"); 2773 Group->addMetadata(NewLoad); 2774 NewLoads.push_back(NewLoad); 2775 } 2776 2777 // For each member in the group, shuffle out the appropriate data from the 2778 // wide loads. 2779 unsigned J = 0; 2780 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2781 Instruction *Member = Group->getMember(I); 2782 2783 // Skip the gaps in the group. 2784 if (!Member) 2785 continue; 2786 2787 auto StrideMask = 2788 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2789 for (unsigned Part = 0; Part < UF; Part++) { 2790 Value *StridedVec = Builder.CreateShuffleVector( 2791 NewLoads[Part], StrideMask, "strided.vec"); 2792 2793 // If this member has different type, cast the result type. 2794 if (Member->getType() != ScalarTy) { 2795 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2796 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2797 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2798 } 2799 2800 if (Group->isReverse()) 2801 StridedVec = reverseVector(StridedVec); 2802 2803 State.set(VPDefs[J], StridedVec, Part); 2804 } 2805 ++J; 2806 } 2807 return; 2808 } 2809 2810 // The sub vector type for current instruction. 2811 auto *SubVT = VectorType::get(ScalarTy, VF); 2812 2813 // Vectorize the interleaved store group. 2814 for (unsigned Part = 0; Part < UF; Part++) { 2815 // Collect the stored vector from each member. 2816 SmallVector<Value *, 4> StoredVecs; 2817 for (unsigned i = 0; i < InterleaveFactor; i++) { 2818 // Interleaved store group doesn't allow a gap, so each index has a member 2819 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2820 2821 Value *StoredVec = State.get(StoredValues[i], Part); 2822 2823 if (Group->isReverse()) 2824 StoredVec = reverseVector(StoredVec); 2825 2826 // If this member has different type, cast it to a unified type. 2827 2828 if (StoredVec->getType() != SubVT) 2829 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2830 2831 StoredVecs.push_back(StoredVec); 2832 } 2833 2834 // Concatenate all vectors into a wide vector. 2835 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2836 2837 // Interleave the elements in the wide vector. 2838 Value *IVec = Builder.CreateShuffleVector( 2839 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2840 "interleaved.vec"); 2841 2842 Instruction *NewStoreInstr; 2843 if (BlockInMask) { 2844 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2845 Value *ShuffledMask = Builder.CreateShuffleVector( 2846 BlockInMaskPart, 2847 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2848 "interleaved.mask"); 2849 NewStoreInstr = Builder.CreateMaskedStore( 2850 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2851 } 2852 else 2853 NewStoreInstr = 2854 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2855 2856 Group->addMetadata(NewStoreInstr); 2857 } 2858 } 2859 2860 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2861 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2862 VPValue *StoredValue, VPValue *BlockInMask) { 2863 // Attempt to issue a wide load. 2864 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2865 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2866 2867 assert((LI || SI) && "Invalid Load/Store instruction"); 2868 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2869 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2870 2871 LoopVectorizationCostModel::InstWidening Decision = 2872 Cost->getWideningDecision(Instr, VF); 2873 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2874 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2875 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2876 "CM decision is not to widen the memory instruction"); 2877 2878 Type *ScalarDataTy = getMemInstValueType(Instr); 2879 2880 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2881 const Align Alignment = getLoadStoreAlignment(Instr); 2882 2883 // Determine if the pointer operand of the access is either consecutive or 2884 // reverse consecutive. 2885 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2886 bool ConsecutiveStride = 2887 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2888 bool CreateGatherScatter = 2889 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2890 2891 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2892 // gather/scatter. Otherwise Decision should have been to Scalarize. 2893 assert((ConsecutiveStride || CreateGatherScatter) && 2894 "The instruction should be scalarized"); 2895 (void)ConsecutiveStride; 2896 2897 VectorParts BlockInMaskParts(UF); 2898 bool isMaskRequired = BlockInMask; 2899 if (isMaskRequired) 2900 for (unsigned Part = 0; Part < UF; ++Part) 2901 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2902 2903 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2904 // Calculate the pointer for the specific unroll-part. 2905 GetElementPtrInst *PartPtr = nullptr; 2906 2907 bool InBounds = false; 2908 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2909 InBounds = gep->isInBounds(); 2910 if (Reverse) { 2911 // If the address is consecutive but reversed, then the 2912 // wide store needs to start at the last vector element. 2913 // RunTimeVF = VScale * VF.getKnownMinValue() 2914 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2915 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2916 // NumElt = -Part * RunTimeVF 2917 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2918 // LastLane = 1 - RunTimeVF 2919 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2920 PartPtr = 2921 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2922 PartPtr->setIsInBounds(InBounds); 2923 PartPtr = cast<GetElementPtrInst>( 2924 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2925 PartPtr->setIsInBounds(InBounds); 2926 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2927 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2928 } else { 2929 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2930 PartPtr = cast<GetElementPtrInst>( 2931 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2932 PartPtr->setIsInBounds(InBounds); 2933 } 2934 2935 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2936 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2937 }; 2938 2939 // Handle Stores: 2940 if (SI) { 2941 setDebugLocFromInst(Builder, SI); 2942 2943 for (unsigned Part = 0; Part < UF; ++Part) { 2944 Instruction *NewSI = nullptr; 2945 Value *StoredVal = State.get(StoredValue, Part); 2946 if (CreateGatherScatter) { 2947 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2948 Value *VectorGep = State.get(Addr, Part); 2949 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2950 MaskPart); 2951 } else { 2952 if (Reverse) { 2953 // If we store to reverse consecutive memory locations, then we need 2954 // to reverse the order of elements in the stored value. 2955 StoredVal = reverseVector(StoredVal); 2956 // We don't want to update the value in the map as it might be used in 2957 // another expression. So don't call resetVectorValue(StoredVal). 2958 } 2959 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2960 if (isMaskRequired) 2961 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2962 BlockInMaskParts[Part]); 2963 else 2964 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2965 } 2966 addMetadata(NewSI, SI); 2967 } 2968 return; 2969 } 2970 2971 // Handle loads. 2972 assert(LI && "Must have a load instruction"); 2973 setDebugLocFromInst(Builder, LI); 2974 for (unsigned Part = 0; Part < UF; ++Part) { 2975 Value *NewLI; 2976 if (CreateGatherScatter) { 2977 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2978 Value *VectorGep = State.get(Addr, Part); 2979 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2980 nullptr, "wide.masked.gather"); 2981 addMetadata(NewLI, LI); 2982 } else { 2983 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2984 if (isMaskRequired) 2985 NewLI = Builder.CreateMaskedLoad( 2986 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2987 "wide.masked.load"); 2988 else 2989 NewLI = 2990 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2991 2992 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2993 addMetadata(NewLI, LI); 2994 if (Reverse) 2995 NewLI = reverseVector(NewLI); 2996 } 2997 2998 State.set(Def, NewLI, Part); 2999 } 3000 } 3001 3002 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3003 VPUser &User, 3004 const VPIteration &Instance, 3005 bool IfPredicateInstr, 3006 VPTransformState &State) { 3007 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3008 3009 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3010 // the first lane and part. 3011 if (isa<NoAliasScopeDeclInst>(Instr)) 3012 if (!Instance.isFirstIteration()) 3013 return; 3014 3015 setDebugLocFromInst(Builder, Instr); 3016 3017 // Does this instruction return a value ? 3018 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3019 3020 Instruction *Cloned = Instr->clone(); 3021 if (!IsVoidRetTy) 3022 Cloned->setName(Instr->getName() + ".cloned"); 3023 3024 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3025 Builder.GetInsertPoint()); 3026 // Replace the operands of the cloned instructions with their scalar 3027 // equivalents in the new loop. 3028 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3029 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3030 auto InputInstance = Instance; 3031 if (!Operand || !OrigLoop->contains(Operand) || 3032 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3033 InputInstance.Lane = VPLane::getFirstLane(); 3034 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3035 Cloned->setOperand(op, NewOp); 3036 } 3037 addNewMetadata(Cloned, Instr); 3038 3039 // Place the cloned scalar in the new loop. 3040 Builder.Insert(Cloned); 3041 3042 State.set(Def, Cloned, Instance); 3043 3044 // If we just cloned a new assumption, add it the assumption cache. 3045 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3046 AC->registerAssumption(II); 3047 3048 // End if-block. 3049 if (IfPredicateInstr) 3050 PredicatedInstructions.push_back(Cloned); 3051 } 3052 3053 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3054 Value *End, Value *Step, 3055 Instruction *DL) { 3056 BasicBlock *Header = L->getHeader(); 3057 BasicBlock *Latch = L->getLoopLatch(); 3058 // As we're just creating this loop, it's possible no latch exists 3059 // yet. If so, use the header as this will be a single block loop. 3060 if (!Latch) 3061 Latch = Header; 3062 3063 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3064 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3065 setDebugLocFromInst(Builder, OldInst); 3066 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3067 3068 Builder.SetInsertPoint(Latch->getTerminator()); 3069 setDebugLocFromInst(Builder, OldInst); 3070 3071 // Create i+1 and fill the PHINode. 3072 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3073 Induction->addIncoming(Start, L->getLoopPreheader()); 3074 Induction->addIncoming(Next, Latch); 3075 // Create the compare. 3076 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3077 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3078 3079 // Now we have two terminators. Remove the old one from the block. 3080 Latch->getTerminator()->eraseFromParent(); 3081 3082 return Induction; 3083 } 3084 3085 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3086 if (TripCount) 3087 return TripCount; 3088 3089 assert(L && "Create Trip Count for null loop."); 3090 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3091 // Find the loop boundaries. 3092 ScalarEvolution *SE = PSE.getSE(); 3093 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3094 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3095 "Invalid loop count"); 3096 3097 Type *IdxTy = Legal->getWidestInductionType(); 3098 assert(IdxTy && "No type for induction"); 3099 3100 // The exit count might have the type of i64 while the phi is i32. This can 3101 // happen if we have an induction variable that is sign extended before the 3102 // compare. The only way that we get a backedge taken count is that the 3103 // induction variable was signed and as such will not overflow. In such a case 3104 // truncation is legal. 3105 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3106 IdxTy->getPrimitiveSizeInBits()) 3107 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3108 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3109 3110 // Get the total trip count from the count by adding 1. 3111 const SCEV *ExitCount = SE->getAddExpr( 3112 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3113 3114 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3115 3116 // Expand the trip count and place the new instructions in the preheader. 3117 // Notice that the pre-header does not change, only the loop body. 3118 SCEVExpander Exp(*SE, DL, "induction"); 3119 3120 // Count holds the overall loop count (N). 3121 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3122 L->getLoopPreheader()->getTerminator()); 3123 3124 if (TripCount->getType()->isPointerTy()) 3125 TripCount = 3126 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3127 L->getLoopPreheader()->getTerminator()); 3128 3129 return TripCount; 3130 } 3131 3132 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3133 if (VectorTripCount) 3134 return VectorTripCount; 3135 3136 Value *TC = getOrCreateTripCount(L); 3137 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3138 3139 Type *Ty = TC->getType(); 3140 // This is where we can make the step a runtime constant. 3141 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3142 3143 // If the tail is to be folded by masking, round the number of iterations N 3144 // up to a multiple of Step instead of rounding down. This is done by first 3145 // adding Step-1 and then rounding down. Note that it's ok if this addition 3146 // overflows: the vector induction variable will eventually wrap to zero given 3147 // that it starts at zero and its Step is a power of two; the loop will then 3148 // exit, with the last early-exit vector comparison also producing all-true. 3149 if (Cost->foldTailByMasking()) { 3150 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3151 "VF*UF must be a power of 2 when folding tail by masking"); 3152 assert(!VF.isScalable() && 3153 "Tail folding not yet supported for scalable vectors"); 3154 TC = Builder.CreateAdd( 3155 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3156 } 3157 3158 // Now we need to generate the expression for the part of the loop that the 3159 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3160 // iterations are not required for correctness, or N - Step, otherwise. Step 3161 // is equal to the vectorization factor (number of SIMD elements) times the 3162 // unroll factor (number of SIMD instructions). 3163 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3164 3165 // There are two cases where we need to ensure (at least) the last iteration 3166 // runs in the scalar remainder loop. Thus, if the step evenly divides 3167 // the trip count, we set the remainder to be equal to the step. If the step 3168 // does not evenly divide the trip count, no adjustment is necessary since 3169 // there will already be scalar iterations. Note that the minimum iterations 3170 // check ensures that N >= Step. The cases are: 3171 // 1) If there is a non-reversed interleaved group that may speculatively 3172 // access memory out-of-bounds. 3173 // 2) If any instruction may follow a conditionally taken exit. That is, if 3174 // the loop contains multiple exiting blocks, or a single exiting block 3175 // which is not the latch. 3176 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3177 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3178 R = Builder.CreateSelect(IsZero, Step, R); 3179 } 3180 3181 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3182 3183 return VectorTripCount; 3184 } 3185 3186 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3187 const DataLayout &DL) { 3188 // Verify that V is a vector type with same number of elements as DstVTy. 3189 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3190 unsigned VF = DstFVTy->getNumElements(); 3191 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3192 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3193 Type *SrcElemTy = SrcVecTy->getElementType(); 3194 Type *DstElemTy = DstFVTy->getElementType(); 3195 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3196 "Vector elements must have same size"); 3197 3198 // Do a direct cast if element types are castable. 3199 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3200 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3201 } 3202 // V cannot be directly casted to desired vector type. 3203 // May happen when V is a floating point vector but DstVTy is a vector of 3204 // pointers or vice-versa. Handle this using a two-step bitcast using an 3205 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3206 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3207 "Only one type should be a pointer type"); 3208 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3209 "Only one type should be a floating point type"); 3210 Type *IntTy = 3211 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3212 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3213 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3214 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3215 } 3216 3217 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3218 BasicBlock *Bypass) { 3219 Value *Count = getOrCreateTripCount(L); 3220 // Reuse existing vector loop preheader for TC checks. 3221 // Note that new preheader block is generated for vector loop. 3222 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3223 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3224 3225 // Generate code to check if the loop's trip count is less than VF * UF, or 3226 // equal to it in case a scalar epilogue is required; this implies that the 3227 // vector trip count is zero. This check also covers the case where adding one 3228 // to the backedge-taken count overflowed leading to an incorrect trip count 3229 // of zero. In this case we will also jump to the scalar loop. 3230 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3231 : ICmpInst::ICMP_ULT; 3232 3233 // If tail is to be folded, vector loop takes care of all iterations. 3234 Value *CheckMinIters = Builder.getFalse(); 3235 if (!Cost->foldTailByMasking()) { 3236 Value *Step = 3237 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3238 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3239 } 3240 // Create new preheader for vector loop. 3241 LoopVectorPreHeader = 3242 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3243 "vector.ph"); 3244 3245 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3246 DT->getNode(Bypass)->getIDom()) && 3247 "TC check is expected to dominate Bypass"); 3248 3249 // Update dominator for Bypass & LoopExit. 3250 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3251 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3252 3253 ReplaceInstWithInst( 3254 TCCheckBlock->getTerminator(), 3255 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3256 LoopBypassBlocks.push_back(TCCheckBlock); 3257 } 3258 3259 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3260 3261 BasicBlock *const SCEVCheckBlock = 3262 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3263 if (!SCEVCheckBlock) 3264 return nullptr; 3265 3266 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3267 (OptForSizeBasedOnProfile && 3268 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3269 "Cannot SCEV check stride or overflow when optimizing for size"); 3270 3271 3272 // Update dominator only if this is first RT check. 3273 if (LoopBypassBlocks.empty()) { 3274 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3275 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3276 } 3277 3278 LoopBypassBlocks.push_back(SCEVCheckBlock); 3279 AddedSafetyChecks = true; 3280 return SCEVCheckBlock; 3281 } 3282 3283 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3284 BasicBlock *Bypass) { 3285 // VPlan-native path does not do any analysis for runtime checks currently. 3286 if (EnableVPlanNativePath) 3287 return nullptr; 3288 3289 BasicBlock *const MemCheckBlock = 3290 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3291 3292 // Check if we generated code that checks in runtime if arrays overlap. We put 3293 // the checks into a separate block to make the more common case of few 3294 // elements faster. 3295 if (!MemCheckBlock) 3296 return nullptr; 3297 3298 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3299 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3300 "Cannot emit memory checks when optimizing for size, unless forced " 3301 "to vectorize."); 3302 ORE->emit([&]() { 3303 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3304 L->getStartLoc(), L->getHeader()) 3305 << "Code-size may be reduced by not forcing " 3306 "vectorization, or by source-code modifications " 3307 "eliminating the need for runtime checks " 3308 "(e.g., adding 'restrict')."; 3309 }); 3310 } 3311 3312 LoopBypassBlocks.push_back(MemCheckBlock); 3313 3314 AddedSafetyChecks = true; 3315 3316 // We currently don't use LoopVersioning for the actual loop cloning but we 3317 // still use it to add the noalias metadata. 3318 LVer = std::make_unique<LoopVersioning>( 3319 *Legal->getLAI(), 3320 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3321 DT, PSE.getSE()); 3322 LVer->prepareNoAliasMetadata(); 3323 return MemCheckBlock; 3324 } 3325 3326 Value *InnerLoopVectorizer::emitTransformedIndex( 3327 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3328 const InductionDescriptor &ID) const { 3329 3330 SCEVExpander Exp(*SE, DL, "induction"); 3331 auto Step = ID.getStep(); 3332 auto StartValue = ID.getStartValue(); 3333 assert(Index->getType() == Step->getType() && 3334 "Index type does not match StepValue type"); 3335 3336 // Note: the IR at this point is broken. We cannot use SE to create any new 3337 // SCEV and then expand it, hoping that SCEV's simplification will give us 3338 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3339 // lead to various SCEV crashes. So all we can do is to use builder and rely 3340 // on InstCombine for future simplifications. Here we handle some trivial 3341 // cases only. 3342 auto CreateAdd = [&B](Value *X, Value *Y) { 3343 assert(X->getType() == Y->getType() && "Types don't match!"); 3344 if (auto *CX = dyn_cast<ConstantInt>(X)) 3345 if (CX->isZero()) 3346 return Y; 3347 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3348 if (CY->isZero()) 3349 return X; 3350 return B.CreateAdd(X, Y); 3351 }; 3352 3353 auto CreateMul = [&B](Value *X, Value *Y) { 3354 assert(X->getType() == Y->getType() && "Types don't match!"); 3355 if (auto *CX = dyn_cast<ConstantInt>(X)) 3356 if (CX->isOne()) 3357 return Y; 3358 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3359 if (CY->isOne()) 3360 return X; 3361 return B.CreateMul(X, Y); 3362 }; 3363 3364 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3365 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3366 // the DomTree is not kept up-to-date for additional blocks generated in the 3367 // vector loop. By using the header as insertion point, we guarantee that the 3368 // expanded instructions dominate all their uses. 3369 auto GetInsertPoint = [this, &B]() { 3370 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3371 if (InsertBB != LoopVectorBody && 3372 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3373 return LoopVectorBody->getTerminator(); 3374 return &*B.GetInsertPoint(); 3375 }; 3376 3377 switch (ID.getKind()) { 3378 case InductionDescriptor::IK_IntInduction: { 3379 assert(Index->getType() == StartValue->getType() && 3380 "Index type does not match StartValue type"); 3381 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3382 return B.CreateSub(StartValue, Index); 3383 auto *Offset = CreateMul( 3384 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3385 return CreateAdd(StartValue, Offset); 3386 } 3387 case InductionDescriptor::IK_PtrInduction: { 3388 assert(isa<SCEVConstant>(Step) && 3389 "Expected constant step for pointer induction"); 3390 return B.CreateGEP( 3391 StartValue->getType()->getPointerElementType(), StartValue, 3392 CreateMul(Index, 3393 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3394 } 3395 case InductionDescriptor::IK_FpInduction: { 3396 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3397 auto InductionBinOp = ID.getInductionBinOp(); 3398 assert(InductionBinOp && 3399 (InductionBinOp->getOpcode() == Instruction::FAdd || 3400 InductionBinOp->getOpcode() == Instruction::FSub) && 3401 "Original bin op should be defined for FP induction"); 3402 3403 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3404 Value *MulExp = B.CreateFMul(StepValue, Index); 3405 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3406 "induction"); 3407 } 3408 case InductionDescriptor::IK_NoInduction: 3409 return nullptr; 3410 } 3411 llvm_unreachable("invalid enum"); 3412 } 3413 3414 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3415 LoopScalarBody = OrigLoop->getHeader(); 3416 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3417 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3418 assert(LoopExitBlock && "Must have an exit block"); 3419 assert(LoopVectorPreHeader && "Invalid loop structure"); 3420 3421 LoopMiddleBlock = 3422 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3423 LI, nullptr, Twine(Prefix) + "middle.block"); 3424 LoopScalarPreHeader = 3425 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3426 nullptr, Twine(Prefix) + "scalar.ph"); 3427 3428 // Set up branch from middle block to the exit and scalar preheader blocks. 3429 // completeLoopSkeleton will update the condition to use an iteration check, 3430 // if required to decide whether to execute the remainder. 3431 BranchInst *BrInst = 3432 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3433 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3434 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3435 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3436 3437 // We intentionally don't let SplitBlock to update LoopInfo since 3438 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3439 // LoopVectorBody is explicitly added to the correct place few lines later. 3440 LoopVectorBody = 3441 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3442 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3443 3444 // Update dominator for loop exit. 3445 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3446 3447 // Create and register the new vector loop. 3448 Loop *Lp = LI->AllocateLoop(); 3449 Loop *ParentLoop = OrigLoop->getParentLoop(); 3450 3451 // Insert the new loop into the loop nest and register the new basic blocks 3452 // before calling any utilities such as SCEV that require valid LoopInfo. 3453 if (ParentLoop) { 3454 ParentLoop->addChildLoop(Lp); 3455 } else { 3456 LI->addTopLevelLoop(Lp); 3457 } 3458 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3459 return Lp; 3460 } 3461 3462 void InnerLoopVectorizer::createInductionResumeValues( 3463 Loop *L, Value *VectorTripCount, 3464 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3465 assert(VectorTripCount && L && "Expected valid arguments"); 3466 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3467 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3468 "Inconsistent information about additional bypass."); 3469 // We are going to resume the execution of the scalar loop. 3470 // Go over all of the induction variables that we found and fix the 3471 // PHIs that are left in the scalar version of the loop. 3472 // The starting values of PHI nodes depend on the counter of the last 3473 // iteration in the vectorized loop. 3474 // If we come from a bypass edge then we need to start from the original 3475 // start value. 3476 for (auto &InductionEntry : Legal->getInductionVars()) { 3477 PHINode *OrigPhi = InductionEntry.first; 3478 InductionDescriptor II = InductionEntry.second; 3479 3480 // Create phi nodes to merge from the backedge-taken check block. 3481 PHINode *BCResumeVal = 3482 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3483 LoopScalarPreHeader->getTerminator()); 3484 // Copy original phi DL over to the new one. 3485 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3486 Value *&EndValue = IVEndValues[OrigPhi]; 3487 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3488 if (OrigPhi == OldInduction) { 3489 // We know what the end value is. 3490 EndValue = VectorTripCount; 3491 } else { 3492 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3493 3494 // Fast-math-flags propagate from the original induction instruction. 3495 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3496 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3497 3498 Type *StepType = II.getStep()->getType(); 3499 Instruction::CastOps CastOp = 3500 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3501 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3502 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3503 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3504 EndValue->setName("ind.end"); 3505 3506 // Compute the end value for the additional bypass (if applicable). 3507 if (AdditionalBypass.first) { 3508 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3509 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3510 StepType, true); 3511 CRD = 3512 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3513 EndValueFromAdditionalBypass = 3514 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3515 EndValueFromAdditionalBypass->setName("ind.end"); 3516 } 3517 } 3518 // The new PHI merges the original incoming value, in case of a bypass, 3519 // or the value at the end of the vectorized loop. 3520 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3521 3522 // Fix the scalar body counter (PHI node). 3523 // The old induction's phi node in the scalar body needs the truncated 3524 // value. 3525 for (BasicBlock *BB : LoopBypassBlocks) 3526 BCResumeVal->addIncoming(II.getStartValue(), BB); 3527 3528 if (AdditionalBypass.first) 3529 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3530 EndValueFromAdditionalBypass); 3531 3532 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3533 } 3534 } 3535 3536 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3537 MDNode *OrigLoopID) { 3538 assert(L && "Expected valid loop."); 3539 3540 // The trip counts should be cached by now. 3541 Value *Count = getOrCreateTripCount(L); 3542 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3543 3544 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3545 3546 // Add a check in the middle block to see if we have completed 3547 // all of the iterations in the first vector loop. 3548 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3549 // If tail is to be folded, we know we don't need to run the remainder. 3550 if (!Cost->foldTailByMasking()) { 3551 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3552 Count, VectorTripCount, "cmp.n", 3553 LoopMiddleBlock->getTerminator()); 3554 3555 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3556 // of the corresponding compare because they may have ended up with 3557 // different line numbers and we want to avoid awkward line stepping while 3558 // debugging. Eg. if the compare has got a line number inside the loop. 3559 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3560 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3561 } 3562 3563 // Get ready to start creating new instructions into the vectorized body. 3564 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3565 "Inconsistent vector loop preheader"); 3566 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3567 3568 Optional<MDNode *> VectorizedLoopID = 3569 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3570 LLVMLoopVectorizeFollowupVectorized}); 3571 if (VectorizedLoopID.hasValue()) { 3572 L->setLoopID(VectorizedLoopID.getValue()); 3573 3574 // Do not setAlreadyVectorized if loop attributes have been defined 3575 // explicitly. 3576 return LoopVectorPreHeader; 3577 } 3578 3579 // Keep all loop hints from the original loop on the vector loop (we'll 3580 // replace the vectorizer-specific hints below). 3581 if (MDNode *LID = OrigLoop->getLoopID()) 3582 L->setLoopID(LID); 3583 3584 LoopVectorizeHints Hints(L, true, *ORE); 3585 Hints.setAlreadyVectorized(); 3586 3587 #ifdef EXPENSIVE_CHECKS 3588 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3589 LI->verify(*DT); 3590 #endif 3591 3592 return LoopVectorPreHeader; 3593 } 3594 3595 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3596 /* 3597 In this function we generate a new loop. The new loop will contain 3598 the vectorized instructions while the old loop will continue to run the 3599 scalar remainder. 3600 3601 [ ] <-- loop iteration number check. 3602 / | 3603 / v 3604 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3605 | / | 3606 | / v 3607 || [ ] <-- vector pre header. 3608 |/ | 3609 | v 3610 | [ ] \ 3611 | [ ]_| <-- vector loop. 3612 | | 3613 | v 3614 | -[ ] <--- middle-block. 3615 | / | 3616 | / v 3617 -|- >[ ] <--- new preheader. 3618 | | 3619 | v 3620 | [ ] \ 3621 | [ ]_| <-- old scalar loop to handle remainder. 3622 \ | 3623 \ v 3624 >[ ] <-- exit block. 3625 ... 3626 */ 3627 3628 // Get the metadata of the original loop before it gets modified. 3629 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3630 3631 // Create an empty vector loop, and prepare basic blocks for the runtime 3632 // checks. 3633 Loop *Lp = createVectorLoopSkeleton(""); 3634 3635 // Now, compare the new count to zero. If it is zero skip the vector loop and 3636 // jump to the scalar loop. This check also covers the case where the 3637 // backedge-taken count is uint##_max: adding one to it will overflow leading 3638 // to an incorrect trip count of zero. In this (rare) case we will also jump 3639 // to the scalar loop. 3640 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3641 3642 // Generate the code to check any assumptions that we've made for SCEV 3643 // expressions. 3644 emitSCEVChecks(Lp, LoopScalarPreHeader); 3645 3646 // Generate the code that checks in runtime if arrays overlap. We put the 3647 // checks into a separate block to make the more common case of few elements 3648 // faster. 3649 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3650 3651 // Some loops have a single integer induction variable, while other loops 3652 // don't. One example is c++ iterators that often have multiple pointer 3653 // induction variables. In the code below we also support a case where we 3654 // don't have a single induction variable. 3655 // 3656 // We try to obtain an induction variable from the original loop as hard 3657 // as possible. However if we don't find one that: 3658 // - is an integer 3659 // - counts from zero, stepping by one 3660 // - is the size of the widest induction variable type 3661 // then we create a new one. 3662 OldInduction = Legal->getPrimaryInduction(); 3663 Type *IdxTy = Legal->getWidestInductionType(); 3664 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3665 // The loop step is equal to the vectorization factor (num of SIMD elements) 3666 // times the unroll factor (num of SIMD instructions). 3667 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3668 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3669 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3670 Induction = 3671 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3672 getDebugLocFromInstOrOperands(OldInduction)); 3673 3674 // Emit phis for the new starting index of the scalar loop. 3675 createInductionResumeValues(Lp, CountRoundDown); 3676 3677 return completeLoopSkeleton(Lp, OrigLoopID); 3678 } 3679 3680 // Fix up external users of the induction variable. At this point, we are 3681 // in LCSSA form, with all external PHIs that use the IV having one input value, 3682 // coming from the remainder loop. We need those PHIs to also have a correct 3683 // value for the IV when arriving directly from the middle block. 3684 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3685 const InductionDescriptor &II, 3686 Value *CountRoundDown, Value *EndValue, 3687 BasicBlock *MiddleBlock) { 3688 // There are two kinds of external IV usages - those that use the value 3689 // computed in the last iteration (the PHI) and those that use the penultimate 3690 // value (the value that feeds into the phi from the loop latch). 3691 // We allow both, but they, obviously, have different values. 3692 3693 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3694 3695 DenseMap<Value *, Value *> MissingVals; 3696 3697 // An external user of the last iteration's value should see the value that 3698 // the remainder loop uses to initialize its own IV. 3699 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3700 for (User *U : PostInc->users()) { 3701 Instruction *UI = cast<Instruction>(U); 3702 if (!OrigLoop->contains(UI)) { 3703 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3704 MissingVals[UI] = EndValue; 3705 } 3706 } 3707 3708 // An external user of the penultimate value need to see EndValue - Step. 3709 // The simplest way to get this is to recompute it from the constituent SCEVs, 3710 // that is Start + (Step * (CRD - 1)). 3711 for (User *U : OrigPhi->users()) { 3712 auto *UI = cast<Instruction>(U); 3713 if (!OrigLoop->contains(UI)) { 3714 const DataLayout &DL = 3715 OrigLoop->getHeader()->getModule()->getDataLayout(); 3716 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3717 3718 IRBuilder<> B(MiddleBlock->getTerminator()); 3719 3720 // Fast-math-flags propagate from the original induction instruction. 3721 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3722 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3723 3724 Value *CountMinusOne = B.CreateSub( 3725 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3726 Value *CMO = 3727 !II.getStep()->getType()->isIntegerTy() 3728 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3729 II.getStep()->getType()) 3730 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3731 CMO->setName("cast.cmo"); 3732 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3733 Escape->setName("ind.escape"); 3734 MissingVals[UI] = Escape; 3735 } 3736 } 3737 3738 for (auto &I : MissingVals) { 3739 PHINode *PHI = cast<PHINode>(I.first); 3740 // One corner case we have to handle is two IVs "chasing" each-other, 3741 // that is %IV2 = phi [...], [ %IV1, %latch ] 3742 // In this case, if IV1 has an external use, we need to avoid adding both 3743 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3744 // don't already have an incoming value for the middle block. 3745 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3746 PHI->addIncoming(I.second, MiddleBlock); 3747 } 3748 } 3749 3750 namespace { 3751 3752 struct CSEDenseMapInfo { 3753 static bool canHandle(const Instruction *I) { 3754 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3755 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3756 } 3757 3758 static inline Instruction *getEmptyKey() { 3759 return DenseMapInfo<Instruction *>::getEmptyKey(); 3760 } 3761 3762 static inline Instruction *getTombstoneKey() { 3763 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3764 } 3765 3766 static unsigned getHashValue(const Instruction *I) { 3767 assert(canHandle(I) && "Unknown instruction!"); 3768 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3769 I->value_op_end())); 3770 } 3771 3772 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3773 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3774 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3775 return LHS == RHS; 3776 return LHS->isIdenticalTo(RHS); 3777 } 3778 }; 3779 3780 } // end anonymous namespace 3781 3782 ///Perform cse of induction variable instructions. 3783 static void cse(BasicBlock *BB) { 3784 // Perform simple cse. 3785 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3786 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3787 Instruction *In = &*I++; 3788 3789 if (!CSEDenseMapInfo::canHandle(In)) 3790 continue; 3791 3792 // Check if we can replace this instruction with any of the 3793 // visited instructions. 3794 if (Instruction *V = CSEMap.lookup(In)) { 3795 In->replaceAllUsesWith(V); 3796 In->eraseFromParent(); 3797 continue; 3798 } 3799 3800 CSEMap[In] = In; 3801 } 3802 } 3803 3804 InstructionCost 3805 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3806 bool &NeedToScalarize) const { 3807 Function *F = CI->getCalledFunction(); 3808 Type *ScalarRetTy = CI->getType(); 3809 SmallVector<Type *, 4> Tys, ScalarTys; 3810 for (auto &ArgOp : CI->arg_operands()) 3811 ScalarTys.push_back(ArgOp->getType()); 3812 3813 // Estimate cost of scalarized vector call. The source operands are assumed 3814 // to be vectors, so we need to extract individual elements from there, 3815 // execute VF scalar calls, and then gather the result into the vector return 3816 // value. 3817 InstructionCost ScalarCallCost = 3818 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3819 if (VF.isScalar()) 3820 return ScalarCallCost; 3821 3822 // Compute corresponding vector type for return value and arguments. 3823 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3824 for (Type *ScalarTy : ScalarTys) 3825 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3826 3827 // Compute costs of unpacking argument values for the scalar calls and 3828 // packing the return values to a vector. 3829 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3830 3831 InstructionCost Cost = 3832 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3833 3834 // If we can't emit a vector call for this function, then the currently found 3835 // cost is the cost we need to return. 3836 NeedToScalarize = true; 3837 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3838 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3839 3840 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3841 return Cost; 3842 3843 // If the corresponding vector cost is cheaper, return its cost. 3844 InstructionCost VectorCallCost = 3845 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3846 if (VectorCallCost < Cost) { 3847 NeedToScalarize = false; 3848 Cost = VectorCallCost; 3849 } 3850 return Cost; 3851 } 3852 3853 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3854 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3855 return Elt; 3856 return VectorType::get(Elt, VF); 3857 } 3858 3859 InstructionCost 3860 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3861 ElementCount VF) const { 3862 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3863 assert(ID && "Expected intrinsic call!"); 3864 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3865 FastMathFlags FMF; 3866 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3867 FMF = FPMO->getFastMathFlags(); 3868 3869 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3870 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3871 SmallVector<Type *> ParamTys; 3872 std::transform(FTy->param_begin(), FTy->param_end(), 3873 std::back_inserter(ParamTys), 3874 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3875 3876 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3877 dyn_cast<IntrinsicInst>(CI)); 3878 return TTI.getIntrinsicInstrCost(CostAttrs, 3879 TargetTransformInfo::TCK_RecipThroughput); 3880 } 3881 3882 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3883 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3884 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3885 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3886 } 3887 3888 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3889 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3890 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3891 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3892 } 3893 3894 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3895 // For every instruction `I` in MinBWs, truncate the operands, create a 3896 // truncated version of `I` and reextend its result. InstCombine runs 3897 // later and will remove any ext/trunc pairs. 3898 SmallPtrSet<Value *, 4> Erased; 3899 for (const auto &KV : Cost->getMinimalBitwidths()) { 3900 // If the value wasn't vectorized, we must maintain the original scalar 3901 // type. The absence of the value from State indicates that it 3902 // wasn't vectorized. 3903 VPValue *Def = State.Plan->getVPValue(KV.first); 3904 if (!State.hasAnyVectorValue(Def)) 3905 continue; 3906 for (unsigned Part = 0; Part < UF; ++Part) { 3907 Value *I = State.get(Def, Part); 3908 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3909 continue; 3910 Type *OriginalTy = I->getType(); 3911 Type *ScalarTruncatedTy = 3912 IntegerType::get(OriginalTy->getContext(), KV.second); 3913 auto *TruncatedTy = FixedVectorType::get( 3914 ScalarTruncatedTy, 3915 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3916 if (TruncatedTy == OriginalTy) 3917 continue; 3918 3919 IRBuilder<> B(cast<Instruction>(I)); 3920 auto ShrinkOperand = [&](Value *V) -> Value * { 3921 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3922 if (ZI->getSrcTy() == TruncatedTy) 3923 return ZI->getOperand(0); 3924 return B.CreateZExtOrTrunc(V, TruncatedTy); 3925 }; 3926 3927 // The actual instruction modification depends on the instruction type, 3928 // unfortunately. 3929 Value *NewI = nullptr; 3930 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3931 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3932 ShrinkOperand(BO->getOperand(1))); 3933 3934 // Any wrapping introduced by shrinking this operation shouldn't be 3935 // considered undefined behavior. So, we can't unconditionally copy 3936 // arithmetic wrapping flags to NewI. 3937 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3938 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3939 NewI = 3940 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3941 ShrinkOperand(CI->getOperand(1))); 3942 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3943 NewI = B.CreateSelect(SI->getCondition(), 3944 ShrinkOperand(SI->getTrueValue()), 3945 ShrinkOperand(SI->getFalseValue())); 3946 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3947 switch (CI->getOpcode()) { 3948 default: 3949 llvm_unreachable("Unhandled cast!"); 3950 case Instruction::Trunc: 3951 NewI = ShrinkOperand(CI->getOperand(0)); 3952 break; 3953 case Instruction::SExt: 3954 NewI = B.CreateSExtOrTrunc( 3955 CI->getOperand(0), 3956 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3957 break; 3958 case Instruction::ZExt: 3959 NewI = B.CreateZExtOrTrunc( 3960 CI->getOperand(0), 3961 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3962 break; 3963 } 3964 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3965 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3966 ->getNumElements(); 3967 auto *O0 = B.CreateZExtOrTrunc( 3968 SI->getOperand(0), 3969 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3970 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3971 ->getNumElements(); 3972 auto *O1 = B.CreateZExtOrTrunc( 3973 SI->getOperand(1), 3974 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3975 3976 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3977 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3978 // Don't do anything with the operands, just extend the result. 3979 continue; 3980 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3981 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3982 ->getNumElements(); 3983 auto *O0 = B.CreateZExtOrTrunc( 3984 IE->getOperand(0), 3985 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3986 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3987 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3988 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3989 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3990 ->getNumElements(); 3991 auto *O0 = B.CreateZExtOrTrunc( 3992 EE->getOperand(0), 3993 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3994 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3995 } else { 3996 // If we don't know what to do, be conservative and don't do anything. 3997 continue; 3998 } 3999 4000 // Lastly, extend the result. 4001 NewI->takeName(cast<Instruction>(I)); 4002 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4003 I->replaceAllUsesWith(Res); 4004 cast<Instruction>(I)->eraseFromParent(); 4005 Erased.insert(I); 4006 State.reset(Def, Res, Part); 4007 } 4008 } 4009 4010 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4011 for (const auto &KV : Cost->getMinimalBitwidths()) { 4012 // If the value wasn't vectorized, we must maintain the original scalar 4013 // type. The absence of the value from State indicates that it 4014 // wasn't vectorized. 4015 VPValue *Def = State.Plan->getVPValue(KV.first); 4016 if (!State.hasAnyVectorValue(Def)) 4017 continue; 4018 for (unsigned Part = 0; Part < UF; ++Part) { 4019 Value *I = State.get(Def, Part); 4020 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4021 if (Inst && Inst->use_empty()) { 4022 Value *NewI = Inst->getOperand(0); 4023 Inst->eraseFromParent(); 4024 State.reset(Def, NewI, Part); 4025 } 4026 } 4027 } 4028 } 4029 4030 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4031 // Insert truncates and extends for any truncated instructions as hints to 4032 // InstCombine. 4033 if (VF.isVector()) 4034 truncateToMinimalBitwidths(State); 4035 4036 // Fix widened non-induction PHIs by setting up the PHI operands. 4037 if (OrigPHIsToFix.size()) { 4038 assert(EnableVPlanNativePath && 4039 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4040 fixNonInductionPHIs(State); 4041 } 4042 4043 // At this point every instruction in the original loop is widened to a 4044 // vector form. Now we need to fix the recurrences in the loop. These PHI 4045 // nodes are currently empty because we did not want to introduce cycles. 4046 // This is the second stage of vectorizing recurrences. 4047 fixCrossIterationPHIs(State); 4048 4049 // Forget the original basic block. 4050 PSE.getSE()->forgetLoop(OrigLoop); 4051 4052 // Fix-up external users of the induction variables. 4053 for (auto &Entry : Legal->getInductionVars()) 4054 fixupIVUsers(Entry.first, Entry.second, 4055 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4056 IVEndValues[Entry.first], LoopMiddleBlock); 4057 4058 fixLCSSAPHIs(State); 4059 for (Instruction *PI : PredicatedInstructions) 4060 sinkScalarOperands(&*PI); 4061 4062 // Remove redundant induction instructions. 4063 cse(LoopVectorBody); 4064 4065 // Set/update profile weights for the vector and remainder loops as original 4066 // loop iterations are now distributed among them. Note that original loop 4067 // represented by LoopScalarBody becomes remainder loop after vectorization. 4068 // 4069 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4070 // end up getting slightly roughened result but that should be OK since 4071 // profile is not inherently precise anyway. Note also possible bypass of 4072 // vector code caused by legality checks is ignored, assigning all the weight 4073 // to the vector loop, optimistically. 4074 // 4075 // For scalable vectorization we can't know at compile time how many iterations 4076 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4077 // vscale of '1'. 4078 setProfileInfoAfterUnrolling( 4079 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4080 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4081 } 4082 4083 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4084 // In order to support recurrences we need to be able to vectorize Phi nodes. 4085 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4086 // stage #2: We now need to fix the recurrences by adding incoming edges to 4087 // the currently empty PHI nodes. At this point every instruction in the 4088 // original loop is widened to a vector form so we can use them to construct 4089 // the incoming edges. 4090 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 4091 // Handle first-order recurrences and reductions that need to be fixed. 4092 if (Legal->isFirstOrderRecurrence(&Phi)) 4093 fixFirstOrderRecurrence(&Phi, State); 4094 else if (Legal->isReductionVariable(&Phi)) 4095 fixReduction(&Phi, State); 4096 } 4097 } 4098 4099 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4100 VPTransformState &State) { 4101 // This is the second phase of vectorizing first-order recurrences. An 4102 // overview of the transformation is described below. Suppose we have the 4103 // following loop. 4104 // 4105 // for (int i = 0; i < n; ++i) 4106 // b[i] = a[i] - a[i - 1]; 4107 // 4108 // There is a first-order recurrence on "a". For this loop, the shorthand 4109 // scalar IR looks like: 4110 // 4111 // scalar.ph: 4112 // s_init = a[-1] 4113 // br scalar.body 4114 // 4115 // scalar.body: 4116 // i = phi [0, scalar.ph], [i+1, scalar.body] 4117 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4118 // s2 = a[i] 4119 // b[i] = s2 - s1 4120 // br cond, scalar.body, ... 4121 // 4122 // In this example, s1 is a recurrence because it's value depends on the 4123 // previous iteration. In the first phase of vectorization, we created a 4124 // temporary value for s1. We now complete the vectorization and produce the 4125 // shorthand vector IR shown below (for VF = 4, UF = 1). 4126 // 4127 // vector.ph: 4128 // v_init = vector(..., ..., ..., a[-1]) 4129 // br vector.body 4130 // 4131 // vector.body 4132 // i = phi [0, vector.ph], [i+4, vector.body] 4133 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4134 // v2 = a[i, i+1, i+2, i+3]; 4135 // v3 = vector(v1(3), v2(0, 1, 2)) 4136 // b[i, i+1, i+2, i+3] = v2 - v3 4137 // br cond, vector.body, middle.block 4138 // 4139 // middle.block: 4140 // x = v2(3) 4141 // br scalar.ph 4142 // 4143 // scalar.ph: 4144 // s_init = phi [x, middle.block], [a[-1], otherwise] 4145 // br scalar.body 4146 // 4147 // After execution completes the vector loop, we extract the next value of 4148 // the recurrence (x) to use as the initial value in the scalar loop. 4149 4150 // Get the original loop preheader and single loop latch. 4151 auto *Preheader = OrigLoop->getLoopPreheader(); 4152 auto *Latch = OrigLoop->getLoopLatch(); 4153 4154 // Get the initial and previous values of the scalar recurrence. 4155 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4156 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4157 4158 // Create a vector from the initial value. 4159 auto *VectorInit = ScalarInit; 4160 if (VF.isVector()) { 4161 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4162 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4163 VectorInit = Builder.CreateInsertElement( 4164 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4165 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4166 } 4167 4168 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4169 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4170 // We constructed a temporary phi node in the first phase of vectorization. 4171 // This phi node will eventually be deleted. 4172 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4173 4174 // Create a phi node for the new recurrence. The current value will either be 4175 // the initial value inserted into a vector or loop-varying vector value. 4176 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4177 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4178 4179 // Get the vectorized previous value of the last part UF - 1. It appears last 4180 // among all unrolled iterations, due to the order of their construction. 4181 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4182 4183 // Find and set the insertion point after the previous value if it is an 4184 // instruction. 4185 BasicBlock::iterator InsertPt; 4186 // Note that the previous value may have been constant-folded so it is not 4187 // guaranteed to be an instruction in the vector loop. 4188 // FIXME: Loop invariant values do not form recurrences. We should deal with 4189 // them earlier. 4190 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4191 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4192 else { 4193 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4194 if (isa<PHINode>(PreviousLastPart)) 4195 // If the previous value is a phi node, we should insert after all the phi 4196 // nodes in the block containing the PHI to avoid breaking basic block 4197 // verification. Note that the basic block may be different to 4198 // LoopVectorBody, in case we predicate the loop. 4199 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4200 else 4201 InsertPt = ++PreviousInst->getIterator(); 4202 } 4203 Builder.SetInsertPoint(&*InsertPt); 4204 4205 // We will construct a vector for the recurrence by combining the values for 4206 // the current and previous iterations. This is the required shuffle mask. 4207 assert(!VF.isScalable()); 4208 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4209 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4210 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4211 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4212 4213 // The vector from which to take the initial value for the current iteration 4214 // (actual or unrolled). Initially, this is the vector phi node. 4215 Value *Incoming = VecPhi; 4216 4217 // Shuffle the current and previous vector and update the vector parts. 4218 for (unsigned Part = 0; Part < UF; ++Part) { 4219 Value *PreviousPart = State.get(PreviousDef, Part); 4220 Value *PhiPart = State.get(PhiDef, Part); 4221 auto *Shuffle = 4222 VF.isVector() 4223 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4224 : Incoming; 4225 PhiPart->replaceAllUsesWith(Shuffle); 4226 cast<Instruction>(PhiPart)->eraseFromParent(); 4227 State.reset(PhiDef, Shuffle, Part); 4228 Incoming = PreviousPart; 4229 } 4230 4231 // Fix the latch value of the new recurrence in the vector loop. 4232 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4233 4234 // Extract the last vector element in the middle block. This will be the 4235 // initial value for the recurrence when jumping to the scalar loop. 4236 auto *ExtractForScalar = Incoming; 4237 if (VF.isVector()) { 4238 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4239 ExtractForScalar = Builder.CreateExtractElement( 4240 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4241 "vector.recur.extract"); 4242 } 4243 // Extract the second last element in the middle block if the 4244 // Phi is used outside the loop. We need to extract the phi itself 4245 // and not the last element (the phi update in the current iteration). This 4246 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4247 // when the scalar loop is not run at all. 4248 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4249 if (VF.isVector()) 4250 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4251 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4252 "vector.recur.extract.for.phi"); 4253 // When loop is unrolled without vectorizing, initialize 4254 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4255 // `Incoming`. This is analogous to the vectorized case above: extracting the 4256 // second last element when VF > 1. 4257 else if (UF > 1) 4258 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4259 4260 // Fix the initial value of the original recurrence in the scalar loop. 4261 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4262 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4263 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4264 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4265 Start->addIncoming(Incoming, BB); 4266 } 4267 4268 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4269 Phi->setName("scalar.recur"); 4270 4271 // Finally, fix users of the recurrence outside the loop. The users will need 4272 // either the last value of the scalar recurrence or the last value of the 4273 // vector recurrence we extracted in the middle block. Since the loop is in 4274 // LCSSA form, we just need to find all the phi nodes for the original scalar 4275 // recurrence in the exit block, and then add an edge for the middle block. 4276 // Note that LCSSA does not imply single entry when the original scalar loop 4277 // had multiple exiting edges (as we always run the last iteration in the 4278 // scalar epilogue); in that case, the exiting path through middle will be 4279 // dynamically dead and the value picked for the phi doesn't matter. 4280 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4281 if (any_of(LCSSAPhi.incoming_values(), 4282 [Phi](Value *V) { return V == Phi; })) 4283 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4284 } 4285 4286 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4287 return EnableStrictReductions && RdxDesc.isOrdered(); 4288 } 4289 4290 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { 4291 // Get it's reduction variable descriptor. 4292 assert(Legal->isReductionVariable(Phi) && 4293 "Unable to find the reduction variable"); 4294 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4295 4296 RecurKind RK = RdxDesc.getRecurrenceKind(); 4297 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4298 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4299 setDebugLocFromInst(Builder, ReductionStartValue); 4300 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4301 4302 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4303 // This is the vector-clone of the value that leaves the loop. 4304 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4305 4306 // Wrap flags are in general invalid after vectorization, clear them. 4307 clearReductionWrapFlags(RdxDesc, State); 4308 4309 // Fix the vector-loop phi. 4310 4311 // Reductions do not have to start at zero. They can start with 4312 // any loop invariant values. 4313 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 4314 Value *OrigLoopVal = Phi->getIncomingValueForBlock(OrigLatch); 4315 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4316 4317 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4318 useOrderedReductions(RdxDesc); 4319 4320 for (unsigned Part = 0; Part < UF; ++Part) { 4321 if (IsOrdered && Part > 0) 4322 break; 4323 Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part); 4324 Value *Val = State.get(State.Plan->getVPValue(OrigLoopVal), Part); 4325 if (IsOrdered) 4326 Val = State.get(State.Plan->getVPValue(OrigLoopVal), UF - 1); 4327 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4328 } 4329 4330 // Before each round, move the insertion point right between 4331 // the PHIs and the values we are going to write. 4332 // This allows us to write both PHINodes and the extractelement 4333 // instructions. 4334 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4335 4336 setDebugLocFromInst(Builder, LoopExitInst); 4337 4338 Type *PhiTy = Phi->getType(); 4339 // If tail is folded by masking, the vector value to leave the loop should be 4340 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4341 // instead of the former. For an inloop reduction the reduction will already 4342 // be predicated, and does not need to be handled here. 4343 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4344 for (unsigned Part = 0; Part < UF; ++Part) { 4345 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4346 Value *Sel = nullptr; 4347 for (User *U : VecLoopExitInst->users()) { 4348 if (isa<SelectInst>(U)) { 4349 assert(!Sel && "Reduction exit feeding two selects"); 4350 Sel = U; 4351 } else 4352 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4353 } 4354 assert(Sel && "Reduction exit feeds no select"); 4355 State.reset(LoopExitInstDef, Sel, Part); 4356 4357 // If the target can create a predicated operator for the reduction at no 4358 // extra cost in the loop (for example a predicated vadd), it can be 4359 // cheaper for the select to remain in the loop than be sunk out of it, 4360 // and so use the select value for the phi instead of the old 4361 // LoopExitValue. 4362 if (PreferPredicatedReductionSelect || 4363 TTI->preferPredicatedReductionSelect( 4364 RdxDesc.getOpcode(), PhiTy, 4365 TargetTransformInfo::ReductionFlags())) { 4366 auto *VecRdxPhi = 4367 cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part)); 4368 VecRdxPhi->setIncomingValueForBlock( 4369 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4370 } 4371 } 4372 } 4373 4374 // If the vector reduction can be performed in a smaller type, we truncate 4375 // then extend the loop exit value to enable InstCombine to evaluate the 4376 // entire expression in the smaller type. 4377 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4378 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4379 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4380 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4381 Builder.SetInsertPoint( 4382 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4383 VectorParts RdxParts(UF); 4384 for (unsigned Part = 0; Part < UF; ++Part) { 4385 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4386 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4387 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4388 : Builder.CreateZExt(Trunc, VecTy); 4389 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4390 UI != RdxParts[Part]->user_end();) 4391 if (*UI != Trunc) { 4392 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4393 RdxParts[Part] = Extnd; 4394 } else { 4395 ++UI; 4396 } 4397 } 4398 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4399 for (unsigned Part = 0; Part < UF; ++Part) { 4400 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4401 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4402 } 4403 } 4404 4405 // Reduce all of the unrolled parts into a single vector. 4406 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4407 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4408 4409 // The middle block terminator has already been assigned a DebugLoc here (the 4410 // OrigLoop's single latch terminator). We want the whole middle block to 4411 // appear to execute on this line because: (a) it is all compiler generated, 4412 // (b) these instructions are always executed after evaluating the latch 4413 // conditional branch, and (c) other passes may add new predecessors which 4414 // terminate on this line. This is the easiest way to ensure we don't 4415 // accidentally cause an extra step back into the loop while debugging. 4416 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4417 if (IsOrdered) 4418 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4419 else { 4420 // Floating-point operations should have some FMF to enable the reduction. 4421 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4422 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4423 for (unsigned Part = 1; Part < UF; ++Part) { 4424 Value *RdxPart = State.get(LoopExitInstDef, Part); 4425 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4426 ReducedPartRdx = Builder.CreateBinOp( 4427 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4428 } else { 4429 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4430 } 4431 } 4432 } 4433 4434 // Create the reduction after the loop. Note that inloop reductions create the 4435 // target reduction in the loop using a Reduction recipe. 4436 if (VF.isVector() && !IsInLoopReductionPhi) { 4437 ReducedPartRdx = 4438 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4439 // If the reduction can be performed in a smaller type, we need to extend 4440 // the reduction to the wider type before we branch to the original loop. 4441 if (PhiTy != RdxDesc.getRecurrenceType()) 4442 ReducedPartRdx = RdxDesc.isSigned() 4443 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4444 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4445 } 4446 4447 // Create a phi node that merges control-flow from the backedge-taken check 4448 // block and the middle block. 4449 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4450 LoopScalarPreHeader->getTerminator()); 4451 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4452 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4453 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4454 4455 // Now, we need to fix the users of the reduction variable 4456 // inside and outside of the scalar remainder loop. 4457 4458 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4459 // in the exit blocks. See comment on analogous loop in 4460 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4461 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4462 if (any_of(LCSSAPhi.incoming_values(), 4463 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4464 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4465 4466 // Fix the scalar loop reduction variable with the incoming reduction sum 4467 // from the vector body and from the backedge value. 4468 int IncomingEdgeBlockIdx = 4469 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4470 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4471 // Pick the other block. 4472 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4473 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4474 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4475 } 4476 4477 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4478 VPTransformState &State) { 4479 RecurKind RK = RdxDesc.getRecurrenceKind(); 4480 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4481 return; 4482 4483 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4484 assert(LoopExitInstr && "null loop exit instruction"); 4485 SmallVector<Instruction *, 8> Worklist; 4486 SmallPtrSet<Instruction *, 8> Visited; 4487 Worklist.push_back(LoopExitInstr); 4488 Visited.insert(LoopExitInstr); 4489 4490 while (!Worklist.empty()) { 4491 Instruction *Cur = Worklist.pop_back_val(); 4492 if (isa<OverflowingBinaryOperator>(Cur)) 4493 for (unsigned Part = 0; Part < UF; ++Part) { 4494 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4495 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4496 } 4497 4498 for (User *U : Cur->users()) { 4499 Instruction *UI = cast<Instruction>(U); 4500 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4501 Visited.insert(UI).second) 4502 Worklist.push_back(UI); 4503 } 4504 } 4505 } 4506 4507 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4508 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4509 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4510 // Some phis were already hand updated by the reduction and recurrence 4511 // code above, leave them alone. 4512 continue; 4513 4514 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4515 // Non-instruction incoming values will have only one value. 4516 4517 VPLane Lane = VPLane::getFirstLane(); 4518 if (isa<Instruction>(IncomingValue) && 4519 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4520 VF)) 4521 Lane = VPLane::getLastLaneForVF(VF); 4522 4523 // Can be a loop invariant incoming value or the last scalar value to be 4524 // extracted from the vectorized loop. 4525 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4526 Value *lastIncomingValue = 4527 OrigLoop->isLoopInvariant(IncomingValue) 4528 ? IncomingValue 4529 : State.get(State.Plan->getVPValue(IncomingValue), 4530 VPIteration(UF - 1, Lane)); 4531 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4532 } 4533 } 4534 4535 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4536 // The basic block and loop containing the predicated instruction. 4537 auto *PredBB = PredInst->getParent(); 4538 auto *VectorLoop = LI->getLoopFor(PredBB); 4539 4540 // Initialize a worklist with the operands of the predicated instruction. 4541 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4542 4543 // Holds instructions that we need to analyze again. An instruction may be 4544 // reanalyzed if we don't yet know if we can sink it or not. 4545 SmallVector<Instruction *, 8> InstsToReanalyze; 4546 4547 // Returns true if a given use occurs in the predicated block. Phi nodes use 4548 // their operands in their corresponding predecessor blocks. 4549 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4550 auto *I = cast<Instruction>(U.getUser()); 4551 BasicBlock *BB = I->getParent(); 4552 if (auto *Phi = dyn_cast<PHINode>(I)) 4553 BB = Phi->getIncomingBlock( 4554 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4555 return BB == PredBB; 4556 }; 4557 4558 // Iteratively sink the scalarized operands of the predicated instruction 4559 // into the block we created for it. When an instruction is sunk, it's 4560 // operands are then added to the worklist. The algorithm ends after one pass 4561 // through the worklist doesn't sink a single instruction. 4562 bool Changed; 4563 do { 4564 // Add the instructions that need to be reanalyzed to the worklist, and 4565 // reset the changed indicator. 4566 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4567 InstsToReanalyze.clear(); 4568 Changed = false; 4569 4570 while (!Worklist.empty()) { 4571 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4572 4573 // We can't sink an instruction if it is a phi node, is already in the 4574 // predicated block, is not in the loop, or may have side effects. 4575 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4576 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4577 continue; 4578 4579 // It's legal to sink the instruction if all its uses occur in the 4580 // predicated block. Otherwise, there's nothing to do yet, and we may 4581 // need to reanalyze the instruction. 4582 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4583 InstsToReanalyze.push_back(I); 4584 continue; 4585 } 4586 4587 // Move the instruction to the beginning of the predicated block, and add 4588 // it's operands to the worklist. 4589 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4590 Worklist.insert(I->op_begin(), I->op_end()); 4591 4592 // The sinking may have enabled other instructions to be sunk, so we will 4593 // need to iterate. 4594 Changed = true; 4595 } 4596 } while (Changed); 4597 } 4598 4599 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4600 for (PHINode *OrigPhi : OrigPHIsToFix) { 4601 VPWidenPHIRecipe *VPPhi = 4602 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4603 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4604 // Make sure the builder has a valid insert point. 4605 Builder.SetInsertPoint(NewPhi); 4606 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4607 VPValue *Inc = VPPhi->getIncomingValue(i); 4608 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4609 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4610 } 4611 } 4612 } 4613 4614 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4615 VPUser &Operands, unsigned UF, 4616 ElementCount VF, bool IsPtrLoopInvariant, 4617 SmallBitVector &IsIndexLoopInvariant, 4618 VPTransformState &State) { 4619 // Construct a vector GEP by widening the operands of the scalar GEP as 4620 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4621 // results in a vector of pointers when at least one operand of the GEP 4622 // is vector-typed. Thus, to keep the representation compact, we only use 4623 // vector-typed operands for loop-varying values. 4624 4625 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4626 // If we are vectorizing, but the GEP has only loop-invariant operands, 4627 // the GEP we build (by only using vector-typed operands for 4628 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4629 // produce a vector of pointers, we need to either arbitrarily pick an 4630 // operand to broadcast, or broadcast a clone of the original GEP. 4631 // Here, we broadcast a clone of the original. 4632 // 4633 // TODO: If at some point we decide to scalarize instructions having 4634 // loop-invariant operands, this special case will no longer be 4635 // required. We would add the scalarization decision to 4636 // collectLoopScalars() and teach getVectorValue() to broadcast 4637 // the lane-zero scalar value. 4638 auto *Clone = Builder.Insert(GEP->clone()); 4639 for (unsigned Part = 0; Part < UF; ++Part) { 4640 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4641 State.set(VPDef, EntryPart, Part); 4642 addMetadata(EntryPart, GEP); 4643 } 4644 } else { 4645 // If the GEP has at least one loop-varying operand, we are sure to 4646 // produce a vector of pointers. But if we are only unrolling, we want 4647 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4648 // produce with the code below will be scalar (if VF == 1) or vector 4649 // (otherwise). Note that for the unroll-only case, we still maintain 4650 // values in the vector mapping with initVector, as we do for other 4651 // instructions. 4652 for (unsigned Part = 0; Part < UF; ++Part) { 4653 // The pointer operand of the new GEP. If it's loop-invariant, we 4654 // won't broadcast it. 4655 auto *Ptr = IsPtrLoopInvariant 4656 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4657 : State.get(Operands.getOperand(0), Part); 4658 4659 // Collect all the indices for the new GEP. If any index is 4660 // loop-invariant, we won't broadcast it. 4661 SmallVector<Value *, 4> Indices; 4662 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4663 VPValue *Operand = Operands.getOperand(I); 4664 if (IsIndexLoopInvariant[I - 1]) 4665 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4666 else 4667 Indices.push_back(State.get(Operand, Part)); 4668 } 4669 4670 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4671 // but it should be a vector, otherwise. 4672 auto *NewGEP = 4673 GEP->isInBounds() 4674 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4675 Indices) 4676 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4677 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4678 "NewGEP is not a pointer vector"); 4679 State.set(VPDef, NewGEP, Part); 4680 addMetadata(NewGEP, GEP); 4681 } 4682 } 4683 } 4684 4685 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4686 RecurrenceDescriptor *RdxDesc, 4687 VPWidenPHIRecipe *PhiR, 4688 VPTransformState &State) { 4689 PHINode *P = cast<PHINode>(PN); 4690 if (EnableVPlanNativePath) { 4691 // Currently we enter here in the VPlan-native path for non-induction 4692 // PHIs where all control flow is uniform. We simply widen these PHIs. 4693 // Create a vector phi with no operands - the vector phi operands will be 4694 // set at the end of vector code generation. 4695 Type *VecTy = (State.VF.isScalar()) 4696 ? PN->getType() 4697 : VectorType::get(PN->getType(), State.VF); 4698 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4699 State.set(PhiR, VecPhi, 0); 4700 OrigPHIsToFix.push_back(P); 4701 4702 return; 4703 } 4704 4705 assert(PN->getParent() == OrigLoop->getHeader() && 4706 "Non-header phis should have been handled elsewhere"); 4707 4708 VPValue *StartVPV = PhiR->getStartValue(); 4709 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4710 // In order to support recurrences we need to be able to vectorize Phi nodes. 4711 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4712 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4713 // this value when we vectorize all of the instructions that use the PHI. 4714 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4715 Value *Iden = nullptr; 4716 bool ScalarPHI = 4717 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4718 Type *VecTy = 4719 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4720 4721 if (RdxDesc) { 4722 assert(Legal->isReductionVariable(P) && StartV && 4723 "RdxDesc should only be set for reduction variables; in that case " 4724 "a StartV is also required"); 4725 RecurKind RK = RdxDesc->getRecurrenceKind(); 4726 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4727 // MinMax reduction have the start value as their identify. 4728 if (ScalarPHI) { 4729 Iden = StartV; 4730 } else { 4731 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4732 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4733 StartV = Iden = 4734 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4735 } 4736 } else { 4737 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4738 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4739 Iden = IdenC; 4740 4741 if (!ScalarPHI) { 4742 Iden = ConstantVector::getSplat(State.VF, IdenC); 4743 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4744 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4745 Constant *Zero = Builder.getInt32(0); 4746 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4747 } 4748 } 4749 } 4750 4751 bool IsOrdered = State.VF.isVector() && 4752 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4753 useOrderedReductions(*RdxDesc); 4754 4755 for (unsigned Part = 0; Part < State.UF; ++Part) { 4756 // This is phase one of vectorizing PHIs. 4757 if (Part > 0 && IsOrdered) 4758 return; 4759 Value *EntryPart = PHINode::Create( 4760 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4761 State.set(PhiR, EntryPart, Part); 4762 if (StartV) { 4763 // Make sure to add the reduction start value only to the 4764 // first unroll part. 4765 Value *StartVal = (Part == 0) ? StartV : Iden; 4766 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4767 } 4768 } 4769 return; 4770 } 4771 4772 assert(!Legal->isReductionVariable(P) && 4773 "reductions should be handled above"); 4774 4775 setDebugLocFromInst(Builder, P); 4776 4777 // This PHINode must be an induction variable. 4778 // Make sure that we know about it. 4779 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4780 4781 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4782 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4783 4784 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4785 // which can be found from the original scalar operations. 4786 switch (II.getKind()) { 4787 case InductionDescriptor::IK_NoInduction: 4788 llvm_unreachable("Unknown induction"); 4789 case InductionDescriptor::IK_IntInduction: 4790 case InductionDescriptor::IK_FpInduction: 4791 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4792 case InductionDescriptor::IK_PtrInduction: { 4793 // Handle the pointer induction variable case. 4794 assert(P->getType()->isPointerTy() && "Unexpected type."); 4795 4796 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4797 // This is the normalized GEP that starts counting at zero. 4798 Value *PtrInd = 4799 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4800 // Determine the number of scalars we need to generate for each unroll 4801 // iteration. If the instruction is uniform, we only need to generate the 4802 // first lane. Otherwise, we generate all VF values. 4803 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4804 assert((IsUniform || !VF.isScalable()) && 4805 "Currently unsupported for scalable vectors"); 4806 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4807 4808 for (unsigned Part = 0; Part < UF; ++Part) { 4809 Value *PartStart = createStepForVF( 4810 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4811 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4812 Value *Idx = Builder.CreateAdd( 4813 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4814 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4815 Value *SclrGep = 4816 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4817 SclrGep->setName("next.gep"); 4818 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4819 } 4820 } 4821 return; 4822 } 4823 assert(isa<SCEVConstant>(II.getStep()) && 4824 "Induction step not a SCEV constant!"); 4825 Type *PhiType = II.getStep()->getType(); 4826 4827 // Build a pointer phi 4828 Value *ScalarStartValue = II.getStartValue(); 4829 Type *ScStValueType = ScalarStartValue->getType(); 4830 PHINode *NewPointerPhi = 4831 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4832 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4833 4834 // A pointer induction, performed by using a gep 4835 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4836 Instruction *InductionLoc = LoopLatch->getTerminator(); 4837 const SCEV *ScalarStep = II.getStep(); 4838 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4839 Value *ScalarStepValue = 4840 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4841 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4842 Value *NumUnrolledElems = 4843 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4844 Value *InductionGEP = GetElementPtrInst::Create( 4845 ScStValueType->getPointerElementType(), NewPointerPhi, 4846 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4847 InductionLoc); 4848 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4849 4850 // Create UF many actual address geps that use the pointer 4851 // phi as base and a vectorized version of the step value 4852 // (<step*0, ..., step*N>) as offset. 4853 for (unsigned Part = 0; Part < State.UF; ++Part) { 4854 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4855 Value *StartOffsetScalar = 4856 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4857 Value *StartOffset = 4858 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4859 // Create a vector of consecutive numbers from zero to VF. 4860 StartOffset = 4861 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4862 4863 Value *GEP = Builder.CreateGEP( 4864 ScStValueType->getPointerElementType(), NewPointerPhi, 4865 Builder.CreateMul( 4866 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4867 "vector.gep")); 4868 State.set(PhiR, GEP, Part); 4869 } 4870 } 4871 } 4872 } 4873 4874 /// A helper function for checking whether an integer division-related 4875 /// instruction may divide by zero (in which case it must be predicated if 4876 /// executed conditionally in the scalar code). 4877 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4878 /// Non-zero divisors that are non compile-time constants will not be 4879 /// converted into multiplication, so we will still end up scalarizing 4880 /// the division, but can do so w/o predication. 4881 static bool mayDivideByZero(Instruction &I) { 4882 assert((I.getOpcode() == Instruction::UDiv || 4883 I.getOpcode() == Instruction::SDiv || 4884 I.getOpcode() == Instruction::URem || 4885 I.getOpcode() == Instruction::SRem) && 4886 "Unexpected instruction"); 4887 Value *Divisor = I.getOperand(1); 4888 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4889 return !CInt || CInt->isZero(); 4890 } 4891 4892 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4893 VPUser &User, 4894 VPTransformState &State) { 4895 switch (I.getOpcode()) { 4896 case Instruction::Call: 4897 case Instruction::Br: 4898 case Instruction::PHI: 4899 case Instruction::GetElementPtr: 4900 case Instruction::Select: 4901 llvm_unreachable("This instruction is handled by a different recipe."); 4902 case Instruction::UDiv: 4903 case Instruction::SDiv: 4904 case Instruction::SRem: 4905 case Instruction::URem: 4906 case Instruction::Add: 4907 case Instruction::FAdd: 4908 case Instruction::Sub: 4909 case Instruction::FSub: 4910 case Instruction::FNeg: 4911 case Instruction::Mul: 4912 case Instruction::FMul: 4913 case Instruction::FDiv: 4914 case Instruction::FRem: 4915 case Instruction::Shl: 4916 case Instruction::LShr: 4917 case Instruction::AShr: 4918 case Instruction::And: 4919 case Instruction::Or: 4920 case Instruction::Xor: { 4921 // Just widen unops and binops. 4922 setDebugLocFromInst(Builder, &I); 4923 4924 for (unsigned Part = 0; Part < UF; ++Part) { 4925 SmallVector<Value *, 2> Ops; 4926 for (VPValue *VPOp : User.operands()) 4927 Ops.push_back(State.get(VPOp, Part)); 4928 4929 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4930 4931 if (auto *VecOp = dyn_cast<Instruction>(V)) 4932 VecOp->copyIRFlags(&I); 4933 4934 // Use this vector value for all users of the original instruction. 4935 State.set(Def, V, Part); 4936 addMetadata(V, &I); 4937 } 4938 4939 break; 4940 } 4941 case Instruction::ICmp: 4942 case Instruction::FCmp: { 4943 // Widen compares. Generate vector compares. 4944 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4945 auto *Cmp = cast<CmpInst>(&I); 4946 setDebugLocFromInst(Builder, Cmp); 4947 for (unsigned Part = 0; Part < UF; ++Part) { 4948 Value *A = State.get(User.getOperand(0), Part); 4949 Value *B = State.get(User.getOperand(1), Part); 4950 Value *C = nullptr; 4951 if (FCmp) { 4952 // Propagate fast math flags. 4953 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4954 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4955 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4956 } else { 4957 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4958 } 4959 State.set(Def, C, Part); 4960 addMetadata(C, &I); 4961 } 4962 4963 break; 4964 } 4965 4966 case Instruction::ZExt: 4967 case Instruction::SExt: 4968 case Instruction::FPToUI: 4969 case Instruction::FPToSI: 4970 case Instruction::FPExt: 4971 case Instruction::PtrToInt: 4972 case Instruction::IntToPtr: 4973 case Instruction::SIToFP: 4974 case Instruction::UIToFP: 4975 case Instruction::Trunc: 4976 case Instruction::FPTrunc: 4977 case Instruction::BitCast: { 4978 auto *CI = cast<CastInst>(&I); 4979 setDebugLocFromInst(Builder, CI); 4980 4981 /// Vectorize casts. 4982 Type *DestTy = 4983 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4984 4985 for (unsigned Part = 0; Part < UF; ++Part) { 4986 Value *A = State.get(User.getOperand(0), Part); 4987 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4988 State.set(Def, Cast, Part); 4989 addMetadata(Cast, &I); 4990 } 4991 break; 4992 } 4993 default: 4994 // This instruction is not vectorized by simple widening. 4995 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4996 llvm_unreachable("Unhandled instruction!"); 4997 } // end of switch. 4998 } 4999 5000 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5001 VPUser &ArgOperands, 5002 VPTransformState &State) { 5003 assert(!isa<DbgInfoIntrinsic>(I) && 5004 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5005 setDebugLocFromInst(Builder, &I); 5006 5007 Module *M = I.getParent()->getParent()->getParent(); 5008 auto *CI = cast<CallInst>(&I); 5009 5010 SmallVector<Type *, 4> Tys; 5011 for (Value *ArgOperand : CI->arg_operands()) 5012 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5013 5014 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5015 5016 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5017 // version of the instruction. 5018 // Is it beneficial to perform intrinsic call compared to lib call? 5019 bool NeedToScalarize = false; 5020 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5021 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5022 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5023 assert((UseVectorIntrinsic || !NeedToScalarize) && 5024 "Instruction should be scalarized elsewhere."); 5025 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5026 "Either the intrinsic cost or vector call cost must be valid"); 5027 5028 for (unsigned Part = 0; Part < UF; ++Part) { 5029 SmallVector<Value *, 4> Args; 5030 for (auto &I : enumerate(ArgOperands.operands())) { 5031 // Some intrinsics have a scalar argument - don't replace it with a 5032 // vector. 5033 Value *Arg; 5034 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5035 Arg = State.get(I.value(), Part); 5036 else 5037 Arg = State.get(I.value(), VPIteration(0, 0)); 5038 Args.push_back(Arg); 5039 } 5040 5041 Function *VectorF; 5042 if (UseVectorIntrinsic) { 5043 // Use vector version of the intrinsic. 5044 Type *TysForDecl[] = {CI->getType()}; 5045 if (VF.isVector()) 5046 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5047 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5048 assert(VectorF && "Can't retrieve vector intrinsic."); 5049 } else { 5050 // Use vector version of the function call. 5051 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5052 #ifndef NDEBUG 5053 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5054 "Can't create vector function."); 5055 #endif 5056 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5057 } 5058 SmallVector<OperandBundleDef, 1> OpBundles; 5059 CI->getOperandBundlesAsDefs(OpBundles); 5060 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5061 5062 if (isa<FPMathOperator>(V)) 5063 V->copyFastMathFlags(CI); 5064 5065 State.set(Def, V, Part); 5066 addMetadata(V, &I); 5067 } 5068 } 5069 5070 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5071 VPUser &Operands, 5072 bool InvariantCond, 5073 VPTransformState &State) { 5074 setDebugLocFromInst(Builder, &I); 5075 5076 // The condition can be loop invariant but still defined inside the 5077 // loop. This means that we can't just use the original 'cond' value. 5078 // We have to take the 'vectorized' value and pick the first lane. 5079 // Instcombine will make this a no-op. 5080 auto *InvarCond = InvariantCond 5081 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5082 : nullptr; 5083 5084 for (unsigned Part = 0; Part < UF; ++Part) { 5085 Value *Cond = 5086 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5087 Value *Op0 = State.get(Operands.getOperand(1), Part); 5088 Value *Op1 = State.get(Operands.getOperand(2), Part); 5089 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5090 State.set(VPDef, Sel, Part); 5091 addMetadata(Sel, &I); 5092 } 5093 } 5094 5095 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5096 // We should not collect Scalars more than once per VF. Right now, this 5097 // function is called from collectUniformsAndScalars(), which already does 5098 // this check. Collecting Scalars for VF=1 does not make any sense. 5099 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5100 "This function should not be visited twice for the same VF"); 5101 5102 SmallSetVector<Instruction *, 8> Worklist; 5103 5104 // These sets are used to seed the analysis with pointers used by memory 5105 // accesses that will remain scalar. 5106 SmallSetVector<Instruction *, 8> ScalarPtrs; 5107 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5108 auto *Latch = TheLoop->getLoopLatch(); 5109 5110 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5111 // The pointer operands of loads and stores will be scalar as long as the 5112 // memory access is not a gather or scatter operation. The value operand of a 5113 // store will remain scalar if the store is scalarized. 5114 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5115 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5116 assert(WideningDecision != CM_Unknown && 5117 "Widening decision should be ready at this moment"); 5118 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5119 if (Ptr == Store->getValueOperand()) 5120 return WideningDecision == CM_Scalarize; 5121 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5122 "Ptr is neither a value or pointer operand"); 5123 return WideningDecision != CM_GatherScatter; 5124 }; 5125 5126 // A helper that returns true if the given value is a bitcast or 5127 // getelementptr instruction contained in the loop. 5128 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5129 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5130 isa<GetElementPtrInst>(V)) && 5131 !TheLoop->isLoopInvariant(V); 5132 }; 5133 5134 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5135 if (!isa<PHINode>(Ptr) || 5136 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5137 return false; 5138 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5139 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5140 return false; 5141 return isScalarUse(MemAccess, Ptr); 5142 }; 5143 5144 // A helper that evaluates a memory access's use of a pointer. If the 5145 // pointer is actually the pointer induction of a loop, it is being 5146 // inserted into Worklist. If the use will be a scalar use, and the 5147 // pointer is only used by memory accesses, we place the pointer in 5148 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5149 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5150 if (isScalarPtrInduction(MemAccess, Ptr)) { 5151 Worklist.insert(cast<Instruction>(Ptr)); 5152 Instruction *Update = cast<Instruction>( 5153 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5154 Worklist.insert(Update); 5155 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5156 << "\n"); 5157 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5158 << "\n"); 5159 return; 5160 } 5161 // We only care about bitcast and getelementptr instructions contained in 5162 // the loop. 5163 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5164 return; 5165 5166 // If the pointer has already been identified as scalar (e.g., if it was 5167 // also identified as uniform), there's nothing to do. 5168 auto *I = cast<Instruction>(Ptr); 5169 if (Worklist.count(I)) 5170 return; 5171 5172 // If the use of the pointer will be a scalar use, and all users of the 5173 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5174 // place the pointer in PossibleNonScalarPtrs. 5175 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5176 return isa<LoadInst>(U) || isa<StoreInst>(U); 5177 })) 5178 ScalarPtrs.insert(I); 5179 else 5180 PossibleNonScalarPtrs.insert(I); 5181 }; 5182 5183 // We seed the scalars analysis with three classes of instructions: (1) 5184 // instructions marked uniform-after-vectorization and (2) bitcast, 5185 // getelementptr and (pointer) phi instructions used by memory accesses 5186 // requiring a scalar use. 5187 // 5188 // (1) Add to the worklist all instructions that have been identified as 5189 // uniform-after-vectorization. 5190 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5191 5192 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5193 // memory accesses requiring a scalar use. The pointer operands of loads and 5194 // stores will be scalar as long as the memory accesses is not a gather or 5195 // scatter operation. The value operand of a store will remain scalar if the 5196 // store is scalarized. 5197 for (auto *BB : TheLoop->blocks()) 5198 for (auto &I : *BB) { 5199 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5200 evaluatePtrUse(Load, Load->getPointerOperand()); 5201 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5202 evaluatePtrUse(Store, Store->getPointerOperand()); 5203 evaluatePtrUse(Store, Store->getValueOperand()); 5204 } 5205 } 5206 for (auto *I : ScalarPtrs) 5207 if (!PossibleNonScalarPtrs.count(I)) { 5208 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5209 Worklist.insert(I); 5210 } 5211 5212 // Insert the forced scalars. 5213 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5214 // induction variable when the PHI user is scalarized. 5215 auto ForcedScalar = ForcedScalars.find(VF); 5216 if (ForcedScalar != ForcedScalars.end()) 5217 for (auto *I : ForcedScalar->second) 5218 Worklist.insert(I); 5219 5220 // Expand the worklist by looking through any bitcasts and getelementptr 5221 // instructions we've already identified as scalar. This is similar to the 5222 // expansion step in collectLoopUniforms(); however, here we're only 5223 // expanding to include additional bitcasts and getelementptr instructions. 5224 unsigned Idx = 0; 5225 while (Idx != Worklist.size()) { 5226 Instruction *Dst = Worklist[Idx++]; 5227 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5228 continue; 5229 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5230 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5231 auto *J = cast<Instruction>(U); 5232 return !TheLoop->contains(J) || Worklist.count(J) || 5233 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5234 isScalarUse(J, Src)); 5235 })) { 5236 Worklist.insert(Src); 5237 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5238 } 5239 } 5240 5241 // An induction variable will remain scalar if all users of the induction 5242 // variable and induction variable update remain scalar. 5243 for (auto &Induction : Legal->getInductionVars()) { 5244 auto *Ind = Induction.first; 5245 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5246 5247 // If tail-folding is applied, the primary induction variable will be used 5248 // to feed a vector compare. 5249 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5250 continue; 5251 5252 // Determine if all users of the induction variable are scalar after 5253 // vectorization. 5254 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5255 auto *I = cast<Instruction>(U); 5256 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5257 }); 5258 if (!ScalarInd) 5259 continue; 5260 5261 // Determine if all users of the induction variable update instruction are 5262 // scalar after vectorization. 5263 auto ScalarIndUpdate = 5264 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5265 auto *I = cast<Instruction>(U); 5266 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5267 }); 5268 if (!ScalarIndUpdate) 5269 continue; 5270 5271 // The induction variable and its update instruction will remain scalar. 5272 Worklist.insert(Ind); 5273 Worklist.insert(IndUpdate); 5274 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5275 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5276 << "\n"); 5277 } 5278 5279 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5280 } 5281 5282 bool LoopVectorizationCostModel::isScalarWithPredication( 5283 Instruction *I, ElementCount VF) const { 5284 if (!blockNeedsPredication(I->getParent())) 5285 return false; 5286 switch(I->getOpcode()) { 5287 default: 5288 break; 5289 case Instruction::Load: 5290 case Instruction::Store: { 5291 if (!Legal->isMaskRequired(I)) 5292 return false; 5293 auto *Ptr = getLoadStorePointerOperand(I); 5294 auto *Ty = getMemInstValueType(I); 5295 // We have already decided how to vectorize this instruction, get that 5296 // result. 5297 if (VF.isVector()) { 5298 InstWidening WideningDecision = getWideningDecision(I, VF); 5299 assert(WideningDecision != CM_Unknown && 5300 "Widening decision should be ready at this moment"); 5301 return WideningDecision == CM_Scalarize; 5302 } 5303 const Align Alignment = getLoadStoreAlignment(I); 5304 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5305 isLegalMaskedGather(Ty, Alignment)) 5306 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5307 isLegalMaskedScatter(Ty, Alignment)); 5308 } 5309 case Instruction::UDiv: 5310 case Instruction::SDiv: 5311 case Instruction::SRem: 5312 case Instruction::URem: 5313 return mayDivideByZero(*I); 5314 } 5315 return false; 5316 } 5317 5318 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5319 Instruction *I, ElementCount VF) { 5320 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5321 assert(getWideningDecision(I, VF) == CM_Unknown && 5322 "Decision should not be set yet."); 5323 auto *Group = getInterleavedAccessGroup(I); 5324 assert(Group && "Must have a group."); 5325 5326 // If the instruction's allocated size doesn't equal it's type size, it 5327 // requires padding and will be scalarized. 5328 auto &DL = I->getModule()->getDataLayout(); 5329 auto *ScalarTy = getMemInstValueType(I); 5330 if (hasIrregularType(ScalarTy, DL)) 5331 return false; 5332 5333 // Check if masking is required. 5334 // A Group may need masking for one of two reasons: it resides in a block that 5335 // needs predication, or it was decided to use masking to deal with gaps. 5336 bool PredicatedAccessRequiresMasking = 5337 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5338 bool AccessWithGapsRequiresMasking = 5339 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5340 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5341 return true; 5342 5343 // If masked interleaving is required, we expect that the user/target had 5344 // enabled it, because otherwise it either wouldn't have been created or 5345 // it should have been invalidated by the CostModel. 5346 assert(useMaskedInterleavedAccesses(TTI) && 5347 "Masked interleave-groups for predicated accesses are not enabled."); 5348 5349 auto *Ty = getMemInstValueType(I); 5350 const Align Alignment = getLoadStoreAlignment(I); 5351 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5352 : TTI.isLegalMaskedStore(Ty, Alignment); 5353 } 5354 5355 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5356 Instruction *I, ElementCount VF) { 5357 // Get and ensure we have a valid memory instruction. 5358 LoadInst *LI = dyn_cast<LoadInst>(I); 5359 StoreInst *SI = dyn_cast<StoreInst>(I); 5360 assert((LI || SI) && "Invalid memory instruction"); 5361 5362 auto *Ptr = getLoadStorePointerOperand(I); 5363 5364 // In order to be widened, the pointer should be consecutive, first of all. 5365 if (!Legal->isConsecutivePtr(Ptr)) 5366 return false; 5367 5368 // If the instruction is a store located in a predicated block, it will be 5369 // scalarized. 5370 if (isScalarWithPredication(I)) 5371 return false; 5372 5373 // If the instruction's allocated size doesn't equal it's type size, it 5374 // requires padding and will be scalarized. 5375 auto &DL = I->getModule()->getDataLayout(); 5376 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5377 if (hasIrregularType(ScalarTy, DL)) 5378 return false; 5379 5380 return true; 5381 } 5382 5383 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5384 // We should not collect Uniforms more than once per VF. Right now, 5385 // this function is called from collectUniformsAndScalars(), which 5386 // already does this check. Collecting Uniforms for VF=1 does not make any 5387 // sense. 5388 5389 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5390 "This function should not be visited twice for the same VF"); 5391 5392 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5393 // not analyze again. Uniforms.count(VF) will return 1. 5394 Uniforms[VF].clear(); 5395 5396 // We now know that the loop is vectorizable! 5397 // Collect instructions inside the loop that will remain uniform after 5398 // vectorization. 5399 5400 // Global values, params and instructions outside of current loop are out of 5401 // scope. 5402 auto isOutOfScope = [&](Value *V) -> bool { 5403 Instruction *I = dyn_cast<Instruction>(V); 5404 return (!I || !TheLoop->contains(I)); 5405 }; 5406 5407 SetVector<Instruction *> Worklist; 5408 BasicBlock *Latch = TheLoop->getLoopLatch(); 5409 5410 // Instructions that are scalar with predication must not be considered 5411 // uniform after vectorization, because that would create an erroneous 5412 // replicating region where only a single instance out of VF should be formed. 5413 // TODO: optimize such seldom cases if found important, see PR40816. 5414 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5415 if (isOutOfScope(I)) { 5416 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5417 << *I << "\n"); 5418 return; 5419 } 5420 if (isScalarWithPredication(I, VF)) { 5421 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5422 << *I << "\n"); 5423 return; 5424 } 5425 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5426 Worklist.insert(I); 5427 }; 5428 5429 // Start with the conditional branch. If the branch condition is an 5430 // instruction contained in the loop that is only used by the branch, it is 5431 // uniform. 5432 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5433 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5434 addToWorklistIfAllowed(Cmp); 5435 5436 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5437 InstWidening WideningDecision = getWideningDecision(I, VF); 5438 assert(WideningDecision != CM_Unknown && 5439 "Widening decision should be ready at this moment"); 5440 5441 // A uniform memory op is itself uniform. We exclude uniform stores 5442 // here as they demand the last lane, not the first one. 5443 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5444 assert(WideningDecision == CM_Scalarize); 5445 return true; 5446 } 5447 5448 return (WideningDecision == CM_Widen || 5449 WideningDecision == CM_Widen_Reverse || 5450 WideningDecision == CM_Interleave); 5451 }; 5452 5453 5454 // Returns true if Ptr is the pointer operand of a memory access instruction 5455 // I, and I is known to not require scalarization. 5456 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5457 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5458 }; 5459 5460 // Holds a list of values which are known to have at least one uniform use. 5461 // Note that there may be other uses which aren't uniform. A "uniform use" 5462 // here is something which only demands lane 0 of the unrolled iterations; 5463 // it does not imply that all lanes produce the same value (e.g. this is not 5464 // the usual meaning of uniform) 5465 SetVector<Value *> HasUniformUse; 5466 5467 // Scan the loop for instructions which are either a) known to have only 5468 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5469 for (auto *BB : TheLoop->blocks()) 5470 for (auto &I : *BB) { 5471 // If there's no pointer operand, there's nothing to do. 5472 auto *Ptr = getLoadStorePointerOperand(&I); 5473 if (!Ptr) 5474 continue; 5475 5476 // A uniform memory op is itself uniform. We exclude uniform stores 5477 // here as they demand the last lane, not the first one. 5478 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5479 addToWorklistIfAllowed(&I); 5480 5481 if (isUniformDecision(&I, VF)) { 5482 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5483 HasUniformUse.insert(Ptr); 5484 } 5485 } 5486 5487 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5488 // demanding) users. Since loops are assumed to be in LCSSA form, this 5489 // disallows uses outside the loop as well. 5490 for (auto *V : HasUniformUse) { 5491 if (isOutOfScope(V)) 5492 continue; 5493 auto *I = cast<Instruction>(V); 5494 auto UsersAreMemAccesses = 5495 llvm::all_of(I->users(), [&](User *U) -> bool { 5496 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5497 }); 5498 if (UsersAreMemAccesses) 5499 addToWorklistIfAllowed(I); 5500 } 5501 5502 // Expand Worklist in topological order: whenever a new instruction 5503 // is added , its users should be already inside Worklist. It ensures 5504 // a uniform instruction will only be used by uniform instructions. 5505 unsigned idx = 0; 5506 while (idx != Worklist.size()) { 5507 Instruction *I = Worklist[idx++]; 5508 5509 for (auto OV : I->operand_values()) { 5510 // isOutOfScope operands cannot be uniform instructions. 5511 if (isOutOfScope(OV)) 5512 continue; 5513 // First order recurrence Phi's should typically be considered 5514 // non-uniform. 5515 auto *OP = dyn_cast<PHINode>(OV); 5516 if (OP && Legal->isFirstOrderRecurrence(OP)) 5517 continue; 5518 // If all the users of the operand are uniform, then add the 5519 // operand into the uniform worklist. 5520 auto *OI = cast<Instruction>(OV); 5521 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5522 auto *J = cast<Instruction>(U); 5523 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5524 })) 5525 addToWorklistIfAllowed(OI); 5526 } 5527 } 5528 5529 // For an instruction to be added into Worklist above, all its users inside 5530 // the loop should also be in Worklist. However, this condition cannot be 5531 // true for phi nodes that form a cyclic dependence. We must process phi 5532 // nodes separately. An induction variable will remain uniform if all users 5533 // of the induction variable and induction variable update remain uniform. 5534 // The code below handles both pointer and non-pointer induction variables. 5535 for (auto &Induction : Legal->getInductionVars()) { 5536 auto *Ind = Induction.first; 5537 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5538 5539 // Determine if all users of the induction variable are uniform after 5540 // vectorization. 5541 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5542 auto *I = cast<Instruction>(U); 5543 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5544 isVectorizedMemAccessUse(I, Ind); 5545 }); 5546 if (!UniformInd) 5547 continue; 5548 5549 // Determine if all users of the induction variable update instruction are 5550 // uniform after vectorization. 5551 auto UniformIndUpdate = 5552 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5553 auto *I = cast<Instruction>(U); 5554 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5555 isVectorizedMemAccessUse(I, IndUpdate); 5556 }); 5557 if (!UniformIndUpdate) 5558 continue; 5559 5560 // The induction variable and its update instruction will remain uniform. 5561 addToWorklistIfAllowed(Ind); 5562 addToWorklistIfAllowed(IndUpdate); 5563 } 5564 5565 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5566 } 5567 5568 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5569 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5570 5571 if (Legal->getRuntimePointerChecking()->Need) { 5572 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5573 "runtime pointer checks needed. Enable vectorization of this " 5574 "loop with '#pragma clang loop vectorize(enable)' when " 5575 "compiling with -Os/-Oz", 5576 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5577 return true; 5578 } 5579 5580 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5581 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5582 "runtime SCEV checks needed. Enable vectorization of this " 5583 "loop with '#pragma clang loop vectorize(enable)' when " 5584 "compiling with -Os/-Oz", 5585 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5586 return true; 5587 } 5588 5589 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5590 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5591 reportVectorizationFailure("Runtime stride check for small trip count", 5592 "runtime stride == 1 checks needed. Enable vectorization of " 5593 "this loop without such check by compiling with -Os/-Oz", 5594 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5595 return true; 5596 } 5597 5598 return false; 5599 } 5600 5601 ElementCount 5602 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5603 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5604 reportVectorizationInfo( 5605 "Disabling scalable vectorization, because target does not " 5606 "support scalable vectors.", 5607 "ScalableVectorsUnsupported", ORE, TheLoop); 5608 return ElementCount::getScalable(0); 5609 } 5610 5611 auto MaxScalableVF = ElementCount::getScalable(1u << 16); 5612 5613 // Disable scalable vectorization if the loop contains unsupported reductions. 5614 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5615 // FIXME: While for scalable vectors this is currently sufficient, this should 5616 // be replaced by a more detailed mechanism that filters out specific VFs, 5617 // instead of invalidating vectorization for a whole set of VFs based on the 5618 // MaxVF. 5619 if (!canVectorizeReductions(MaxScalableVF)) { 5620 reportVectorizationInfo( 5621 "Scalable vectorization not supported for the reduction " 5622 "operations found in this loop.", 5623 "ScalableVFUnfeasible", ORE, TheLoop); 5624 return ElementCount::getScalable(0); 5625 } 5626 5627 if (Legal->isSafeForAnyVectorWidth()) 5628 return MaxScalableVF; 5629 5630 // Limit MaxScalableVF by the maximum safe dependence distance. 5631 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5632 MaxScalableVF = ElementCount::getScalable( 5633 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5634 if (!MaxScalableVF) 5635 reportVectorizationInfo( 5636 "Max legal vector width too small, scalable vectorization " 5637 "unfeasible.", 5638 "ScalableVFUnfeasible", ORE, TheLoop); 5639 5640 return MaxScalableVF; 5641 } 5642 5643 ElementCount 5644 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5645 ElementCount UserVF) { 5646 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5647 unsigned SmallestType, WidestType; 5648 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5649 5650 // Get the maximum safe dependence distance in bits computed by LAA. 5651 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5652 // the memory accesses that is most restrictive (involved in the smallest 5653 // dependence distance). 5654 unsigned MaxSafeElements = 5655 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5656 5657 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5658 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5659 5660 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5661 << ".\n"); 5662 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5663 << ".\n"); 5664 5665 // First analyze the UserVF, fall back if the UserVF should be ignored. 5666 if (UserVF) { 5667 auto MaxSafeUserVF = 5668 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5669 5670 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5671 return UserVF; 5672 5673 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5674 5675 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5676 // is better to ignore the hint and let the compiler choose a suitable VF. 5677 if (!UserVF.isScalable()) { 5678 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5679 << " is unsafe, clamping to max safe VF=" 5680 << MaxSafeFixedVF << ".\n"); 5681 ORE->emit([&]() { 5682 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5683 TheLoop->getStartLoc(), 5684 TheLoop->getHeader()) 5685 << "User-specified vectorization factor " 5686 << ore::NV("UserVectorizationFactor", UserVF) 5687 << " is unsafe, clamping to maximum safe vectorization factor " 5688 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5689 }); 5690 return MaxSafeFixedVF; 5691 } 5692 5693 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5694 << " is unsafe. Ignoring scalable UserVF.\n"); 5695 ORE->emit([&]() { 5696 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5697 TheLoop->getStartLoc(), 5698 TheLoop->getHeader()) 5699 << "User-specified vectorization factor " 5700 << ore::NV("UserVectorizationFactor", UserVF) 5701 << " is unsafe. Ignoring the hint to let the compiler pick a " 5702 "suitable VF."; 5703 }); 5704 } 5705 5706 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5707 << " / " << WidestType << " bits.\n"); 5708 5709 ElementCount MaxFixedVF = ElementCount::getFixed(1); 5710 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5711 WidestType, MaxSafeFixedVF)) 5712 MaxFixedVF = MaxVF; 5713 5714 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5715 WidestType, MaxSafeScalableVF)) 5716 // FIXME: Return scalable VF as well (to be added in future patch). 5717 if (MaxVF.isScalable()) 5718 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5719 << "\n"); 5720 5721 return MaxFixedVF; 5722 } 5723 5724 Optional<ElementCount> 5725 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5726 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5727 // TODO: It may by useful to do since it's still likely to be dynamically 5728 // uniform if the target can skip. 5729 reportVectorizationFailure( 5730 "Not inserting runtime ptr check for divergent target", 5731 "runtime pointer checks needed. Not enabled for divergent target", 5732 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5733 return None; 5734 } 5735 5736 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5737 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5738 if (TC == 1) { 5739 reportVectorizationFailure("Single iteration (non) loop", 5740 "loop trip count is one, irrelevant for vectorization", 5741 "SingleIterationLoop", ORE, TheLoop); 5742 return None; 5743 } 5744 5745 switch (ScalarEpilogueStatus) { 5746 case CM_ScalarEpilogueAllowed: 5747 return computeFeasibleMaxVF(TC, UserVF); 5748 case CM_ScalarEpilogueNotAllowedUsePredicate: 5749 LLVM_FALLTHROUGH; 5750 case CM_ScalarEpilogueNotNeededUsePredicate: 5751 LLVM_DEBUG( 5752 dbgs() << "LV: vector predicate hint/switch found.\n" 5753 << "LV: Not allowing scalar epilogue, creating predicated " 5754 << "vector loop.\n"); 5755 break; 5756 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5757 // fallthrough as a special case of OptForSize 5758 case CM_ScalarEpilogueNotAllowedOptSize: 5759 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5760 LLVM_DEBUG( 5761 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5762 else 5763 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5764 << "count.\n"); 5765 5766 // Bail if runtime checks are required, which are not good when optimising 5767 // for size. 5768 if (runtimeChecksRequired()) 5769 return None; 5770 5771 break; 5772 } 5773 5774 // The only loops we can vectorize without a scalar epilogue, are loops with 5775 // a bottom-test and a single exiting block. We'd have to handle the fact 5776 // that not every instruction executes on the last iteration. This will 5777 // require a lane mask which varies through the vector loop body. (TODO) 5778 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5779 // If there was a tail-folding hint/switch, but we can't fold the tail by 5780 // masking, fallback to a vectorization with a scalar epilogue. 5781 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5782 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5783 "scalar epilogue instead.\n"); 5784 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5785 return computeFeasibleMaxVF(TC, UserVF); 5786 } 5787 return None; 5788 } 5789 5790 // Now try the tail folding 5791 5792 // Invalidate interleave groups that require an epilogue if we can't mask 5793 // the interleave-group. 5794 if (!useMaskedInterleavedAccesses(TTI)) { 5795 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5796 "No decisions should have been taken at this point"); 5797 // Note: There is no need to invalidate any cost modeling decisions here, as 5798 // non where taken so far. 5799 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5800 } 5801 5802 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5803 assert(!MaxVF.isScalable() && 5804 "Scalable vectors do not yet support tail folding"); 5805 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5806 "MaxVF must be a power of 2"); 5807 unsigned MaxVFtimesIC = 5808 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5809 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5810 // chose. 5811 ScalarEvolution *SE = PSE.getSE(); 5812 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5813 const SCEV *ExitCount = SE->getAddExpr( 5814 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5815 const SCEV *Rem = SE->getURemExpr( 5816 SE->applyLoopGuards(ExitCount, TheLoop), 5817 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5818 if (Rem->isZero()) { 5819 // Accept MaxVF if we do not have a tail. 5820 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5821 return MaxVF; 5822 } 5823 5824 // If we don't know the precise trip count, or if the trip count that we 5825 // found modulo the vectorization factor is not zero, try to fold the tail 5826 // by masking. 5827 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5828 if (Legal->prepareToFoldTailByMasking()) { 5829 FoldTailByMasking = true; 5830 return MaxVF; 5831 } 5832 5833 // If there was a tail-folding hint/switch, but we can't fold the tail by 5834 // masking, fallback to a vectorization with a scalar epilogue. 5835 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5836 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5837 "scalar epilogue instead.\n"); 5838 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5839 return MaxVF; 5840 } 5841 5842 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5843 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5844 return None; 5845 } 5846 5847 if (TC == 0) { 5848 reportVectorizationFailure( 5849 "Unable to calculate the loop count due to complex control flow", 5850 "unable to calculate the loop count due to complex control flow", 5851 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5852 return None; 5853 } 5854 5855 reportVectorizationFailure( 5856 "Cannot optimize for size and vectorize at the same time.", 5857 "cannot optimize for size and vectorize at the same time. " 5858 "Enable vectorization of this loop with '#pragma clang loop " 5859 "vectorize(enable)' when compiling with -Os/-Oz", 5860 "NoTailLoopWithOptForSize", ORE, TheLoop); 5861 return None; 5862 } 5863 5864 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5865 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5866 ElementCount MaxSafeVF) { 5867 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5868 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5869 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5870 : TargetTransformInfo::RGK_FixedWidthVector); 5871 5872 // Convenience function to return the minimum of two ElementCounts. 5873 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5874 assert((LHS.isScalable() == RHS.isScalable()) && 5875 "Scalable flags must match"); 5876 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5877 }; 5878 5879 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5880 // Note that both WidestRegister and WidestType may not be a powers of 2. 5881 auto MaxVectorElementCount = ElementCount::get( 5882 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5883 ComputeScalableMaxVF); 5884 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5885 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5886 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5887 5888 if (!MaxVectorElementCount) { 5889 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5890 return ElementCount::getFixed(1); 5891 } 5892 5893 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5894 if (ConstTripCount && 5895 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5896 isPowerOf2_32(ConstTripCount)) { 5897 // We need to clamp the VF to be the ConstTripCount. There is no point in 5898 // choosing a higher viable VF as done in the loop below. If 5899 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5900 // the TC is less than or equal to the known number of lanes. 5901 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5902 << ConstTripCount << "\n"); 5903 return TripCountEC; 5904 } 5905 5906 ElementCount MaxVF = MaxVectorElementCount; 5907 if (TTI.shouldMaximizeVectorBandwidth() || 5908 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5909 auto MaxVectorElementCountMaxBW = ElementCount::get( 5910 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5911 ComputeScalableMaxVF); 5912 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5913 5914 // Collect all viable vectorization factors larger than the default MaxVF 5915 // (i.e. MaxVectorElementCount). 5916 SmallVector<ElementCount, 8> VFs; 5917 for (ElementCount VS = MaxVectorElementCount * 2; 5918 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5919 VFs.push_back(VS); 5920 5921 // For each VF calculate its register usage. 5922 auto RUs = calculateRegisterUsage(VFs); 5923 5924 // Select the largest VF which doesn't require more registers than existing 5925 // ones. 5926 for (int i = RUs.size() - 1; i >= 0; --i) { 5927 bool Selected = true; 5928 for (auto &pair : RUs[i].MaxLocalUsers) { 5929 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5930 if (pair.second > TargetNumRegisters) 5931 Selected = false; 5932 } 5933 if (Selected) { 5934 MaxVF = VFs[i]; 5935 break; 5936 } 5937 } 5938 if (ElementCount MinVF = 5939 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5940 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5941 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5942 << ") with target's minimum: " << MinVF << '\n'); 5943 MaxVF = MinVF; 5944 } 5945 } 5946 } 5947 return MaxVF; 5948 } 5949 5950 bool LoopVectorizationCostModel::isMoreProfitable( 5951 const VectorizationFactor &A, const VectorizationFactor &B) const { 5952 InstructionCost::CostType CostA = *A.Cost.getValue(); 5953 InstructionCost::CostType CostB = *B.Cost.getValue(); 5954 5955 // To avoid the need for FP division: 5956 // (CostA / A.Width) < (CostB / B.Width) 5957 // <=> (CostA * B.Width) < (CostB * A.Width) 5958 return (CostA * B.Width.getKnownMinValue()) < 5959 (CostB * A.Width.getKnownMinValue()); 5960 } 5961 5962 VectorizationFactor 5963 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5964 // FIXME: This can be fixed for scalable vectors later, because at this stage 5965 // the LoopVectorizer will only consider vectorizing a loop with scalable 5966 // vectors when the loop has a hint to enable vectorization for a given VF. 5967 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5968 5969 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5970 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5971 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5972 5973 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 5974 VectorizationFactor ChosenFactor = ScalarCost; 5975 5976 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5977 if (ForceVectorization && MaxVF.isVector()) { 5978 // Ignore scalar width, because the user explicitly wants vectorization. 5979 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5980 // evaluation. 5981 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 5982 } 5983 5984 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 5985 i *= 2) { 5986 // Notice that the vector loop needs to be executed less times, so 5987 // we need to divide the cost of the vector loops by the width of 5988 // the vector elements. 5989 VectorizationCostTy C = expectedCost(i); 5990 5991 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5992 VectorizationFactor Candidate(i, C.first); 5993 LLVM_DEBUG( 5994 dbgs() << "LV: Vector loop of width " << i << " costs: " 5995 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 5996 << ".\n"); 5997 5998 if (!C.second && !ForceVectorization) { 5999 LLVM_DEBUG( 6000 dbgs() << "LV: Not considering vector loop of width " << i 6001 << " because it will not generate any vector instructions.\n"); 6002 continue; 6003 } 6004 6005 // If profitable add it to ProfitableVF list. 6006 if (isMoreProfitable(Candidate, ScalarCost)) 6007 ProfitableVFs.push_back(Candidate); 6008 6009 if (isMoreProfitable(Candidate, ChosenFactor)) 6010 ChosenFactor = Candidate; 6011 } 6012 6013 if (!EnableCondStoresVectorization && NumPredStores) { 6014 reportVectorizationFailure("There are conditional stores.", 6015 "store that is conditionally executed prevents vectorization", 6016 "ConditionalStore", ORE, TheLoop); 6017 ChosenFactor = ScalarCost; 6018 } 6019 6020 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6021 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6022 dbgs() 6023 << "LV: Vectorization seems to be not beneficial, " 6024 << "but was forced by a user.\n"); 6025 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6026 return ChosenFactor; 6027 } 6028 6029 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6030 const Loop &L, ElementCount VF) const { 6031 // Cross iteration phis such as reductions need special handling and are 6032 // currently unsupported. 6033 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6034 return Legal->isFirstOrderRecurrence(&Phi) || 6035 Legal->isReductionVariable(&Phi); 6036 })) 6037 return false; 6038 6039 // Phis with uses outside of the loop require special handling and are 6040 // currently unsupported. 6041 for (auto &Entry : Legal->getInductionVars()) { 6042 // Look for uses of the value of the induction at the last iteration. 6043 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6044 for (User *U : PostInc->users()) 6045 if (!L.contains(cast<Instruction>(U))) 6046 return false; 6047 // Look for uses of penultimate value of the induction. 6048 for (User *U : Entry.first->users()) 6049 if (!L.contains(cast<Instruction>(U))) 6050 return false; 6051 } 6052 6053 // Induction variables that are widened require special handling that is 6054 // currently not supported. 6055 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6056 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6057 this->isProfitableToScalarize(Entry.first, VF)); 6058 })) 6059 return false; 6060 6061 return true; 6062 } 6063 6064 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6065 const ElementCount VF) const { 6066 // FIXME: We need a much better cost-model to take different parameters such 6067 // as register pressure, code size increase and cost of extra branches into 6068 // account. For now we apply a very crude heuristic and only consider loops 6069 // with vectorization factors larger than a certain value. 6070 // We also consider epilogue vectorization unprofitable for targets that don't 6071 // consider interleaving beneficial (eg. MVE). 6072 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6073 return false; 6074 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6075 return true; 6076 return false; 6077 } 6078 6079 VectorizationFactor 6080 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6081 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6082 VectorizationFactor Result = VectorizationFactor::Disabled(); 6083 if (!EnableEpilogueVectorization) { 6084 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6085 return Result; 6086 } 6087 6088 if (!isScalarEpilogueAllowed()) { 6089 LLVM_DEBUG( 6090 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6091 "allowed.\n";); 6092 return Result; 6093 } 6094 6095 // FIXME: This can be fixed for scalable vectors later, because at this stage 6096 // the LoopVectorizer will only consider vectorizing a loop with scalable 6097 // vectors when the loop has a hint to enable vectorization for a given VF. 6098 if (MainLoopVF.isScalable()) { 6099 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6100 "yet supported.\n"); 6101 return Result; 6102 } 6103 6104 // Not really a cost consideration, but check for unsupported cases here to 6105 // simplify the logic. 6106 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6107 LLVM_DEBUG( 6108 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6109 "not a supported candidate.\n";); 6110 return Result; 6111 } 6112 6113 if (EpilogueVectorizationForceVF > 1) { 6114 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6115 if (LVP.hasPlanWithVFs( 6116 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6117 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6118 else { 6119 LLVM_DEBUG( 6120 dbgs() 6121 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6122 return Result; 6123 } 6124 } 6125 6126 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6127 TheLoop->getHeader()->getParent()->hasMinSize()) { 6128 LLVM_DEBUG( 6129 dbgs() 6130 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6131 return Result; 6132 } 6133 6134 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6135 return Result; 6136 6137 for (auto &NextVF : ProfitableVFs) 6138 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6139 (Result.Width.getFixedValue() == 1 || 6140 isMoreProfitable(NextVF, Result)) && 6141 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6142 Result = NextVF; 6143 6144 if (Result != VectorizationFactor::Disabled()) 6145 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6146 << Result.Width.getFixedValue() << "\n";); 6147 return Result; 6148 } 6149 6150 std::pair<unsigned, unsigned> 6151 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6152 unsigned MinWidth = -1U; 6153 unsigned MaxWidth = 8; 6154 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6155 6156 // For each block. 6157 for (BasicBlock *BB : TheLoop->blocks()) { 6158 // For each instruction in the loop. 6159 for (Instruction &I : BB->instructionsWithoutDebug()) { 6160 Type *T = I.getType(); 6161 6162 // Skip ignored values. 6163 if (ValuesToIgnore.count(&I)) 6164 continue; 6165 6166 // Only examine Loads, Stores and PHINodes. 6167 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6168 continue; 6169 6170 // Examine PHI nodes that are reduction variables. Update the type to 6171 // account for the recurrence type. 6172 if (auto *PN = dyn_cast<PHINode>(&I)) { 6173 if (!Legal->isReductionVariable(PN)) 6174 continue; 6175 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6176 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6177 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6178 RdxDesc.getRecurrenceType(), 6179 TargetTransformInfo::ReductionFlags())) 6180 continue; 6181 T = RdxDesc.getRecurrenceType(); 6182 } 6183 6184 // Examine the stored values. 6185 if (auto *ST = dyn_cast<StoreInst>(&I)) 6186 T = ST->getValueOperand()->getType(); 6187 6188 // Ignore loaded pointer types and stored pointer types that are not 6189 // vectorizable. 6190 // 6191 // FIXME: The check here attempts to predict whether a load or store will 6192 // be vectorized. We only know this for certain after a VF has 6193 // been selected. Here, we assume that if an access can be 6194 // vectorized, it will be. We should also look at extending this 6195 // optimization to non-pointer types. 6196 // 6197 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6198 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6199 continue; 6200 6201 MinWidth = std::min(MinWidth, 6202 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6203 MaxWidth = std::max(MaxWidth, 6204 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6205 } 6206 } 6207 6208 return {MinWidth, MaxWidth}; 6209 } 6210 6211 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6212 unsigned LoopCost) { 6213 // -- The interleave heuristics -- 6214 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6215 // There are many micro-architectural considerations that we can't predict 6216 // at this level. For example, frontend pressure (on decode or fetch) due to 6217 // code size, or the number and capabilities of the execution ports. 6218 // 6219 // We use the following heuristics to select the interleave count: 6220 // 1. If the code has reductions, then we interleave to break the cross 6221 // iteration dependency. 6222 // 2. If the loop is really small, then we interleave to reduce the loop 6223 // overhead. 6224 // 3. We don't interleave if we think that we will spill registers to memory 6225 // due to the increased register pressure. 6226 6227 if (!isScalarEpilogueAllowed()) 6228 return 1; 6229 6230 // We used the distance for the interleave count. 6231 if (Legal->getMaxSafeDepDistBytes() != -1U) 6232 return 1; 6233 6234 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6235 const bool HasReductions = !Legal->getReductionVars().empty(); 6236 // Do not interleave loops with a relatively small known or estimated trip 6237 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6238 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6239 // because with the above conditions interleaving can expose ILP and break 6240 // cross iteration dependences for reductions. 6241 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6242 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6243 return 1; 6244 6245 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6246 // We divide by these constants so assume that we have at least one 6247 // instruction that uses at least one register. 6248 for (auto& pair : R.MaxLocalUsers) { 6249 pair.second = std::max(pair.second, 1U); 6250 } 6251 6252 // We calculate the interleave count using the following formula. 6253 // Subtract the number of loop invariants from the number of available 6254 // registers. These registers are used by all of the interleaved instances. 6255 // Next, divide the remaining registers by the number of registers that is 6256 // required by the loop, in order to estimate how many parallel instances 6257 // fit without causing spills. All of this is rounded down if necessary to be 6258 // a power of two. We want power of two interleave count to simplify any 6259 // addressing operations or alignment considerations. 6260 // We also want power of two interleave counts to ensure that the induction 6261 // variable of the vector loop wraps to zero, when tail is folded by masking; 6262 // this currently happens when OptForSize, in which case IC is set to 1 above. 6263 unsigned IC = UINT_MAX; 6264 6265 for (auto& pair : R.MaxLocalUsers) { 6266 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6267 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6268 << " registers of " 6269 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6270 if (VF.isScalar()) { 6271 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6272 TargetNumRegisters = ForceTargetNumScalarRegs; 6273 } else { 6274 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6275 TargetNumRegisters = ForceTargetNumVectorRegs; 6276 } 6277 unsigned MaxLocalUsers = pair.second; 6278 unsigned LoopInvariantRegs = 0; 6279 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6280 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6281 6282 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6283 // Don't count the induction variable as interleaved. 6284 if (EnableIndVarRegisterHeur) { 6285 TmpIC = 6286 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6287 std::max(1U, (MaxLocalUsers - 1))); 6288 } 6289 6290 IC = std::min(IC, TmpIC); 6291 } 6292 6293 // Clamp the interleave ranges to reasonable counts. 6294 unsigned MaxInterleaveCount = 6295 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6296 6297 // Check if the user has overridden the max. 6298 if (VF.isScalar()) { 6299 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6300 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6301 } else { 6302 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6303 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6304 } 6305 6306 // If trip count is known or estimated compile time constant, limit the 6307 // interleave count to be less than the trip count divided by VF, provided it 6308 // is at least 1. 6309 // 6310 // For scalable vectors we can't know if interleaving is beneficial. It may 6311 // not be beneficial for small loops if none of the lanes in the second vector 6312 // iterations is enabled. However, for larger loops, there is likely to be a 6313 // similar benefit as for fixed-width vectors. For now, we choose to leave 6314 // the InterleaveCount as if vscale is '1', although if some information about 6315 // the vector is known (e.g. min vector size), we can make a better decision. 6316 if (BestKnownTC) { 6317 MaxInterleaveCount = 6318 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6319 // Make sure MaxInterleaveCount is greater than 0. 6320 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6321 } 6322 6323 assert(MaxInterleaveCount > 0 && 6324 "Maximum interleave count must be greater than 0"); 6325 6326 // Clamp the calculated IC to be between the 1 and the max interleave count 6327 // that the target and trip count allows. 6328 if (IC > MaxInterleaveCount) 6329 IC = MaxInterleaveCount; 6330 else 6331 // Make sure IC is greater than 0. 6332 IC = std::max(1u, IC); 6333 6334 assert(IC > 0 && "Interleave count must be greater than 0."); 6335 6336 // If we did not calculate the cost for VF (because the user selected the VF) 6337 // then we calculate the cost of VF here. 6338 if (LoopCost == 0) { 6339 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6340 LoopCost = *expectedCost(VF).first.getValue(); 6341 } 6342 6343 assert(LoopCost && "Non-zero loop cost expected"); 6344 6345 // Interleave if we vectorized this loop and there is a reduction that could 6346 // benefit from interleaving. 6347 if (VF.isVector() && HasReductions) { 6348 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6349 return IC; 6350 } 6351 6352 // Note that if we've already vectorized the loop we will have done the 6353 // runtime check and so interleaving won't require further checks. 6354 bool InterleavingRequiresRuntimePointerCheck = 6355 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6356 6357 // We want to interleave small loops in order to reduce the loop overhead and 6358 // potentially expose ILP opportunities. 6359 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6360 << "LV: IC is " << IC << '\n' 6361 << "LV: VF is " << VF << '\n'); 6362 const bool AggressivelyInterleaveReductions = 6363 TTI.enableAggressiveInterleaving(HasReductions); 6364 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6365 // We assume that the cost overhead is 1 and we use the cost model 6366 // to estimate the cost of the loop and interleave until the cost of the 6367 // loop overhead is about 5% of the cost of the loop. 6368 unsigned SmallIC = 6369 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6370 6371 // Interleave until store/load ports (estimated by max interleave count) are 6372 // saturated. 6373 unsigned NumStores = Legal->getNumStores(); 6374 unsigned NumLoads = Legal->getNumLoads(); 6375 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6376 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6377 6378 // If we have a scalar reduction (vector reductions are already dealt with 6379 // by this point), we can increase the critical path length if the loop 6380 // we're interleaving is inside another loop. Limit, by default to 2, so the 6381 // critical path only gets increased by one reduction operation. 6382 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6383 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6384 SmallIC = std::min(SmallIC, F); 6385 StoresIC = std::min(StoresIC, F); 6386 LoadsIC = std::min(LoadsIC, F); 6387 } 6388 6389 if (EnableLoadStoreRuntimeInterleave && 6390 std::max(StoresIC, LoadsIC) > SmallIC) { 6391 LLVM_DEBUG( 6392 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6393 return std::max(StoresIC, LoadsIC); 6394 } 6395 6396 // If there are scalar reductions and TTI has enabled aggressive 6397 // interleaving for reductions, we will interleave to expose ILP. 6398 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6399 AggressivelyInterleaveReductions) { 6400 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6401 // Interleave no less than SmallIC but not as aggressive as the normal IC 6402 // to satisfy the rare situation when resources are too limited. 6403 return std::max(IC / 2, SmallIC); 6404 } else { 6405 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6406 return SmallIC; 6407 } 6408 } 6409 6410 // Interleave if this is a large loop (small loops are already dealt with by 6411 // this point) that could benefit from interleaving. 6412 if (AggressivelyInterleaveReductions) { 6413 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6414 return IC; 6415 } 6416 6417 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6418 return 1; 6419 } 6420 6421 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6422 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6423 // This function calculates the register usage by measuring the highest number 6424 // of values that are alive at a single location. Obviously, this is a very 6425 // rough estimation. We scan the loop in a topological order in order and 6426 // assign a number to each instruction. We use RPO to ensure that defs are 6427 // met before their users. We assume that each instruction that has in-loop 6428 // users starts an interval. We record every time that an in-loop value is 6429 // used, so we have a list of the first and last occurrences of each 6430 // instruction. Next, we transpose this data structure into a multi map that 6431 // holds the list of intervals that *end* at a specific location. This multi 6432 // map allows us to perform a linear search. We scan the instructions linearly 6433 // and record each time that a new interval starts, by placing it in a set. 6434 // If we find this value in the multi-map then we remove it from the set. 6435 // The max register usage is the maximum size of the set. 6436 // We also search for instructions that are defined outside the loop, but are 6437 // used inside the loop. We need this number separately from the max-interval 6438 // usage number because when we unroll, loop-invariant values do not take 6439 // more register. 6440 LoopBlocksDFS DFS(TheLoop); 6441 DFS.perform(LI); 6442 6443 RegisterUsage RU; 6444 6445 // Each 'key' in the map opens a new interval. The values 6446 // of the map are the index of the 'last seen' usage of the 6447 // instruction that is the key. 6448 using IntervalMap = DenseMap<Instruction *, unsigned>; 6449 6450 // Maps instruction to its index. 6451 SmallVector<Instruction *, 64> IdxToInstr; 6452 // Marks the end of each interval. 6453 IntervalMap EndPoint; 6454 // Saves the list of instruction indices that are used in the loop. 6455 SmallPtrSet<Instruction *, 8> Ends; 6456 // Saves the list of values that are used in the loop but are 6457 // defined outside the loop, such as arguments and constants. 6458 SmallPtrSet<Value *, 8> LoopInvariants; 6459 6460 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6461 for (Instruction &I : BB->instructionsWithoutDebug()) { 6462 IdxToInstr.push_back(&I); 6463 6464 // Save the end location of each USE. 6465 for (Value *U : I.operands()) { 6466 auto *Instr = dyn_cast<Instruction>(U); 6467 6468 // Ignore non-instruction values such as arguments, constants, etc. 6469 if (!Instr) 6470 continue; 6471 6472 // If this instruction is outside the loop then record it and continue. 6473 if (!TheLoop->contains(Instr)) { 6474 LoopInvariants.insert(Instr); 6475 continue; 6476 } 6477 6478 // Overwrite previous end points. 6479 EndPoint[Instr] = IdxToInstr.size(); 6480 Ends.insert(Instr); 6481 } 6482 } 6483 } 6484 6485 // Saves the list of intervals that end with the index in 'key'. 6486 using InstrList = SmallVector<Instruction *, 2>; 6487 DenseMap<unsigned, InstrList> TransposeEnds; 6488 6489 // Transpose the EndPoints to a list of values that end at each index. 6490 for (auto &Interval : EndPoint) 6491 TransposeEnds[Interval.second].push_back(Interval.first); 6492 6493 SmallPtrSet<Instruction *, 8> OpenIntervals; 6494 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6495 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6496 6497 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6498 6499 // A lambda that gets the register usage for the given type and VF. 6500 const auto &TTICapture = TTI; 6501 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6502 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6503 return 0U; 6504 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6505 }; 6506 6507 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6508 Instruction *I = IdxToInstr[i]; 6509 6510 // Remove all of the instructions that end at this location. 6511 InstrList &List = TransposeEnds[i]; 6512 for (Instruction *ToRemove : List) 6513 OpenIntervals.erase(ToRemove); 6514 6515 // Ignore instructions that are never used within the loop. 6516 if (!Ends.count(I)) 6517 continue; 6518 6519 // Skip ignored values. 6520 if (ValuesToIgnore.count(I)) 6521 continue; 6522 6523 // For each VF find the maximum usage of registers. 6524 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6525 // Count the number of live intervals. 6526 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6527 6528 if (VFs[j].isScalar()) { 6529 for (auto Inst : OpenIntervals) { 6530 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6531 if (RegUsage.find(ClassID) == RegUsage.end()) 6532 RegUsage[ClassID] = 1; 6533 else 6534 RegUsage[ClassID] += 1; 6535 } 6536 } else { 6537 collectUniformsAndScalars(VFs[j]); 6538 for (auto Inst : OpenIntervals) { 6539 // Skip ignored values for VF > 1. 6540 if (VecValuesToIgnore.count(Inst)) 6541 continue; 6542 if (isScalarAfterVectorization(Inst, VFs[j])) { 6543 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6544 if (RegUsage.find(ClassID) == RegUsage.end()) 6545 RegUsage[ClassID] = 1; 6546 else 6547 RegUsage[ClassID] += 1; 6548 } else { 6549 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6550 if (RegUsage.find(ClassID) == RegUsage.end()) 6551 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6552 else 6553 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6554 } 6555 } 6556 } 6557 6558 for (auto& pair : RegUsage) { 6559 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6560 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6561 else 6562 MaxUsages[j][pair.first] = pair.second; 6563 } 6564 } 6565 6566 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6567 << OpenIntervals.size() << '\n'); 6568 6569 // Add the current instruction to the list of open intervals. 6570 OpenIntervals.insert(I); 6571 } 6572 6573 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6574 SmallMapVector<unsigned, unsigned, 4> Invariant; 6575 6576 for (auto Inst : LoopInvariants) { 6577 unsigned Usage = 6578 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6579 unsigned ClassID = 6580 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6581 if (Invariant.find(ClassID) == Invariant.end()) 6582 Invariant[ClassID] = Usage; 6583 else 6584 Invariant[ClassID] += Usage; 6585 } 6586 6587 LLVM_DEBUG({ 6588 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6589 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6590 << " item\n"; 6591 for (const auto &pair : MaxUsages[i]) { 6592 dbgs() << "LV(REG): RegisterClass: " 6593 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6594 << " registers\n"; 6595 } 6596 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6597 << " item\n"; 6598 for (const auto &pair : Invariant) { 6599 dbgs() << "LV(REG): RegisterClass: " 6600 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6601 << " registers\n"; 6602 } 6603 }); 6604 6605 RU.LoopInvariantRegs = Invariant; 6606 RU.MaxLocalUsers = MaxUsages[i]; 6607 RUs[i] = RU; 6608 } 6609 6610 return RUs; 6611 } 6612 6613 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6614 // TODO: Cost model for emulated masked load/store is completely 6615 // broken. This hack guides the cost model to use an artificially 6616 // high enough value to practically disable vectorization with such 6617 // operations, except where previously deployed legality hack allowed 6618 // using very low cost values. This is to avoid regressions coming simply 6619 // from moving "masked load/store" check from legality to cost model. 6620 // Masked Load/Gather emulation was previously never allowed. 6621 // Limited number of Masked Store/Scatter emulation was allowed. 6622 assert(isPredicatedInst(I, ElementCount::getFixed(1)) && 6623 "Expecting a scalar emulated instruction"); 6624 return isa<LoadInst>(I) || 6625 (isa<StoreInst>(I) && 6626 NumPredStores > NumberOfStoresToPredicate); 6627 } 6628 6629 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6630 // If we aren't vectorizing the loop, or if we've already collected the 6631 // instructions to scalarize, there's nothing to do. Collection may already 6632 // have occurred if we have a user-selected VF and are now computing the 6633 // expected cost for interleaving. 6634 if (VF.isScalar() || VF.isZero() || 6635 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6636 return; 6637 6638 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6639 // not profitable to scalarize any instructions, the presence of VF in the 6640 // map will indicate that we've analyzed it already. 6641 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6642 6643 // Find all the instructions that are scalar with predication in the loop and 6644 // determine if it would be better to not if-convert the blocks they are in. 6645 // If so, we also record the instructions to scalarize. 6646 for (BasicBlock *BB : TheLoop->blocks()) { 6647 if (!blockNeedsPredication(BB)) 6648 continue; 6649 for (Instruction &I : *BB) 6650 if (isScalarWithPredication(&I)) { 6651 ScalarCostsTy ScalarCosts; 6652 // Do not apply discount logic if hacked cost is needed 6653 // for emulated masked memrefs. 6654 if (!useEmulatedMaskMemRefHack(&I) && 6655 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6656 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6657 // Remember that BB will remain after vectorization. 6658 PredicatedBBsAfterVectorization.insert(BB); 6659 } 6660 } 6661 } 6662 6663 int LoopVectorizationCostModel::computePredInstDiscount( 6664 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6665 assert(!isUniformAfterVectorization(PredInst, VF) && 6666 "Instruction marked uniform-after-vectorization will be predicated"); 6667 6668 // Initialize the discount to zero, meaning that the scalar version and the 6669 // vector version cost the same. 6670 InstructionCost Discount = 0; 6671 6672 // Holds instructions to analyze. The instructions we visit are mapped in 6673 // ScalarCosts. Those instructions are the ones that would be scalarized if 6674 // we find that the scalar version costs less. 6675 SmallVector<Instruction *, 8> Worklist; 6676 6677 // Returns true if the given instruction can be scalarized. 6678 auto canBeScalarized = [&](Instruction *I) -> bool { 6679 // We only attempt to scalarize instructions forming a single-use chain 6680 // from the original predicated block that would otherwise be vectorized. 6681 // Although not strictly necessary, we give up on instructions we know will 6682 // already be scalar to avoid traversing chains that are unlikely to be 6683 // beneficial. 6684 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6685 isScalarAfterVectorization(I, VF)) 6686 return false; 6687 6688 // If the instruction is scalar with predication, it will be analyzed 6689 // separately. We ignore it within the context of PredInst. 6690 if (isScalarWithPredication(I)) 6691 return false; 6692 6693 // If any of the instruction's operands are uniform after vectorization, 6694 // the instruction cannot be scalarized. This prevents, for example, a 6695 // masked load from being scalarized. 6696 // 6697 // We assume we will only emit a value for lane zero of an instruction 6698 // marked uniform after vectorization, rather than VF identical values. 6699 // Thus, if we scalarize an instruction that uses a uniform, we would 6700 // create uses of values corresponding to the lanes we aren't emitting code 6701 // for. This behavior can be changed by allowing getScalarValue to clone 6702 // the lane zero values for uniforms rather than asserting. 6703 for (Use &U : I->operands()) 6704 if (auto *J = dyn_cast<Instruction>(U.get())) 6705 if (isUniformAfterVectorization(J, VF)) 6706 return false; 6707 6708 // Otherwise, we can scalarize the instruction. 6709 return true; 6710 }; 6711 6712 // Compute the expected cost discount from scalarizing the entire expression 6713 // feeding the predicated instruction. We currently only consider expressions 6714 // that are single-use instruction chains. 6715 Worklist.push_back(PredInst); 6716 while (!Worklist.empty()) { 6717 Instruction *I = Worklist.pop_back_val(); 6718 6719 // If we've already analyzed the instruction, there's nothing to do. 6720 if (ScalarCosts.find(I) != ScalarCosts.end()) 6721 continue; 6722 6723 // Compute the cost of the vector instruction. Note that this cost already 6724 // includes the scalarization overhead of the predicated instruction. 6725 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6726 6727 // Compute the cost of the scalarized instruction. This cost is the cost of 6728 // the instruction as if it wasn't if-converted and instead remained in the 6729 // predicated block. We will scale this cost by block probability after 6730 // computing the scalarization overhead. 6731 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6732 InstructionCost ScalarCost = 6733 VF.getKnownMinValue() * 6734 getInstructionCost(I, ElementCount::getFixed(1)).first; 6735 6736 // Compute the scalarization overhead of needed insertelement instructions 6737 // and phi nodes. 6738 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6739 ScalarCost += TTI.getScalarizationOverhead( 6740 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6741 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6742 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6743 ScalarCost += 6744 VF.getKnownMinValue() * 6745 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6746 } 6747 6748 // Compute the scalarization overhead of needed extractelement 6749 // instructions. For each of the instruction's operands, if the operand can 6750 // be scalarized, add it to the worklist; otherwise, account for the 6751 // overhead. 6752 for (Use &U : I->operands()) 6753 if (auto *J = dyn_cast<Instruction>(U.get())) { 6754 assert(VectorType::isValidElementType(J->getType()) && 6755 "Instruction has non-scalar type"); 6756 if (canBeScalarized(J)) 6757 Worklist.push_back(J); 6758 else if (needsExtract(J, VF)) { 6759 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6760 ScalarCost += TTI.getScalarizationOverhead( 6761 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6762 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6763 } 6764 } 6765 6766 // Scale the total scalar cost by block probability. 6767 ScalarCost /= getReciprocalPredBlockProb(); 6768 6769 // Compute the discount. A non-negative discount means the vector version 6770 // of the instruction costs more, and scalarizing would be beneficial. 6771 Discount += VectorCost - ScalarCost; 6772 ScalarCosts[I] = ScalarCost; 6773 } 6774 6775 return *Discount.getValue(); 6776 } 6777 6778 LoopVectorizationCostModel::VectorizationCostTy 6779 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6780 VectorizationCostTy Cost; 6781 6782 // For each block. 6783 for (BasicBlock *BB : TheLoop->blocks()) { 6784 VectorizationCostTy BlockCost; 6785 6786 // For each instruction in the old loop. 6787 for (Instruction &I : BB->instructionsWithoutDebug()) { 6788 // Skip ignored values. 6789 if (ValuesToIgnore.count(&I) || 6790 (VF.isVector() && VecValuesToIgnore.count(&I))) 6791 continue; 6792 6793 VectorizationCostTy C = getInstructionCost(&I, VF); 6794 6795 // Check if we should override the cost. 6796 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6797 C.first = InstructionCost(ForceTargetInstructionCost); 6798 6799 BlockCost.first += C.first; 6800 BlockCost.second |= C.second; 6801 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6802 << " for VF " << VF << " For instruction: " << I 6803 << '\n'); 6804 } 6805 6806 // If we are vectorizing a predicated block, it will have been 6807 // if-converted. This means that the block's instructions (aside from 6808 // stores and instructions that may divide by zero) will now be 6809 // unconditionally executed. For the scalar case, we may not always execute 6810 // the predicated block, if it is an if-else block. Thus, scale the block's 6811 // cost by the probability of executing it. blockNeedsPredication from 6812 // Legal is used so as to not include all blocks in tail folded loops. 6813 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6814 BlockCost.first /= getReciprocalPredBlockProb(); 6815 6816 Cost.first += BlockCost.first; 6817 Cost.second |= BlockCost.second; 6818 } 6819 6820 return Cost; 6821 } 6822 6823 /// Gets Address Access SCEV after verifying that the access pattern 6824 /// is loop invariant except the induction variable dependence. 6825 /// 6826 /// This SCEV can be sent to the Target in order to estimate the address 6827 /// calculation cost. 6828 static const SCEV *getAddressAccessSCEV( 6829 Value *Ptr, 6830 LoopVectorizationLegality *Legal, 6831 PredicatedScalarEvolution &PSE, 6832 const Loop *TheLoop) { 6833 6834 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6835 if (!Gep) 6836 return nullptr; 6837 6838 // We are looking for a gep with all loop invariant indices except for one 6839 // which should be an induction variable. 6840 auto SE = PSE.getSE(); 6841 unsigned NumOperands = Gep->getNumOperands(); 6842 for (unsigned i = 1; i < NumOperands; ++i) { 6843 Value *Opd = Gep->getOperand(i); 6844 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6845 !Legal->isInductionVariable(Opd)) 6846 return nullptr; 6847 } 6848 6849 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6850 return PSE.getSCEV(Ptr); 6851 } 6852 6853 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6854 return Legal->hasStride(I->getOperand(0)) || 6855 Legal->hasStride(I->getOperand(1)); 6856 } 6857 6858 InstructionCost 6859 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6860 ElementCount VF) { 6861 assert(VF.isVector() && 6862 "Scalarization cost of instruction implies vectorization."); 6863 if (VF.isScalable()) 6864 return InstructionCost::getInvalid(); 6865 6866 Type *ValTy = getMemInstValueType(I); 6867 auto SE = PSE.getSE(); 6868 6869 unsigned AS = getLoadStoreAddressSpace(I); 6870 Value *Ptr = getLoadStorePointerOperand(I); 6871 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6872 6873 // Figure out whether the access is strided and get the stride value 6874 // if it's known in compile time 6875 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6876 6877 // Get the cost of the scalar memory instruction and address computation. 6878 InstructionCost Cost = 6879 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6880 6881 // Don't pass *I here, since it is scalar but will actually be part of a 6882 // vectorized loop where the user of it is a vectorized instruction. 6883 const Align Alignment = getLoadStoreAlignment(I); 6884 Cost += VF.getKnownMinValue() * 6885 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6886 AS, TTI::TCK_RecipThroughput); 6887 6888 // Get the overhead of the extractelement and insertelement instructions 6889 // we might create due to scalarization. 6890 Cost += getScalarizationOverhead(I, VF); 6891 6892 // If we have a predicated load/store, it will need extra i1 extracts and 6893 // conditional branches, but may not be executed for each vector lane. Scale 6894 // the cost by the probability of executing the predicated block. 6895 if (isPredicatedInst(I, ElementCount::getFixed(1))) { 6896 Cost /= getReciprocalPredBlockProb(); 6897 6898 // Add the cost of an i1 extract and a branch 6899 auto *Vec_i1Ty = 6900 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6901 Cost += TTI.getScalarizationOverhead( 6902 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6903 /*Insert=*/false, /*Extract=*/true); 6904 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6905 6906 if (useEmulatedMaskMemRefHack(I)) 6907 // Artificially setting to a high enough value to practically disable 6908 // vectorization with such operations. 6909 Cost = 3000000; 6910 } 6911 6912 return Cost; 6913 } 6914 6915 InstructionCost 6916 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6917 ElementCount VF) { 6918 Type *ValTy = getMemInstValueType(I); 6919 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6920 Value *Ptr = getLoadStorePointerOperand(I); 6921 unsigned AS = getLoadStoreAddressSpace(I); 6922 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6923 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6924 6925 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6926 "Stride should be 1 or -1 for consecutive memory access"); 6927 const Align Alignment = getLoadStoreAlignment(I); 6928 InstructionCost Cost = 0; 6929 if (Legal->isMaskRequired(I)) 6930 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6931 CostKind); 6932 else 6933 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6934 CostKind, I); 6935 6936 bool Reverse = ConsecutiveStride < 0; 6937 if (Reverse) 6938 Cost += 6939 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6940 return Cost; 6941 } 6942 6943 InstructionCost 6944 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6945 ElementCount VF) { 6946 assert(Legal->isUniformMemOp(*I)); 6947 6948 Type *ValTy = getMemInstValueType(I); 6949 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6950 const Align Alignment = getLoadStoreAlignment(I); 6951 unsigned AS = getLoadStoreAddressSpace(I); 6952 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6953 if (isa<LoadInst>(I)) { 6954 return TTI.getAddressComputationCost(ValTy) + 6955 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6956 CostKind) + 6957 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6958 } 6959 StoreInst *SI = cast<StoreInst>(I); 6960 6961 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6962 return TTI.getAddressComputationCost(ValTy) + 6963 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6964 CostKind) + 6965 (isLoopInvariantStoreValue 6966 ? 0 6967 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6968 VF.getKnownMinValue() - 1)); 6969 } 6970 6971 InstructionCost 6972 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6973 ElementCount VF) { 6974 Type *ValTy = getMemInstValueType(I); 6975 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6976 const Align Alignment = getLoadStoreAlignment(I); 6977 const Value *Ptr = getLoadStorePointerOperand(I); 6978 6979 return TTI.getAddressComputationCost(VectorTy) + 6980 TTI.getGatherScatterOpCost( 6981 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6982 TargetTransformInfo::TCK_RecipThroughput, I); 6983 } 6984 6985 InstructionCost 6986 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6987 ElementCount VF) { 6988 // TODO: Once we have support for interleaving with scalable vectors 6989 // we can calculate the cost properly here. 6990 if (VF.isScalable()) 6991 return InstructionCost::getInvalid(); 6992 6993 Type *ValTy = getMemInstValueType(I); 6994 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6995 unsigned AS = getLoadStoreAddressSpace(I); 6996 6997 auto Group = getInterleavedAccessGroup(I); 6998 assert(Group && "Fail to get an interleaved access group."); 6999 7000 unsigned InterleaveFactor = Group->getFactor(); 7001 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7002 7003 // Holds the indices of existing members in an interleaved load group. 7004 // An interleaved store group doesn't need this as it doesn't allow gaps. 7005 SmallVector<unsigned, 4> Indices; 7006 if (isa<LoadInst>(I)) { 7007 for (unsigned i = 0; i < InterleaveFactor; i++) 7008 if (Group->getMember(i)) 7009 Indices.push_back(i); 7010 } 7011 7012 // Calculate the cost of the whole interleaved group. 7013 bool UseMaskForGaps = 7014 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7015 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7016 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7017 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7018 7019 if (Group->isReverse()) { 7020 // TODO: Add support for reversed masked interleaved access. 7021 assert(!Legal->isMaskRequired(I) && 7022 "Reverse masked interleaved access not supported."); 7023 Cost += 7024 Group->getNumMembers() * 7025 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7026 } 7027 return Cost; 7028 } 7029 7030 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7031 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7032 // Early exit for no inloop reductions 7033 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7034 return InstructionCost::getInvalid(); 7035 auto *VectorTy = cast<VectorType>(Ty); 7036 7037 // We are looking for a pattern of, and finding the minimal acceptable cost: 7038 // reduce(mul(ext(A), ext(B))) or 7039 // reduce(mul(A, B)) or 7040 // reduce(ext(A)) or 7041 // reduce(A). 7042 // The basic idea is that we walk down the tree to do that, finding the root 7043 // reduction instruction in InLoopReductionImmediateChains. From there we find 7044 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7045 // of the components. If the reduction cost is lower then we return it for the 7046 // reduction instruction and 0 for the other instructions in the pattern. If 7047 // it is not we return an invalid cost specifying the orignal cost method 7048 // should be used. 7049 Instruction *RetI = I; 7050 if ((RetI->getOpcode() == Instruction::SExt || 7051 RetI->getOpcode() == Instruction::ZExt)) { 7052 if (!RetI->hasOneUser()) 7053 return InstructionCost::getInvalid(); 7054 RetI = RetI->user_back(); 7055 } 7056 if (RetI->getOpcode() == Instruction::Mul && 7057 RetI->user_back()->getOpcode() == Instruction::Add) { 7058 if (!RetI->hasOneUser()) 7059 return InstructionCost::getInvalid(); 7060 RetI = RetI->user_back(); 7061 } 7062 7063 // Test if the found instruction is a reduction, and if not return an invalid 7064 // cost specifying the parent to use the original cost modelling. 7065 if (!InLoopReductionImmediateChains.count(RetI)) 7066 return InstructionCost::getInvalid(); 7067 7068 // Find the reduction this chain is a part of and calculate the basic cost of 7069 // the reduction on its own. 7070 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7071 Instruction *ReductionPhi = LastChain; 7072 while (!isa<PHINode>(ReductionPhi)) 7073 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7074 7075 RecurrenceDescriptor RdxDesc = 7076 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7077 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7078 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7079 7080 // Get the operand that was not the reduction chain and match it to one of the 7081 // patterns, returning the better cost if it is found. 7082 Instruction *RedOp = RetI->getOperand(1) == LastChain 7083 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7084 : dyn_cast<Instruction>(RetI->getOperand(1)); 7085 7086 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7087 7088 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7089 !TheLoop->isLoopInvariant(RedOp)) { 7090 bool IsUnsigned = isa<ZExtInst>(RedOp); 7091 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7092 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7093 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7094 CostKind); 7095 7096 InstructionCost ExtCost = 7097 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7098 TTI::CastContextHint::None, CostKind, RedOp); 7099 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7100 return I == RetI ? *RedCost.getValue() : 0; 7101 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7102 Instruction *Mul = RedOp; 7103 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7104 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7105 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7106 Op0->getOpcode() == Op1->getOpcode() && 7107 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7108 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7109 bool IsUnsigned = isa<ZExtInst>(Op0); 7110 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7111 // reduce(mul(ext, ext)) 7112 InstructionCost ExtCost = 7113 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7114 TTI::CastContextHint::None, CostKind, Op0); 7115 InstructionCost MulCost = 7116 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7117 7118 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7119 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7120 CostKind); 7121 7122 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7123 return I == RetI ? *RedCost.getValue() : 0; 7124 } else { 7125 InstructionCost MulCost = 7126 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7127 7128 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7129 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7130 CostKind); 7131 7132 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7133 return I == RetI ? *RedCost.getValue() : 0; 7134 } 7135 } 7136 7137 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7138 } 7139 7140 InstructionCost 7141 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7142 ElementCount VF) { 7143 // Calculate scalar cost only. Vectorization cost should be ready at this 7144 // moment. 7145 if (VF.isScalar()) { 7146 Type *ValTy = getMemInstValueType(I); 7147 const Align Alignment = getLoadStoreAlignment(I); 7148 unsigned AS = getLoadStoreAddressSpace(I); 7149 7150 return TTI.getAddressComputationCost(ValTy) + 7151 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7152 TTI::TCK_RecipThroughput, I); 7153 } 7154 return getWideningCost(I, VF); 7155 } 7156 7157 LoopVectorizationCostModel::VectorizationCostTy 7158 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7159 ElementCount VF) { 7160 // If we know that this instruction will remain uniform, check the cost of 7161 // the scalar version. 7162 if (isUniformAfterVectorization(I, VF)) 7163 VF = ElementCount::getFixed(1); 7164 7165 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7166 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7167 7168 // Forced scalars do not have any scalarization overhead. 7169 auto ForcedScalar = ForcedScalars.find(VF); 7170 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7171 auto InstSet = ForcedScalar->second; 7172 if (InstSet.count(I)) 7173 return VectorizationCostTy( 7174 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7175 VF.getKnownMinValue()), 7176 false); 7177 } 7178 7179 Type *VectorTy; 7180 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7181 7182 bool TypeNotScalarized = 7183 VF.isVector() && VectorTy->isVectorTy() && 7184 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7185 return VectorizationCostTy(C, TypeNotScalarized); 7186 } 7187 7188 InstructionCost 7189 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7190 ElementCount VF) const { 7191 7192 if (VF.isScalable()) 7193 return InstructionCost::getInvalid(); 7194 7195 if (VF.isScalar()) 7196 return 0; 7197 7198 InstructionCost Cost = 0; 7199 Type *RetTy = ToVectorTy(I->getType(), VF); 7200 if (!RetTy->isVoidTy() && 7201 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7202 Cost += TTI.getScalarizationOverhead( 7203 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7204 true, false); 7205 7206 // Some targets keep addresses scalar. 7207 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7208 return Cost; 7209 7210 // Some targets support efficient element stores. 7211 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7212 return Cost; 7213 7214 // Collect operands to consider. 7215 CallInst *CI = dyn_cast<CallInst>(I); 7216 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7217 7218 // Skip operands that do not require extraction/scalarization and do not incur 7219 // any overhead. 7220 SmallVector<Type *> Tys; 7221 for (auto *V : filterExtractingOperands(Ops, VF)) 7222 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7223 return Cost + TTI.getOperandsScalarizationOverhead( 7224 filterExtractingOperands(Ops, VF), Tys); 7225 } 7226 7227 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7228 if (VF.isScalar()) 7229 return; 7230 NumPredStores = 0; 7231 for (BasicBlock *BB : TheLoop->blocks()) { 7232 // For each instruction in the old loop. 7233 for (Instruction &I : *BB) { 7234 Value *Ptr = getLoadStorePointerOperand(&I); 7235 if (!Ptr) 7236 continue; 7237 7238 // TODO: We should generate better code and update the cost model for 7239 // predicated uniform stores. Today they are treated as any other 7240 // predicated store (see added test cases in 7241 // invariant-store-vectorization.ll). 7242 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7243 NumPredStores++; 7244 7245 if (Legal->isUniformMemOp(I)) { 7246 // TODO: Avoid replicating loads and stores instead of 7247 // relying on instcombine to remove them. 7248 // Load: Scalar load + broadcast 7249 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7250 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7251 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7252 continue; 7253 } 7254 7255 // We assume that widening is the best solution when possible. 7256 if (memoryInstructionCanBeWidened(&I, VF)) { 7257 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7258 int ConsecutiveStride = 7259 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7260 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7261 "Expected consecutive stride."); 7262 InstWidening Decision = 7263 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7264 setWideningDecision(&I, VF, Decision, Cost); 7265 continue; 7266 } 7267 7268 // Choose between Interleaving, Gather/Scatter or Scalarization. 7269 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7270 unsigned NumAccesses = 1; 7271 if (isAccessInterleaved(&I)) { 7272 auto Group = getInterleavedAccessGroup(&I); 7273 assert(Group && "Fail to get an interleaved access group."); 7274 7275 // Make one decision for the whole group. 7276 if (getWideningDecision(&I, VF) != CM_Unknown) 7277 continue; 7278 7279 NumAccesses = Group->getNumMembers(); 7280 if (interleavedAccessCanBeWidened(&I, VF)) 7281 InterleaveCost = getInterleaveGroupCost(&I, VF); 7282 } 7283 7284 InstructionCost GatherScatterCost = 7285 isLegalGatherOrScatter(&I) 7286 ? getGatherScatterCost(&I, VF) * NumAccesses 7287 : InstructionCost::getInvalid(); 7288 7289 InstructionCost ScalarizationCost = 7290 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7291 7292 // Choose better solution for the current VF, 7293 // write down this decision and use it during vectorization. 7294 InstructionCost Cost; 7295 InstWidening Decision; 7296 if (InterleaveCost <= GatherScatterCost && 7297 InterleaveCost < ScalarizationCost) { 7298 Decision = CM_Interleave; 7299 Cost = InterleaveCost; 7300 } else if (GatherScatterCost < ScalarizationCost) { 7301 Decision = CM_GatherScatter; 7302 Cost = GatherScatterCost; 7303 } else { 7304 assert(!VF.isScalable() && 7305 "We cannot yet scalarise for scalable vectors"); 7306 Decision = CM_Scalarize; 7307 Cost = ScalarizationCost; 7308 } 7309 // If the instructions belongs to an interleave group, the whole group 7310 // receives the same decision. The whole group receives the cost, but 7311 // the cost will actually be assigned to one instruction. 7312 if (auto Group = getInterleavedAccessGroup(&I)) 7313 setWideningDecision(Group, VF, Decision, Cost); 7314 else 7315 setWideningDecision(&I, VF, Decision, Cost); 7316 } 7317 } 7318 7319 // Make sure that any load of address and any other address computation 7320 // remains scalar unless there is gather/scatter support. This avoids 7321 // inevitable extracts into address registers, and also has the benefit of 7322 // activating LSR more, since that pass can't optimize vectorized 7323 // addresses. 7324 if (TTI.prefersVectorizedAddressing()) 7325 return; 7326 7327 // Start with all scalar pointer uses. 7328 SmallPtrSet<Instruction *, 8> AddrDefs; 7329 for (BasicBlock *BB : TheLoop->blocks()) 7330 for (Instruction &I : *BB) { 7331 Instruction *PtrDef = 7332 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7333 if (PtrDef && TheLoop->contains(PtrDef) && 7334 getWideningDecision(&I, VF) != CM_GatherScatter) 7335 AddrDefs.insert(PtrDef); 7336 } 7337 7338 // Add all instructions used to generate the addresses. 7339 SmallVector<Instruction *, 4> Worklist; 7340 append_range(Worklist, AddrDefs); 7341 while (!Worklist.empty()) { 7342 Instruction *I = Worklist.pop_back_val(); 7343 for (auto &Op : I->operands()) 7344 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7345 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7346 AddrDefs.insert(InstOp).second) 7347 Worklist.push_back(InstOp); 7348 } 7349 7350 for (auto *I : AddrDefs) { 7351 if (isa<LoadInst>(I)) { 7352 // Setting the desired widening decision should ideally be handled in 7353 // by cost functions, but since this involves the task of finding out 7354 // if the loaded register is involved in an address computation, it is 7355 // instead changed here when we know this is the case. 7356 InstWidening Decision = getWideningDecision(I, VF); 7357 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7358 // Scalarize a widened load of address. 7359 setWideningDecision( 7360 I, VF, CM_Scalarize, 7361 (VF.getKnownMinValue() * 7362 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7363 else if (auto Group = getInterleavedAccessGroup(I)) { 7364 // Scalarize an interleave group of address loads. 7365 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7366 if (Instruction *Member = Group->getMember(I)) 7367 setWideningDecision( 7368 Member, VF, CM_Scalarize, 7369 (VF.getKnownMinValue() * 7370 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7371 } 7372 } 7373 } else 7374 // Make sure I gets scalarized and a cost estimate without 7375 // scalarization overhead. 7376 ForcedScalars[VF].insert(I); 7377 } 7378 } 7379 7380 InstructionCost 7381 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7382 Type *&VectorTy) { 7383 Type *RetTy = I->getType(); 7384 if (canTruncateToMinimalBitwidth(I, VF)) 7385 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7386 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7387 auto SE = PSE.getSE(); 7388 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7389 7390 // TODO: We need to estimate the cost of intrinsic calls. 7391 switch (I->getOpcode()) { 7392 case Instruction::GetElementPtr: 7393 // We mark this instruction as zero-cost because the cost of GEPs in 7394 // vectorized code depends on whether the corresponding memory instruction 7395 // is scalarized or not. Therefore, we handle GEPs with the memory 7396 // instruction cost. 7397 return 0; 7398 case Instruction::Br: { 7399 // In cases of scalarized and predicated instructions, there will be VF 7400 // predicated blocks in the vectorized loop. Each branch around these 7401 // blocks requires also an extract of its vector compare i1 element. 7402 bool ScalarPredicatedBB = false; 7403 BranchInst *BI = cast<BranchInst>(I); 7404 if (VF.isVector() && BI->isConditional() && 7405 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7406 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7407 ScalarPredicatedBB = true; 7408 7409 if (ScalarPredicatedBB) { 7410 // Return cost for branches around scalarized and predicated blocks. 7411 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7412 auto *Vec_i1Ty = 7413 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7414 return (TTI.getScalarizationOverhead( 7415 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7416 false, true) + 7417 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7418 VF.getKnownMinValue())); 7419 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7420 // The back-edge branch will remain, as will all scalar branches. 7421 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7422 else 7423 // This branch will be eliminated by if-conversion. 7424 return 0; 7425 // Note: We currently assume zero cost for an unconditional branch inside 7426 // a predicated block since it will become a fall-through, although we 7427 // may decide in the future to call TTI for all branches. 7428 } 7429 case Instruction::PHI: { 7430 auto *Phi = cast<PHINode>(I); 7431 7432 // First-order recurrences are replaced by vector shuffles inside the loop. 7433 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7434 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7435 return TTI.getShuffleCost( 7436 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7437 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7438 7439 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7440 // converted into select instructions. We require N - 1 selects per phi 7441 // node, where N is the number of incoming values. 7442 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7443 return (Phi->getNumIncomingValues() - 1) * 7444 TTI.getCmpSelInstrCost( 7445 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7446 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7447 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7448 7449 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7450 } 7451 case Instruction::UDiv: 7452 case Instruction::SDiv: 7453 case Instruction::URem: 7454 case Instruction::SRem: 7455 // If we have a predicated instruction, it may not be executed for each 7456 // vector lane. Get the scalarization cost and scale this amount by the 7457 // probability of executing the predicated block. If the instruction is not 7458 // predicated, we fall through to the next case. 7459 if (VF.isVector() && isScalarWithPredication(I)) { 7460 InstructionCost Cost = 0; 7461 7462 // These instructions have a non-void type, so account for the phi nodes 7463 // that we will create. This cost is likely to be zero. The phi node 7464 // cost, if any, should be scaled by the block probability because it 7465 // models a copy at the end of each predicated block. 7466 Cost += VF.getKnownMinValue() * 7467 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7468 7469 // The cost of the non-predicated instruction. 7470 Cost += VF.getKnownMinValue() * 7471 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7472 7473 // The cost of insertelement and extractelement instructions needed for 7474 // scalarization. 7475 Cost += getScalarizationOverhead(I, VF); 7476 7477 // Scale the cost by the probability of executing the predicated blocks. 7478 // This assumes the predicated block for each vector lane is equally 7479 // likely. 7480 return Cost / getReciprocalPredBlockProb(); 7481 } 7482 LLVM_FALLTHROUGH; 7483 case Instruction::Add: 7484 case Instruction::FAdd: 7485 case Instruction::Sub: 7486 case Instruction::FSub: 7487 case Instruction::Mul: 7488 case Instruction::FMul: 7489 case Instruction::FDiv: 7490 case Instruction::FRem: 7491 case Instruction::Shl: 7492 case Instruction::LShr: 7493 case Instruction::AShr: 7494 case Instruction::And: 7495 case Instruction::Or: 7496 case Instruction::Xor: { 7497 // Since we will replace the stride by 1 the multiplication should go away. 7498 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7499 return 0; 7500 7501 // Detect reduction patterns 7502 InstructionCost RedCost; 7503 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7504 .isValid()) 7505 return RedCost; 7506 7507 // Certain instructions can be cheaper to vectorize if they have a constant 7508 // second vector operand. One example of this are shifts on x86. 7509 Value *Op2 = I->getOperand(1); 7510 TargetTransformInfo::OperandValueProperties Op2VP; 7511 TargetTransformInfo::OperandValueKind Op2VK = 7512 TTI.getOperandInfo(Op2, Op2VP); 7513 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7514 Op2VK = TargetTransformInfo::OK_UniformValue; 7515 7516 SmallVector<const Value *, 4> Operands(I->operand_values()); 7517 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7518 return N * TTI.getArithmeticInstrCost( 7519 I->getOpcode(), VectorTy, CostKind, 7520 TargetTransformInfo::OK_AnyValue, 7521 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7522 } 7523 case Instruction::FNeg: { 7524 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7525 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7526 return N * TTI.getArithmeticInstrCost( 7527 I->getOpcode(), VectorTy, CostKind, 7528 TargetTransformInfo::OK_AnyValue, 7529 TargetTransformInfo::OK_AnyValue, 7530 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7531 I->getOperand(0), I); 7532 } 7533 case Instruction::Select: { 7534 SelectInst *SI = cast<SelectInst>(I); 7535 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7536 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7537 7538 const Value *Op0, *Op1; 7539 using namespace llvm::PatternMatch; 7540 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7541 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7542 // select x, y, false --> x & y 7543 // select x, true, y --> x | y 7544 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7545 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7546 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7547 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7548 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7549 Op1->getType()->getScalarSizeInBits() == 1); 7550 7551 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7552 return TTI.getArithmeticInstrCost( 7553 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7554 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7555 } 7556 7557 Type *CondTy = SI->getCondition()->getType(); 7558 if (!ScalarCond) 7559 CondTy = VectorType::get(CondTy, VF); 7560 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7561 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7562 } 7563 case Instruction::ICmp: 7564 case Instruction::FCmp: { 7565 Type *ValTy = I->getOperand(0)->getType(); 7566 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7567 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7568 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7569 VectorTy = ToVectorTy(ValTy, VF); 7570 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7571 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7572 } 7573 case Instruction::Store: 7574 case Instruction::Load: { 7575 ElementCount Width = VF; 7576 if (Width.isVector()) { 7577 InstWidening Decision = getWideningDecision(I, Width); 7578 assert(Decision != CM_Unknown && 7579 "CM decision should be taken at this point"); 7580 if (Decision == CM_Scalarize) 7581 Width = ElementCount::getFixed(1); 7582 } 7583 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7584 return getMemoryInstructionCost(I, VF); 7585 } 7586 case Instruction::ZExt: 7587 case Instruction::SExt: 7588 case Instruction::FPToUI: 7589 case Instruction::FPToSI: 7590 case Instruction::FPExt: 7591 case Instruction::PtrToInt: 7592 case Instruction::IntToPtr: 7593 case Instruction::SIToFP: 7594 case Instruction::UIToFP: 7595 case Instruction::Trunc: 7596 case Instruction::FPTrunc: 7597 case Instruction::BitCast: { 7598 // Computes the CastContextHint from a Load/Store instruction. 7599 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7600 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7601 "Expected a load or a store!"); 7602 7603 if (VF.isScalar() || !TheLoop->contains(I)) 7604 return TTI::CastContextHint::Normal; 7605 7606 switch (getWideningDecision(I, VF)) { 7607 case LoopVectorizationCostModel::CM_GatherScatter: 7608 return TTI::CastContextHint::GatherScatter; 7609 case LoopVectorizationCostModel::CM_Interleave: 7610 return TTI::CastContextHint::Interleave; 7611 case LoopVectorizationCostModel::CM_Scalarize: 7612 case LoopVectorizationCostModel::CM_Widen: 7613 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7614 : TTI::CastContextHint::Normal; 7615 case LoopVectorizationCostModel::CM_Widen_Reverse: 7616 return TTI::CastContextHint::Reversed; 7617 case LoopVectorizationCostModel::CM_Unknown: 7618 llvm_unreachable("Instr did not go through cost modelling?"); 7619 } 7620 7621 llvm_unreachable("Unhandled case!"); 7622 }; 7623 7624 unsigned Opcode = I->getOpcode(); 7625 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7626 // For Trunc, the context is the only user, which must be a StoreInst. 7627 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7628 if (I->hasOneUse()) 7629 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7630 CCH = ComputeCCH(Store); 7631 } 7632 // For Z/Sext, the context is the operand, which must be a LoadInst. 7633 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7634 Opcode == Instruction::FPExt) { 7635 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7636 CCH = ComputeCCH(Load); 7637 } 7638 7639 // We optimize the truncation of induction variables having constant 7640 // integer steps. The cost of these truncations is the same as the scalar 7641 // operation. 7642 if (isOptimizableIVTruncate(I, VF)) { 7643 auto *Trunc = cast<TruncInst>(I); 7644 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7645 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7646 } 7647 7648 // Detect reduction patterns 7649 InstructionCost RedCost; 7650 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7651 .isValid()) 7652 return RedCost; 7653 7654 Type *SrcScalarTy = I->getOperand(0)->getType(); 7655 Type *SrcVecTy = 7656 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7657 if (canTruncateToMinimalBitwidth(I, VF)) { 7658 // This cast is going to be shrunk. This may remove the cast or it might 7659 // turn it into slightly different cast. For example, if MinBW == 16, 7660 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7661 // 7662 // Calculate the modified src and dest types. 7663 Type *MinVecTy = VectorTy; 7664 if (Opcode == Instruction::Trunc) { 7665 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7666 VectorTy = 7667 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7668 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7669 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7670 VectorTy = 7671 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7672 } 7673 } 7674 7675 unsigned N; 7676 if (isScalarAfterVectorization(I, VF)) { 7677 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7678 N = VF.getKnownMinValue(); 7679 } else 7680 N = 1; 7681 return N * 7682 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7683 } 7684 case Instruction::Call: { 7685 bool NeedToScalarize; 7686 CallInst *CI = cast<CallInst>(I); 7687 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7688 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7689 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7690 return std::min(CallCost, IntrinsicCost); 7691 } 7692 return CallCost; 7693 } 7694 case Instruction::ExtractValue: 7695 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7696 default: 7697 // The cost of executing VF copies of the scalar instruction. This opcode 7698 // is unknown. Assume that it is the same as 'mul'. 7699 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7700 Instruction::Mul, VectorTy, CostKind) + 7701 getScalarizationOverhead(I, VF); 7702 } // end of switch. 7703 } 7704 7705 char LoopVectorize::ID = 0; 7706 7707 static const char lv_name[] = "Loop Vectorization"; 7708 7709 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7710 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7711 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7712 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7713 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7714 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7715 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7716 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7717 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7718 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7719 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7720 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7721 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7722 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7723 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7724 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7725 7726 namespace llvm { 7727 7728 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7729 7730 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7731 bool VectorizeOnlyWhenForced) { 7732 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7733 } 7734 7735 } // end namespace llvm 7736 7737 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7738 // Check if the pointer operand of a load or store instruction is 7739 // consecutive. 7740 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7741 return Legal->isConsecutivePtr(Ptr); 7742 return false; 7743 } 7744 7745 void LoopVectorizationCostModel::collectValuesToIgnore() { 7746 // Ignore ephemeral values. 7747 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7748 7749 // Ignore type-promoting instructions we identified during reduction 7750 // detection. 7751 for (auto &Reduction : Legal->getReductionVars()) { 7752 RecurrenceDescriptor &RedDes = Reduction.second; 7753 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7754 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7755 } 7756 // Ignore type-casting instructions we identified during induction 7757 // detection. 7758 for (auto &Induction : Legal->getInductionVars()) { 7759 InductionDescriptor &IndDes = Induction.second; 7760 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7761 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7762 } 7763 } 7764 7765 void LoopVectorizationCostModel::collectInLoopReductions() { 7766 for (auto &Reduction : Legal->getReductionVars()) { 7767 PHINode *Phi = Reduction.first; 7768 RecurrenceDescriptor &RdxDesc = Reduction.second; 7769 7770 // We don't collect reductions that are type promoted (yet). 7771 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7772 continue; 7773 7774 // If the target would prefer this reduction to happen "in-loop", then we 7775 // want to record it as such. 7776 unsigned Opcode = RdxDesc.getOpcode(); 7777 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7778 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7779 TargetTransformInfo::ReductionFlags())) 7780 continue; 7781 7782 // Check that we can correctly put the reductions into the loop, by 7783 // finding the chain of operations that leads from the phi to the loop 7784 // exit value. 7785 SmallVector<Instruction *, 4> ReductionOperations = 7786 RdxDesc.getReductionOpChain(Phi, TheLoop); 7787 bool InLoop = !ReductionOperations.empty(); 7788 if (InLoop) { 7789 InLoopReductionChains[Phi] = ReductionOperations; 7790 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7791 Instruction *LastChain = Phi; 7792 for (auto *I : ReductionOperations) { 7793 InLoopReductionImmediateChains[I] = LastChain; 7794 LastChain = I; 7795 } 7796 } 7797 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7798 << " reduction for phi: " << *Phi << "\n"); 7799 } 7800 } 7801 7802 // TODO: we could return a pair of values that specify the max VF and 7803 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7804 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7805 // doesn't have a cost model that can choose which plan to execute if 7806 // more than one is generated. 7807 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7808 LoopVectorizationCostModel &CM) { 7809 unsigned WidestType; 7810 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7811 return WidestVectorRegBits / WidestType; 7812 } 7813 7814 VectorizationFactor 7815 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7816 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7817 ElementCount VF = UserVF; 7818 // Outer loop handling: They may require CFG and instruction level 7819 // transformations before even evaluating whether vectorization is profitable. 7820 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7821 // the vectorization pipeline. 7822 if (!OrigLoop->isInnermost()) { 7823 // If the user doesn't provide a vectorization factor, determine a 7824 // reasonable one. 7825 if (UserVF.isZero()) { 7826 VF = ElementCount::getFixed(determineVPlanVF( 7827 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7828 .getFixedSize(), 7829 CM)); 7830 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7831 7832 // Make sure we have a VF > 1 for stress testing. 7833 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7834 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7835 << "overriding computed VF.\n"); 7836 VF = ElementCount::getFixed(4); 7837 } 7838 } 7839 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7840 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7841 "VF needs to be a power of two"); 7842 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7843 << "VF " << VF << " to build VPlans.\n"); 7844 buildVPlans(VF, VF); 7845 7846 // For VPlan build stress testing, we bail out after VPlan construction. 7847 if (VPlanBuildStressTest) 7848 return VectorizationFactor::Disabled(); 7849 7850 return {VF, 0 /*Cost*/}; 7851 } 7852 7853 LLVM_DEBUG( 7854 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7855 "VPlan-native path.\n"); 7856 return VectorizationFactor::Disabled(); 7857 } 7858 7859 Optional<VectorizationFactor> 7860 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7861 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7862 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7863 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7864 return None; 7865 7866 // Invalidate interleave groups if all blocks of loop will be predicated. 7867 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7868 !useMaskedInterleavedAccesses(*TTI)) { 7869 LLVM_DEBUG( 7870 dbgs() 7871 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7872 "which requires masked-interleaved support.\n"); 7873 if (CM.InterleaveInfo.invalidateGroups()) 7874 // Invalidating interleave groups also requires invalidating all decisions 7875 // based on them, which includes widening decisions and uniform and scalar 7876 // values. 7877 CM.invalidateCostModelingDecisions(); 7878 } 7879 7880 ElementCount MaxVF = MaybeMaxVF.getValue(); 7881 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7882 7883 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7884 if (!UserVF.isZero() && 7885 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7886 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7887 // VFs here, this should be reverted to only use legal UserVFs once the 7888 // loop below supports scalable VFs. 7889 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7890 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7891 << " VF " << VF << ".\n"); 7892 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7893 "VF needs to be a power of two"); 7894 // Collect the instructions (and their associated costs) that will be more 7895 // profitable to scalarize. 7896 CM.selectUserVectorizationFactor(VF); 7897 CM.collectInLoopReductions(); 7898 buildVPlansWithVPRecipes(VF, VF); 7899 LLVM_DEBUG(printPlans(dbgs())); 7900 return {{VF, 0}}; 7901 } 7902 7903 assert(!MaxVF.isScalable() && 7904 "Scalable vectors not yet supported beyond this point"); 7905 7906 for (ElementCount VF = ElementCount::getFixed(1); 7907 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7908 // Collect Uniform and Scalar instructions after vectorization with VF. 7909 CM.collectUniformsAndScalars(VF); 7910 7911 // Collect the instructions (and their associated costs) that will be more 7912 // profitable to scalarize. 7913 if (VF.isVector()) 7914 CM.collectInstsToScalarize(VF); 7915 } 7916 7917 CM.collectInLoopReductions(); 7918 7919 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7920 LLVM_DEBUG(printPlans(dbgs())); 7921 if (MaxVF.isScalar()) 7922 return VectorizationFactor::Disabled(); 7923 7924 // Select the optimal vectorization factor. 7925 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 7926 7927 // Check if it is profitable to vectorize with runtime checks. 7928 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7929 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7930 bool PragmaThresholdReached = 7931 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7932 bool ThresholdReached = 7933 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7934 if ((ThresholdReached && !Hints.allowReordering()) || 7935 PragmaThresholdReached) { 7936 ORE->emit([&]() { 7937 return OptimizationRemarkAnalysisAliasing( 7938 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7939 OrigLoop->getHeader()) 7940 << "loop not vectorized: cannot prove it is safe to reorder " 7941 "memory operations"; 7942 }); 7943 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7944 Hints.emitRemarkWithHints(); 7945 return VectorizationFactor::Disabled(); 7946 } 7947 } 7948 return SelectedVF; 7949 } 7950 7951 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7952 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7953 << '\n'); 7954 BestVF = VF; 7955 BestUF = UF; 7956 7957 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7958 return !Plan->hasVF(VF); 7959 }); 7960 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7961 } 7962 7963 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7964 DominatorTree *DT) { 7965 // Perform the actual loop transformation. 7966 7967 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7968 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7969 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7970 7971 VPTransformState State{ 7972 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 7973 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7974 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7975 State.CanonicalIV = ILV.Induction; 7976 7977 ILV.printDebugTracesAtStart(); 7978 7979 //===------------------------------------------------===// 7980 // 7981 // Notice: any optimization or new instruction that go 7982 // into the code below should also be implemented in 7983 // the cost-model. 7984 // 7985 //===------------------------------------------------===// 7986 7987 // 2. Copy and widen instructions from the old loop into the new loop. 7988 VPlans.front()->execute(&State); 7989 7990 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7991 // predication, updating analyses. 7992 ILV.fixVectorizedLoop(State); 7993 7994 ILV.printDebugTracesAtEnd(); 7995 } 7996 7997 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7998 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7999 for (const auto &Plan : VPlans) 8000 if (PrintVPlansInDotFormat) 8001 Plan->printDOT(O); 8002 else 8003 Plan->print(O); 8004 } 8005 #endif 8006 8007 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8008 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8009 8010 // We create new control-flow for the vectorized loop, so the original exit 8011 // conditions will be dead after vectorization if it's only used by the 8012 // terminator 8013 SmallVector<BasicBlock*> ExitingBlocks; 8014 OrigLoop->getExitingBlocks(ExitingBlocks); 8015 for (auto *BB : ExitingBlocks) { 8016 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8017 if (!Cmp || !Cmp->hasOneUse()) 8018 continue; 8019 8020 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8021 if (!DeadInstructions.insert(Cmp).second) 8022 continue; 8023 8024 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8025 // TODO: can recurse through operands in general 8026 for (Value *Op : Cmp->operands()) { 8027 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8028 DeadInstructions.insert(cast<Instruction>(Op)); 8029 } 8030 } 8031 8032 // We create new "steps" for induction variable updates to which the original 8033 // induction variables map. An original update instruction will be dead if 8034 // all its users except the induction variable are dead. 8035 auto *Latch = OrigLoop->getLoopLatch(); 8036 for (auto &Induction : Legal->getInductionVars()) { 8037 PHINode *Ind = Induction.first; 8038 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8039 8040 // If the tail is to be folded by masking, the primary induction variable, 8041 // if exists, isn't dead: it will be used for masking. Don't kill it. 8042 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8043 continue; 8044 8045 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8046 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8047 })) 8048 DeadInstructions.insert(IndUpdate); 8049 8050 // We record as "Dead" also the type-casting instructions we had identified 8051 // during induction analysis. We don't need any handling for them in the 8052 // vectorized loop because we have proven that, under a proper runtime 8053 // test guarding the vectorized loop, the value of the phi, and the casted 8054 // value of the phi, are the same. The last instruction in this casting chain 8055 // will get its scalar/vector/widened def from the scalar/vector/widened def 8056 // of the respective phi node. Any other casts in the induction def-use chain 8057 // have no other uses outside the phi update chain, and will be ignored. 8058 InductionDescriptor &IndDes = Induction.second; 8059 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8060 DeadInstructions.insert(Casts.begin(), Casts.end()); 8061 } 8062 } 8063 8064 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8065 8066 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8067 8068 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8069 Instruction::BinaryOps BinOp) { 8070 // When unrolling and the VF is 1, we only need to add a simple scalar. 8071 Type *Ty = Val->getType(); 8072 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8073 8074 if (Ty->isFloatingPointTy()) { 8075 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8076 8077 // Floating-point operations inherit FMF via the builder's flags. 8078 Value *MulOp = Builder.CreateFMul(C, Step); 8079 return Builder.CreateBinOp(BinOp, Val, MulOp); 8080 } 8081 Constant *C = ConstantInt::get(Ty, StartIdx); 8082 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8083 } 8084 8085 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8086 SmallVector<Metadata *, 4> MDs; 8087 // Reserve first location for self reference to the LoopID metadata node. 8088 MDs.push_back(nullptr); 8089 bool IsUnrollMetadata = false; 8090 MDNode *LoopID = L->getLoopID(); 8091 if (LoopID) { 8092 // First find existing loop unrolling disable metadata. 8093 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8094 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8095 if (MD) { 8096 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8097 IsUnrollMetadata = 8098 S && S->getString().startswith("llvm.loop.unroll.disable"); 8099 } 8100 MDs.push_back(LoopID->getOperand(i)); 8101 } 8102 } 8103 8104 if (!IsUnrollMetadata) { 8105 // Add runtime unroll disable metadata. 8106 LLVMContext &Context = L->getHeader()->getContext(); 8107 SmallVector<Metadata *, 1> DisableOperands; 8108 DisableOperands.push_back( 8109 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8110 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8111 MDs.push_back(DisableNode); 8112 MDNode *NewLoopID = MDNode::get(Context, MDs); 8113 // Set operand 0 to refer to the loop id itself. 8114 NewLoopID->replaceOperandWith(0, NewLoopID); 8115 L->setLoopID(NewLoopID); 8116 } 8117 } 8118 8119 //===--------------------------------------------------------------------===// 8120 // EpilogueVectorizerMainLoop 8121 //===--------------------------------------------------------------------===// 8122 8123 /// This function is partially responsible for generating the control flow 8124 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8125 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8126 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8127 Loop *Lp = createVectorLoopSkeleton(""); 8128 8129 // Generate the code to check the minimum iteration count of the vector 8130 // epilogue (see below). 8131 EPI.EpilogueIterationCountCheck = 8132 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8133 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8134 8135 // Generate the code to check any assumptions that we've made for SCEV 8136 // expressions. 8137 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8138 8139 // Generate the code that checks at runtime if arrays overlap. We put the 8140 // checks into a separate block to make the more common case of few elements 8141 // faster. 8142 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8143 8144 // Generate the iteration count check for the main loop, *after* the check 8145 // for the epilogue loop, so that the path-length is shorter for the case 8146 // that goes directly through the vector epilogue. The longer-path length for 8147 // the main loop is compensated for, by the gain from vectorizing the larger 8148 // trip count. Note: the branch will get updated later on when we vectorize 8149 // the epilogue. 8150 EPI.MainLoopIterationCountCheck = 8151 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8152 8153 // Generate the induction variable. 8154 OldInduction = Legal->getPrimaryInduction(); 8155 Type *IdxTy = Legal->getWidestInductionType(); 8156 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8157 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8158 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8159 EPI.VectorTripCount = CountRoundDown; 8160 Induction = 8161 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8162 getDebugLocFromInstOrOperands(OldInduction)); 8163 8164 // Skip induction resume value creation here because they will be created in 8165 // the second pass. If we created them here, they wouldn't be used anyway, 8166 // because the vplan in the second pass still contains the inductions from the 8167 // original loop. 8168 8169 return completeLoopSkeleton(Lp, OrigLoopID); 8170 } 8171 8172 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8173 LLVM_DEBUG({ 8174 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8175 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8176 << ", Main Loop UF:" << EPI.MainLoopUF 8177 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8178 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8179 }); 8180 } 8181 8182 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8183 DEBUG_WITH_TYPE(VerboseDebug, { 8184 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8185 }); 8186 } 8187 8188 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8189 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8190 assert(L && "Expected valid Loop."); 8191 assert(Bypass && "Expected valid bypass basic block."); 8192 unsigned VFactor = 8193 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8194 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8195 Value *Count = getOrCreateTripCount(L); 8196 // Reuse existing vector loop preheader for TC checks. 8197 // Note that new preheader block is generated for vector loop. 8198 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8199 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8200 8201 // Generate code to check if the loop's trip count is less than VF * UF of the 8202 // main vector loop. 8203 auto P = 8204 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8205 8206 Value *CheckMinIters = Builder.CreateICmp( 8207 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8208 "min.iters.check"); 8209 8210 if (!ForEpilogue) 8211 TCCheckBlock->setName("vector.main.loop.iter.check"); 8212 8213 // Create new preheader for vector loop. 8214 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8215 DT, LI, nullptr, "vector.ph"); 8216 8217 if (ForEpilogue) { 8218 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8219 DT->getNode(Bypass)->getIDom()) && 8220 "TC check is expected to dominate Bypass"); 8221 8222 // Update dominator for Bypass & LoopExit. 8223 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8224 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8225 8226 LoopBypassBlocks.push_back(TCCheckBlock); 8227 8228 // Save the trip count so we don't have to regenerate it in the 8229 // vec.epilog.iter.check. This is safe to do because the trip count 8230 // generated here dominates the vector epilog iter check. 8231 EPI.TripCount = Count; 8232 } 8233 8234 ReplaceInstWithInst( 8235 TCCheckBlock->getTerminator(), 8236 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8237 8238 return TCCheckBlock; 8239 } 8240 8241 //===--------------------------------------------------------------------===// 8242 // EpilogueVectorizerEpilogueLoop 8243 //===--------------------------------------------------------------------===// 8244 8245 /// This function is partially responsible for generating the control flow 8246 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8247 BasicBlock * 8248 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8249 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8250 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8251 8252 // Now, compare the remaining count and if there aren't enough iterations to 8253 // execute the vectorized epilogue skip to the scalar part. 8254 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8255 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8256 LoopVectorPreHeader = 8257 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8258 LI, nullptr, "vec.epilog.ph"); 8259 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8260 VecEpilogueIterationCountCheck); 8261 8262 // Adjust the control flow taking the state info from the main loop 8263 // vectorization into account. 8264 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8265 "expected this to be saved from the previous pass."); 8266 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8267 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8268 8269 DT->changeImmediateDominator(LoopVectorPreHeader, 8270 EPI.MainLoopIterationCountCheck); 8271 8272 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8273 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8274 8275 if (EPI.SCEVSafetyCheck) 8276 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8277 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8278 if (EPI.MemSafetyCheck) 8279 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8280 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8281 8282 DT->changeImmediateDominator( 8283 VecEpilogueIterationCountCheck, 8284 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8285 8286 DT->changeImmediateDominator(LoopScalarPreHeader, 8287 EPI.EpilogueIterationCountCheck); 8288 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8289 8290 // Keep track of bypass blocks, as they feed start values to the induction 8291 // phis in the scalar loop preheader. 8292 if (EPI.SCEVSafetyCheck) 8293 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8294 if (EPI.MemSafetyCheck) 8295 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8296 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8297 8298 // Generate a resume induction for the vector epilogue and put it in the 8299 // vector epilogue preheader 8300 Type *IdxTy = Legal->getWidestInductionType(); 8301 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8302 LoopVectorPreHeader->getFirstNonPHI()); 8303 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8304 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8305 EPI.MainLoopIterationCountCheck); 8306 8307 // Generate the induction variable. 8308 OldInduction = Legal->getPrimaryInduction(); 8309 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8310 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8311 Value *StartIdx = EPResumeVal; 8312 Induction = 8313 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8314 getDebugLocFromInstOrOperands(OldInduction)); 8315 8316 // Generate induction resume values. These variables save the new starting 8317 // indexes for the scalar loop. They are used to test if there are any tail 8318 // iterations left once the vector loop has completed. 8319 // Note that when the vectorized epilogue is skipped due to iteration count 8320 // check, then the resume value for the induction variable comes from 8321 // the trip count of the main vector loop, hence passing the AdditionalBypass 8322 // argument. 8323 createInductionResumeValues(Lp, CountRoundDown, 8324 {VecEpilogueIterationCountCheck, 8325 EPI.VectorTripCount} /* AdditionalBypass */); 8326 8327 AddRuntimeUnrollDisableMetaData(Lp); 8328 return completeLoopSkeleton(Lp, OrigLoopID); 8329 } 8330 8331 BasicBlock * 8332 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8333 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8334 8335 assert(EPI.TripCount && 8336 "Expected trip count to have been safed in the first pass."); 8337 assert( 8338 (!isa<Instruction>(EPI.TripCount) || 8339 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8340 "saved trip count does not dominate insertion point."); 8341 Value *TC = EPI.TripCount; 8342 IRBuilder<> Builder(Insert->getTerminator()); 8343 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8344 8345 // Generate code to check if the loop's trip count is less than VF * UF of the 8346 // vector epilogue loop. 8347 auto P = 8348 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8349 8350 Value *CheckMinIters = Builder.CreateICmp( 8351 P, Count, 8352 ConstantInt::get(Count->getType(), 8353 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8354 "min.epilog.iters.check"); 8355 8356 ReplaceInstWithInst( 8357 Insert->getTerminator(), 8358 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8359 8360 LoopBypassBlocks.push_back(Insert); 8361 return Insert; 8362 } 8363 8364 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8365 LLVM_DEBUG({ 8366 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8367 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8368 << ", Main Loop UF:" << EPI.MainLoopUF 8369 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8370 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8371 }); 8372 } 8373 8374 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8375 DEBUG_WITH_TYPE(VerboseDebug, { 8376 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8377 }); 8378 } 8379 8380 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8381 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8382 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8383 bool PredicateAtRangeStart = Predicate(Range.Start); 8384 8385 for (ElementCount TmpVF = Range.Start * 2; 8386 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8387 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8388 Range.End = TmpVF; 8389 break; 8390 } 8391 8392 return PredicateAtRangeStart; 8393 } 8394 8395 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8396 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8397 /// of VF's starting at a given VF and extending it as much as possible. Each 8398 /// vectorization decision can potentially shorten this sub-range during 8399 /// buildVPlan(). 8400 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8401 ElementCount MaxVF) { 8402 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8403 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8404 VFRange SubRange = {VF, MaxVFPlusOne}; 8405 VPlans.push_back(buildVPlan(SubRange)); 8406 VF = SubRange.End; 8407 } 8408 } 8409 8410 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8411 VPlanPtr &Plan) { 8412 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8413 8414 // Look for cached value. 8415 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8416 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8417 if (ECEntryIt != EdgeMaskCache.end()) 8418 return ECEntryIt->second; 8419 8420 VPValue *SrcMask = createBlockInMask(Src, Plan); 8421 8422 // The terminator has to be a branch inst! 8423 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8424 assert(BI && "Unexpected terminator found"); 8425 8426 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8427 return EdgeMaskCache[Edge] = SrcMask; 8428 8429 // If source is an exiting block, we know the exit edge is dynamically dead 8430 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8431 // adding uses of an otherwise potentially dead instruction. 8432 if (OrigLoop->isLoopExiting(Src)) 8433 return EdgeMaskCache[Edge] = SrcMask; 8434 8435 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8436 assert(EdgeMask && "No Edge Mask found for condition"); 8437 8438 if (BI->getSuccessor(0) != Dst) 8439 EdgeMask = Builder.createNot(EdgeMask); 8440 8441 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8442 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8443 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8444 // The select version does not introduce new UB if SrcMask is false and 8445 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8446 VPValue *False = Plan->getOrAddVPValue( 8447 ConstantInt::getFalse(BI->getCondition()->getType())); 8448 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8449 } 8450 8451 return EdgeMaskCache[Edge] = EdgeMask; 8452 } 8453 8454 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8455 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8456 8457 // Look for cached value. 8458 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8459 if (BCEntryIt != BlockMaskCache.end()) 8460 return BCEntryIt->second; 8461 8462 // All-one mask is modelled as no-mask following the convention for masked 8463 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8464 VPValue *BlockMask = nullptr; 8465 8466 if (OrigLoop->getHeader() == BB) { 8467 if (!CM.blockNeedsPredication(BB)) 8468 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8469 8470 // Create the block in mask as the first non-phi instruction in the block. 8471 VPBuilder::InsertPointGuard Guard(Builder); 8472 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8473 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8474 8475 // Introduce the early-exit compare IV <= BTC to form header block mask. 8476 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8477 // Start by constructing the desired canonical IV. 8478 VPValue *IV = nullptr; 8479 if (Legal->getPrimaryInduction()) 8480 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8481 else { 8482 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8483 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8484 IV = IVRecipe->getVPValue(); 8485 } 8486 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8487 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8488 8489 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8490 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8491 // as a second argument, we only pass the IV here and extract the 8492 // tripcount from the transform state where codegen of the VP instructions 8493 // happen. 8494 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8495 } else { 8496 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8497 } 8498 return BlockMaskCache[BB] = BlockMask; 8499 } 8500 8501 // This is the block mask. We OR all incoming edges. 8502 for (auto *Predecessor : predecessors(BB)) { 8503 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8504 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8505 return BlockMaskCache[BB] = EdgeMask; 8506 8507 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8508 BlockMask = EdgeMask; 8509 continue; 8510 } 8511 8512 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8513 } 8514 8515 return BlockMaskCache[BB] = BlockMask; 8516 } 8517 8518 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8519 ArrayRef<VPValue *> Operands, 8520 VFRange &Range, 8521 VPlanPtr &Plan) { 8522 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8523 "Must be called with either a load or store"); 8524 8525 auto willWiden = [&](ElementCount VF) -> bool { 8526 if (VF.isScalar()) 8527 return false; 8528 LoopVectorizationCostModel::InstWidening Decision = 8529 CM.getWideningDecision(I, VF); 8530 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8531 "CM decision should be taken at this point."); 8532 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8533 return true; 8534 if (CM.isScalarAfterVectorization(I, VF) || 8535 CM.isProfitableToScalarize(I, VF)) 8536 return false; 8537 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8538 }; 8539 8540 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8541 return nullptr; 8542 8543 VPValue *Mask = nullptr; 8544 if (Legal->isMaskRequired(I)) 8545 Mask = createBlockInMask(I->getParent(), Plan); 8546 8547 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8548 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8549 8550 StoreInst *Store = cast<StoreInst>(I); 8551 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8552 Mask); 8553 } 8554 8555 VPWidenIntOrFpInductionRecipe * 8556 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8557 ArrayRef<VPValue *> Operands) const { 8558 // Check if this is an integer or fp induction. If so, build the recipe that 8559 // produces its scalar and vector values. 8560 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8561 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8562 II.getKind() == InductionDescriptor::IK_FpInduction) { 8563 assert(II.getStartValue() == 8564 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8565 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8566 return new VPWidenIntOrFpInductionRecipe( 8567 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8568 } 8569 8570 return nullptr; 8571 } 8572 8573 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8574 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8575 VPlan &Plan) const { 8576 // Optimize the special case where the source is a constant integer 8577 // induction variable. Notice that we can only optimize the 'trunc' case 8578 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8579 // (c) other casts depend on pointer size. 8580 8581 // Determine whether \p K is a truncation based on an induction variable that 8582 // can be optimized. 8583 auto isOptimizableIVTruncate = 8584 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8585 return [=](ElementCount VF) -> bool { 8586 return CM.isOptimizableIVTruncate(K, VF); 8587 }; 8588 }; 8589 8590 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8591 isOptimizableIVTruncate(I), Range)) { 8592 8593 InductionDescriptor II = 8594 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8595 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8596 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8597 Start, nullptr, I); 8598 } 8599 return nullptr; 8600 } 8601 8602 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8603 ArrayRef<VPValue *> Operands, 8604 VPlanPtr &Plan) { 8605 // If all incoming values are equal, the incoming VPValue can be used directly 8606 // instead of creating a new VPBlendRecipe. 8607 VPValue *FirstIncoming = Operands[0]; 8608 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8609 return FirstIncoming == Inc; 8610 })) { 8611 return Operands[0]; 8612 } 8613 8614 // We know that all PHIs in non-header blocks are converted into selects, so 8615 // we don't have to worry about the insertion order and we can just use the 8616 // builder. At this point we generate the predication tree. There may be 8617 // duplications since this is a simple recursive scan, but future 8618 // optimizations will clean it up. 8619 SmallVector<VPValue *, 2> OperandsWithMask; 8620 unsigned NumIncoming = Phi->getNumIncomingValues(); 8621 8622 for (unsigned In = 0; In < NumIncoming; In++) { 8623 VPValue *EdgeMask = 8624 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8625 assert((EdgeMask || NumIncoming == 1) && 8626 "Multiple predecessors with one having a full mask"); 8627 OperandsWithMask.push_back(Operands[In]); 8628 if (EdgeMask) 8629 OperandsWithMask.push_back(EdgeMask); 8630 } 8631 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8632 } 8633 8634 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8635 ArrayRef<VPValue *> Operands, 8636 VFRange &Range) const { 8637 8638 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8639 [this, CI](ElementCount VF) { 8640 return CM.isScalarWithPredication(CI, VF); 8641 }, 8642 Range); 8643 8644 if (IsPredicated) 8645 return nullptr; 8646 8647 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8648 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8649 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8650 ID == Intrinsic::pseudoprobe || 8651 ID == Intrinsic::experimental_noalias_scope_decl)) 8652 return nullptr; 8653 8654 auto willWiden = [&](ElementCount VF) -> bool { 8655 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8656 // The following case may be scalarized depending on the VF. 8657 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8658 // version of the instruction. 8659 // Is it beneficial to perform intrinsic call compared to lib call? 8660 bool NeedToScalarize = false; 8661 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8662 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8663 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8664 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8665 "Either the intrinsic cost or vector call cost must be valid"); 8666 return UseVectorIntrinsic || !NeedToScalarize; 8667 }; 8668 8669 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8670 return nullptr; 8671 8672 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8673 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8674 } 8675 8676 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8677 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8678 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8679 // Instruction should be widened, unless it is scalar after vectorization, 8680 // scalarization is profitable or it is predicated. 8681 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8682 return CM.isScalarAfterVectorization(I, VF) || 8683 CM.isProfitableToScalarize(I, VF) || 8684 CM.isScalarWithPredication(I, VF); 8685 }; 8686 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8687 Range); 8688 } 8689 8690 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8691 ArrayRef<VPValue *> Operands) const { 8692 auto IsVectorizableOpcode = [](unsigned Opcode) { 8693 switch (Opcode) { 8694 case Instruction::Add: 8695 case Instruction::And: 8696 case Instruction::AShr: 8697 case Instruction::BitCast: 8698 case Instruction::FAdd: 8699 case Instruction::FCmp: 8700 case Instruction::FDiv: 8701 case Instruction::FMul: 8702 case Instruction::FNeg: 8703 case Instruction::FPExt: 8704 case Instruction::FPToSI: 8705 case Instruction::FPToUI: 8706 case Instruction::FPTrunc: 8707 case Instruction::FRem: 8708 case Instruction::FSub: 8709 case Instruction::ICmp: 8710 case Instruction::IntToPtr: 8711 case Instruction::LShr: 8712 case Instruction::Mul: 8713 case Instruction::Or: 8714 case Instruction::PtrToInt: 8715 case Instruction::SDiv: 8716 case Instruction::Select: 8717 case Instruction::SExt: 8718 case Instruction::Shl: 8719 case Instruction::SIToFP: 8720 case Instruction::SRem: 8721 case Instruction::Sub: 8722 case Instruction::Trunc: 8723 case Instruction::UDiv: 8724 case Instruction::UIToFP: 8725 case Instruction::URem: 8726 case Instruction::Xor: 8727 case Instruction::ZExt: 8728 return true; 8729 } 8730 return false; 8731 }; 8732 8733 if (!IsVectorizableOpcode(I->getOpcode())) 8734 return nullptr; 8735 8736 // Success: widen this instruction. 8737 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8738 } 8739 8740 VPBasicBlock *VPRecipeBuilder::handleReplication( 8741 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8742 VPlanPtr &Plan) { 8743 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8744 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8745 Range); 8746 8747 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8748 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range); 8749 8750 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8751 IsUniform, IsPredicated); 8752 setRecipe(I, Recipe); 8753 Plan->addVPValue(I, Recipe); 8754 8755 // Find if I uses a predicated instruction. If so, it will use its scalar 8756 // value. Avoid hoisting the insert-element which packs the scalar value into 8757 // a vector value, as that happens iff all users use the vector value. 8758 for (VPValue *Op : Recipe->operands()) { 8759 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8760 if (!PredR) 8761 continue; 8762 auto *RepR = 8763 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8764 assert(RepR->isPredicated() && 8765 "expected Replicate recipe to be predicated"); 8766 RepR->setAlsoPack(false); 8767 } 8768 8769 // Finalize the recipe for Instr, first if it is not predicated. 8770 if (!IsPredicated) { 8771 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8772 VPBB->appendRecipe(Recipe); 8773 return VPBB; 8774 } 8775 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8776 assert(VPBB->getSuccessors().empty() && 8777 "VPBB has successors when handling predicated replication."); 8778 // Record predicated instructions for above packing optimizations. 8779 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8780 VPBlockUtils::insertBlockAfter(Region, VPBB); 8781 auto *RegSucc = new VPBasicBlock(); 8782 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8783 return RegSucc; 8784 } 8785 8786 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8787 VPRecipeBase *PredRecipe, 8788 VPlanPtr &Plan) { 8789 // Instructions marked for predication are replicated and placed under an 8790 // if-then construct to prevent side-effects. 8791 8792 // Generate recipes to compute the block mask for this region. 8793 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8794 8795 // Build the triangular if-then region. 8796 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8797 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8798 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8799 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8800 auto *PHIRecipe = Instr->getType()->isVoidTy() 8801 ? nullptr 8802 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8803 if (PHIRecipe) { 8804 Plan->removeVPValueFor(Instr); 8805 Plan->addVPValue(Instr, PHIRecipe); 8806 } 8807 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8808 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8809 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8810 8811 // Note: first set Entry as region entry and then connect successors starting 8812 // from it in order, to propagate the "parent" of each VPBasicBlock. 8813 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8814 VPBlockUtils::connectBlocks(Pred, Exit); 8815 8816 return Region; 8817 } 8818 8819 VPRecipeOrVPValueTy 8820 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8821 ArrayRef<VPValue *> Operands, 8822 VFRange &Range, VPlanPtr &Plan) { 8823 // First, check for specific widening recipes that deal with calls, memory 8824 // operations, inductions and Phi nodes. 8825 if (auto *CI = dyn_cast<CallInst>(Instr)) 8826 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8827 8828 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8829 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8830 8831 VPRecipeBase *Recipe; 8832 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8833 if (Phi->getParent() != OrigLoop->getHeader()) 8834 return tryToBlend(Phi, Operands, Plan); 8835 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8836 return toVPRecipeResult(Recipe); 8837 8838 if (Legal->isReductionVariable(Phi)) { 8839 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8840 assert(RdxDesc.getRecurrenceStartValue() == 8841 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8842 VPValue *StartV = Operands[0]; 8843 return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV)); 8844 } 8845 8846 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8847 } 8848 8849 if (isa<TruncInst>(Instr) && 8850 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8851 Range, *Plan))) 8852 return toVPRecipeResult(Recipe); 8853 8854 if (!shouldWiden(Instr, Range)) 8855 return nullptr; 8856 8857 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8858 return toVPRecipeResult(new VPWidenGEPRecipe( 8859 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8860 8861 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8862 bool InvariantCond = 8863 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8864 return toVPRecipeResult(new VPWidenSelectRecipe( 8865 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8866 } 8867 8868 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8869 } 8870 8871 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8872 ElementCount MaxVF) { 8873 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8874 8875 // Collect instructions from the original loop that will become trivially dead 8876 // in the vectorized loop. We don't need to vectorize these instructions. For 8877 // example, original induction update instructions can become dead because we 8878 // separately emit induction "steps" when generating code for the new loop. 8879 // Similarly, we create a new latch condition when setting up the structure 8880 // of the new loop, so the old one can become dead. 8881 SmallPtrSet<Instruction *, 4> DeadInstructions; 8882 collectTriviallyDeadInstructions(DeadInstructions); 8883 8884 // Add assume instructions we need to drop to DeadInstructions, to prevent 8885 // them from being added to the VPlan. 8886 // TODO: We only need to drop assumes in blocks that get flattend. If the 8887 // control flow is preserved, we should keep them. 8888 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8889 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8890 8891 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8892 // Dead instructions do not need sinking. Remove them from SinkAfter. 8893 for (Instruction *I : DeadInstructions) 8894 SinkAfter.erase(I); 8895 8896 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8897 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8898 VFRange SubRange = {VF, MaxVFPlusOne}; 8899 VPlans.push_back( 8900 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8901 VF = SubRange.End; 8902 } 8903 } 8904 8905 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8906 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8907 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8908 8909 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8910 8911 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8912 8913 // --------------------------------------------------------------------------- 8914 // Pre-construction: record ingredients whose recipes we'll need to further 8915 // process after constructing the initial VPlan. 8916 // --------------------------------------------------------------------------- 8917 8918 // Mark instructions we'll need to sink later and their targets as 8919 // ingredients whose recipe we'll need to record. 8920 for (auto &Entry : SinkAfter) { 8921 RecipeBuilder.recordRecipeOf(Entry.first); 8922 RecipeBuilder.recordRecipeOf(Entry.second); 8923 } 8924 for (auto &Reduction : CM.getInLoopReductionChains()) { 8925 PHINode *Phi = Reduction.first; 8926 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8927 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8928 8929 RecipeBuilder.recordRecipeOf(Phi); 8930 for (auto &R : ReductionOperations) { 8931 RecipeBuilder.recordRecipeOf(R); 8932 // For min/max reducitons, where we have a pair of icmp/select, we also 8933 // need to record the ICmp recipe, so it can be removed later. 8934 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8935 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8936 } 8937 } 8938 8939 // For each interleave group which is relevant for this (possibly trimmed) 8940 // Range, add it to the set of groups to be later applied to the VPlan and add 8941 // placeholders for its members' Recipes which we'll be replacing with a 8942 // single VPInterleaveRecipe. 8943 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8944 auto applyIG = [IG, this](ElementCount VF) -> bool { 8945 return (VF.isVector() && // Query is illegal for VF == 1 8946 CM.getWideningDecision(IG->getInsertPos(), VF) == 8947 LoopVectorizationCostModel::CM_Interleave); 8948 }; 8949 if (!getDecisionAndClampRange(applyIG, Range)) 8950 continue; 8951 InterleaveGroups.insert(IG); 8952 for (unsigned i = 0; i < IG->getFactor(); i++) 8953 if (Instruction *Member = IG->getMember(i)) 8954 RecipeBuilder.recordRecipeOf(Member); 8955 }; 8956 8957 // --------------------------------------------------------------------------- 8958 // Build initial VPlan: Scan the body of the loop in a topological order to 8959 // visit each basic block after having visited its predecessor basic blocks. 8960 // --------------------------------------------------------------------------- 8961 8962 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8963 auto Plan = std::make_unique<VPlan>(); 8964 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8965 Plan->setEntry(VPBB); 8966 8967 // Scan the body of the loop in a topological order to visit each basic block 8968 // after having visited its predecessor basic blocks. 8969 LoopBlocksDFS DFS(OrigLoop); 8970 DFS.perform(LI); 8971 8972 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8973 // Relevant instructions from basic block BB will be grouped into VPRecipe 8974 // ingredients and fill a new VPBasicBlock. 8975 unsigned VPBBsForBB = 0; 8976 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8977 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8978 VPBB = FirstVPBBForBB; 8979 Builder.setInsertPoint(VPBB); 8980 8981 // Introduce each ingredient into VPlan. 8982 // TODO: Model and preserve debug instrinsics in VPlan. 8983 for (Instruction &I : BB->instructionsWithoutDebug()) { 8984 Instruction *Instr = &I; 8985 8986 // First filter out irrelevant instructions, to ensure no recipes are 8987 // built for them. 8988 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8989 continue; 8990 8991 SmallVector<VPValue *, 4> Operands; 8992 auto *Phi = dyn_cast<PHINode>(Instr); 8993 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8994 Operands.push_back(Plan->getOrAddVPValue( 8995 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8996 } else { 8997 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8998 Operands = {OpRange.begin(), OpRange.end()}; 8999 } 9000 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9001 Instr, Operands, Range, Plan)) { 9002 // If Instr can be simplified to an existing VPValue, use it. 9003 if (RecipeOrValue.is<VPValue *>()) { 9004 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 9005 continue; 9006 } 9007 // Otherwise, add the new recipe. 9008 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9009 for (auto *Def : Recipe->definedValues()) { 9010 auto *UV = Def->getUnderlyingValue(); 9011 Plan->addVPValue(UV, Def); 9012 } 9013 9014 RecipeBuilder.setRecipe(Instr, Recipe); 9015 VPBB->appendRecipe(Recipe); 9016 continue; 9017 } 9018 9019 // Otherwise, if all widening options failed, Instruction is to be 9020 // replicated. This may create a successor for VPBB. 9021 VPBasicBlock *NextVPBB = 9022 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9023 if (NextVPBB != VPBB) { 9024 VPBB = NextVPBB; 9025 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9026 : ""); 9027 } 9028 } 9029 } 9030 9031 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9032 // may also be empty, such as the last one VPBB, reflecting original 9033 // basic-blocks with no recipes. 9034 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9035 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9036 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9037 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9038 delete PreEntry; 9039 9040 // --------------------------------------------------------------------------- 9041 // Transform initial VPlan: Apply previously taken decisions, in order, to 9042 // bring the VPlan to its final state. 9043 // --------------------------------------------------------------------------- 9044 9045 // Apply Sink-After legal constraints. 9046 for (auto &Entry : SinkAfter) { 9047 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9048 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9049 // If the target is in a replication region, make sure to move Sink to the 9050 // block after it, not into the replication region itself. 9051 if (auto *Region = 9052 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 9053 if (Region->isReplicator()) { 9054 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 9055 VPBasicBlock *NextBlock = 9056 cast<VPBasicBlock>(Region->getSuccessors().front()); 9057 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9058 continue; 9059 } 9060 } 9061 Sink->moveAfter(Target); 9062 } 9063 9064 // Interleave memory: for each Interleave Group we marked earlier as relevant 9065 // for this VPlan, replace the Recipes widening its memory instructions with a 9066 // single VPInterleaveRecipe at its insertion point. 9067 for (auto IG : InterleaveGroups) { 9068 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9069 RecipeBuilder.getRecipe(IG->getInsertPos())); 9070 SmallVector<VPValue *, 4> StoredValues; 9071 for (unsigned i = 0; i < IG->getFactor(); ++i) 9072 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9073 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9074 9075 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9076 Recipe->getMask()); 9077 VPIG->insertBefore(Recipe); 9078 unsigned J = 0; 9079 for (unsigned i = 0; i < IG->getFactor(); ++i) 9080 if (Instruction *Member = IG->getMember(i)) { 9081 if (!Member->getType()->isVoidTy()) { 9082 VPValue *OriginalV = Plan->getVPValue(Member); 9083 Plan->removeVPValueFor(Member); 9084 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9085 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9086 J++; 9087 } 9088 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9089 } 9090 } 9091 9092 // Adjust the recipes for any inloop reductions. 9093 if (Range.Start.isVector()) 9094 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9095 9096 // Finally, if tail is folded by masking, introduce selects between the phi 9097 // and the live-out instruction of each reduction, at the end of the latch. 9098 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9099 Builder.setInsertPoint(VPBB); 9100 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9101 for (auto &Reduction : Legal->getReductionVars()) { 9102 if (CM.isInLoopReduction(Reduction.first)) 9103 continue; 9104 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9105 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9106 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9107 } 9108 } 9109 9110 std::string PlanName; 9111 raw_string_ostream RSO(PlanName); 9112 ElementCount VF = Range.Start; 9113 Plan->addVF(VF); 9114 RSO << "Initial VPlan for VF={" << VF; 9115 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9116 Plan->addVF(VF); 9117 RSO << "," << VF; 9118 } 9119 RSO << "},UF>=1"; 9120 RSO.flush(); 9121 Plan->setName(PlanName); 9122 9123 return Plan; 9124 } 9125 9126 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9127 // Outer loop handling: They may require CFG and instruction level 9128 // transformations before even evaluating whether vectorization is profitable. 9129 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9130 // the vectorization pipeline. 9131 assert(!OrigLoop->isInnermost()); 9132 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9133 9134 // Create new empty VPlan 9135 auto Plan = std::make_unique<VPlan>(); 9136 9137 // Build hierarchical CFG 9138 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9139 HCFGBuilder.buildHierarchicalCFG(); 9140 9141 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9142 VF *= 2) 9143 Plan->addVF(VF); 9144 9145 if (EnableVPlanPredication) { 9146 VPlanPredicator VPP(*Plan); 9147 VPP.predicate(); 9148 9149 // Avoid running transformation to recipes until masked code generation in 9150 // VPlan-native path is in place. 9151 return Plan; 9152 } 9153 9154 SmallPtrSet<Instruction *, 1> DeadInstructions; 9155 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9156 Legal->getInductionVars(), 9157 DeadInstructions, *PSE.getSE()); 9158 return Plan; 9159 } 9160 9161 // Adjust the recipes for any inloop reductions. The chain of instructions 9162 // leading from the loop exit instr to the phi need to be converted to 9163 // reductions, with one operand being vector and the other being the scalar 9164 // reduction chain. 9165 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9166 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9167 for (auto &Reduction : CM.getInLoopReductionChains()) { 9168 PHINode *Phi = Reduction.first; 9169 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9170 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9171 9172 // ReductionOperations are orders top-down from the phi's use to the 9173 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9174 // which of the two operands will remain scalar and which will be reduced. 9175 // For minmax the chain will be the select instructions. 9176 Instruction *Chain = Phi; 9177 for (Instruction *R : ReductionOperations) { 9178 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9179 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9180 9181 VPValue *ChainOp = Plan->getVPValue(Chain); 9182 unsigned FirstOpId; 9183 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9184 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9185 "Expected to replace a VPWidenSelectSC"); 9186 FirstOpId = 1; 9187 } else { 9188 assert(isa<VPWidenRecipe>(WidenRecipe) && 9189 "Expected to replace a VPWidenSC"); 9190 FirstOpId = 0; 9191 } 9192 unsigned VecOpId = 9193 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9194 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9195 9196 auto *CondOp = CM.foldTailByMasking() 9197 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9198 : nullptr; 9199 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9200 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9201 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9202 Plan->removeVPValueFor(R); 9203 Plan->addVPValue(R, RedRecipe); 9204 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9205 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 9206 WidenRecipe->eraseFromParent(); 9207 9208 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9209 VPRecipeBase *CompareRecipe = 9210 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9211 assert(isa<VPWidenRecipe>(CompareRecipe) && 9212 "Expected to replace a VPWidenSC"); 9213 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9214 "Expected no remaining users"); 9215 CompareRecipe->eraseFromParent(); 9216 } 9217 Chain = R; 9218 } 9219 } 9220 } 9221 9222 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9223 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9224 VPSlotTracker &SlotTracker) const { 9225 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9226 IG->getInsertPos()->printAsOperand(O, false); 9227 O << ", "; 9228 getAddr()->printAsOperand(O, SlotTracker); 9229 VPValue *Mask = getMask(); 9230 if (Mask) { 9231 O << ", "; 9232 Mask->printAsOperand(O, SlotTracker); 9233 } 9234 for (unsigned i = 0; i < IG->getFactor(); ++i) 9235 if (Instruction *I = IG->getMember(i)) 9236 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9237 } 9238 #endif 9239 9240 void VPWidenCallRecipe::execute(VPTransformState &State) { 9241 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9242 *this, State); 9243 } 9244 9245 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9246 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9247 this, *this, InvariantCond, State); 9248 } 9249 9250 void VPWidenRecipe::execute(VPTransformState &State) { 9251 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9252 } 9253 9254 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9255 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9256 *this, State.UF, State.VF, IsPtrLoopInvariant, 9257 IsIndexLoopInvariant, State); 9258 } 9259 9260 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9261 assert(!State.Instance && "Int or FP induction being replicated."); 9262 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9263 getTruncInst(), getVPValue(0), 9264 getCastValue(), State); 9265 } 9266 9267 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9268 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9269 this, State); 9270 } 9271 9272 void VPBlendRecipe::execute(VPTransformState &State) { 9273 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9274 // We know that all PHIs in non-header blocks are converted into 9275 // selects, so we don't have to worry about the insertion order and we 9276 // can just use the builder. 9277 // At this point we generate the predication tree. There may be 9278 // duplications since this is a simple recursive scan, but future 9279 // optimizations will clean it up. 9280 9281 unsigned NumIncoming = getNumIncomingValues(); 9282 9283 // Generate a sequence of selects of the form: 9284 // SELECT(Mask3, In3, 9285 // SELECT(Mask2, In2, 9286 // SELECT(Mask1, In1, 9287 // In0))) 9288 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9289 // are essentially undef are taken from In0. 9290 InnerLoopVectorizer::VectorParts Entry(State.UF); 9291 for (unsigned In = 0; In < NumIncoming; ++In) { 9292 for (unsigned Part = 0; Part < State.UF; ++Part) { 9293 // We might have single edge PHIs (blocks) - use an identity 9294 // 'select' for the first PHI operand. 9295 Value *In0 = State.get(getIncomingValue(In), Part); 9296 if (In == 0) 9297 Entry[Part] = In0; // Initialize with the first incoming value. 9298 else { 9299 // Select between the current value and the previous incoming edge 9300 // based on the incoming mask. 9301 Value *Cond = State.get(getMask(In), Part); 9302 Entry[Part] = 9303 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9304 } 9305 } 9306 } 9307 for (unsigned Part = 0; Part < State.UF; ++Part) 9308 State.set(this, Entry[Part], Part); 9309 } 9310 9311 void VPInterleaveRecipe::execute(VPTransformState &State) { 9312 assert(!State.Instance && "Interleave group being replicated."); 9313 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9314 getStoredValues(), getMask()); 9315 } 9316 9317 void VPReductionRecipe::execute(VPTransformState &State) { 9318 assert(!State.Instance && "Reduction being replicated."); 9319 Value *PrevInChain = State.get(getChainOp(), 0); 9320 for (unsigned Part = 0; Part < State.UF; ++Part) { 9321 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9322 bool IsOrdered = useOrderedReductions(*RdxDesc); 9323 Value *NewVecOp = State.get(getVecOp(), Part); 9324 if (VPValue *Cond = getCondOp()) { 9325 Value *NewCond = State.get(Cond, Part); 9326 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9327 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9328 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9329 Constant *IdenVec = 9330 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9331 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9332 NewVecOp = Select; 9333 } 9334 Value *NewRed; 9335 Value *NextInChain; 9336 if (IsOrdered) { 9337 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9338 PrevInChain); 9339 PrevInChain = NewRed; 9340 } else { 9341 PrevInChain = State.get(getChainOp(), Part); 9342 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9343 } 9344 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9345 NextInChain = 9346 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9347 NewRed, PrevInChain); 9348 } else if (IsOrdered) 9349 NextInChain = NewRed; 9350 else { 9351 NextInChain = State.Builder.CreateBinOp( 9352 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9353 PrevInChain); 9354 } 9355 State.set(this, NextInChain, Part); 9356 } 9357 } 9358 9359 void VPReplicateRecipe::execute(VPTransformState &State) { 9360 if (State.Instance) { // Generate a single instance. 9361 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9362 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9363 *State.Instance, IsPredicated, State); 9364 // Insert scalar instance packing it into a vector. 9365 if (AlsoPack && State.VF.isVector()) { 9366 // If we're constructing lane 0, initialize to start from poison. 9367 if (State.Instance->Lane.isFirstLane()) { 9368 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9369 Value *Poison = PoisonValue::get( 9370 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9371 State.set(this, Poison, State.Instance->Part); 9372 } 9373 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9374 } 9375 return; 9376 } 9377 9378 // Generate scalar instances for all VF lanes of all UF parts, unless the 9379 // instruction is uniform inwhich case generate only the first lane for each 9380 // of the UF parts. 9381 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9382 assert((!State.VF.isScalable() || IsUniform) && 9383 "Can't scalarize a scalable vector"); 9384 for (unsigned Part = 0; Part < State.UF; ++Part) 9385 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9386 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9387 VPIteration(Part, Lane), IsPredicated, 9388 State); 9389 } 9390 9391 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9392 assert(State.Instance && "Branch on Mask works only on single instance."); 9393 9394 unsigned Part = State.Instance->Part; 9395 unsigned Lane = State.Instance->Lane.getKnownLane(); 9396 9397 Value *ConditionBit = nullptr; 9398 VPValue *BlockInMask = getMask(); 9399 if (BlockInMask) { 9400 ConditionBit = State.get(BlockInMask, Part); 9401 if (ConditionBit->getType()->isVectorTy()) 9402 ConditionBit = State.Builder.CreateExtractElement( 9403 ConditionBit, State.Builder.getInt32(Lane)); 9404 } else // Block in mask is all-one. 9405 ConditionBit = State.Builder.getTrue(); 9406 9407 // Replace the temporary unreachable terminator with a new conditional branch, 9408 // whose two destinations will be set later when they are created. 9409 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9410 assert(isa<UnreachableInst>(CurrentTerminator) && 9411 "Expected to replace unreachable terminator with conditional branch."); 9412 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9413 CondBr->setSuccessor(0, nullptr); 9414 ReplaceInstWithInst(CurrentTerminator, CondBr); 9415 } 9416 9417 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9418 assert(State.Instance && "Predicated instruction PHI works per instance."); 9419 Instruction *ScalarPredInst = 9420 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9421 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9422 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9423 assert(PredicatingBB && "Predicated block has no single predecessor."); 9424 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9425 "operand must be VPReplicateRecipe"); 9426 9427 // By current pack/unpack logic we need to generate only a single phi node: if 9428 // a vector value for the predicated instruction exists at this point it means 9429 // the instruction has vector users only, and a phi for the vector value is 9430 // needed. In this case the recipe of the predicated instruction is marked to 9431 // also do that packing, thereby "hoisting" the insert-element sequence. 9432 // Otherwise, a phi node for the scalar value is needed. 9433 unsigned Part = State.Instance->Part; 9434 if (State.hasVectorValue(getOperand(0), Part)) { 9435 Value *VectorValue = State.get(getOperand(0), Part); 9436 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9437 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9438 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9439 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9440 if (State.hasVectorValue(this, Part)) 9441 State.reset(this, VPhi, Part); 9442 else 9443 State.set(this, VPhi, Part); 9444 // NOTE: Currently we need to update the value of the operand, so the next 9445 // predicated iteration inserts its generated value in the correct vector. 9446 State.reset(getOperand(0), VPhi, Part); 9447 } else { 9448 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9449 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9450 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9451 PredicatingBB); 9452 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9453 if (State.hasScalarValue(this, *State.Instance)) 9454 State.reset(this, Phi, *State.Instance); 9455 else 9456 State.set(this, Phi, *State.Instance); 9457 // NOTE: Currently we need to update the value of the operand, so the next 9458 // predicated iteration inserts its generated value in the correct vector. 9459 State.reset(getOperand(0), Phi, *State.Instance); 9460 } 9461 } 9462 9463 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9464 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9465 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9466 StoredValue ? nullptr : getVPValue(), 9467 getAddr(), StoredValue, getMask()); 9468 } 9469 9470 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9471 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9472 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9473 // for predication. 9474 static ScalarEpilogueLowering getScalarEpilogueLowering( 9475 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9476 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9477 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9478 LoopVectorizationLegality &LVL) { 9479 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9480 // don't look at hints or options, and don't request a scalar epilogue. 9481 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9482 // LoopAccessInfo (due to code dependency and not being able to reliably get 9483 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9484 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9485 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9486 // back to the old way and vectorize with versioning when forced. See D81345.) 9487 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9488 PGSOQueryType::IRPass) && 9489 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9490 return CM_ScalarEpilogueNotAllowedOptSize; 9491 9492 // 2) If set, obey the directives 9493 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9494 switch (PreferPredicateOverEpilogue) { 9495 case PreferPredicateTy::ScalarEpilogue: 9496 return CM_ScalarEpilogueAllowed; 9497 case PreferPredicateTy::PredicateElseScalarEpilogue: 9498 return CM_ScalarEpilogueNotNeededUsePredicate; 9499 case PreferPredicateTy::PredicateOrDontVectorize: 9500 return CM_ScalarEpilogueNotAllowedUsePredicate; 9501 }; 9502 } 9503 9504 // 3) If set, obey the hints 9505 switch (Hints.getPredicate()) { 9506 case LoopVectorizeHints::FK_Enabled: 9507 return CM_ScalarEpilogueNotNeededUsePredicate; 9508 case LoopVectorizeHints::FK_Disabled: 9509 return CM_ScalarEpilogueAllowed; 9510 }; 9511 9512 // 4) if the TTI hook indicates this is profitable, request predication. 9513 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9514 LVL.getLAI())) 9515 return CM_ScalarEpilogueNotNeededUsePredicate; 9516 9517 return CM_ScalarEpilogueAllowed; 9518 } 9519 9520 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9521 // If Values have been set for this Def return the one relevant for \p Part. 9522 if (hasVectorValue(Def, Part)) 9523 return Data.PerPartOutput[Def][Part]; 9524 9525 if (!hasScalarValue(Def, {Part, 0})) { 9526 Value *IRV = Def->getLiveInIRValue(); 9527 Value *B = ILV->getBroadcastInstrs(IRV); 9528 set(Def, B, Part); 9529 return B; 9530 } 9531 9532 Value *ScalarValue = get(Def, {Part, 0}); 9533 // If we aren't vectorizing, we can just copy the scalar map values over 9534 // to the vector map. 9535 if (VF.isScalar()) { 9536 set(Def, ScalarValue, Part); 9537 return ScalarValue; 9538 } 9539 9540 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9541 bool IsUniform = RepR && RepR->isUniform(); 9542 9543 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9544 // Check if there is a scalar value for the selected lane. 9545 if (!hasScalarValue(Def, {Part, LastLane})) { 9546 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9547 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9548 "unexpected recipe found to be invariant"); 9549 IsUniform = true; 9550 LastLane = 0; 9551 } 9552 9553 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9554 9555 // Set the insert point after the last scalarized instruction. This 9556 // ensures the insertelement sequence will directly follow the scalar 9557 // definitions. 9558 auto OldIP = Builder.saveIP(); 9559 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9560 Builder.SetInsertPoint(&*NewIP); 9561 9562 // However, if we are vectorizing, we need to construct the vector values. 9563 // If the value is known to be uniform after vectorization, we can just 9564 // broadcast the scalar value corresponding to lane zero for each unroll 9565 // iteration. Otherwise, we construct the vector values using 9566 // insertelement instructions. Since the resulting vectors are stored in 9567 // State, we will only generate the insertelements once. 9568 Value *VectorValue = nullptr; 9569 if (IsUniform) { 9570 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9571 set(Def, VectorValue, Part); 9572 } else { 9573 // Initialize packing with insertelements to start from undef. 9574 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9575 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9576 set(Def, Undef, Part); 9577 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9578 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9579 VectorValue = get(Def, Part); 9580 } 9581 Builder.restoreIP(OldIP); 9582 return VectorValue; 9583 } 9584 9585 // Process the loop in the VPlan-native vectorization path. This path builds 9586 // VPlan upfront in the vectorization pipeline, which allows to apply 9587 // VPlan-to-VPlan transformations from the very beginning without modifying the 9588 // input LLVM IR. 9589 static bool processLoopInVPlanNativePath( 9590 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9591 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9592 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9593 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9594 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9595 LoopVectorizationRequirements &Requirements) { 9596 9597 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9598 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9599 return false; 9600 } 9601 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9602 Function *F = L->getHeader()->getParent(); 9603 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9604 9605 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9606 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9607 9608 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9609 &Hints, IAI); 9610 // Use the planner for outer loop vectorization. 9611 // TODO: CM is not used at this point inside the planner. Turn CM into an 9612 // optional argument if we don't need it in the future. 9613 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9614 Requirements, ORE); 9615 9616 // Get user vectorization factor. 9617 ElementCount UserVF = Hints.getWidth(); 9618 9619 // Plan how to best vectorize, return the best VF and its cost. 9620 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9621 9622 // If we are stress testing VPlan builds, do not attempt to generate vector 9623 // code. Masked vector code generation support will follow soon. 9624 // Also, do not attempt to vectorize if no vector code will be produced. 9625 if (VPlanBuildStressTest || EnableVPlanPredication || 9626 VectorizationFactor::Disabled() == VF) 9627 return false; 9628 9629 LVP.setBestPlan(VF.Width, 1); 9630 9631 { 9632 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9633 F->getParent()->getDataLayout()); 9634 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9635 &CM, BFI, PSI, Checks); 9636 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9637 << L->getHeader()->getParent()->getName() << "\"\n"); 9638 LVP.executePlan(LB, DT); 9639 } 9640 9641 // Mark the loop as already vectorized to avoid vectorizing again. 9642 Hints.setAlreadyVectorized(); 9643 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9644 return true; 9645 } 9646 9647 // Emit a remark if there are stores to floats that required a floating point 9648 // extension. If the vectorized loop was generated with floating point there 9649 // will be a performance penalty from the conversion overhead and the change in 9650 // the vector width. 9651 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9652 SmallVector<Instruction *, 4> Worklist; 9653 for (BasicBlock *BB : L->getBlocks()) { 9654 for (Instruction &Inst : *BB) { 9655 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9656 if (S->getValueOperand()->getType()->isFloatTy()) 9657 Worklist.push_back(S); 9658 } 9659 } 9660 } 9661 9662 // Traverse the floating point stores upwards searching, for floating point 9663 // conversions. 9664 SmallPtrSet<const Instruction *, 4> Visited; 9665 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9666 while (!Worklist.empty()) { 9667 auto *I = Worklist.pop_back_val(); 9668 if (!L->contains(I)) 9669 continue; 9670 if (!Visited.insert(I).second) 9671 continue; 9672 9673 // Emit a remark if the floating point store required a floating 9674 // point conversion. 9675 // TODO: More work could be done to identify the root cause such as a 9676 // constant or a function return type and point the user to it. 9677 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9678 ORE->emit([&]() { 9679 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9680 I->getDebugLoc(), L->getHeader()) 9681 << "floating point conversion changes vector width. " 9682 << "Mixed floating point precision requires an up/down " 9683 << "cast that will negatively impact performance."; 9684 }); 9685 9686 for (Use &Op : I->operands()) 9687 if (auto *OpI = dyn_cast<Instruction>(Op)) 9688 Worklist.push_back(OpI); 9689 } 9690 } 9691 9692 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9693 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9694 !EnableLoopInterleaving), 9695 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9696 !EnableLoopVectorization) {} 9697 9698 bool LoopVectorizePass::processLoop(Loop *L) { 9699 assert((EnableVPlanNativePath || L->isInnermost()) && 9700 "VPlan-native path is not enabled. Only process inner loops."); 9701 9702 #ifndef NDEBUG 9703 const std::string DebugLocStr = getDebugLocString(L); 9704 #endif /* NDEBUG */ 9705 9706 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9707 << L->getHeader()->getParent()->getName() << "\" from " 9708 << DebugLocStr << "\n"); 9709 9710 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9711 9712 LLVM_DEBUG( 9713 dbgs() << "LV: Loop hints:" 9714 << " force=" 9715 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9716 ? "disabled" 9717 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9718 ? "enabled" 9719 : "?")) 9720 << " width=" << Hints.getWidth() 9721 << " unroll=" << Hints.getInterleave() << "\n"); 9722 9723 // Function containing loop 9724 Function *F = L->getHeader()->getParent(); 9725 9726 // Looking at the diagnostic output is the only way to determine if a loop 9727 // was vectorized (other than looking at the IR or machine code), so it 9728 // is important to generate an optimization remark for each loop. Most of 9729 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9730 // generated as OptimizationRemark and OptimizationRemarkMissed are 9731 // less verbose reporting vectorized loops and unvectorized loops that may 9732 // benefit from vectorization, respectively. 9733 9734 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9735 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9736 return false; 9737 } 9738 9739 PredicatedScalarEvolution PSE(*SE, *L); 9740 9741 // Check if it is legal to vectorize the loop. 9742 LoopVectorizationRequirements Requirements; 9743 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9744 &Requirements, &Hints, DB, AC, BFI, PSI); 9745 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9746 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9747 Hints.emitRemarkWithHints(); 9748 return false; 9749 } 9750 9751 // Check the function attributes and profiles to find out if this function 9752 // should be optimized for size. 9753 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9754 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9755 9756 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9757 // here. They may require CFG and instruction level transformations before 9758 // even evaluating whether vectorization is profitable. Since we cannot modify 9759 // the incoming IR, we need to build VPlan upfront in the vectorization 9760 // pipeline. 9761 if (!L->isInnermost()) 9762 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9763 ORE, BFI, PSI, Hints, Requirements); 9764 9765 assert(L->isInnermost() && "Inner loop expected."); 9766 9767 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9768 // count by optimizing for size, to minimize overheads. 9769 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9770 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9771 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9772 << "This loop is worth vectorizing only if no scalar " 9773 << "iteration overheads are incurred."); 9774 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9775 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9776 else { 9777 LLVM_DEBUG(dbgs() << "\n"); 9778 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9779 } 9780 } 9781 9782 // Check the function attributes to see if implicit floats are allowed. 9783 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9784 // an integer loop and the vector instructions selected are purely integer 9785 // vector instructions? 9786 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9787 reportVectorizationFailure( 9788 "Can't vectorize when the NoImplicitFloat attribute is used", 9789 "loop not vectorized due to NoImplicitFloat attribute", 9790 "NoImplicitFloat", ORE, L); 9791 Hints.emitRemarkWithHints(); 9792 return false; 9793 } 9794 9795 // Check if the target supports potentially unsafe FP vectorization. 9796 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9797 // for the target we're vectorizing for, to make sure none of the 9798 // additional fp-math flags can help. 9799 if (Hints.isPotentiallyUnsafe() && 9800 TTI->isFPVectorizationPotentiallyUnsafe()) { 9801 reportVectorizationFailure( 9802 "Potentially unsafe FP op prevents vectorization", 9803 "loop not vectorized due to unsafe FP support.", 9804 "UnsafeFP", ORE, L); 9805 Hints.emitRemarkWithHints(); 9806 return false; 9807 } 9808 9809 if (!Requirements.canVectorizeFPMath(Hints)) { 9810 ORE->emit([&]() { 9811 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9812 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9813 ExactFPMathInst->getDebugLoc(), 9814 ExactFPMathInst->getParent()) 9815 << "loop not vectorized: cannot prove it is safe to reorder " 9816 "floating-point operations"; 9817 }); 9818 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9819 "reorder floating-point operations\n"); 9820 Hints.emitRemarkWithHints(); 9821 return false; 9822 } 9823 9824 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9825 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9826 9827 // If an override option has been passed in for interleaved accesses, use it. 9828 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9829 UseInterleaved = EnableInterleavedMemAccesses; 9830 9831 // Analyze interleaved memory accesses. 9832 if (UseInterleaved) { 9833 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9834 } 9835 9836 // Use the cost model. 9837 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9838 F, &Hints, IAI); 9839 CM.collectValuesToIgnore(); 9840 9841 // Use the planner for vectorization. 9842 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9843 Requirements, ORE); 9844 9845 // Get user vectorization factor and interleave count. 9846 ElementCount UserVF = Hints.getWidth(); 9847 unsigned UserIC = Hints.getInterleave(); 9848 9849 // Plan how to best vectorize, return the best VF and its cost. 9850 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9851 9852 VectorizationFactor VF = VectorizationFactor::Disabled(); 9853 unsigned IC = 1; 9854 9855 if (MaybeVF) { 9856 VF = *MaybeVF; 9857 // Select the interleave count. 9858 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 9859 } 9860 9861 // Identify the diagnostic messages that should be produced. 9862 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9863 bool VectorizeLoop = true, InterleaveLoop = true; 9864 if (VF.Width.isScalar()) { 9865 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9866 VecDiagMsg = std::make_pair( 9867 "VectorizationNotBeneficial", 9868 "the cost-model indicates that vectorization is not beneficial"); 9869 VectorizeLoop = false; 9870 } 9871 9872 if (!MaybeVF && UserIC > 1) { 9873 // Tell the user interleaving was avoided up-front, despite being explicitly 9874 // requested. 9875 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9876 "interleaving should be avoided up front\n"); 9877 IntDiagMsg = std::make_pair( 9878 "InterleavingAvoided", 9879 "Ignoring UserIC, because interleaving was avoided up front"); 9880 InterleaveLoop = false; 9881 } else if (IC == 1 && UserIC <= 1) { 9882 // Tell the user interleaving is not beneficial. 9883 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9884 IntDiagMsg = std::make_pair( 9885 "InterleavingNotBeneficial", 9886 "the cost-model indicates that interleaving is not beneficial"); 9887 InterleaveLoop = false; 9888 if (UserIC == 1) { 9889 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9890 IntDiagMsg.second += 9891 " and is explicitly disabled or interleave count is set to 1"; 9892 } 9893 } else if (IC > 1 && UserIC == 1) { 9894 // Tell the user interleaving is beneficial, but it explicitly disabled. 9895 LLVM_DEBUG( 9896 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9897 IntDiagMsg = std::make_pair( 9898 "InterleavingBeneficialButDisabled", 9899 "the cost-model indicates that interleaving is beneficial " 9900 "but is explicitly disabled or interleave count is set to 1"); 9901 InterleaveLoop = false; 9902 } 9903 9904 // Override IC if user provided an interleave count. 9905 IC = UserIC > 0 ? UserIC : IC; 9906 9907 // Emit diagnostic messages, if any. 9908 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9909 if (!VectorizeLoop && !InterleaveLoop) { 9910 // Do not vectorize or interleaving the loop. 9911 ORE->emit([&]() { 9912 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9913 L->getStartLoc(), L->getHeader()) 9914 << VecDiagMsg.second; 9915 }); 9916 ORE->emit([&]() { 9917 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9918 L->getStartLoc(), L->getHeader()) 9919 << IntDiagMsg.second; 9920 }); 9921 return false; 9922 } else if (!VectorizeLoop && InterleaveLoop) { 9923 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9924 ORE->emit([&]() { 9925 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9926 L->getStartLoc(), L->getHeader()) 9927 << VecDiagMsg.second; 9928 }); 9929 } else if (VectorizeLoop && !InterleaveLoop) { 9930 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9931 << ") in " << DebugLocStr << '\n'); 9932 ORE->emit([&]() { 9933 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9934 L->getStartLoc(), L->getHeader()) 9935 << IntDiagMsg.second; 9936 }); 9937 } else if (VectorizeLoop && InterleaveLoop) { 9938 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9939 << ") in " << DebugLocStr << '\n'); 9940 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9941 } 9942 9943 bool DisableRuntimeUnroll = false; 9944 MDNode *OrigLoopID = L->getLoopID(); 9945 { 9946 // Optimistically generate runtime checks. Drop them if they turn out to not 9947 // be profitable. Limit the scope of Checks, so the cleanup happens 9948 // immediately after vector codegeneration is done. 9949 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9950 F->getParent()->getDataLayout()); 9951 if (!VF.Width.isScalar() || IC > 1) 9952 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 9953 LVP.setBestPlan(VF.Width, IC); 9954 9955 using namespace ore; 9956 if (!VectorizeLoop) { 9957 assert(IC > 1 && "interleave count should not be 1 or 0"); 9958 // If we decided that it is not legal to vectorize the loop, then 9959 // interleave it. 9960 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 9961 &CM, BFI, PSI, Checks); 9962 LVP.executePlan(Unroller, DT); 9963 9964 ORE->emit([&]() { 9965 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9966 L->getHeader()) 9967 << "interleaved loop (interleaved count: " 9968 << NV("InterleaveCount", IC) << ")"; 9969 }); 9970 } else { 9971 // If we decided that it is *legal* to vectorize the loop, then do it. 9972 9973 // Consider vectorizing the epilogue too if it's profitable. 9974 VectorizationFactor EpilogueVF = 9975 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9976 if (EpilogueVF.Width.isVector()) { 9977 9978 // The first pass vectorizes the main loop and creates a scalar epilogue 9979 // to be vectorized by executing the plan (potentially with a different 9980 // factor) again shortly afterwards. 9981 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9982 EpilogueVF.Width.getKnownMinValue(), 9983 1); 9984 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 9985 EPI, &LVL, &CM, BFI, PSI, Checks); 9986 9987 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9988 LVP.executePlan(MainILV, DT); 9989 ++LoopsVectorized; 9990 9991 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9992 formLCSSARecursively(*L, *DT, LI, SE); 9993 9994 // Second pass vectorizes the epilogue and adjusts the control flow 9995 // edges from the first pass. 9996 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9997 EPI.MainLoopVF = EPI.EpilogueVF; 9998 EPI.MainLoopUF = EPI.EpilogueUF; 9999 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10000 ORE, EPI, &LVL, &CM, BFI, PSI, 10001 Checks); 10002 LVP.executePlan(EpilogILV, DT); 10003 ++LoopsEpilogueVectorized; 10004 10005 if (!MainILV.areSafetyChecksAdded()) 10006 DisableRuntimeUnroll = true; 10007 } else { 10008 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10009 &LVL, &CM, BFI, PSI, Checks); 10010 LVP.executePlan(LB, DT); 10011 ++LoopsVectorized; 10012 10013 // Add metadata to disable runtime unrolling a scalar loop when there 10014 // are no runtime checks about strides and memory. A scalar loop that is 10015 // rarely used is not worth unrolling. 10016 if (!LB.areSafetyChecksAdded()) 10017 DisableRuntimeUnroll = true; 10018 } 10019 // Report the vectorization decision. 10020 ORE->emit([&]() { 10021 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10022 L->getHeader()) 10023 << "vectorized loop (vectorization width: " 10024 << NV("VectorizationFactor", VF.Width) 10025 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10026 }); 10027 } 10028 10029 if (ORE->allowExtraAnalysis(LV_NAME)) 10030 checkMixedPrecision(L, ORE); 10031 } 10032 10033 Optional<MDNode *> RemainderLoopID = 10034 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10035 LLVMLoopVectorizeFollowupEpilogue}); 10036 if (RemainderLoopID.hasValue()) { 10037 L->setLoopID(RemainderLoopID.getValue()); 10038 } else { 10039 if (DisableRuntimeUnroll) 10040 AddRuntimeUnrollDisableMetaData(L); 10041 10042 // Mark the loop as already vectorized to avoid vectorizing again. 10043 Hints.setAlreadyVectorized(); 10044 } 10045 10046 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10047 return true; 10048 } 10049 10050 LoopVectorizeResult LoopVectorizePass::runImpl( 10051 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10052 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10053 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10054 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10055 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10056 SE = &SE_; 10057 LI = &LI_; 10058 TTI = &TTI_; 10059 DT = &DT_; 10060 BFI = &BFI_; 10061 TLI = TLI_; 10062 AA = &AA_; 10063 AC = &AC_; 10064 GetLAA = &GetLAA_; 10065 DB = &DB_; 10066 ORE = &ORE_; 10067 PSI = PSI_; 10068 10069 // Don't attempt if 10070 // 1. the target claims to have no vector registers, and 10071 // 2. interleaving won't help ILP. 10072 // 10073 // The second condition is necessary because, even if the target has no 10074 // vector registers, loop vectorization may still enable scalar 10075 // interleaving. 10076 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10077 TTI->getMaxInterleaveFactor(1) < 2) 10078 return LoopVectorizeResult(false, false); 10079 10080 bool Changed = false, CFGChanged = false; 10081 10082 // The vectorizer requires loops to be in simplified form. 10083 // Since simplification may add new inner loops, it has to run before the 10084 // legality and profitability checks. This means running the loop vectorizer 10085 // will simplify all loops, regardless of whether anything end up being 10086 // vectorized. 10087 for (auto &L : *LI) 10088 Changed |= CFGChanged |= 10089 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10090 10091 // Build up a worklist of inner-loops to vectorize. This is necessary as 10092 // the act of vectorizing or partially unrolling a loop creates new loops 10093 // and can invalidate iterators across the loops. 10094 SmallVector<Loop *, 8> Worklist; 10095 10096 for (Loop *L : *LI) 10097 collectSupportedLoops(*L, LI, ORE, Worklist); 10098 10099 LoopsAnalyzed += Worklist.size(); 10100 10101 // Now walk the identified inner loops. 10102 while (!Worklist.empty()) { 10103 Loop *L = Worklist.pop_back_val(); 10104 10105 // For the inner loops we actually process, form LCSSA to simplify the 10106 // transform. 10107 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10108 10109 Changed |= CFGChanged |= processLoop(L); 10110 } 10111 10112 // Process each loop nest in the function. 10113 return LoopVectorizeResult(Changed, CFGChanged); 10114 } 10115 10116 PreservedAnalyses LoopVectorizePass::run(Function &F, 10117 FunctionAnalysisManager &AM) { 10118 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10119 auto &LI = AM.getResult<LoopAnalysis>(F); 10120 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10121 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10122 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10123 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10124 auto &AA = AM.getResult<AAManager>(F); 10125 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10126 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10127 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10128 MemorySSA *MSSA = EnableMSSALoopDependency 10129 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10130 : nullptr; 10131 10132 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10133 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10134 [&](Loop &L) -> const LoopAccessInfo & { 10135 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10136 TLI, TTI, nullptr, MSSA}; 10137 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10138 }; 10139 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10140 ProfileSummaryInfo *PSI = 10141 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10142 LoopVectorizeResult Result = 10143 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10144 if (!Result.MadeAnyChange) 10145 return PreservedAnalyses::all(); 10146 PreservedAnalyses PA; 10147 10148 // We currently do not preserve loopinfo/dominator analyses with outer loop 10149 // vectorization. Until this is addressed, mark these analyses as preserved 10150 // only for non-VPlan-native path. 10151 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10152 if (!EnableVPlanNativePath) { 10153 PA.preserve<LoopAnalysis>(); 10154 PA.preserve<DominatorTreeAnalysis>(); 10155 } 10156 PA.preserve<BasicAA>(); 10157 PA.preserve<GlobalsAA>(); 10158 if (!Result.MadeCFGChange) 10159 PA.preserveSet<CFGAnalyses>(); 10160 return PA; 10161 } 10162