1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> EnableStrictReductions( 335 "enable-strict-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns the type of loaded or stored value. 379 static Type *getMemInstValueType(Value *I) { 380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 381 "Expected Load or Store instruction"); 382 if (auto *LI = dyn_cast<LoadInst>(I)) 383 return LI->getType(); 384 return cast<StoreInst>(I)->getValueOperand()->getType(); 385 } 386 387 /// A helper function that returns true if the given type is irregular. The 388 /// type is irregular if its allocated size doesn't equal the store size of an 389 /// element of the corresponding vector type. 390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 391 // Determine if an array of N elements of type Ty is "bitcast compatible" 392 // with a <N x Ty> vector. 393 // This is only true if there is no padding between the array elements. 394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 395 } 396 397 /// A helper function that returns the reciprocal of the block probability of 398 /// predicated blocks. If we return X, we are assuming the predicated block 399 /// will execute once for every X iterations of the loop header. 400 /// 401 /// TODO: We should use actual block probability here, if available. Currently, 402 /// we always assume predicated blocks have a 50% chance of executing. 403 static unsigned getReciprocalPredBlockProb() { return 2; } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPWidenPHIRecipe *PhiR, VPTransformState &State); 519 520 /// A helper function to scalarize a single Instruction in the innermost loop. 521 /// Generates a sequence of scalar instances for each lane between \p MinLane 522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 524 /// Instr's operands. 525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 526 const VPIteration &Instance, bool IfPredicateInstr, 527 VPTransformState &State); 528 529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 530 /// is provided, the integer induction variable will first be truncated to 531 /// the corresponding type. 532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 533 VPValue *Def, VPValue *CastDef, 534 VPTransformState &State); 535 536 /// Construct the vector value of a scalarized value \p V one lane at a time. 537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 538 VPTransformState &State); 539 540 /// Try to vectorize interleaved access group \p Group with the base address 541 /// given in \p Addr, optionally masking the vector operations if \p 542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 543 /// values in the vectorized loop. 544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 545 ArrayRef<VPValue *> VPDefs, 546 VPTransformState &State, VPValue *Addr, 547 ArrayRef<VPValue *> StoredValues, 548 VPValue *BlockInMask = nullptr); 549 550 /// Vectorize Load and Store instructions with the base address given in \p 551 /// Addr, optionally masking the vector operations if \p BlockInMask is 552 /// non-null. Use \p State to translate given VPValues to IR values in the 553 /// vectorized loop. 554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 555 VPValue *Def, VPValue *Addr, 556 VPValue *StoredValue, VPValue *BlockInMask); 557 558 /// Set the debug location in the builder using the debug location in 559 /// the instruction. 560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 561 562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 563 void fixNonInductionPHIs(VPTransformState &State); 564 565 /// Create a broadcast instruction. This method generates a broadcast 566 /// instruction (shuffle) for loop invariant values and for the induction 567 /// value. If this is the induction variable then we extend it to N, N+1, ... 568 /// this is needed because each iteration in the loop corresponds to a SIMD 569 /// element. 570 virtual Value *getBroadcastInstrs(Value *V); 571 572 protected: 573 friend class LoopVectorizationPlanner; 574 575 /// A small list of PHINodes. 576 using PhiVector = SmallVector<PHINode *, 4>; 577 578 /// A type for scalarized values in the new loop. Each value from the 579 /// original loop, when scalarized, is represented by UF x VF scalar values 580 /// in the new unrolled loop, where UF is the unroll factor and VF is the 581 /// vectorization factor. 582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 583 584 /// Set up the values of the IVs correctly when exiting the vector loop. 585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 586 Value *CountRoundDown, Value *EndValue, 587 BasicBlock *MiddleBlock); 588 589 /// Create a new induction variable inside L. 590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 591 Value *Step, Instruction *DL); 592 593 /// Handle all cross-iteration phis in the header. 594 void fixCrossIterationPHIs(VPTransformState &State); 595 596 /// Fix a first-order recurrence. This is the second phase of vectorizing 597 /// this phi node. 598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 599 600 /// Fix a reduction cross-iteration phi. This is the second phase of 601 /// vectorizing this phi node. 602 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 603 604 /// Clear NSW/NUW flags from reduction instructions if necessary. 605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 606 VPTransformState &State); 607 608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 609 /// means we need to add the appropriate incoming value from the middle 610 /// block as exiting edges from the scalar epilogue loop (if present) are 611 /// already in place, and we exit the vector loop exclusively to the middle 612 /// block. 613 void fixLCSSAPHIs(VPTransformState &State); 614 615 /// Iteratively sink the scalarized operands of a predicated instruction into 616 /// the block that was created for it. 617 void sinkScalarOperands(Instruction *PredInst); 618 619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 620 /// represented as. 621 void truncateToMinimalBitwidths(VPTransformState &State); 622 623 /// This function adds 624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The (unique) ExitBlock of the scalar loop. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// LoopVectorizationCostModel - estimates the expected speedups due to 1211 /// vectorization. 1212 /// In many cases vectorization is not profitable. This can happen because of 1213 /// a number of reasons. In this class we mainly attempt to predict the 1214 /// expected speedup/slowdowns due to the supported instruction set. We use the 1215 /// TargetTransformInfo to query the different backends for the cost of 1216 /// different operations. 1217 class LoopVectorizationCostModel { 1218 public: 1219 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1220 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1221 LoopVectorizationLegality *Legal, 1222 const TargetTransformInfo &TTI, 1223 const TargetLibraryInfo *TLI, DemandedBits *DB, 1224 AssumptionCache *AC, 1225 OptimizationRemarkEmitter *ORE, const Function *F, 1226 const LoopVectorizeHints *Hints, 1227 InterleavedAccessInfo &IAI) 1228 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1229 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1230 Hints(Hints), InterleaveInfo(IAI) {} 1231 1232 /// \return An upper bound for the vectorization factors (both fixed and 1233 /// scalable). If the factors are 0, vectorization and interleaving should be 1234 /// avoided up front. 1235 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1236 1237 /// \return True if runtime checks are required for vectorization, and false 1238 /// otherwise. 1239 bool runtimeChecksRequired(); 1240 1241 /// \return The most profitable vectorization factor and the cost of that VF. 1242 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1243 /// then this vectorization factor will be selected if vectorization is 1244 /// possible. 1245 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1246 VectorizationFactor 1247 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1248 const LoopVectorizationPlanner &LVP); 1249 1250 /// Setup cost-based decisions for user vectorization factor. 1251 void selectUserVectorizationFactor(ElementCount UserVF) { 1252 collectUniformsAndScalars(UserVF); 1253 collectInstsToScalarize(UserVF); 1254 } 1255 1256 /// \return The size (in bits) of the smallest and widest types in the code 1257 /// that needs to be vectorized. We ignore values that remain scalar such as 1258 /// 64 bit loop indices. 1259 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1260 1261 /// \return The desired interleave count. 1262 /// If interleave count has been specified by metadata it will be returned. 1263 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1264 /// are the selected vectorization factor and the cost of the selected VF. 1265 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1266 1267 /// Memory access instruction may be vectorized in more than one way. 1268 /// Form of instruction after vectorization depends on cost. 1269 /// This function takes cost-based decisions for Load/Store instructions 1270 /// and collects them in a map. This decisions map is used for building 1271 /// the lists of loop-uniform and loop-scalar instructions. 1272 /// The calculated cost is saved with widening decision in order to 1273 /// avoid redundant calculations. 1274 void setCostBasedWideningDecision(ElementCount VF); 1275 1276 /// A struct that represents some properties of the register usage 1277 /// of a loop. 1278 struct RegisterUsage { 1279 /// Holds the number of loop invariant values that are used in the loop. 1280 /// The key is ClassID of target-provided register class. 1281 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1282 /// Holds the maximum number of concurrent live intervals in the loop. 1283 /// The key is ClassID of target-provided register class. 1284 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1285 }; 1286 1287 /// \return Returns information about the register usages of the loop for the 1288 /// given vectorization factors. 1289 SmallVector<RegisterUsage, 8> 1290 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1291 1292 /// Collect values we want to ignore in the cost model. 1293 void collectValuesToIgnore(); 1294 1295 /// Split reductions into those that happen in the loop, and those that happen 1296 /// outside. In loop reductions are collected into InLoopReductionChains. 1297 void collectInLoopReductions(); 1298 1299 /// \returns The smallest bitwidth each instruction can be represented with. 1300 /// The vector equivalents of these instructions should be truncated to this 1301 /// type. 1302 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1303 return MinBWs; 1304 } 1305 1306 /// \returns True if it is more profitable to scalarize instruction \p I for 1307 /// vectorization factor \p VF. 1308 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1309 assert(VF.isVector() && 1310 "Profitable to scalarize relevant only for VF > 1."); 1311 1312 // Cost model is not run in the VPlan-native path - return conservative 1313 // result until this changes. 1314 if (EnableVPlanNativePath) 1315 return false; 1316 1317 auto Scalars = InstsToScalarize.find(VF); 1318 assert(Scalars != InstsToScalarize.end() && 1319 "VF not yet analyzed for scalarization profitability"); 1320 return Scalars->second.find(I) != Scalars->second.end(); 1321 } 1322 1323 /// Returns true if \p I is known to be uniform after vectorization. 1324 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1325 if (VF.isScalar()) 1326 return true; 1327 1328 // Cost model is not run in the VPlan-native path - return conservative 1329 // result until this changes. 1330 if (EnableVPlanNativePath) 1331 return false; 1332 1333 auto UniformsPerVF = Uniforms.find(VF); 1334 assert(UniformsPerVF != Uniforms.end() && 1335 "VF not yet analyzed for uniformity"); 1336 return UniformsPerVF->second.count(I); 1337 } 1338 1339 /// Returns true if \p I is known to be scalar after vectorization. 1340 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1341 if (VF.isScalar()) 1342 return true; 1343 1344 // Cost model is not run in the VPlan-native path - return conservative 1345 // result until this changes. 1346 if (EnableVPlanNativePath) 1347 return false; 1348 1349 auto ScalarsPerVF = Scalars.find(VF); 1350 assert(ScalarsPerVF != Scalars.end() && 1351 "Scalar values are not calculated for VF"); 1352 return ScalarsPerVF->second.count(I); 1353 } 1354 1355 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1356 /// for vectorization factor \p VF. 1357 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1358 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1359 !isProfitableToScalarize(I, VF) && 1360 !isScalarAfterVectorization(I, VF); 1361 } 1362 1363 /// Decision that was taken during cost calculation for memory instruction. 1364 enum InstWidening { 1365 CM_Unknown, 1366 CM_Widen, // For consecutive accesses with stride +1. 1367 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1368 CM_Interleave, 1369 CM_GatherScatter, 1370 CM_Scalarize 1371 }; 1372 1373 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1374 /// instruction \p I and vector width \p VF. 1375 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1376 InstructionCost Cost) { 1377 assert(VF.isVector() && "Expected VF >=2"); 1378 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1379 } 1380 1381 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1382 /// interleaving group \p Grp and vector width \p VF. 1383 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1384 ElementCount VF, InstWidening W, 1385 InstructionCost Cost) { 1386 assert(VF.isVector() && "Expected VF >=2"); 1387 /// Broadcast this decicion to all instructions inside the group. 1388 /// But the cost will be assigned to one instruction only. 1389 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1390 if (auto *I = Grp->getMember(i)) { 1391 if (Grp->getInsertPos() == I) 1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1393 else 1394 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1395 } 1396 } 1397 } 1398 1399 /// Return the cost model decision for the given instruction \p I and vector 1400 /// width \p VF. Return CM_Unknown if this instruction did not pass 1401 /// through the cost modeling. 1402 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1403 assert(VF.isVector() && "Expected VF to be a vector VF"); 1404 // Cost model is not run in the VPlan-native path - return conservative 1405 // result until this changes. 1406 if (EnableVPlanNativePath) 1407 return CM_GatherScatter; 1408 1409 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1410 auto Itr = WideningDecisions.find(InstOnVF); 1411 if (Itr == WideningDecisions.end()) 1412 return CM_Unknown; 1413 return Itr->second.first; 1414 } 1415 1416 /// Return the vectorization cost for the given instruction \p I and vector 1417 /// width \p VF. 1418 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1419 assert(VF.isVector() && "Expected VF >=2"); 1420 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1421 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1422 "The cost is not calculated"); 1423 return WideningDecisions[InstOnVF].second; 1424 } 1425 1426 /// Return True if instruction \p I is an optimizable truncate whose operand 1427 /// is an induction variable. Such a truncate will be removed by adding a new 1428 /// induction variable with the destination type. 1429 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1430 // If the instruction is not a truncate, return false. 1431 auto *Trunc = dyn_cast<TruncInst>(I); 1432 if (!Trunc) 1433 return false; 1434 1435 // Get the source and destination types of the truncate. 1436 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1437 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1438 1439 // If the truncate is free for the given types, return false. Replacing a 1440 // free truncate with an induction variable would add an induction variable 1441 // update instruction to each iteration of the loop. We exclude from this 1442 // check the primary induction variable since it will need an update 1443 // instruction regardless. 1444 Value *Op = Trunc->getOperand(0); 1445 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1446 return false; 1447 1448 // If the truncated value is not an induction variable, return false. 1449 return Legal->isInductionPhi(Op); 1450 } 1451 1452 /// Collects the instructions to scalarize for each predicated instruction in 1453 /// the loop. 1454 void collectInstsToScalarize(ElementCount VF); 1455 1456 /// Collect Uniform and Scalar values for the given \p VF. 1457 /// The sets depend on CM decision for Load/Store instructions 1458 /// that may be vectorized as interleave, gather-scatter or scalarized. 1459 void collectUniformsAndScalars(ElementCount VF) { 1460 // Do the analysis once. 1461 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1462 return; 1463 setCostBasedWideningDecision(VF); 1464 collectLoopUniforms(VF); 1465 collectLoopScalars(VF); 1466 } 1467 1468 /// Returns true if the target machine supports masked store operation 1469 /// for the given \p DataType and kind of access to \p Ptr. 1470 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1471 return Legal->isConsecutivePtr(Ptr) && 1472 TTI.isLegalMaskedStore(DataType, Alignment); 1473 } 1474 1475 /// Returns true if the target machine supports masked load operation 1476 /// for the given \p DataType and kind of access to \p Ptr. 1477 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1478 return Legal->isConsecutivePtr(Ptr) && 1479 TTI.isLegalMaskedLoad(DataType, Alignment); 1480 } 1481 1482 /// Returns true if the target machine supports masked scatter operation 1483 /// for the given \p DataType. 1484 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1485 return TTI.isLegalMaskedScatter(DataType, Alignment); 1486 } 1487 1488 /// Returns true if the target machine supports masked gather operation 1489 /// for the given \p DataType. 1490 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1491 return TTI.isLegalMaskedGather(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine can represent \p V as a masked gather 1495 /// or scatter operation. 1496 bool isLegalGatherOrScatter(Value *V) { 1497 bool LI = isa<LoadInst>(V); 1498 bool SI = isa<StoreInst>(V); 1499 if (!LI && !SI) 1500 return false; 1501 auto *Ty = getMemInstValueType(V); 1502 Align Align = getLoadStoreAlignment(V); 1503 return (LI && isLegalMaskedGather(Ty, Align)) || 1504 (SI && isLegalMaskedScatter(Ty, Align)); 1505 } 1506 1507 /// Returns true if the target machine supports all of the reduction 1508 /// variables found for the given VF. 1509 bool canVectorizeReductions(ElementCount VF) { 1510 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1511 RecurrenceDescriptor RdxDesc = Reduction.second; 1512 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1513 })); 1514 } 1515 1516 /// Returns true if \p I is an instruction that will be scalarized with 1517 /// predication. Such instructions include conditional stores and 1518 /// instructions that may divide by zero. 1519 /// If a non-zero VF has been calculated, we check if I will be scalarized 1520 /// predication for that VF. 1521 bool isScalarWithPredication(Instruction *I) const; 1522 1523 // Returns true if \p I is an instruction that will be predicated either 1524 // through scalar predication or masked load/store or masked gather/scatter. 1525 // Superset of instructions that return true for isScalarWithPredication. 1526 bool isPredicatedInst(Instruction *I) { 1527 if (!blockNeedsPredication(I->getParent())) 1528 return false; 1529 // Loads and stores that need some form of masked operation are predicated 1530 // instructions. 1531 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1532 return Legal->isMaskRequired(I); 1533 return isScalarWithPredication(I); 1534 } 1535 1536 /// Returns true if \p I is a memory instruction with consecutive memory 1537 /// access that can be widened. 1538 bool 1539 memoryInstructionCanBeWidened(Instruction *I, 1540 ElementCount VF = ElementCount::getFixed(1)); 1541 1542 /// Returns true if \p I is a memory instruction in an interleaved-group 1543 /// of memory accesses that can be vectorized with wide vector loads/stores 1544 /// and shuffles. 1545 bool 1546 interleavedAccessCanBeWidened(Instruction *I, 1547 ElementCount VF = ElementCount::getFixed(1)); 1548 1549 /// Check if \p Instr belongs to any interleaved access group. 1550 bool isAccessInterleaved(Instruction *Instr) { 1551 return InterleaveInfo.isInterleaved(Instr); 1552 } 1553 1554 /// Get the interleaved access group that \p Instr belongs to. 1555 const InterleaveGroup<Instruction> * 1556 getInterleavedAccessGroup(Instruction *Instr) { 1557 return InterleaveInfo.getInterleaveGroup(Instr); 1558 } 1559 1560 /// Returns true if we're required to use a scalar epilogue for at least 1561 /// the final iteration of the original loop. 1562 bool requiresScalarEpilogue() const { 1563 if (!isScalarEpilogueAllowed()) 1564 return false; 1565 // If we might exit from anywhere but the latch, must run the exiting 1566 // iteration in scalar form. 1567 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1568 return true; 1569 return InterleaveInfo.requiresScalarEpilogue(); 1570 } 1571 1572 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1573 /// loop hint annotation. 1574 bool isScalarEpilogueAllowed() const { 1575 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1576 } 1577 1578 /// Returns true if all loop blocks should be masked to fold tail loop. 1579 bool foldTailByMasking() const { return FoldTailByMasking; } 1580 1581 bool blockNeedsPredication(BasicBlock *BB) const { 1582 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1583 } 1584 1585 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1586 /// nodes to the chain of instructions representing the reductions. Uses a 1587 /// MapVector to ensure deterministic iteration order. 1588 using ReductionChainMap = 1589 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1590 1591 /// Return the chain of instructions representing an inloop reduction. 1592 const ReductionChainMap &getInLoopReductionChains() const { 1593 return InLoopReductionChains; 1594 } 1595 1596 /// Returns true if the Phi is part of an inloop reduction. 1597 bool isInLoopReduction(PHINode *Phi) const { 1598 return InLoopReductionChains.count(Phi); 1599 } 1600 1601 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1602 /// with factor VF. Return the cost of the instruction, including 1603 /// scalarization overhead if it's needed. 1604 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1605 1606 /// Estimate cost of a call instruction CI if it were vectorized with factor 1607 /// VF. Return the cost of the instruction, including scalarization overhead 1608 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1609 /// scalarized - 1610 /// i.e. either vector version isn't available, or is too expensive. 1611 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1612 bool &NeedToScalarize) const; 1613 1614 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1615 /// that of B. 1616 bool isMoreProfitable(const VectorizationFactor &A, 1617 const VectorizationFactor &B) const; 1618 1619 /// Invalidates decisions already taken by the cost model. 1620 void invalidateCostModelingDecisions() { 1621 WideningDecisions.clear(); 1622 Uniforms.clear(); 1623 Scalars.clear(); 1624 } 1625 1626 private: 1627 unsigned NumPredStores = 0; 1628 1629 /// \return An upper bound for the vectorization factors for both 1630 /// fixed and scalable vectorization, where the minimum-known number of 1631 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1632 /// disabled or unsupported, then the scalable part will be equal to 1633 /// ElementCount::getScalable(0). 1634 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1635 ElementCount UserVF); 1636 1637 /// \return the maximized element count based on the targets vector 1638 /// registers and the loop trip-count, but limited to a maximum safe VF. 1639 /// This is a helper function of computeFeasibleMaxVF. 1640 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1641 /// issue that occurred on one of the buildbots which cannot be reproduced 1642 /// without having access to the properietary compiler (see comments on 1643 /// D98509). The issue is currently under investigation and this workaround 1644 /// will be removed as soon as possible. 1645 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1646 unsigned SmallestType, 1647 unsigned WidestType, 1648 const ElementCount &MaxSafeVF); 1649 1650 /// \return the maximum legal scalable VF, based on the safe max number 1651 /// of elements. 1652 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1653 1654 /// The vectorization cost is a combination of the cost itself and a boolean 1655 /// indicating whether any of the contributing operations will actually 1656 /// operate on 1657 /// vector values after type legalization in the backend. If this latter value 1658 /// is 1659 /// false, then all operations will be scalarized (i.e. no vectorization has 1660 /// actually taken place). 1661 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1662 1663 /// Returns the expected execution cost. The unit of the cost does 1664 /// not matter because we use the 'cost' units to compare different 1665 /// vector widths. The cost that is returned is *not* normalized by 1666 /// the factor width. 1667 VectorizationCostTy expectedCost(ElementCount VF); 1668 1669 /// Returns the execution time cost of an instruction for a given vector 1670 /// width. Vector width of one means scalar. 1671 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1672 1673 /// The cost-computation logic from getInstructionCost which provides 1674 /// the vector type as an output parameter. 1675 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1676 Type *&VectorTy); 1677 1678 /// Return the cost of instructions in an inloop reduction pattern, if I is 1679 /// part of that pattern. 1680 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1681 Type *VectorTy, 1682 TTI::TargetCostKind CostKind); 1683 1684 /// Calculate vectorization cost of memory instruction \p I. 1685 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1686 1687 /// The cost computation for scalarized memory instruction. 1688 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1689 1690 /// The cost computation for interleaving group of memory instructions. 1691 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1692 1693 /// The cost computation for Gather/Scatter instruction. 1694 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1695 1696 /// The cost computation for widening instruction \p I with consecutive 1697 /// memory access. 1698 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1699 1700 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1701 /// Load: scalar load + broadcast. 1702 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1703 /// element) 1704 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1705 1706 /// Estimate the overhead of scalarizing an instruction. This is a 1707 /// convenience wrapper for the type-based getScalarizationOverhead API. 1708 InstructionCost getScalarizationOverhead(Instruction *I, 1709 ElementCount VF) const; 1710 1711 /// Returns whether the instruction is a load or store and will be a emitted 1712 /// as a vector operation. 1713 bool isConsecutiveLoadOrStore(Instruction *I); 1714 1715 /// Returns true if an artificially high cost for emulated masked memrefs 1716 /// should be used. 1717 bool useEmulatedMaskMemRefHack(Instruction *I); 1718 1719 /// Map of scalar integer values to the smallest bitwidth they can be legally 1720 /// represented as. The vector equivalents of these values should be truncated 1721 /// to this type. 1722 MapVector<Instruction *, uint64_t> MinBWs; 1723 1724 /// A type representing the costs for instructions if they were to be 1725 /// scalarized rather than vectorized. The entries are Instruction-Cost 1726 /// pairs. 1727 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1728 1729 /// A set containing all BasicBlocks that are known to present after 1730 /// vectorization as a predicated block. 1731 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1732 1733 /// Records whether it is allowed to have the original scalar loop execute at 1734 /// least once. This may be needed as a fallback loop in case runtime 1735 /// aliasing/dependence checks fail, or to handle the tail/remainder 1736 /// iterations when the trip count is unknown or doesn't divide by the VF, 1737 /// or as a peel-loop to handle gaps in interleave-groups. 1738 /// Under optsize and when the trip count is very small we don't allow any 1739 /// iterations to execute in the scalar loop. 1740 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1741 1742 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1743 bool FoldTailByMasking = false; 1744 1745 /// A map holding scalar costs for different vectorization factors. The 1746 /// presence of a cost for an instruction in the mapping indicates that the 1747 /// instruction will be scalarized when vectorizing with the associated 1748 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1749 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1750 1751 /// Holds the instructions known to be uniform after vectorization. 1752 /// The data is collected per VF. 1753 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1754 1755 /// Holds the instructions known to be scalar after vectorization. 1756 /// The data is collected per VF. 1757 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1758 1759 /// Holds the instructions (address computations) that are forced to be 1760 /// scalarized. 1761 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1762 1763 /// PHINodes of the reductions that should be expanded in-loop along with 1764 /// their associated chains of reduction operations, in program order from top 1765 /// (PHI) to bottom 1766 ReductionChainMap InLoopReductionChains; 1767 1768 /// A Map of inloop reduction operations and their immediate chain operand. 1769 /// FIXME: This can be removed once reductions can be costed correctly in 1770 /// vplan. This was added to allow quick lookup to the inloop operations, 1771 /// without having to loop through InLoopReductionChains. 1772 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1773 1774 /// Returns the expected difference in cost from scalarizing the expression 1775 /// feeding a predicated instruction \p PredInst. The instructions to 1776 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1777 /// non-negative return value implies the expression will be scalarized. 1778 /// Currently, only single-use chains are considered for scalarization. 1779 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1780 ElementCount VF); 1781 1782 /// Collect the instructions that are uniform after vectorization. An 1783 /// instruction is uniform if we represent it with a single scalar value in 1784 /// the vectorized loop corresponding to each vector iteration. Examples of 1785 /// uniform instructions include pointer operands of consecutive or 1786 /// interleaved memory accesses. Note that although uniformity implies an 1787 /// instruction will be scalar, the reverse is not true. In general, a 1788 /// scalarized instruction will be represented by VF scalar values in the 1789 /// vectorized loop, each corresponding to an iteration of the original 1790 /// scalar loop. 1791 void collectLoopUniforms(ElementCount VF); 1792 1793 /// Collect the instructions that are scalar after vectorization. An 1794 /// instruction is scalar if it is known to be uniform or will be scalarized 1795 /// during vectorization. Non-uniform scalarized instructions will be 1796 /// represented by VF values in the vectorized loop, each corresponding to an 1797 /// iteration of the original scalar loop. 1798 void collectLoopScalars(ElementCount VF); 1799 1800 /// Keeps cost model vectorization decision and cost for instructions. 1801 /// Right now it is used for memory instructions only. 1802 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1803 std::pair<InstWidening, InstructionCost>>; 1804 1805 DecisionList WideningDecisions; 1806 1807 /// Returns true if \p V is expected to be vectorized and it needs to be 1808 /// extracted. 1809 bool needsExtract(Value *V, ElementCount VF) const { 1810 Instruction *I = dyn_cast<Instruction>(V); 1811 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1812 TheLoop->isLoopInvariant(I)) 1813 return false; 1814 1815 // Assume we can vectorize V (and hence we need extraction) if the 1816 // scalars are not computed yet. This can happen, because it is called 1817 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1818 // the scalars are collected. That should be a safe assumption in most 1819 // cases, because we check if the operands have vectorizable types 1820 // beforehand in LoopVectorizationLegality. 1821 return Scalars.find(VF) == Scalars.end() || 1822 !isScalarAfterVectorization(I, VF); 1823 }; 1824 1825 /// Returns a range containing only operands needing to be extracted. 1826 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1827 ElementCount VF) const { 1828 return SmallVector<Value *, 4>(make_filter_range( 1829 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1830 } 1831 1832 /// Determines if we have the infrastructure to vectorize loop \p L and its 1833 /// epilogue, assuming the main loop is vectorized by \p VF. 1834 bool isCandidateForEpilogueVectorization(const Loop &L, 1835 const ElementCount VF) const; 1836 1837 /// Returns true if epilogue vectorization is considered profitable, and 1838 /// false otherwise. 1839 /// \p VF is the vectorization factor chosen for the original loop. 1840 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1841 1842 public: 1843 /// The loop that we evaluate. 1844 Loop *TheLoop; 1845 1846 /// Predicated scalar evolution analysis. 1847 PredicatedScalarEvolution &PSE; 1848 1849 /// Loop Info analysis. 1850 LoopInfo *LI; 1851 1852 /// Vectorization legality. 1853 LoopVectorizationLegality *Legal; 1854 1855 /// Vector target information. 1856 const TargetTransformInfo &TTI; 1857 1858 /// Target Library Info. 1859 const TargetLibraryInfo *TLI; 1860 1861 /// Demanded bits analysis. 1862 DemandedBits *DB; 1863 1864 /// Assumption cache. 1865 AssumptionCache *AC; 1866 1867 /// Interface to emit optimization remarks. 1868 OptimizationRemarkEmitter *ORE; 1869 1870 const Function *TheFunction; 1871 1872 /// Loop Vectorize Hint. 1873 const LoopVectorizeHints *Hints; 1874 1875 /// The interleave access information contains groups of interleaved accesses 1876 /// with the same stride and close to each other. 1877 InterleavedAccessInfo &InterleaveInfo; 1878 1879 /// Values to ignore in the cost model. 1880 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1881 1882 /// Values to ignore in the cost model when VF > 1. 1883 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1884 1885 /// Profitable vector factors. 1886 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1887 }; 1888 } // end namespace llvm 1889 1890 /// Helper struct to manage generating runtime checks for vectorization. 1891 /// 1892 /// The runtime checks are created up-front in temporary blocks to allow better 1893 /// estimating the cost and un-linked from the existing IR. After deciding to 1894 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1895 /// temporary blocks are completely removed. 1896 class GeneratedRTChecks { 1897 /// Basic block which contains the generated SCEV checks, if any. 1898 BasicBlock *SCEVCheckBlock = nullptr; 1899 1900 /// The value representing the result of the generated SCEV checks. If it is 1901 /// nullptr, either no SCEV checks have been generated or they have been used. 1902 Value *SCEVCheckCond = nullptr; 1903 1904 /// Basic block which contains the generated memory runtime checks, if any. 1905 BasicBlock *MemCheckBlock = nullptr; 1906 1907 /// The value representing the result of the generated memory runtime checks. 1908 /// If it is nullptr, either no memory runtime checks have been generated or 1909 /// they have been used. 1910 Instruction *MemRuntimeCheckCond = nullptr; 1911 1912 DominatorTree *DT; 1913 LoopInfo *LI; 1914 1915 SCEVExpander SCEVExp; 1916 SCEVExpander MemCheckExp; 1917 1918 public: 1919 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1920 const DataLayout &DL) 1921 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1922 MemCheckExp(SE, DL, "scev.check") {} 1923 1924 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1925 /// accurately estimate the cost of the runtime checks. The blocks are 1926 /// un-linked from the IR and is added back during vector code generation. If 1927 /// there is no vector code generation, the check blocks are removed 1928 /// completely. 1929 void Create(Loop *L, const LoopAccessInfo &LAI, 1930 const SCEVUnionPredicate &UnionPred) { 1931 1932 BasicBlock *LoopHeader = L->getHeader(); 1933 BasicBlock *Preheader = L->getLoopPreheader(); 1934 1935 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1936 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1937 // may be used by SCEVExpander. The blocks will be un-linked from their 1938 // predecessors and removed from LI & DT at the end of the function. 1939 if (!UnionPred.isAlwaysTrue()) { 1940 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1941 nullptr, "vector.scevcheck"); 1942 1943 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1944 &UnionPred, SCEVCheckBlock->getTerminator()); 1945 } 1946 1947 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1948 if (RtPtrChecking.Need) { 1949 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1950 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1951 "vector.memcheck"); 1952 1953 std::tie(std::ignore, MemRuntimeCheckCond) = 1954 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1955 RtPtrChecking.getChecks(), MemCheckExp); 1956 assert(MemRuntimeCheckCond && 1957 "no RT checks generated although RtPtrChecking " 1958 "claimed checks are required"); 1959 } 1960 1961 if (!MemCheckBlock && !SCEVCheckBlock) 1962 return; 1963 1964 // Unhook the temporary block with the checks, update various places 1965 // accordingly. 1966 if (SCEVCheckBlock) 1967 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1968 if (MemCheckBlock) 1969 MemCheckBlock->replaceAllUsesWith(Preheader); 1970 1971 if (SCEVCheckBlock) { 1972 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1973 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1974 Preheader->getTerminator()->eraseFromParent(); 1975 } 1976 if (MemCheckBlock) { 1977 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1978 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1979 Preheader->getTerminator()->eraseFromParent(); 1980 } 1981 1982 DT->changeImmediateDominator(LoopHeader, Preheader); 1983 if (MemCheckBlock) { 1984 DT->eraseNode(MemCheckBlock); 1985 LI->removeBlock(MemCheckBlock); 1986 } 1987 if (SCEVCheckBlock) { 1988 DT->eraseNode(SCEVCheckBlock); 1989 LI->removeBlock(SCEVCheckBlock); 1990 } 1991 } 1992 1993 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1994 /// unused. 1995 ~GeneratedRTChecks() { 1996 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1997 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1998 if (!SCEVCheckCond) 1999 SCEVCleaner.markResultUsed(); 2000 2001 if (!MemRuntimeCheckCond) 2002 MemCheckCleaner.markResultUsed(); 2003 2004 if (MemRuntimeCheckCond) { 2005 auto &SE = *MemCheckExp.getSE(); 2006 // Memory runtime check generation creates compares that use expanded 2007 // values. Remove them before running the SCEVExpanderCleaners. 2008 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2009 if (MemCheckExp.isInsertedInstruction(&I)) 2010 continue; 2011 SE.forgetValue(&I); 2012 SE.eraseValueFromMap(&I); 2013 I.eraseFromParent(); 2014 } 2015 } 2016 MemCheckCleaner.cleanup(); 2017 SCEVCleaner.cleanup(); 2018 2019 if (SCEVCheckCond) 2020 SCEVCheckBlock->eraseFromParent(); 2021 if (MemRuntimeCheckCond) 2022 MemCheckBlock->eraseFromParent(); 2023 } 2024 2025 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2026 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2027 /// depending on the generated condition. 2028 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2029 BasicBlock *LoopVectorPreHeader, 2030 BasicBlock *LoopExitBlock) { 2031 if (!SCEVCheckCond) 2032 return nullptr; 2033 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2034 if (C->isZero()) 2035 return nullptr; 2036 2037 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2038 2039 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2040 // Create new preheader for vector loop. 2041 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2042 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2043 2044 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2045 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2046 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2047 SCEVCheckBlock); 2048 2049 DT->addNewBlock(SCEVCheckBlock, Pred); 2050 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2051 2052 ReplaceInstWithInst( 2053 SCEVCheckBlock->getTerminator(), 2054 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2055 // Mark the check as used, to prevent it from being removed during cleanup. 2056 SCEVCheckCond = nullptr; 2057 return SCEVCheckBlock; 2058 } 2059 2060 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2061 /// the branches to branch to the vector preheader or \p Bypass, depending on 2062 /// the generated condition. 2063 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2064 BasicBlock *LoopVectorPreHeader) { 2065 // Check if we generated code that checks in runtime if arrays overlap. 2066 if (!MemRuntimeCheckCond) 2067 return nullptr; 2068 2069 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2070 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2071 MemCheckBlock); 2072 2073 DT->addNewBlock(MemCheckBlock, Pred); 2074 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2075 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2076 2077 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2078 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2079 2080 ReplaceInstWithInst( 2081 MemCheckBlock->getTerminator(), 2082 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2083 MemCheckBlock->getTerminator()->setDebugLoc( 2084 Pred->getTerminator()->getDebugLoc()); 2085 2086 // Mark the check as used, to prevent it from being removed during cleanup. 2087 MemRuntimeCheckCond = nullptr; 2088 return MemCheckBlock; 2089 } 2090 }; 2091 2092 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2093 // vectorization. The loop needs to be annotated with #pragma omp simd 2094 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2095 // vector length information is not provided, vectorization is not considered 2096 // explicit. Interleave hints are not allowed either. These limitations will be 2097 // relaxed in the future. 2098 // Please, note that we are currently forced to abuse the pragma 'clang 2099 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2100 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2101 // provides *explicit vectorization hints* (LV can bypass legal checks and 2102 // assume that vectorization is legal). However, both hints are implemented 2103 // using the same metadata (llvm.loop.vectorize, processed by 2104 // LoopVectorizeHints). This will be fixed in the future when the native IR 2105 // representation for pragma 'omp simd' is introduced. 2106 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2107 OptimizationRemarkEmitter *ORE) { 2108 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2109 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2110 2111 // Only outer loops with an explicit vectorization hint are supported. 2112 // Unannotated outer loops are ignored. 2113 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2114 return false; 2115 2116 Function *Fn = OuterLp->getHeader()->getParent(); 2117 if (!Hints.allowVectorization(Fn, OuterLp, 2118 true /*VectorizeOnlyWhenForced*/)) { 2119 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2120 return false; 2121 } 2122 2123 if (Hints.getInterleave() > 1) { 2124 // TODO: Interleave support is future work. 2125 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2126 "outer loops.\n"); 2127 Hints.emitRemarkWithHints(); 2128 return false; 2129 } 2130 2131 return true; 2132 } 2133 2134 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2135 OptimizationRemarkEmitter *ORE, 2136 SmallVectorImpl<Loop *> &V) { 2137 // Collect inner loops and outer loops without irreducible control flow. For 2138 // now, only collect outer loops that have explicit vectorization hints. If we 2139 // are stress testing the VPlan H-CFG construction, we collect the outermost 2140 // loop of every loop nest. 2141 if (L.isInnermost() || VPlanBuildStressTest || 2142 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2143 LoopBlocksRPO RPOT(&L); 2144 RPOT.perform(LI); 2145 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2146 V.push_back(&L); 2147 // TODO: Collect inner loops inside marked outer loops in case 2148 // vectorization fails for the outer loop. Do not invoke 2149 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2150 // already known to be reducible. We can use an inherited attribute for 2151 // that. 2152 return; 2153 } 2154 } 2155 for (Loop *InnerL : L) 2156 collectSupportedLoops(*InnerL, LI, ORE, V); 2157 } 2158 2159 namespace { 2160 2161 /// The LoopVectorize Pass. 2162 struct LoopVectorize : public FunctionPass { 2163 /// Pass identification, replacement for typeid 2164 static char ID; 2165 2166 LoopVectorizePass Impl; 2167 2168 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2169 bool VectorizeOnlyWhenForced = false) 2170 : FunctionPass(ID), 2171 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2172 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2173 } 2174 2175 bool runOnFunction(Function &F) override { 2176 if (skipFunction(F)) 2177 return false; 2178 2179 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2180 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2181 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2182 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2183 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2184 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2185 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2186 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2187 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2188 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2189 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2190 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2191 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2192 2193 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2194 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2195 2196 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2197 GetLAA, *ORE, PSI).MadeAnyChange; 2198 } 2199 2200 void getAnalysisUsage(AnalysisUsage &AU) const override { 2201 AU.addRequired<AssumptionCacheTracker>(); 2202 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2203 AU.addRequired<DominatorTreeWrapperPass>(); 2204 AU.addRequired<LoopInfoWrapperPass>(); 2205 AU.addRequired<ScalarEvolutionWrapperPass>(); 2206 AU.addRequired<TargetTransformInfoWrapperPass>(); 2207 AU.addRequired<AAResultsWrapperPass>(); 2208 AU.addRequired<LoopAccessLegacyAnalysis>(); 2209 AU.addRequired<DemandedBitsWrapperPass>(); 2210 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2211 AU.addRequired<InjectTLIMappingsLegacy>(); 2212 2213 // We currently do not preserve loopinfo/dominator analyses with outer loop 2214 // vectorization. Until this is addressed, mark these analyses as preserved 2215 // only for non-VPlan-native path. 2216 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2217 if (!EnableVPlanNativePath) { 2218 AU.addPreserved<LoopInfoWrapperPass>(); 2219 AU.addPreserved<DominatorTreeWrapperPass>(); 2220 } 2221 2222 AU.addPreserved<BasicAAWrapperPass>(); 2223 AU.addPreserved<GlobalsAAWrapperPass>(); 2224 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2225 } 2226 }; 2227 2228 } // end anonymous namespace 2229 2230 //===----------------------------------------------------------------------===// 2231 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2232 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2233 //===----------------------------------------------------------------------===// 2234 2235 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2236 // We need to place the broadcast of invariant variables outside the loop, 2237 // but only if it's proven safe to do so. Else, broadcast will be inside 2238 // vector loop body. 2239 Instruction *Instr = dyn_cast<Instruction>(V); 2240 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2241 (!Instr || 2242 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2243 // Place the code for broadcasting invariant variables in the new preheader. 2244 IRBuilder<>::InsertPointGuard Guard(Builder); 2245 if (SafeToHoist) 2246 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2247 2248 // Broadcast the scalar into all locations in the vector. 2249 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2250 2251 return Shuf; 2252 } 2253 2254 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2255 const InductionDescriptor &II, Value *Step, Value *Start, 2256 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2257 VPTransformState &State) { 2258 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2259 "Expected either an induction phi-node or a truncate of it!"); 2260 2261 // Construct the initial value of the vector IV in the vector loop preheader 2262 auto CurrIP = Builder.saveIP(); 2263 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2264 if (isa<TruncInst>(EntryVal)) { 2265 assert(Start->getType()->isIntegerTy() && 2266 "Truncation requires an integer type"); 2267 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2268 Step = Builder.CreateTrunc(Step, TruncType); 2269 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2270 } 2271 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2272 Value *SteppedStart = 2273 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2274 2275 // We create vector phi nodes for both integer and floating-point induction 2276 // variables. Here, we determine the kind of arithmetic we will perform. 2277 Instruction::BinaryOps AddOp; 2278 Instruction::BinaryOps MulOp; 2279 if (Step->getType()->isIntegerTy()) { 2280 AddOp = Instruction::Add; 2281 MulOp = Instruction::Mul; 2282 } else { 2283 AddOp = II.getInductionOpcode(); 2284 MulOp = Instruction::FMul; 2285 } 2286 2287 // Multiply the vectorization factor by the step using integer or 2288 // floating-point arithmetic as appropriate. 2289 Type *StepType = Step->getType(); 2290 if (Step->getType()->isFloatingPointTy()) 2291 StepType = IntegerType::get(StepType->getContext(), 2292 StepType->getScalarSizeInBits()); 2293 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2294 if (Step->getType()->isFloatingPointTy()) 2295 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2296 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2297 2298 // Create a vector splat to use in the induction update. 2299 // 2300 // FIXME: If the step is non-constant, we create the vector splat with 2301 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2302 // handle a constant vector splat. 2303 Value *SplatVF = isa<Constant>(Mul) 2304 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2305 : Builder.CreateVectorSplat(VF, Mul); 2306 Builder.restoreIP(CurrIP); 2307 2308 // We may need to add the step a number of times, depending on the unroll 2309 // factor. The last of those goes into the PHI. 2310 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2311 &*LoopVectorBody->getFirstInsertionPt()); 2312 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2313 Instruction *LastInduction = VecInd; 2314 for (unsigned Part = 0; Part < UF; ++Part) { 2315 State.set(Def, LastInduction, Part); 2316 2317 if (isa<TruncInst>(EntryVal)) 2318 addMetadata(LastInduction, EntryVal); 2319 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2320 State, Part); 2321 2322 LastInduction = cast<Instruction>( 2323 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2324 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2325 } 2326 2327 // Move the last step to the end of the latch block. This ensures consistent 2328 // placement of all induction updates. 2329 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2330 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2331 auto *ICmp = cast<Instruction>(Br->getCondition()); 2332 LastInduction->moveBefore(ICmp); 2333 LastInduction->setName("vec.ind.next"); 2334 2335 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2336 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2337 } 2338 2339 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2340 return Cost->isScalarAfterVectorization(I, VF) || 2341 Cost->isProfitableToScalarize(I, VF); 2342 } 2343 2344 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2345 if (shouldScalarizeInstruction(IV)) 2346 return true; 2347 auto isScalarInst = [&](User *U) -> bool { 2348 auto *I = cast<Instruction>(U); 2349 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2350 }; 2351 return llvm::any_of(IV->users(), isScalarInst); 2352 } 2353 2354 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2355 const InductionDescriptor &ID, const Instruction *EntryVal, 2356 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2357 unsigned Part, unsigned Lane) { 2358 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2359 "Expected either an induction phi-node or a truncate of it!"); 2360 2361 // This induction variable is not the phi from the original loop but the 2362 // newly-created IV based on the proof that casted Phi is equal to the 2363 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2364 // re-uses the same InductionDescriptor that original IV uses but we don't 2365 // have to do any recording in this case - that is done when original IV is 2366 // processed. 2367 if (isa<TruncInst>(EntryVal)) 2368 return; 2369 2370 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2371 if (Casts.empty()) 2372 return; 2373 // Only the first Cast instruction in the Casts vector is of interest. 2374 // The rest of the Casts (if exist) have no uses outside the 2375 // induction update chain itself. 2376 if (Lane < UINT_MAX) 2377 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2378 else 2379 State.set(CastDef, VectorLoopVal, Part); 2380 } 2381 2382 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2383 TruncInst *Trunc, VPValue *Def, 2384 VPValue *CastDef, 2385 VPTransformState &State) { 2386 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2387 "Primary induction variable must have an integer type"); 2388 2389 auto II = Legal->getInductionVars().find(IV); 2390 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2391 2392 auto ID = II->second; 2393 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2394 2395 // The value from the original loop to which we are mapping the new induction 2396 // variable. 2397 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2398 2399 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2400 2401 // Generate code for the induction step. Note that induction steps are 2402 // required to be loop-invariant 2403 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2404 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2405 "Induction step should be loop invariant"); 2406 if (PSE.getSE()->isSCEVable(IV->getType())) { 2407 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2408 return Exp.expandCodeFor(Step, Step->getType(), 2409 LoopVectorPreHeader->getTerminator()); 2410 } 2411 return cast<SCEVUnknown>(Step)->getValue(); 2412 }; 2413 2414 // The scalar value to broadcast. This is derived from the canonical 2415 // induction variable. If a truncation type is given, truncate the canonical 2416 // induction variable and step. Otherwise, derive these values from the 2417 // induction descriptor. 2418 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2419 Value *ScalarIV = Induction; 2420 if (IV != OldInduction) { 2421 ScalarIV = IV->getType()->isIntegerTy() 2422 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2423 : Builder.CreateCast(Instruction::SIToFP, Induction, 2424 IV->getType()); 2425 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2426 ScalarIV->setName("offset.idx"); 2427 } 2428 if (Trunc) { 2429 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2430 assert(Step->getType()->isIntegerTy() && 2431 "Truncation requires an integer step"); 2432 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2433 Step = Builder.CreateTrunc(Step, TruncType); 2434 } 2435 return ScalarIV; 2436 }; 2437 2438 // Create the vector values from the scalar IV, in the absence of creating a 2439 // vector IV. 2440 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2441 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2442 for (unsigned Part = 0; Part < UF; ++Part) { 2443 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2444 Value *EntryPart = 2445 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2446 ID.getInductionOpcode()); 2447 State.set(Def, EntryPart, Part); 2448 if (Trunc) 2449 addMetadata(EntryPart, Trunc); 2450 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2451 State, Part); 2452 } 2453 }; 2454 2455 // Fast-math-flags propagate from the original induction instruction. 2456 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2457 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2458 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2459 2460 // Now do the actual transformations, and start with creating the step value. 2461 Value *Step = CreateStepValue(ID.getStep()); 2462 if (VF.isZero() || VF.isScalar()) { 2463 Value *ScalarIV = CreateScalarIV(Step); 2464 CreateSplatIV(ScalarIV, Step); 2465 return; 2466 } 2467 2468 // Determine if we want a scalar version of the induction variable. This is 2469 // true if the induction variable itself is not widened, or if it has at 2470 // least one user in the loop that is not widened. 2471 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2472 if (!NeedsScalarIV) { 2473 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2474 State); 2475 return; 2476 } 2477 2478 // Try to create a new independent vector induction variable. If we can't 2479 // create the phi node, we will splat the scalar induction variable in each 2480 // loop iteration. 2481 if (!shouldScalarizeInstruction(EntryVal)) { 2482 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2483 State); 2484 Value *ScalarIV = CreateScalarIV(Step); 2485 // Create scalar steps that can be used by instructions we will later 2486 // scalarize. Note that the addition of the scalar steps will not increase 2487 // the number of instructions in the loop in the common case prior to 2488 // InstCombine. We will be trading one vector extract for each scalar step. 2489 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2490 return; 2491 } 2492 2493 // All IV users are scalar instructions, so only emit a scalar IV, not a 2494 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2495 // predicate used by the masked loads/stores. 2496 Value *ScalarIV = CreateScalarIV(Step); 2497 if (!Cost->isScalarEpilogueAllowed()) 2498 CreateSplatIV(ScalarIV, Step); 2499 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2500 } 2501 2502 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2503 Instruction::BinaryOps BinOp) { 2504 // Create and check the types. 2505 auto *ValVTy = cast<VectorType>(Val->getType()); 2506 ElementCount VLen = ValVTy->getElementCount(); 2507 2508 Type *STy = Val->getType()->getScalarType(); 2509 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2510 "Induction Step must be an integer or FP"); 2511 assert(Step->getType() == STy && "Step has wrong type"); 2512 2513 SmallVector<Constant *, 8> Indices; 2514 2515 // Create a vector of consecutive numbers from zero to VF. 2516 VectorType *InitVecValVTy = ValVTy; 2517 Type *InitVecValSTy = STy; 2518 if (STy->isFloatingPointTy()) { 2519 InitVecValSTy = 2520 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2521 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2522 } 2523 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2524 2525 // Add on StartIdx 2526 Value *StartIdxSplat = Builder.CreateVectorSplat( 2527 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2528 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2529 2530 if (STy->isIntegerTy()) { 2531 Step = Builder.CreateVectorSplat(VLen, Step); 2532 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2533 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2534 // which can be found from the original scalar operations. 2535 Step = Builder.CreateMul(InitVec, Step); 2536 return Builder.CreateAdd(Val, Step, "induction"); 2537 } 2538 2539 // Floating point induction. 2540 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2541 "Binary Opcode should be specified for FP induction"); 2542 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2543 Step = Builder.CreateVectorSplat(VLen, Step); 2544 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2545 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2546 } 2547 2548 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2549 Instruction *EntryVal, 2550 const InductionDescriptor &ID, 2551 VPValue *Def, VPValue *CastDef, 2552 VPTransformState &State) { 2553 // We shouldn't have to build scalar steps if we aren't vectorizing. 2554 assert(VF.isVector() && "VF should be greater than one"); 2555 // Get the value type and ensure it and the step have the same integer type. 2556 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2557 assert(ScalarIVTy == Step->getType() && 2558 "Val and Step should have the same type"); 2559 2560 // We build scalar steps for both integer and floating-point induction 2561 // variables. Here, we determine the kind of arithmetic we will perform. 2562 Instruction::BinaryOps AddOp; 2563 Instruction::BinaryOps MulOp; 2564 if (ScalarIVTy->isIntegerTy()) { 2565 AddOp = Instruction::Add; 2566 MulOp = Instruction::Mul; 2567 } else { 2568 AddOp = ID.getInductionOpcode(); 2569 MulOp = Instruction::FMul; 2570 } 2571 2572 // Determine the number of scalars we need to generate for each unroll 2573 // iteration. If EntryVal is uniform, we only need to generate the first 2574 // lane. Otherwise, we generate all VF values. 2575 bool IsUniform = 2576 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2577 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2578 // Compute the scalar steps and save the results in State. 2579 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2580 ScalarIVTy->getScalarSizeInBits()); 2581 Type *VecIVTy = nullptr; 2582 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2583 if (!IsUniform && VF.isScalable()) { 2584 VecIVTy = VectorType::get(ScalarIVTy, VF); 2585 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2586 SplatStep = Builder.CreateVectorSplat(VF, Step); 2587 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2588 } 2589 2590 for (unsigned Part = 0; Part < UF; ++Part) { 2591 Value *StartIdx0 = 2592 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2593 2594 if (!IsUniform && VF.isScalable()) { 2595 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2596 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2597 if (ScalarIVTy->isFloatingPointTy()) 2598 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2599 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2600 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2601 State.set(Def, Add, Part); 2602 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2603 Part); 2604 // It's useful to record the lane values too for the known minimum number 2605 // of elements so we do those below. This improves the code quality when 2606 // trying to extract the first element, for example. 2607 } 2608 2609 if (ScalarIVTy->isFloatingPointTy()) 2610 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2611 2612 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2613 Value *StartIdx = Builder.CreateBinOp( 2614 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2615 // The step returned by `createStepForVF` is a runtime-evaluated value 2616 // when VF is scalable. Otherwise, it should be folded into a Constant. 2617 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2618 "Expected StartIdx to be folded to a constant when VF is not " 2619 "scalable"); 2620 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2621 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2622 State.set(Def, Add, VPIteration(Part, Lane)); 2623 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2624 Part, Lane); 2625 } 2626 } 2627 } 2628 2629 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2630 const VPIteration &Instance, 2631 VPTransformState &State) { 2632 Value *ScalarInst = State.get(Def, Instance); 2633 Value *VectorValue = State.get(Def, Instance.Part); 2634 VectorValue = Builder.CreateInsertElement( 2635 VectorValue, ScalarInst, 2636 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2637 State.set(Def, VectorValue, Instance.Part); 2638 } 2639 2640 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2641 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2642 return Builder.CreateVectorReverse(Vec, "reverse"); 2643 } 2644 2645 // Return whether we allow using masked interleave-groups (for dealing with 2646 // strided loads/stores that reside in predicated blocks, or for dealing 2647 // with gaps). 2648 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2649 // If an override option has been passed in for interleaved accesses, use it. 2650 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2651 return EnableMaskedInterleavedMemAccesses; 2652 2653 return TTI.enableMaskedInterleavedAccessVectorization(); 2654 } 2655 2656 // Try to vectorize the interleave group that \p Instr belongs to. 2657 // 2658 // E.g. Translate following interleaved load group (factor = 3): 2659 // for (i = 0; i < N; i+=3) { 2660 // R = Pic[i]; // Member of index 0 2661 // G = Pic[i+1]; // Member of index 1 2662 // B = Pic[i+2]; // Member of index 2 2663 // ... // do something to R, G, B 2664 // } 2665 // To: 2666 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2667 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2668 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2669 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2670 // 2671 // Or translate following interleaved store group (factor = 3): 2672 // for (i = 0; i < N; i+=3) { 2673 // ... do something to R, G, B 2674 // Pic[i] = R; // Member of index 0 2675 // Pic[i+1] = G; // Member of index 1 2676 // Pic[i+2] = B; // Member of index 2 2677 // } 2678 // To: 2679 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2680 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2681 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2682 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2683 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2684 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2685 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2686 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2687 VPValue *BlockInMask) { 2688 Instruction *Instr = Group->getInsertPos(); 2689 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2690 2691 // Prepare for the vector type of the interleaved load/store. 2692 Type *ScalarTy = getMemInstValueType(Instr); 2693 unsigned InterleaveFactor = Group->getFactor(); 2694 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2695 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2696 2697 // Prepare for the new pointers. 2698 SmallVector<Value *, 2> AddrParts; 2699 unsigned Index = Group->getIndex(Instr); 2700 2701 // TODO: extend the masked interleaved-group support to reversed access. 2702 assert((!BlockInMask || !Group->isReverse()) && 2703 "Reversed masked interleave-group not supported."); 2704 2705 // If the group is reverse, adjust the index to refer to the last vector lane 2706 // instead of the first. We adjust the index from the first vector lane, 2707 // rather than directly getting the pointer for lane VF - 1, because the 2708 // pointer operand of the interleaved access is supposed to be uniform. For 2709 // uniform instructions, we're only required to generate a value for the 2710 // first vector lane in each unroll iteration. 2711 if (Group->isReverse()) 2712 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2713 2714 for (unsigned Part = 0; Part < UF; Part++) { 2715 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2716 setDebugLocFromInst(Builder, AddrPart); 2717 2718 // Notice current instruction could be any index. Need to adjust the address 2719 // to the member of index 0. 2720 // 2721 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2722 // b = A[i]; // Member of index 0 2723 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2724 // 2725 // E.g. A[i+1] = a; // Member of index 1 2726 // A[i] = b; // Member of index 0 2727 // A[i+2] = c; // Member of index 2 (Current instruction) 2728 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2729 2730 bool InBounds = false; 2731 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2732 InBounds = gep->isInBounds(); 2733 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2734 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2735 2736 // Cast to the vector pointer type. 2737 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2738 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2739 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2740 } 2741 2742 setDebugLocFromInst(Builder, Instr); 2743 Value *PoisonVec = PoisonValue::get(VecTy); 2744 2745 Value *MaskForGaps = nullptr; 2746 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2747 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2748 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2749 } 2750 2751 // Vectorize the interleaved load group. 2752 if (isa<LoadInst>(Instr)) { 2753 // For each unroll part, create a wide load for the group. 2754 SmallVector<Value *, 2> NewLoads; 2755 for (unsigned Part = 0; Part < UF; Part++) { 2756 Instruction *NewLoad; 2757 if (BlockInMask || MaskForGaps) { 2758 assert(useMaskedInterleavedAccesses(*TTI) && 2759 "masked interleaved groups are not allowed."); 2760 Value *GroupMask = MaskForGaps; 2761 if (BlockInMask) { 2762 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2763 Value *ShuffledMask = Builder.CreateShuffleVector( 2764 BlockInMaskPart, 2765 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2766 "interleaved.mask"); 2767 GroupMask = MaskForGaps 2768 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2769 MaskForGaps) 2770 : ShuffledMask; 2771 } 2772 NewLoad = 2773 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2774 GroupMask, PoisonVec, "wide.masked.vec"); 2775 } 2776 else 2777 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2778 Group->getAlign(), "wide.vec"); 2779 Group->addMetadata(NewLoad); 2780 NewLoads.push_back(NewLoad); 2781 } 2782 2783 // For each member in the group, shuffle out the appropriate data from the 2784 // wide loads. 2785 unsigned J = 0; 2786 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2787 Instruction *Member = Group->getMember(I); 2788 2789 // Skip the gaps in the group. 2790 if (!Member) 2791 continue; 2792 2793 auto StrideMask = 2794 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2795 for (unsigned Part = 0; Part < UF; Part++) { 2796 Value *StridedVec = Builder.CreateShuffleVector( 2797 NewLoads[Part], StrideMask, "strided.vec"); 2798 2799 // If this member has different type, cast the result type. 2800 if (Member->getType() != ScalarTy) { 2801 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2802 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2803 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2804 } 2805 2806 if (Group->isReverse()) 2807 StridedVec = reverseVector(StridedVec); 2808 2809 State.set(VPDefs[J], StridedVec, Part); 2810 } 2811 ++J; 2812 } 2813 return; 2814 } 2815 2816 // The sub vector type for current instruction. 2817 auto *SubVT = VectorType::get(ScalarTy, VF); 2818 2819 // Vectorize the interleaved store group. 2820 for (unsigned Part = 0; Part < UF; Part++) { 2821 // Collect the stored vector from each member. 2822 SmallVector<Value *, 4> StoredVecs; 2823 for (unsigned i = 0; i < InterleaveFactor; i++) { 2824 // Interleaved store group doesn't allow a gap, so each index has a member 2825 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2826 2827 Value *StoredVec = State.get(StoredValues[i], Part); 2828 2829 if (Group->isReverse()) 2830 StoredVec = reverseVector(StoredVec); 2831 2832 // If this member has different type, cast it to a unified type. 2833 2834 if (StoredVec->getType() != SubVT) 2835 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2836 2837 StoredVecs.push_back(StoredVec); 2838 } 2839 2840 // Concatenate all vectors into a wide vector. 2841 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2842 2843 // Interleave the elements in the wide vector. 2844 Value *IVec = Builder.CreateShuffleVector( 2845 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2846 "interleaved.vec"); 2847 2848 Instruction *NewStoreInstr; 2849 if (BlockInMask) { 2850 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2851 Value *ShuffledMask = Builder.CreateShuffleVector( 2852 BlockInMaskPart, 2853 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2854 "interleaved.mask"); 2855 NewStoreInstr = Builder.CreateMaskedStore( 2856 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2857 } 2858 else 2859 NewStoreInstr = 2860 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2861 2862 Group->addMetadata(NewStoreInstr); 2863 } 2864 } 2865 2866 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2867 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2868 VPValue *StoredValue, VPValue *BlockInMask) { 2869 // Attempt to issue a wide load. 2870 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2871 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2872 2873 assert((LI || SI) && "Invalid Load/Store instruction"); 2874 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2875 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2876 2877 LoopVectorizationCostModel::InstWidening Decision = 2878 Cost->getWideningDecision(Instr, VF); 2879 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2880 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2881 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2882 "CM decision is not to widen the memory instruction"); 2883 2884 Type *ScalarDataTy = getMemInstValueType(Instr); 2885 2886 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2887 const Align Alignment = getLoadStoreAlignment(Instr); 2888 2889 // Determine if the pointer operand of the access is either consecutive or 2890 // reverse consecutive. 2891 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2892 bool ConsecutiveStride = 2893 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2894 bool CreateGatherScatter = 2895 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2896 2897 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2898 // gather/scatter. Otherwise Decision should have been to Scalarize. 2899 assert((ConsecutiveStride || CreateGatherScatter) && 2900 "The instruction should be scalarized"); 2901 (void)ConsecutiveStride; 2902 2903 VectorParts BlockInMaskParts(UF); 2904 bool isMaskRequired = BlockInMask; 2905 if (isMaskRequired) 2906 for (unsigned Part = 0; Part < UF; ++Part) 2907 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2908 2909 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2910 // Calculate the pointer for the specific unroll-part. 2911 GetElementPtrInst *PartPtr = nullptr; 2912 2913 bool InBounds = false; 2914 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2915 InBounds = gep->isInBounds(); 2916 if (Reverse) { 2917 // If the address is consecutive but reversed, then the 2918 // wide store needs to start at the last vector element. 2919 // RunTimeVF = VScale * VF.getKnownMinValue() 2920 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2921 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2922 // NumElt = -Part * RunTimeVF 2923 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2924 // LastLane = 1 - RunTimeVF 2925 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2926 PartPtr = 2927 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2928 PartPtr->setIsInBounds(InBounds); 2929 PartPtr = cast<GetElementPtrInst>( 2930 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2931 PartPtr->setIsInBounds(InBounds); 2932 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2933 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2934 } else { 2935 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2936 PartPtr = cast<GetElementPtrInst>( 2937 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2938 PartPtr->setIsInBounds(InBounds); 2939 } 2940 2941 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2942 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2943 }; 2944 2945 // Handle Stores: 2946 if (SI) { 2947 setDebugLocFromInst(Builder, SI); 2948 2949 for (unsigned Part = 0; Part < UF; ++Part) { 2950 Instruction *NewSI = nullptr; 2951 Value *StoredVal = State.get(StoredValue, Part); 2952 if (CreateGatherScatter) { 2953 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2954 Value *VectorGep = State.get(Addr, Part); 2955 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2956 MaskPart); 2957 } else { 2958 if (Reverse) { 2959 // If we store to reverse consecutive memory locations, then we need 2960 // to reverse the order of elements in the stored value. 2961 StoredVal = reverseVector(StoredVal); 2962 // We don't want to update the value in the map as it might be used in 2963 // another expression. So don't call resetVectorValue(StoredVal). 2964 } 2965 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2966 if (isMaskRequired) 2967 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2968 BlockInMaskParts[Part]); 2969 else 2970 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2971 } 2972 addMetadata(NewSI, SI); 2973 } 2974 return; 2975 } 2976 2977 // Handle loads. 2978 assert(LI && "Must have a load instruction"); 2979 setDebugLocFromInst(Builder, LI); 2980 for (unsigned Part = 0; Part < UF; ++Part) { 2981 Value *NewLI; 2982 if (CreateGatherScatter) { 2983 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2984 Value *VectorGep = State.get(Addr, Part); 2985 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2986 nullptr, "wide.masked.gather"); 2987 addMetadata(NewLI, LI); 2988 } else { 2989 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2990 if (isMaskRequired) 2991 NewLI = Builder.CreateMaskedLoad( 2992 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2993 "wide.masked.load"); 2994 else 2995 NewLI = 2996 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2997 2998 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2999 addMetadata(NewLI, LI); 3000 if (Reverse) 3001 NewLI = reverseVector(NewLI); 3002 } 3003 3004 State.set(Def, NewLI, Part); 3005 } 3006 } 3007 3008 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3009 VPUser &User, 3010 const VPIteration &Instance, 3011 bool IfPredicateInstr, 3012 VPTransformState &State) { 3013 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3014 3015 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3016 // the first lane and part. 3017 if (isa<NoAliasScopeDeclInst>(Instr)) 3018 if (!Instance.isFirstIteration()) 3019 return; 3020 3021 setDebugLocFromInst(Builder, Instr); 3022 3023 // Does this instruction return a value ? 3024 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3025 3026 Instruction *Cloned = Instr->clone(); 3027 if (!IsVoidRetTy) 3028 Cloned->setName(Instr->getName() + ".cloned"); 3029 3030 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3031 Builder.GetInsertPoint()); 3032 // Replace the operands of the cloned instructions with their scalar 3033 // equivalents in the new loop. 3034 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3035 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3036 auto InputInstance = Instance; 3037 if (!Operand || !OrigLoop->contains(Operand) || 3038 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3039 InputInstance.Lane = VPLane::getFirstLane(); 3040 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3041 Cloned->setOperand(op, NewOp); 3042 } 3043 addNewMetadata(Cloned, Instr); 3044 3045 // Place the cloned scalar in the new loop. 3046 Builder.Insert(Cloned); 3047 3048 State.set(Def, Cloned, Instance); 3049 3050 // If we just cloned a new assumption, add it the assumption cache. 3051 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3052 AC->registerAssumption(II); 3053 3054 // End if-block. 3055 if (IfPredicateInstr) 3056 PredicatedInstructions.push_back(Cloned); 3057 } 3058 3059 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3060 Value *End, Value *Step, 3061 Instruction *DL) { 3062 BasicBlock *Header = L->getHeader(); 3063 BasicBlock *Latch = L->getLoopLatch(); 3064 // As we're just creating this loop, it's possible no latch exists 3065 // yet. If so, use the header as this will be a single block loop. 3066 if (!Latch) 3067 Latch = Header; 3068 3069 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3070 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3071 setDebugLocFromInst(Builder, OldInst); 3072 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3073 3074 Builder.SetInsertPoint(Latch->getTerminator()); 3075 setDebugLocFromInst(Builder, OldInst); 3076 3077 // Create i+1 and fill the PHINode. 3078 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3079 Induction->addIncoming(Start, L->getLoopPreheader()); 3080 Induction->addIncoming(Next, Latch); 3081 // Create the compare. 3082 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3083 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3084 3085 // Now we have two terminators. Remove the old one from the block. 3086 Latch->getTerminator()->eraseFromParent(); 3087 3088 return Induction; 3089 } 3090 3091 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3092 if (TripCount) 3093 return TripCount; 3094 3095 assert(L && "Create Trip Count for null loop."); 3096 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3097 // Find the loop boundaries. 3098 ScalarEvolution *SE = PSE.getSE(); 3099 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3100 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3101 "Invalid loop count"); 3102 3103 Type *IdxTy = Legal->getWidestInductionType(); 3104 assert(IdxTy && "No type for induction"); 3105 3106 // The exit count might have the type of i64 while the phi is i32. This can 3107 // happen if we have an induction variable that is sign extended before the 3108 // compare. The only way that we get a backedge taken count is that the 3109 // induction variable was signed and as such will not overflow. In such a case 3110 // truncation is legal. 3111 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3112 IdxTy->getPrimitiveSizeInBits()) 3113 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3114 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3115 3116 // Get the total trip count from the count by adding 1. 3117 const SCEV *ExitCount = SE->getAddExpr( 3118 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3119 3120 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3121 3122 // Expand the trip count and place the new instructions in the preheader. 3123 // Notice that the pre-header does not change, only the loop body. 3124 SCEVExpander Exp(*SE, DL, "induction"); 3125 3126 // Count holds the overall loop count (N). 3127 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3128 L->getLoopPreheader()->getTerminator()); 3129 3130 if (TripCount->getType()->isPointerTy()) 3131 TripCount = 3132 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3133 L->getLoopPreheader()->getTerminator()); 3134 3135 return TripCount; 3136 } 3137 3138 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3139 if (VectorTripCount) 3140 return VectorTripCount; 3141 3142 Value *TC = getOrCreateTripCount(L); 3143 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3144 3145 Type *Ty = TC->getType(); 3146 // This is where we can make the step a runtime constant. 3147 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3148 3149 // If the tail is to be folded by masking, round the number of iterations N 3150 // up to a multiple of Step instead of rounding down. This is done by first 3151 // adding Step-1 and then rounding down. Note that it's ok if this addition 3152 // overflows: the vector induction variable will eventually wrap to zero given 3153 // that it starts at zero and its Step is a power of two; the loop will then 3154 // exit, with the last early-exit vector comparison also producing all-true. 3155 if (Cost->foldTailByMasking()) { 3156 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3157 "VF*UF must be a power of 2 when folding tail by masking"); 3158 assert(!VF.isScalable() && 3159 "Tail folding not yet supported for scalable vectors"); 3160 TC = Builder.CreateAdd( 3161 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3162 } 3163 3164 // Now we need to generate the expression for the part of the loop that the 3165 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3166 // iterations are not required for correctness, or N - Step, otherwise. Step 3167 // is equal to the vectorization factor (number of SIMD elements) times the 3168 // unroll factor (number of SIMD instructions). 3169 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3170 3171 // There are two cases where we need to ensure (at least) the last iteration 3172 // runs in the scalar remainder loop. Thus, if the step evenly divides 3173 // the trip count, we set the remainder to be equal to the step. If the step 3174 // does not evenly divide the trip count, no adjustment is necessary since 3175 // there will already be scalar iterations. Note that the minimum iterations 3176 // check ensures that N >= Step. The cases are: 3177 // 1) If there is a non-reversed interleaved group that may speculatively 3178 // access memory out-of-bounds. 3179 // 2) If any instruction may follow a conditionally taken exit. That is, if 3180 // the loop contains multiple exiting blocks, or a single exiting block 3181 // which is not the latch. 3182 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3183 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3184 R = Builder.CreateSelect(IsZero, Step, R); 3185 } 3186 3187 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3188 3189 return VectorTripCount; 3190 } 3191 3192 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3193 const DataLayout &DL) { 3194 // Verify that V is a vector type with same number of elements as DstVTy. 3195 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3196 unsigned VF = DstFVTy->getNumElements(); 3197 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3198 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3199 Type *SrcElemTy = SrcVecTy->getElementType(); 3200 Type *DstElemTy = DstFVTy->getElementType(); 3201 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3202 "Vector elements must have same size"); 3203 3204 // Do a direct cast if element types are castable. 3205 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3206 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3207 } 3208 // V cannot be directly casted to desired vector type. 3209 // May happen when V is a floating point vector but DstVTy is a vector of 3210 // pointers or vice-versa. Handle this using a two-step bitcast using an 3211 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3212 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3213 "Only one type should be a pointer type"); 3214 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3215 "Only one type should be a floating point type"); 3216 Type *IntTy = 3217 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3218 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3219 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3220 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3221 } 3222 3223 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3224 BasicBlock *Bypass) { 3225 Value *Count = getOrCreateTripCount(L); 3226 // Reuse existing vector loop preheader for TC checks. 3227 // Note that new preheader block is generated for vector loop. 3228 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3229 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3230 3231 // Generate code to check if the loop's trip count is less than VF * UF, or 3232 // equal to it in case a scalar epilogue is required; this implies that the 3233 // vector trip count is zero. This check also covers the case where adding one 3234 // to the backedge-taken count overflowed leading to an incorrect trip count 3235 // of zero. In this case we will also jump to the scalar loop. 3236 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3237 : ICmpInst::ICMP_ULT; 3238 3239 // If tail is to be folded, vector loop takes care of all iterations. 3240 Value *CheckMinIters = Builder.getFalse(); 3241 if (!Cost->foldTailByMasking()) { 3242 Value *Step = 3243 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3244 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3245 } 3246 // Create new preheader for vector loop. 3247 LoopVectorPreHeader = 3248 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3249 "vector.ph"); 3250 3251 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3252 DT->getNode(Bypass)->getIDom()) && 3253 "TC check is expected to dominate Bypass"); 3254 3255 // Update dominator for Bypass & LoopExit. 3256 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3257 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3258 3259 ReplaceInstWithInst( 3260 TCCheckBlock->getTerminator(), 3261 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3262 LoopBypassBlocks.push_back(TCCheckBlock); 3263 } 3264 3265 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3266 3267 BasicBlock *const SCEVCheckBlock = 3268 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3269 if (!SCEVCheckBlock) 3270 return nullptr; 3271 3272 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3273 (OptForSizeBasedOnProfile && 3274 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3275 "Cannot SCEV check stride or overflow when optimizing for size"); 3276 3277 3278 // Update dominator only if this is first RT check. 3279 if (LoopBypassBlocks.empty()) { 3280 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3281 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3282 } 3283 3284 LoopBypassBlocks.push_back(SCEVCheckBlock); 3285 AddedSafetyChecks = true; 3286 return SCEVCheckBlock; 3287 } 3288 3289 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3290 BasicBlock *Bypass) { 3291 // VPlan-native path does not do any analysis for runtime checks currently. 3292 if (EnableVPlanNativePath) 3293 return nullptr; 3294 3295 BasicBlock *const MemCheckBlock = 3296 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3297 3298 // Check if we generated code that checks in runtime if arrays overlap. We put 3299 // the checks into a separate block to make the more common case of few 3300 // elements faster. 3301 if (!MemCheckBlock) 3302 return nullptr; 3303 3304 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3305 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3306 "Cannot emit memory checks when optimizing for size, unless forced " 3307 "to vectorize."); 3308 ORE->emit([&]() { 3309 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3310 L->getStartLoc(), L->getHeader()) 3311 << "Code-size may be reduced by not forcing " 3312 "vectorization, or by source-code modifications " 3313 "eliminating the need for runtime checks " 3314 "(e.g., adding 'restrict')."; 3315 }); 3316 } 3317 3318 LoopBypassBlocks.push_back(MemCheckBlock); 3319 3320 AddedSafetyChecks = true; 3321 3322 // We currently don't use LoopVersioning for the actual loop cloning but we 3323 // still use it to add the noalias metadata. 3324 LVer = std::make_unique<LoopVersioning>( 3325 *Legal->getLAI(), 3326 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3327 DT, PSE.getSE()); 3328 LVer->prepareNoAliasMetadata(); 3329 return MemCheckBlock; 3330 } 3331 3332 Value *InnerLoopVectorizer::emitTransformedIndex( 3333 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3334 const InductionDescriptor &ID) const { 3335 3336 SCEVExpander Exp(*SE, DL, "induction"); 3337 auto Step = ID.getStep(); 3338 auto StartValue = ID.getStartValue(); 3339 assert(Index->getType()->getScalarType() == Step->getType() && 3340 "Index scalar type does not match StepValue type"); 3341 3342 // Note: the IR at this point is broken. We cannot use SE to create any new 3343 // SCEV and then expand it, hoping that SCEV's simplification will give us 3344 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3345 // lead to various SCEV crashes. So all we can do is to use builder and rely 3346 // on InstCombine for future simplifications. Here we handle some trivial 3347 // cases only. 3348 auto CreateAdd = [&B](Value *X, Value *Y) { 3349 assert(X->getType() == Y->getType() && "Types don't match!"); 3350 if (auto *CX = dyn_cast<ConstantInt>(X)) 3351 if (CX->isZero()) 3352 return Y; 3353 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3354 if (CY->isZero()) 3355 return X; 3356 return B.CreateAdd(X, Y); 3357 }; 3358 3359 // We allow X to be a vector type, in which case Y will potentially be 3360 // splatted into a vector with the same element count. 3361 auto CreateMul = [&B](Value *X, Value *Y) { 3362 assert(X->getType()->getScalarType() == Y->getType() && 3363 "Types don't match!"); 3364 if (auto *CX = dyn_cast<ConstantInt>(X)) 3365 if (CX->isOne()) 3366 return Y; 3367 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3368 if (CY->isOne()) 3369 return X; 3370 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3371 if (XVTy && !isa<VectorType>(Y->getType())) 3372 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3373 return B.CreateMul(X, Y); 3374 }; 3375 3376 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3377 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3378 // the DomTree is not kept up-to-date for additional blocks generated in the 3379 // vector loop. By using the header as insertion point, we guarantee that the 3380 // expanded instructions dominate all their uses. 3381 auto GetInsertPoint = [this, &B]() { 3382 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3383 if (InsertBB != LoopVectorBody && 3384 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3385 return LoopVectorBody->getTerminator(); 3386 return &*B.GetInsertPoint(); 3387 }; 3388 3389 switch (ID.getKind()) { 3390 case InductionDescriptor::IK_IntInduction: { 3391 assert(!isa<VectorType>(Index->getType()) && 3392 "Vector indices not supported for integer inductions yet"); 3393 assert(Index->getType() == StartValue->getType() && 3394 "Index type does not match StartValue type"); 3395 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3396 return B.CreateSub(StartValue, Index); 3397 auto *Offset = CreateMul( 3398 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3399 return CreateAdd(StartValue, Offset); 3400 } 3401 case InductionDescriptor::IK_PtrInduction: { 3402 assert(isa<SCEVConstant>(Step) && 3403 "Expected constant step for pointer induction"); 3404 return B.CreateGEP( 3405 StartValue->getType()->getPointerElementType(), StartValue, 3406 CreateMul(Index, 3407 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3408 GetInsertPoint()))); 3409 } 3410 case InductionDescriptor::IK_FpInduction: { 3411 assert(!isa<VectorType>(Index->getType()) && 3412 "Vector indices not supported for FP inductions yet"); 3413 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3414 auto InductionBinOp = ID.getInductionBinOp(); 3415 assert(InductionBinOp && 3416 (InductionBinOp->getOpcode() == Instruction::FAdd || 3417 InductionBinOp->getOpcode() == Instruction::FSub) && 3418 "Original bin op should be defined for FP induction"); 3419 3420 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3421 Value *MulExp = B.CreateFMul(StepValue, Index); 3422 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3423 "induction"); 3424 } 3425 case InductionDescriptor::IK_NoInduction: 3426 return nullptr; 3427 } 3428 llvm_unreachable("invalid enum"); 3429 } 3430 3431 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3432 LoopScalarBody = OrigLoop->getHeader(); 3433 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3434 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3435 assert(LoopExitBlock && "Must have an exit block"); 3436 assert(LoopVectorPreHeader && "Invalid loop structure"); 3437 3438 LoopMiddleBlock = 3439 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3440 LI, nullptr, Twine(Prefix) + "middle.block"); 3441 LoopScalarPreHeader = 3442 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3443 nullptr, Twine(Prefix) + "scalar.ph"); 3444 3445 // Set up branch from middle block to the exit and scalar preheader blocks. 3446 // completeLoopSkeleton will update the condition to use an iteration check, 3447 // if required to decide whether to execute the remainder. 3448 BranchInst *BrInst = 3449 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3450 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3451 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3452 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3453 3454 // We intentionally don't let SplitBlock to update LoopInfo since 3455 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3456 // LoopVectorBody is explicitly added to the correct place few lines later. 3457 LoopVectorBody = 3458 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3459 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3460 3461 // Update dominator for loop exit. 3462 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3463 3464 // Create and register the new vector loop. 3465 Loop *Lp = LI->AllocateLoop(); 3466 Loop *ParentLoop = OrigLoop->getParentLoop(); 3467 3468 // Insert the new loop into the loop nest and register the new basic blocks 3469 // before calling any utilities such as SCEV that require valid LoopInfo. 3470 if (ParentLoop) { 3471 ParentLoop->addChildLoop(Lp); 3472 } else { 3473 LI->addTopLevelLoop(Lp); 3474 } 3475 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3476 return Lp; 3477 } 3478 3479 void InnerLoopVectorizer::createInductionResumeValues( 3480 Loop *L, Value *VectorTripCount, 3481 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3482 assert(VectorTripCount && L && "Expected valid arguments"); 3483 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3484 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3485 "Inconsistent information about additional bypass."); 3486 // We are going to resume the execution of the scalar loop. 3487 // Go over all of the induction variables that we found and fix the 3488 // PHIs that are left in the scalar version of the loop. 3489 // The starting values of PHI nodes depend on the counter of the last 3490 // iteration in the vectorized loop. 3491 // If we come from a bypass edge then we need to start from the original 3492 // start value. 3493 for (auto &InductionEntry : Legal->getInductionVars()) { 3494 PHINode *OrigPhi = InductionEntry.first; 3495 InductionDescriptor II = InductionEntry.second; 3496 3497 // Create phi nodes to merge from the backedge-taken check block. 3498 PHINode *BCResumeVal = 3499 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3500 LoopScalarPreHeader->getTerminator()); 3501 // Copy original phi DL over to the new one. 3502 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3503 Value *&EndValue = IVEndValues[OrigPhi]; 3504 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3505 if (OrigPhi == OldInduction) { 3506 // We know what the end value is. 3507 EndValue = VectorTripCount; 3508 } else { 3509 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3510 3511 // Fast-math-flags propagate from the original induction instruction. 3512 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3513 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3514 3515 Type *StepType = II.getStep()->getType(); 3516 Instruction::CastOps CastOp = 3517 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3518 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3519 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3520 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3521 EndValue->setName("ind.end"); 3522 3523 // Compute the end value for the additional bypass (if applicable). 3524 if (AdditionalBypass.first) { 3525 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3526 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3527 StepType, true); 3528 CRD = 3529 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3530 EndValueFromAdditionalBypass = 3531 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3532 EndValueFromAdditionalBypass->setName("ind.end"); 3533 } 3534 } 3535 // The new PHI merges the original incoming value, in case of a bypass, 3536 // or the value at the end of the vectorized loop. 3537 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3538 3539 // Fix the scalar body counter (PHI node). 3540 // The old induction's phi node in the scalar body needs the truncated 3541 // value. 3542 for (BasicBlock *BB : LoopBypassBlocks) 3543 BCResumeVal->addIncoming(II.getStartValue(), BB); 3544 3545 if (AdditionalBypass.first) 3546 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3547 EndValueFromAdditionalBypass); 3548 3549 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3550 } 3551 } 3552 3553 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3554 MDNode *OrigLoopID) { 3555 assert(L && "Expected valid loop."); 3556 3557 // The trip counts should be cached by now. 3558 Value *Count = getOrCreateTripCount(L); 3559 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3560 3561 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3562 3563 // Add a check in the middle block to see if we have completed 3564 // all of the iterations in the first vector loop. 3565 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3566 // If tail is to be folded, we know we don't need to run the remainder. 3567 if (!Cost->foldTailByMasking()) { 3568 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3569 Count, VectorTripCount, "cmp.n", 3570 LoopMiddleBlock->getTerminator()); 3571 3572 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3573 // of the corresponding compare because they may have ended up with 3574 // different line numbers and we want to avoid awkward line stepping while 3575 // debugging. Eg. if the compare has got a line number inside the loop. 3576 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3577 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3578 } 3579 3580 // Get ready to start creating new instructions into the vectorized body. 3581 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3582 "Inconsistent vector loop preheader"); 3583 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3584 3585 Optional<MDNode *> VectorizedLoopID = 3586 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3587 LLVMLoopVectorizeFollowupVectorized}); 3588 if (VectorizedLoopID.hasValue()) { 3589 L->setLoopID(VectorizedLoopID.getValue()); 3590 3591 // Do not setAlreadyVectorized if loop attributes have been defined 3592 // explicitly. 3593 return LoopVectorPreHeader; 3594 } 3595 3596 // Keep all loop hints from the original loop on the vector loop (we'll 3597 // replace the vectorizer-specific hints below). 3598 if (MDNode *LID = OrigLoop->getLoopID()) 3599 L->setLoopID(LID); 3600 3601 LoopVectorizeHints Hints(L, true, *ORE); 3602 Hints.setAlreadyVectorized(); 3603 3604 #ifdef EXPENSIVE_CHECKS 3605 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3606 LI->verify(*DT); 3607 #endif 3608 3609 return LoopVectorPreHeader; 3610 } 3611 3612 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3613 /* 3614 In this function we generate a new loop. The new loop will contain 3615 the vectorized instructions while the old loop will continue to run the 3616 scalar remainder. 3617 3618 [ ] <-- loop iteration number check. 3619 / | 3620 / v 3621 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3622 | / | 3623 | / v 3624 || [ ] <-- vector pre header. 3625 |/ | 3626 | v 3627 | [ ] \ 3628 | [ ]_| <-- vector loop. 3629 | | 3630 | v 3631 | -[ ] <--- middle-block. 3632 | / | 3633 | / v 3634 -|- >[ ] <--- new preheader. 3635 | | 3636 | v 3637 | [ ] \ 3638 | [ ]_| <-- old scalar loop to handle remainder. 3639 \ | 3640 \ v 3641 >[ ] <-- exit block. 3642 ... 3643 */ 3644 3645 // Get the metadata of the original loop before it gets modified. 3646 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3647 3648 // Workaround! Compute the trip count of the original loop and cache it 3649 // before we start modifying the CFG. This code has a systemic problem 3650 // wherein it tries to run analysis over partially constructed IR; this is 3651 // wrong, and not simply for SCEV. The trip count of the original loop 3652 // simply happens to be prone to hitting this in practice. In theory, we 3653 // can hit the same issue for any SCEV, or ValueTracking query done during 3654 // mutation. See PR49900. 3655 getOrCreateTripCount(OrigLoop); 3656 3657 // Create an empty vector loop, and prepare basic blocks for the runtime 3658 // checks. 3659 Loop *Lp = createVectorLoopSkeleton(""); 3660 3661 // Now, compare the new count to zero. If it is zero skip the vector loop and 3662 // jump to the scalar loop. This check also covers the case where the 3663 // backedge-taken count is uint##_max: adding one to it will overflow leading 3664 // to an incorrect trip count of zero. In this (rare) case we will also jump 3665 // to the scalar loop. 3666 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3667 3668 // Generate the code to check any assumptions that we've made for SCEV 3669 // expressions. 3670 emitSCEVChecks(Lp, LoopScalarPreHeader); 3671 3672 // Generate the code that checks in runtime if arrays overlap. We put the 3673 // checks into a separate block to make the more common case of few elements 3674 // faster. 3675 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3676 3677 // Some loops have a single integer induction variable, while other loops 3678 // don't. One example is c++ iterators that often have multiple pointer 3679 // induction variables. In the code below we also support a case where we 3680 // don't have a single induction variable. 3681 // 3682 // We try to obtain an induction variable from the original loop as hard 3683 // as possible. However if we don't find one that: 3684 // - is an integer 3685 // - counts from zero, stepping by one 3686 // - is the size of the widest induction variable type 3687 // then we create a new one. 3688 OldInduction = Legal->getPrimaryInduction(); 3689 Type *IdxTy = Legal->getWidestInductionType(); 3690 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3691 // The loop step is equal to the vectorization factor (num of SIMD elements) 3692 // times the unroll factor (num of SIMD instructions). 3693 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3694 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3695 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3696 Induction = 3697 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3698 getDebugLocFromInstOrOperands(OldInduction)); 3699 3700 // Emit phis for the new starting index of the scalar loop. 3701 createInductionResumeValues(Lp, CountRoundDown); 3702 3703 return completeLoopSkeleton(Lp, OrigLoopID); 3704 } 3705 3706 // Fix up external users of the induction variable. At this point, we are 3707 // in LCSSA form, with all external PHIs that use the IV having one input value, 3708 // coming from the remainder loop. We need those PHIs to also have a correct 3709 // value for the IV when arriving directly from the middle block. 3710 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3711 const InductionDescriptor &II, 3712 Value *CountRoundDown, Value *EndValue, 3713 BasicBlock *MiddleBlock) { 3714 // There are two kinds of external IV usages - those that use the value 3715 // computed in the last iteration (the PHI) and those that use the penultimate 3716 // value (the value that feeds into the phi from the loop latch). 3717 // We allow both, but they, obviously, have different values. 3718 3719 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3720 3721 DenseMap<Value *, Value *> MissingVals; 3722 3723 // An external user of the last iteration's value should see the value that 3724 // the remainder loop uses to initialize its own IV. 3725 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3726 for (User *U : PostInc->users()) { 3727 Instruction *UI = cast<Instruction>(U); 3728 if (!OrigLoop->contains(UI)) { 3729 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3730 MissingVals[UI] = EndValue; 3731 } 3732 } 3733 3734 // An external user of the penultimate value need to see EndValue - Step. 3735 // The simplest way to get this is to recompute it from the constituent SCEVs, 3736 // that is Start + (Step * (CRD - 1)). 3737 for (User *U : OrigPhi->users()) { 3738 auto *UI = cast<Instruction>(U); 3739 if (!OrigLoop->contains(UI)) { 3740 const DataLayout &DL = 3741 OrigLoop->getHeader()->getModule()->getDataLayout(); 3742 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3743 3744 IRBuilder<> B(MiddleBlock->getTerminator()); 3745 3746 // Fast-math-flags propagate from the original induction instruction. 3747 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3748 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3749 3750 Value *CountMinusOne = B.CreateSub( 3751 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3752 Value *CMO = 3753 !II.getStep()->getType()->isIntegerTy() 3754 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3755 II.getStep()->getType()) 3756 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3757 CMO->setName("cast.cmo"); 3758 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3759 Escape->setName("ind.escape"); 3760 MissingVals[UI] = Escape; 3761 } 3762 } 3763 3764 for (auto &I : MissingVals) { 3765 PHINode *PHI = cast<PHINode>(I.first); 3766 // One corner case we have to handle is two IVs "chasing" each-other, 3767 // that is %IV2 = phi [...], [ %IV1, %latch ] 3768 // In this case, if IV1 has an external use, we need to avoid adding both 3769 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3770 // don't already have an incoming value for the middle block. 3771 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3772 PHI->addIncoming(I.second, MiddleBlock); 3773 } 3774 } 3775 3776 namespace { 3777 3778 struct CSEDenseMapInfo { 3779 static bool canHandle(const Instruction *I) { 3780 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3781 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3782 } 3783 3784 static inline Instruction *getEmptyKey() { 3785 return DenseMapInfo<Instruction *>::getEmptyKey(); 3786 } 3787 3788 static inline Instruction *getTombstoneKey() { 3789 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3790 } 3791 3792 static unsigned getHashValue(const Instruction *I) { 3793 assert(canHandle(I) && "Unknown instruction!"); 3794 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3795 I->value_op_end())); 3796 } 3797 3798 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3799 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3800 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3801 return LHS == RHS; 3802 return LHS->isIdenticalTo(RHS); 3803 } 3804 }; 3805 3806 } // end anonymous namespace 3807 3808 ///Perform cse of induction variable instructions. 3809 static void cse(BasicBlock *BB) { 3810 // Perform simple cse. 3811 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3812 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3813 Instruction *In = &*I++; 3814 3815 if (!CSEDenseMapInfo::canHandle(In)) 3816 continue; 3817 3818 // Check if we can replace this instruction with any of the 3819 // visited instructions. 3820 if (Instruction *V = CSEMap.lookup(In)) { 3821 In->replaceAllUsesWith(V); 3822 In->eraseFromParent(); 3823 continue; 3824 } 3825 3826 CSEMap[In] = In; 3827 } 3828 } 3829 3830 InstructionCost 3831 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3832 bool &NeedToScalarize) const { 3833 Function *F = CI->getCalledFunction(); 3834 Type *ScalarRetTy = CI->getType(); 3835 SmallVector<Type *, 4> Tys, ScalarTys; 3836 for (auto &ArgOp : CI->arg_operands()) 3837 ScalarTys.push_back(ArgOp->getType()); 3838 3839 // Estimate cost of scalarized vector call. The source operands are assumed 3840 // to be vectors, so we need to extract individual elements from there, 3841 // execute VF scalar calls, and then gather the result into the vector return 3842 // value. 3843 InstructionCost ScalarCallCost = 3844 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3845 if (VF.isScalar()) 3846 return ScalarCallCost; 3847 3848 // Compute corresponding vector type for return value and arguments. 3849 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3850 for (Type *ScalarTy : ScalarTys) 3851 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3852 3853 // Compute costs of unpacking argument values for the scalar calls and 3854 // packing the return values to a vector. 3855 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3856 3857 InstructionCost Cost = 3858 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3859 3860 // If we can't emit a vector call for this function, then the currently found 3861 // cost is the cost we need to return. 3862 NeedToScalarize = true; 3863 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3864 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3865 3866 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3867 return Cost; 3868 3869 // If the corresponding vector cost is cheaper, return its cost. 3870 InstructionCost VectorCallCost = 3871 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3872 if (VectorCallCost < Cost) { 3873 NeedToScalarize = false; 3874 Cost = VectorCallCost; 3875 } 3876 return Cost; 3877 } 3878 3879 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3880 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3881 return Elt; 3882 return VectorType::get(Elt, VF); 3883 } 3884 3885 InstructionCost 3886 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3887 ElementCount VF) const { 3888 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3889 assert(ID && "Expected intrinsic call!"); 3890 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3891 FastMathFlags FMF; 3892 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3893 FMF = FPMO->getFastMathFlags(); 3894 3895 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3896 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3897 SmallVector<Type *> ParamTys; 3898 std::transform(FTy->param_begin(), FTy->param_end(), 3899 std::back_inserter(ParamTys), 3900 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3901 3902 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3903 dyn_cast<IntrinsicInst>(CI)); 3904 return TTI.getIntrinsicInstrCost(CostAttrs, 3905 TargetTransformInfo::TCK_RecipThroughput); 3906 } 3907 3908 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3909 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3910 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3911 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3912 } 3913 3914 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3915 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3916 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3917 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3918 } 3919 3920 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3921 // For every instruction `I` in MinBWs, truncate the operands, create a 3922 // truncated version of `I` and reextend its result. InstCombine runs 3923 // later and will remove any ext/trunc pairs. 3924 SmallPtrSet<Value *, 4> Erased; 3925 for (const auto &KV : Cost->getMinimalBitwidths()) { 3926 // If the value wasn't vectorized, we must maintain the original scalar 3927 // type. The absence of the value from State indicates that it 3928 // wasn't vectorized. 3929 VPValue *Def = State.Plan->getVPValue(KV.first); 3930 if (!State.hasAnyVectorValue(Def)) 3931 continue; 3932 for (unsigned Part = 0; Part < UF; ++Part) { 3933 Value *I = State.get(Def, Part); 3934 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3935 continue; 3936 Type *OriginalTy = I->getType(); 3937 Type *ScalarTruncatedTy = 3938 IntegerType::get(OriginalTy->getContext(), KV.second); 3939 auto *TruncatedTy = FixedVectorType::get( 3940 ScalarTruncatedTy, 3941 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3942 if (TruncatedTy == OriginalTy) 3943 continue; 3944 3945 IRBuilder<> B(cast<Instruction>(I)); 3946 auto ShrinkOperand = [&](Value *V) -> Value * { 3947 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3948 if (ZI->getSrcTy() == TruncatedTy) 3949 return ZI->getOperand(0); 3950 return B.CreateZExtOrTrunc(V, TruncatedTy); 3951 }; 3952 3953 // The actual instruction modification depends on the instruction type, 3954 // unfortunately. 3955 Value *NewI = nullptr; 3956 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3957 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3958 ShrinkOperand(BO->getOperand(1))); 3959 3960 // Any wrapping introduced by shrinking this operation shouldn't be 3961 // considered undefined behavior. So, we can't unconditionally copy 3962 // arithmetic wrapping flags to NewI. 3963 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3964 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3965 NewI = 3966 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3967 ShrinkOperand(CI->getOperand(1))); 3968 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3969 NewI = B.CreateSelect(SI->getCondition(), 3970 ShrinkOperand(SI->getTrueValue()), 3971 ShrinkOperand(SI->getFalseValue())); 3972 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3973 switch (CI->getOpcode()) { 3974 default: 3975 llvm_unreachable("Unhandled cast!"); 3976 case Instruction::Trunc: 3977 NewI = ShrinkOperand(CI->getOperand(0)); 3978 break; 3979 case Instruction::SExt: 3980 NewI = B.CreateSExtOrTrunc( 3981 CI->getOperand(0), 3982 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3983 break; 3984 case Instruction::ZExt: 3985 NewI = B.CreateZExtOrTrunc( 3986 CI->getOperand(0), 3987 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3988 break; 3989 } 3990 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3991 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3992 ->getNumElements(); 3993 auto *O0 = B.CreateZExtOrTrunc( 3994 SI->getOperand(0), 3995 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3996 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3997 ->getNumElements(); 3998 auto *O1 = B.CreateZExtOrTrunc( 3999 SI->getOperand(1), 4000 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 4001 4002 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4003 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4004 // Don't do anything with the operands, just extend the result. 4005 continue; 4006 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4007 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 4008 ->getNumElements(); 4009 auto *O0 = B.CreateZExtOrTrunc( 4010 IE->getOperand(0), 4011 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4012 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4013 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4014 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4015 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4016 ->getNumElements(); 4017 auto *O0 = B.CreateZExtOrTrunc( 4018 EE->getOperand(0), 4019 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4020 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4021 } else { 4022 // If we don't know what to do, be conservative and don't do anything. 4023 continue; 4024 } 4025 4026 // Lastly, extend the result. 4027 NewI->takeName(cast<Instruction>(I)); 4028 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4029 I->replaceAllUsesWith(Res); 4030 cast<Instruction>(I)->eraseFromParent(); 4031 Erased.insert(I); 4032 State.reset(Def, Res, Part); 4033 } 4034 } 4035 4036 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4037 for (const auto &KV : Cost->getMinimalBitwidths()) { 4038 // If the value wasn't vectorized, we must maintain the original scalar 4039 // type. The absence of the value from State indicates that it 4040 // wasn't vectorized. 4041 VPValue *Def = State.Plan->getVPValue(KV.first); 4042 if (!State.hasAnyVectorValue(Def)) 4043 continue; 4044 for (unsigned Part = 0; Part < UF; ++Part) { 4045 Value *I = State.get(Def, Part); 4046 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4047 if (Inst && Inst->use_empty()) { 4048 Value *NewI = Inst->getOperand(0); 4049 Inst->eraseFromParent(); 4050 State.reset(Def, NewI, Part); 4051 } 4052 } 4053 } 4054 } 4055 4056 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4057 // Insert truncates and extends for any truncated instructions as hints to 4058 // InstCombine. 4059 if (VF.isVector()) 4060 truncateToMinimalBitwidths(State); 4061 4062 // Fix widened non-induction PHIs by setting up the PHI operands. 4063 if (OrigPHIsToFix.size()) { 4064 assert(EnableVPlanNativePath && 4065 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4066 fixNonInductionPHIs(State); 4067 } 4068 4069 // At this point every instruction in the original loop is widened to a 4070 // vector form. Now we need to fix the recurrences in the loop. These PHI 4071 // nodes are currently empty because we did not want to introduce cycles. 4072 // This is the second stage of vectorizing recurrences. 4073 fixCrossIterationPHIs(State); 4074 4075 // Forget the original basic block. 4076 PSE.getSE()->forgetLoop(OrigLoop); 4077 4078 // Fix-up external users of the induction variables. 4079 for (auto &Entry : Legal->getInductionVars()) 4080 fixupIVUsers(Entry.first, Entry.second, 4081 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4082 IVEndValues[Entry.first], LoopMiddleBlock); 4083 4084 fixLCSSAPHIs(State); 4085 for (Instruction *PI : PredicatedInstructions) 4086 sinkScalarOperands(&*PI); 4087 4088 // Remove redundant induction instructions. 4089 cse(LoopVectorBody); 4090 4091 // Set/update profile weights for the vector and remainder loops as original 4092 // loop iterations are now distributed among them. Note that original loop 4093 // represented by LoopScalarBody becomes remainder loop after vectorization. 4094 // 4095 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4096 // end up getting slightly roughened result but that should be OK since 4097 // profile is not inherently precise anyway. Note also possible bypass of 4098 // vector code caused by legality checks is ignored, assigning all the weight 4099 // to the vector loop, optimistically. 4100 // 4101 // For scalable vectorization we can't know at compile time how many iterations 4102 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4103 // vscale of '1'. 4104 setProfileInfoAfterUnrolling( 4105 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4106 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4107 } 4108 4109 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4110 // In order to support recurrences we need to be able to vectorize Phi nodes. 4111 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4112 // stage #2: We now need to fix the recurrences by adding incoming edges to 4113 // the currently empty PHI nodes. At this point every instruction in the 4114 // original loop is widened to a vector form so we can use them to construct 4115 // the incoming edges. 4116 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4117 for (VPRecipeBase &R : Header->phis()) { 4118 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4119 if (!PhiR) 4120 continue; 4121 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4122 if (PhiR->getRecurrenceDescriptor()) { 4123 fixReduction(PhiR, State); 4124 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4125 fixFirstOrderRecurrence(OrigPhi, State); 4126 } 4127 } 4128 4129 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4130 VPTransformState &State) { 4131 // This is the second phase of vectorizing first-order recurrences. An 4132 // overview of the transformation is described below. Suppose we have the 4133 // following loop. 4134 // 4135 // for (int i = 0; i < n; ++i) 4136 // b[i] = a[i] - a[i - 1]; 4137 // 4138 // There is a first-order recurrence on "a". For this loop, the shorthand 4139 // scalar IR looks like: 4140 // 4141 // scalar.ph: 4142 // s_init = a[-1] 4143 // br scalar.body 4144 // 4145 // scalar.body: 4146 // i = phi [0, scalar.ph], [i+1, scalar.body] 4147 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4148 // s2 = a[i] 4149 // b[i] = s2 - s1 4150 // br cond, scalar.body, ... 4151 // 4152 // In this example, s1 is a recurrence because it's value depends on the 4153 // previous iteration. In the first phase of vectorization, we created a 4154 // temporary value for s1. We now complete the vectorization and produce the 4155 // shorthand vector IR shown below (for VF = 4, UF = 1). 4156 // 4157 // vector.ph: 4158 // v_init = vector(..., ..., ..., a[-1]) 4159 // br vector.body 4160 // 4161 // vector.body 4162 // i = phi [0, vector.ph], [i+4, vector.body] 4163 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4164 // v2 = a[i, i+1, i+2, i+3]; 4165 // v3 = vector(v1(3), v2(0, 1, 2)) 4166 // b[i, i+1, i+2, i+3] = v2 - v3 4167 // br cond, vector.body, middle.block 4168 // 4169 // middle.block: 4170 // x = v2(3) 4171 // br scalar.ph 4172 // 4173 // scalar.ph: 4174 // s_init = phi [x, middle.block], [a[-1], otherwise] 4175 // br scalar.body 4176 // 4177 // After execution completes the vector loop, we extract the next value of 4178 // the recurrence (x) to use as the initial value in the scalar loop. 4179 4180 // Get the original loop preheader and single loop latch. 4181 auto *Preheader = OrigLoop->getLoopPreheader(); 4182 auto *Latch = OrigLoop->getLoopLatch(); 4183 4184 // Get the initial and previous values of the scalar recurrence. 4185 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4186 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4187 4188 auto *IdxTy = Builder.getInt32Ty(); 4189 auto *One = ConstantInt::get(IdxTy, 1); 4190 4191 // Create a vector from the initial value. 4192 auto *VectorInit = ScalarInit; 4193 if (VF.isVector()) { 4194 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4195 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4196 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4197 VectorInit = Builder.CreateInsertElement( 4198 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4199 VectorInit, LastIdx, "vector.recur.init"); 4200 } 4201 4202 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4203 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4204 // We constructed a temporary phi node in the first phase of vectorization. 4205 // This phi node will eventually be deleted. 4206 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4207 4208 // Create a phi node for the new recurrence. The current value will either be 4209 // the initial value inserted into a vector or loop-varying vector value. 4210 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4211 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4212 4213 // Get the vectorized previous value of the last part UF - 1. It appears last 4214 // among all unrolled iterations, due to the order of their construction. 4215 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4216 4217 // Find and set the insertion point after the previous value if it is an 4218 // instruction. 4219 BasicBlock::iterator InsertPt; 4220 // Note that the previous value may have been constant-folded so it is not 4221 // guaranteed to be an instruction in the vector loop. 4222 // FIXME: Loop invariant values do not form recurrences. We should deal with 4223 // them earlier. 4224 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4225 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4226 else { 4227 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4228 if (isa<PHINode>(PreviousLastPart)) 4229 // If the previous value is a phi node, we should insert after all the phi 4230 // nodes in the block containing the PHI to avoid breaking basic block 4231 // verification. Note that the basic block may be different to 4232 // LoopVectorBody, in case we predicate the loop. 4233 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4234 else 4235 InsertPt = ++PreviousInst->getIterator(); 4236 } 4237 Builder.SetInsertPoint(&*InsertPt); 4238 4239 // The vector from which to take the initial value for the current iteration 4240 // (actual or unrolled). Initially, this is the vector phi node. 4241 Value *Incoming = VecPhi; 4242 4243 // Shuffle the current and previous vector and update the vector parts. 4244 for (unsigned Part = 0; Part < UF; ++Part) { 4245 Value *PreviousPart = State.get(PreviousDef, Part); 4246 Value *PhiPart = State.get(PhiDef, Part); 4247 auto *Shuffle = VF.isVector() 4248 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4249 : Incoming; 4250 PhiPart->replaceAllUsesWith(Shuffle); 4251 cast<Instruction>(PhiPart)->eraseFromParent(); 4252 State.reset(PhiDef, Shuffle, Part); 4253 Incoming = PreviousPart; 4254 } 4255 4256 // Fix the latch value of the new recurrence in the vector loop. 4257 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4258 4259 // Extract the last vector element in the middle block. This will be the 4260 // initial value for the recurrence when jumping to the scalar loop. 4261 auto *ExtractForScalar = Incoming; 4262 if (VF.isVector()) { 4263 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4264 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4265 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4266 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4267 "vector.recur.extract"); 4268 } 4269 // Extract the second last element in the middle block if the 4270 // Phi is used outside the loop. We need to extract the phi itself 4271 // and not the last element (the phi update in the current iteration). This 4272 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4273 // when the scalar loop is not run at all. 4274 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4275 if (VF.isVector()) { 4276 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4277 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4278 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4279 Incoming, Idx, "vector.recur.extract.for.phi"); 4280 } else if (UF > 1) 4281 // When loop is unrolled without vectorizing, initialize 4282 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4283 // of `Incoming`. This is analogous to the vectorized case above: extracting 4284 // the second last element when VF > 1. 4285 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4286 4287 // Fix the initial value of the original recurrence in the scalar loop. 4288 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4289 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4290 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4291 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4292 Start->addIncoming(Incoming, BB); 4293 } 4294 4295 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4296 Phi->setName("scalar.recur"); 4297 4298 // Finally, fix users of the recurrence outside the loop. The users will need 4299 // either the last value of the scalar recurrence or the last value of the 4300 // vector recurrence we extracted in the middle block. Since the loop is in 4301 // LCSSA form, we just need to find all the phi nodes for the original scalar 4302 // recurrence in the exit block, and then add an edge for the middle block. 4303 // Note that LCSSA does not imply single entry when the original scalar loop 4304 // had multiple exiting edges (as we always run the last iteration in the 4305 // scalar epilogue); in that case, the exiting path through middle will be 4306 // dynamically dead and the value picked for the phi doesn't matter. 4307 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4308 if (any_of(LCSSAPhi.incoming_values(), 4309 [Phi](Value *V) { return V == Phi; })) 4310 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4311 } 4312 4313 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4314 return EnableStrictReductions && RdxDesc.isOrdered(); 4315 } 4316 4317 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4318 VPTransformState &State) { 4319 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4320 // Get it's reduction variable descriptor. 4321 assert(Legal->isReductionVariable(OrigPhi) && 4322 "Unable to find the reduction variable"); 4323 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4324 4325 RecurKind RK = RdxDesc.getRecurrenceKind(); 4326 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4327 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4328 setDebugLocFromInst(Builder, ReductionStartValue); 4329 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4330 4331 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4332 // This is the vector-clone of the value that leaves the loop. 4333 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4334 4335 // Wrap flags are in general invalid after vectorization, clear them. 4336 clearReductionWrapFlags(RdxDesc, State); 4337 4338 // Fix the vector-loop phi. 4339 4340 // Reductions do not have to start at zero. They can start with 4341 // any loop invariant values. 4342 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4343 4344 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4345 useOrderedReductions(RdxDesc); 4346 4347 for (unsigned Part = 0; Part < UF; ++Part) { 4348 if (IsOrdered && Part > 0) 4349 break; 4350 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4351 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4352 if (IsOrdered) 4353 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4354 4355 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4356 } 4357 4358 // Before each round, move the insertion point right between 4359 // the PHIs and the values we are going to write. 4360 // This allows us to write both PHINodes and the extractelement 4361 // instructions. 4362 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4363 4364 setDebugLocFromInst(Builder, LoopExitInst); 4365 4366 Type *PhiTy = OrigPhi->getType(); 4367 // If tail is folded by masking, the vector value to leave the loop should be 4368 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4369 // instead of the former. For an inloop reduction the reduction will already 4370 // be predicated, and does not need to be handled here. 4371 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4372 for (unsigned Part = 0; Part < UF; ++Part) { 4373 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4374 Value *Sel = nullptr; 4375 for (User *U : VecLoopExitInst->users()) { 4376 if (isa<SelectInst>(U)) { 4377 assert(!Sel && "Reduction exit feeding two selects"); 4378 Sel = U; 4379 } else 4380 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4381 } 4382 assert(Sel && "Reduction exit feeds no select"); 4383 State.reset(LoopExitInstDef, Sel, Part); 4384 4385 // If the target can create a predicated operator for the reduction at no 4386 // extra cost in the loop (for example a predicated vadd), it can be 4387 // cheaper for the select to remain in the loop than be sunk out of it, 4388 // and so use the select value for the phi instead of the old 4389 // LoopExitValue. 4390 if (PreferPredicatedReductionSelect || 4391 TTI->preferPredicatedReductionSelect( 4392 RdxDesc.getOpcode(), PhiTy, 4393 TargetTransformInfo::ReductionFlags())) { 4394 auto *VecRdxPhi = 4395 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4396 VecRdxPhi->setIncomingValueForBlock( 4397 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4398 } 4399 } 4400 } 4401 4402 // If the vector reduction can be performed in a smaller type, we truncate 4403 // then extend the loop exit value to enable InstCombine to evaluate the 4404 // entire expression in the smaller type. 4405 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4406 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4407 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4408 Builder.SetInsertPoint( 4409 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4410 VectorParts RdxParts(UF); 4411 for (unsigned Part = 0; Part < UF; ++Part) { 4412 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4413 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4414 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4415 : Builder.CreateZExt(Trunc, VecTy); 4416 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4417 UI != RdxParts[Part]->user_end();) 4418 if (*UI != Trunc) { 4419 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4420 RdxParts[Part] = Extnd; 4421 } else { 4422 ++UI; 4423 } 4424 } 4425 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4426 for (unsigned Part = 0; Part < UF; ++Part) { 4427 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4428 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4429 } 4430 } 4431 4432 // Reduce all of the unrolled parts into a single vector. 4433 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4434 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4435 4436 // The middle block terminator has already been assigned a DebugLoc here (the 4437 // OrigLoop's single latch terminator). We want the whole middle block to 4438 // appear to execute on this line because: (a) it is all compiler generated, 4439 // (b) these instructions are always executed after evaluating the latch 4440 // conditional branch, and (c) other passes may add new predecessors which 4441 // terminate on this line. This is the easiest way to ensure we don't 4442 // accidentally cause an extra step back into the loop while debugging. 4443 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4444 if (IsOrdered) 4445 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4446 else { 4447 // Floating-point operations should have some FMF to enable the reduction. 4448 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4449 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4450 for (unsigned Part = 1; Part < UF; ++Part) { 4451 Value *RdxPart = State.get(LoopExitInstDef, Part); 4452 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4453 ReducedPartRdx = Builder.CreateBinOp( 4454 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4455 } else { 4456 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4457 } 4458 } 4459 } 4460 4461 // Create the reduction after the loop. Note that inloop reductions create the 4462 // target reduction in the loop using a Reduction recipe. 4463 if (VF.isVector() && !IsInLoopReductionPhi) { 4464 ReducedPartRdx = 4465 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4466 // If the reduction can be performed in a smaller type, we need to extend 4467 // the reduction to the wider type before we branch to the original loop. 4468 if (PhiTy != RdxDesc.getRecurrenceType()) 4469 ReducedPartRdx = RdxDesc.isSigned() 4470 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4471 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4472 } 4473 4474 // Create a phi node that merges control-flow from the backedge-taken check 4475 // block and the middle block. 4476 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4477 LoopScalarPreHeader->getTerminator()); 4478 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4479 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4480 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4481 4482 // Now, we need to fix the users of the reduction variable 4483 // inside and outside of the scalar remainder loop. 4484 4485 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4486 // in the exit blocks. See comment on analogous loop in 4487 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4488 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4489 if (any_of(LCSSAPhi.incoming_values(), 4490 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4491 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4492 4493 // Fix the scalar loop reduction variable with the incoming reduction sum 4494 // from the vector body and from the backedge value. 4495 int IncomingEdgeBlockIdx = 4496 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4497 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4498 // Pick the other block. 4499 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4500 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4501 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4502 } 4503 4504 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4505 VPTransformState &State) { 4506 RecurKind RK = RdxDesc.getRecurrenceKind(); 4507 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4508 return; 4509 4510 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4511 assert(LoopExitInstr && "null loop exit instruction"); 4512 SmallVector<Instruction *, 8> Worklist; 4513 SmallPtrSet<Instruction *, 8> Visited; 4514 Worklist.push_back(LoopExitInstr); 4515 Visited.insert(LoopExitInstr); 4516 4517 while (!Worklist.empty()) { 4518 Instruction *Cur = Worklist.pop_back_val(); 4519 if (isa<OverflowingBinaryOperator>(Cur)) 4520 for (unsigned Part = 0; Part < UF; ++Part) { 4521 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4522 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4523 } 4524 4525 for (User *U : Cur->users()) { 4526 Instruction *UI = cast<Instruction>(U); 4527 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4528 Visited.insert(UI).second) 4529 Worklist.push_back(UI); 4530 } 4531 } 4532 } 4533 4534 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4535 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4536 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4537 // Some phis were already hand updated by the reduction and recurrence 4538 // code above, leave them alone. 4539 continue; 4540 4541 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4542 // Non-instruction incoming values will have only one value. 4543 4544 VPLane Lane = VPLane::getFirstLane(); 4545 if (isa<Instruction>(IncomingValue) && 4546 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4547 VF)) 4548 Lane = VPLane::getLastLaneForVF(VF); 4549 4550 // Can be a loop invariant incoming value or the last scalar value to be 4551 // extracted from the vectorized loop. 4552 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4553 Value *lastIncomingValue = 4554 OrigLoop->isLoopInvariant(IncomingValue) 4555 ? IncomingValue 4556 : State.get(State.Plan->getVPValue(IncomingValue), 4557 VPIteration(UF - 1, Lane)); 4558 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4559 } 4560 } 4561 4562 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4563 // The basic block and loop containing the predicated instruction. 4564 auto *PredBB = PredInst->getParent(); 4565 auto *VectorLoop = LI->getLoopFor(PredBB); 4566 4567 // Initialize a worklist with the operands of the predicated instruction. 4568 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4569 4570 // Holds instructions that we need to analyze again. An instruction may be 4571 // reanalyzed if we don't yet know if we can sink it or not. 4572 SmallVector<Instruction *, 8> InstsToReanalyze; 4573 4574 // Returns true if a given use occurs in the predicated block. Phi nodes use 4575 // their operands in their corresponding predecessor blocks. 4576 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4577 auto *I = cast<Instruction>(U.getUser()); 4578 BasicBlock *BB = I->getParent(); 4579 if (auto *Phi = dyn_cast<PHINode>(I)) 4580 BB = Phi->getIncomingBlock( 4581 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4582 return BB == PredBB; 4583 }; 4584 4585 // Iteratively sink the scalarized operands of the predicated instruction 4586 // into the block we created for it. When an instruction is sunk, it's 4587 // operands are then added to the worklist. The algorithm ends after one pass 4588 // through the worklist doesn't sink a single instruction. 4589 bool Changed; 4590 do { 4591 // Add the instructions that need to be reanalyzed to the worklist, and 4592 // reset the changed indicator. 4593 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4594 InstsToReanalyze.clear(); 4595 Changed = false; 4596 4597 while (!Worklist.empty()) { 4598 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4599 4600 // We can't sink an instruction if it is a phi node, is already in the 4601 // predicated block, is not in the loop, or may have side effects. 4602 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4603 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4604 continue; 4605 4606 // It's legal to sink the instruction if all its uses occur in the 4607 // predicated block. Otherwise, there's nothing to do yet, and we may 4608 // need to reanalyze the instruction. 4609 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4610 InstsToReanalyze.push_back(I); 4611 continue; 4612 } 4613 4614 // Move the instruction to the beginning of the predicated block, and add 4615 // it's operands to the worklist. 4616 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4617 Worklist.insert(I->op_begin(), I->op_end()); 4618 4619 // The sinking may have enabled other instructions to be sunk, so we will 4620 // need to iterate. 4621 Changed = true; 4622 } 4623 } while (Changed); 4624 } 4625 4626 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4627 for (PHINode *OrigPhi : OrigPHIsToFix) { 4628 VPWidenPHIRecipe *VPPhi = 4629 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4630 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4631 // Make sure the builder has a valid insert point. 4632 Builder.SetInsertPoint(NewPhi); 4633 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4634 VPValue *Inc = VPPhi->getIncomingValue(i); 4635 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4636 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4637 } 4638 } 4639 } 4640 4641 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4642 VPUser &Operands, unsigned UF, 4643 ElementCount VF, bool IsPtrLoopInvariant, 4644 SmallBitVector &IsIndexLoopInvariant, 4645 VPTransformState &State) { 4646 // Construct a vector GEP by widening the operands of the scalar GEP as 4647 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4648 // results in a vector of pointers when at least one operand of the GEP 4649 // is vector-typed. Thus, to keep the representation compact, we only use 4650 // vector-typed operands for loop-varying values. 4651 4652 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4653 // If we are vectorizing, but the GEP has only loop-invariant operands, 4654 // the GEP we build (by only using vector-typed operands for 4655 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4656 // produce a vector of pointers, we need to either arbitrarily pick an 4657 // operand to broadcast, or broadcast a clone of the original GEP. 4658 // Here, we broadcast a clone of the original. 4659 // 4660 // TODO: If at some point we decide to scalarize instructions having 4661 // loop-invariant operands, this special case will no longer be 4662 // required. We would add the scalarization decision to 4663 // collectLoopScalars() and teach getVectorValue() to broadcast 4664 // the lane-zero scalar value. 4665 auto *Clone = Builder.Insert(GEP->clone()); 4666 for (unsigned Part = 0; Part < UF; ++Part) { 4667 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4668 State.set(VPDef, EntryPart, Part); 4669 addMetadata(EntryPart, GEP); 4670 } 4671 } else { 4672 // If the GEP has at least one loop-varying operand, we are sure to 4673 // produce a vector of pointers. But if we are only unrolling, we want 4674 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4675 // produce with the code below will be scalar (if VF == 1) or vector 4676 // (otherwise). Note that for the unroll-only case, we still maintain 4677 // values in the vector mapping with initVector, as we do for other 4678 // instructions. 4679 for (unsigned Part = 0; Part < UF; ++Part) { 4680 // The pointer operand of the new GEP. If it's loop-invariant, we 4681 // won't broadcast it. 4682 auto *Ptr = IsPtrLoopInvariant 4683 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4684 : State.get(Operands.getOperand(0), Part); 4685 4686 // Collect all the indices for the new GEP. If any index is 4687 // loop-invariant, we won't broadcast it. 4688 SmallVector<Value *, 4> Indices; 4689 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4690 VPValue *Operand = Operands.getOperand(I); 4691 if (IsIndexLoopInvariant[I - 1]) 4692 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4693 else 4694 Indices.push_back(State.get(Operand, Part)); 4695 } 4696 4697 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4698 // but it should be a vector, otherwise. 4699 auto *NewGEP = 4700 GEP->isInBounds() 4701 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4702 Indices) 4703 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4704 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4705 "NewGEP is not a pointer vector"); 4706 State.set(VPDef, NewGEP, Part); 4707 addMetadata(NewGEP, GEP); 4708 } 4709 } 4710 } 4711 4712 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4713 RecurrenceDescriptor *RdxDesc, 4714 VPWidenPHIRecipe *PhiR, 4715 VPTransformState &State) { 4716 PHINode *P = cast<PHINode>(PN); 4717 if (EnableVPlanNativePath) { 4718 // Currently we enter here in the VPlan-native path for non-induction 4719 // PHIs where all control flow is uniform. We simply widen these PHIs. 4720 // Create a vector phi with no operands - the vector phi operands will be 4721 // set at the end of vector code generation. 4722 Type *VecTy = (State.VF.isScalar()) 4723 ? PN->getType() 4724 : VectorType::get(PN->getType(), State.VF); 4725 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4726 State.set(PhiR, VecPhi, 0); 4727 OrigPHIsToFix.push_back(P); 4728 4729 return; 4730 } 4731 4732 assert(PN->getParent() == OrigLoop->getHeader() && 4733 "Non-header phis should have been handled elsewhere"); 4734 4735 VPValue *StartVPV = PhiR->getStartValue(); 4736 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4737 // In order to support recurrences we need to be able to vectorize Phi nodes. 4738 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4739 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4740 // this value when we vectorize all of the instructions that use the PHI. 4741 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4742 Value *Iden = nullptr; 4743 bool ScalarPHI = 4744 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4745 Type *VecTy = 4746 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4747 4748 if (RdxDesc) { 4749 assert(Legal->isReductionVariable(P) && StartV && 4750 "RdxDesc should only be set for reduction variables; in that case " 4751 "a StartV is also required"); 4752 RecurKind RK = RdxDesc->getRecurrenceKind(); 4753 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4754 // MinMax reduction have the start value as their identify. 4755 if (ScalarPHI) { 4756 Iden = StartV; 4757 } else { 4758 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4759 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4760 StartV = Iden = 4761 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4762 } 4763 } else { 4764 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4765 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4766 Iden = IdenC; 4767 4768 if (!ScalarPHI) { 4769 Iden = ConstantVector::getSplat(State.VF, IdenC); 4770 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4771 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4772 Constant *Zero = Builder.getInt32(0); 4773 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4774 } 4775 } 4776 } 4777 4778 bool IsOrdered = State.VF.isVector() && 4779 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4780 useOrderedReductions(*RdxDesc); 4781 4782 for (unsigned Part = 0; Part < State.UF; ++Part) { 4783 // This is phase one of vectorizing PHIs. 4784 if (Part > 0 && IsOrdered) 4785 return; 4786 Value *EntryPart = PHINode::Create( 4787 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4788 State.set(PhiR, EntryPart, Part); 4789 if (StartV) { 4790 // Make sure to add the reduction start value only to the 4791 // first unroll part. 4792 Value *StartVal = (Part == 0) ? StartV : Iden; 4793 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4794 } 4795 } 4796 return; 4797 } 4798 4799 assert(!Legal->isReductionVariable(P) && 4800 "reductions should be handled above"); 4801 4802 setDebugLocFromInst(Builder, P); 4803 4804 // This PHINode must be an induction variable. 4805 // Make sure that we know about it. 4806 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4807 4808 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4809 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4810 4811 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4812 // which can be found from the original scalar operations. 4813 switch (II.getKind()) { 4814 case InductionDescriptor::IK_NoInduction: 4815 llvm_unreachable("Unknown induction"); 4816 case InductionDescriptor::IK_IntInduction: 4817 case InductionDescriptor::IK_FpInduction: 4818 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4819 case InductionDescriptor::IK_PtrInduction: { 4820 // Handle the pointer induction variable case. 4821 assert(P->getType()->isPointerTy() && "Unexpected type."); 4822 4823 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4824 // This is the normalized GEP that starts counting at zero. 4825 Value *PtrInd = 4826 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4827 // Determine the number of scalars we need to generate for each unroll 4828 // iteration. If the instruction is uniform, we only need to generate the 4829 // first lane. Otherwise, we generate all VF values. 4830 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4831 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4832 4833 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4834 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4835 if (NeedsVectorIndex) { 4836 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4837 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4838 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4839 } 4840 4841 for (unsigned Part = 0; Part < UF; ++Part) { 4842 Value *PartStart = createStepForVF( 4843 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4844 4845 if (NeedsVectorIndex) { 4846 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4847 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4848 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4849 Value *SclrGep = 4850 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4851 SclrGep->setName("next.gep"); 4852 State.set(PhiR, SclrGep, Part); 4853 // We've cached the whole vector, which means we can support the 4854 // extraction of any lane. 4855 continue; 4856 } 4857 4858 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4859 Value *Idx = Builder.CreateAdd( 4860 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4861 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4862 Value *SclrGep = 4863 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4864 SclrGep->setName("next.gep"); 4865 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4866 } 4867 } 4868 return; 4869 } 4870 assert(isa<SCEVConstant>(II.getStep()) && 4871 "Induction step not a SCEV constant!"); 4872 Type *PhiType = II.getStep()->getType(); 4873 4874 // Build a pointer phi 4875 Value *ScalarStartValue = II.getStartValue(); 4876 Type *ScStValueType = ScalarStartValue->getType(); 4877 PHINode *NewPointerPhi = 4878 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4879 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4880 4881 // A pointer induction, performed by using a gep 4882 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4883 Instruction *InductionLoc = LoopLatch->getTerminator(); 4884 const SCEV *ScalarStep = II.getStep(); 4885 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4886 Value *ScalarStepValue = 4887 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4888 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4889 Value *NumUnrolledElems = 4890 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4891 Value *InductionGEP = GetElementPtrInst::Create( 4892 ScStValueType->getPointerElementType(), NewPointerPhi, 4893 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4894 InductionLoc); 4895 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4896 4897 // Create UF many actual address geps that use the pointer 4898 // phi as base and a vectorized version of the step value 4899 // (<step*0, ..., step*N>) as offset. 4900 for (unsigned Part = 0; Part < State.UF; ++Part) { 4901 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4902 Value *StartOffsetScalar = 4903 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4904 Value *StartOffset = 4905 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4906 // Create a vector of consecutive numbers from zero to VF. 4907 StartOffset = 4908 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4909 4910 Value *GEP = Builder.CreateGEP( 4911 ScStValueType->getPointerElementType(), NewPointerPhi, 4912 Builder.CreateMul( 4913 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4914 "vector.gep")); 4915 State.set(PhiR, GEP, Part); 4916 } 4917 } 4918 } 4919 } 4920 4921 /// A helper function for checking whether an integer division-related 4922 /// instruction may divide by zero (in which case it must be predicated if 4923 /// executed conditionally in the scalar code). 4924 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4925 /// Non-zero divisors that are non compile-time constants will not be 4926 /// converted into multiplication, so we will still end up scalarizing 4927 /// the division, but can do so w/o predication. 4928 static bool mayDivideByZero(Instruction &I) { 4929 assert((I.getOpcode() == Instruction::UDiv || 4930 I.getOpcode() == Instruction::SDiv || 4931 I.getOpcode() == Instruction::URem || 4932 I.getOpcode() == Instruction::SRem) && 4933 "Unexpected instruction"); 4934 Value *Divisor = I.getOperand(1); 4935 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4936 return !CInt || CInt->isZero(); 4937 } 4938 4939 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4940 VPUser &User, 4941 VPTransformState &State) { 4942 switch (I.getOpcode()) { 4943 case Instruction::Call: 4944 case Instruction::Br: 4945 case Instruction::PHI: 4946 case Instruction::GetElementPtr: 4947 case Instruction::Select: 4948 llvm_unreachable("This instruction is handled by a different recipe."); 4949 case Instruction::UDiv: 4950 case Instruction::SDiv: 4951 case Instruction::SRem: 4952 case Instruction::URem: 4953 case Instruction::Add: 4954 case Instruction::FAdd: 4955 case Instruction::Sub: 4956 case Instruction::FSub: 4957 case Instruction::FNeg: 4958 case Instruction::Mul: 4959 case Instruction::FMul: 4960 case Instruction::FDiv: 4961 case Instruction::FRem: 4962 case Instruction::Shl: 4963 case Instruction::LShr: 4964 case Instruction::AShr: 4965 case Instruction::And: 4966 case Instruction::Or: 4967 case Instruction::Xor: { 4968 // Just widen unops and binops. 4969 setDebugLocFromInst(Builder, &I); 4970 4971 for (unsigned Part = 0; Part < UF; ++Part) { 4972 SmallVector<Value *, 2> Ops; 4973 for (VPValue *VPOp : User.operands()) 4974 Ops.push_back(State.get(VPOp, Part)); 4975 4976 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4977 4978 if (auto *VecOp = dyn_cast<Instruction>(V)) 4979 VecOp->copyIRFlags(&I); 4980 4981 // Use this vector value for all users of the original instruction. 4982 State.set(Def, V, Part); 4983 addMetadata(V, &I); 4984 } 4985 4986 break; 4987 } 4988 case Instruction::ICmp: 4989 case Instruction::FCmp: { 4990 // Widen compares. Generate vector compares. 4991 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4992 auto *Cmp = cast<CmpInst>(&I); 4993 setDebugLocFromInst(Builder, Cmp); 4994 for (unsigned Part = 0; Part < UF; ++Part) { 4995 Value *A = State.get(User.getOperand(0), Part); 4996 Value *B = State.get(User.getOperand(1), Part); 4997 Value *C = nullptr; 4998 if (FCmp) { 4999 // Propagate fast math flags. 5000 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 5001 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 5002 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 5003 } else { 5004 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 5005 } 5006 State.set(Def, C, Part); 5007 addMetadata(C, &I); 5008 } 5009 5010 break; 5011 } 5012 5013 case Instruction::ZExt: 5014 case Instruction::SExt: 5015 case Instruction::FPToUI: 5016 case Instruction::FPToSI: 5017 case Instruction::FPExt: 5018 case Instruction::PtrToInt: 5019 case Instruction::IntToPtr: 5020 case Instruction::SIToFP: 5021 case Instruction::UIToFP: 5022 case Instruction::Trunc: 5023 case Instruction::FPTrunc: 5024 case Instruction::BitCast: { 5025 auto *CI = cast<CastInst>(&I); 5026 setDebugLocFromInst(Builder, CI); 5027 5028 /// Vectorize casts. 5029 Type *DestTy = 5030 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 5031 5032 for (unsigned Part = 0; Part < UF; ++Part) { 5033 Value *A = State.get(User.getOperand(0), Part); 5034 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5035 State.set(Def, Cast, Part); 5036 addMetadata(Cast, &I); 5037 } 5038 break; 5039 } 5040 default: 5041 // This instruction is not vectorized by simple widening. 5042 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5043 llvm_unreachable("Unhandled instruction!"); 5044 } // end of switch. 5045 } 5046 5047 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5048 VPUser &ArgOperands, 5049 VPTransformState &State) { 5050 assert(!isa<DbgInfoIntrinsic>(I) && 5051 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5052 setDebugLocFromInst(Builder, &I); 5053 5054 Module *M = I.getParent()->getParent()->getParent(); 5055 auto *CI = cast<CallInst>(&I); 5056 5057 SmallVector<Type *, 4> Tys; 5058 for (Value *ArgOperand : CI->arg_operands()) 5059 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5060 5061 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5062 5063 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5064 // version of the instruction. 5065 // Is it beneficial to perform intrinsic call compared to lib call? 5066 bool NeedToScalarize = false; 5067 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5068 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5069 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5070 assert((UseVectorIntrinsic || !NeedToScalarize) && 5071 "Instruction should be scalarized elsewhere."); 5072 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5073 "Either the intrinsic cost or vector call cost must be valid"); 5074 5075 for (unsigned Part = 0; Part < UF; ++Part) { 5076 SmallVector<Value *, 4> Args; 5077 for (auto &I : enumerate(ArgOperands.operands())) { 5078 // Some intrinsics have a scalar argument - don't replace it with a 5079 // vector. 5080 Value *Arg; 5081 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5082 Arg = State.get(I.value(), Part); 5083 else 5084 Arg = State.get(I.value(), VPIteration(0, 0)); 5085 Args.push_back(Arg); 5086 } 5087 5088 Function *VectorF; 5089 if (UseVectorIntrinsic) { 5090 // Use vector version of the intrinsic. 5091 Type *TysForDecl[] = {CI->getType()}; 5092 if (VF.isVector()) 5093 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5094 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5095 assert(VectorF && "Can't retrieve vector intrinsic."); 5096 } else { 5097 // Use vector version of the function call. 5098 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5099 #ifndef NDEBUG 5100 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5101 "Can't create vector function."); 5102 #endif 5103 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5104 } 5105 SmallVector<OperandBundleDef, 1> OpBundles; 5106 CI->getOperandBundlesAsDefs(OpBundles); 5107 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5108 5109 if (isa<FPMathOperator>(V)) 5110 V->copyFastMathFlags(CI); 5111 5112 State.set(Def, V, Part); 5113 addMetadata(V, &I); 5114 } 5115 } 5116 5117 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5118 VPUser &Operands, 5119 bool InvariantCond, 5120 VPTransformState &State) { 5121 setDebugLocFromInst(Builder, &I); 5122 5123 // The condition can be loop invariant but still defined inside the 5124 // loop. This means that we can't just use the original 'cond' value. 5125 // We have to take the 'vectorized' value and pick the first lane. 5126 // Instcombine will make this a no-op. 5127 auto *InvarCond = InvariantCond 5128 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5129 : nullptr; 5130 5131 for (unsigned Part = 0; Part < UF; ++Part) { 5132 Value *Cond = 5133 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5134 Value *Op0 = State.get(Operands.getOperand(1), Part); 5135 Value *Op1 = State.get(Operands.getOperand(2), Part); 5136 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5137 State.set(VPDef, Sel, Part); 5138 addMetadata(Sel, &I); 5139 } 5140 } 5141 5142 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5143 // We should not collect Scalars more than once per VF. Right now, this 5144 // function is called from collectUniformsAndScalars(), which already does 5145 // this check. Collecting Scalars for VF=1 does not make any sense. 5146 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5147 "This function should not be visited twice for the same VF"); 5148 5149 SmallSetVector<Instruction *, 8> Worklist; 5150 5151 // These sets are used to seed the analysis with pointers used by memory 5152 // accesses that will remain scalar. 5153 SmallSetVector<Instruction *, 8> ScalarPtrs; 5154 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5155 auto *Latch = TheLoop->getLoopLatch(); 5156 5157 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5158 // The pointer operands of loads and stores will be scalar as long as the 5159 // memory access is not a gather or scatter operation. The value operand of a 5160 // store will remain scalar if the store is scalarized. 5161 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5162 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5163 assert(WideningDecision != CM_Unknown && 5164 "Widening decision should be ready at this moment"); 5165 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5166 if (Ptr == Store->getValueOperand()) 5167 return WideningDecision == CM_Scalarize; 5168 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5169 "Ptr is neither a value or pointer operand"); 5170 return WideningDecision != CM_GatherScatter; 5171 }; 5172 5173 // A helper that returns true if the given value is a bitcast or 5174 // getelementptr instruction contained in the loop. 5175 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5176 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5177 isa<GetElementPtrInst>(V)) && 5178 !TheLoop->isLoopInvariant(V); 5179 }; 5180 5181 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5182 if (!isa<PHINode>(Ptr) || 5183 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5184 return false; 5185 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5186 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5187 return false; 5188 return isScalarUse(MemAccess, Ptr); 5189 }; 5190 5191 // A helper that evaluates a memory access's use of a pointer. If the 5192 // pointer is actually the pointer induction of a loop, it is being 5193 // inserted into Worklist. If the use will be a scalar use, and the 5194 // pointer is only used by memory accesses, we place the pointer in 5195 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5196 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5197 if (isScalarPtrInduction(MemAccess, Ptr)) { 5198 Worklist.insert(cast<Instruction>(Ptr)); 5199 Instruction *Update = cast<Instruction>( 5200 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5201 Worklist.insert(Update); 5202 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5203 << "\n"); 5204 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5205 << "\n"); 5206 return; 5207 } 5208 // We only care about bitcast and getelementptr instructions contained in 5209 // the loop. 5210 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5211 return; 5212 5213 // If the pointer has already been identified as scalar (e.g., if it was 5214 // also identified as uniform), there's nothing to do. 5215 auto *I = cast<Instruction>(Ptr); 5216 if (Worklist.count(I)) 5217 return; 5218 5219 // If the use of the pointer will be a scalar use, and all users of the 5220 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5221 // place the pointer in PossibleNonScalarPtrs. 5222 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5223 return isa<LoadInst>(U) || isa<StoreInst>(U); 5224 })) 5225 ScalarPtrs.insert(I); 5226 else 5227 PossibleNonScalarPtrs.insert(I); 5228 }; 5229 5230 // We seed the scalars analysis with three classes of instructions: (1) 5231 // instructions marked uniform-after-vectorization and (2) bitcast, 5232 // getelementptr and (pointer) phi instructions used by memory accesses 5233 // requiring a scalar use. 5234 // 5235 // (1) Add to the worklist all instructions that have been identified as 5236 // uniform-after-vectorization. 5237 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5238 5239 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5240 // memory accesses requiring a scalar use. The pointer operands of loads and 5241 // stores will be scalar as long as the memory accesses is not a gather or 5242 // scatter operation. The value operand of a store will remain scalar if the 5243 // store is scalarized. 5244 for (auto *BB : TheLoop->blocks()) 5245 for (auto &I : *BB) { 5246 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5247 evaluatePtrUse(Load, Load->getPointerOperand()); 5248 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5249 evaluatePtrUse(Store, Store->getPointerOperand()); 5250 evaluatePtrUse(Store, Store->getValueOperand()); 5251 } 5252 } 5253 for (auto *I : ScalarPtrs) 5254 if (!PossibleNonScalarPtrs.count(I)) { 5255 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5256 Worklist.insert(I); 5257 } 5258 5259 // Insert the forced scalars. 5260 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5261 // induction variable when the PHI user is scalarized. 5262 auto ForcedScalar = ForcedScalars.find(VF); 5263 if (ForcedScalar != ForcedScalars.end()) 5264 for (auto *I : ForcedScalar->second) 5265 Worklist.insert(I); 5266 5267 // Expand the worklist by looking through any bitcasts and getelementptr 5268 // instructions we've already identified as scalar. This is similar to the 5269 // expansion step in collectLoopUniforms(); however, here we're only 5270 // expanding to include additional bitcasts and getelementptr instructions. 5271 unsigned Idx = 0; 5272 while (Idx != Worklist.size()) { 5273 Instruction *Dst = Worklist[Idx++]; 5274 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5275 continue; 5276 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5277 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5278 auto *J = cast<Instruction>(U); 5279 return !TheLoop->contains(J) || Worklist.count(J) || 5280 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5281 isScalarUse(J, Src)); 5282 })) { 5283 Worklist.insert(Src); 5284 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5285 } 5286 } 5287 5288 // An induction variable will remain scalar if all users of the induction 5289 // variable and induction variable update remain scalar. 5290 for (auto &Induction : Legal->getInductionVars()) { 5291 auto *Ind = Induction.first; 5292 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5293 5294 // If tail-folding is applied, the primary induction variable will be used 5295 // to feed a vector compare. 5296 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5297 continue; 5298 5299 // Determine if all users of the induction variable are scalar after 5300 // vectorization. 5301 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5302 auto *I = cast<Instruction>(U); 5303 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5304 }); 5305 if (!ScalarInd) 5306 continue; 5307 5308 // Determine if all users of the induction variable update instruction are 5309 // scalar after vectorization. 5310 auto ScalarIndUpdate = 5311 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5312 auto *I = cast<Instruction>(U); 5313 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5314 }); 5315 if (!ScalarIndUpdate) 5316 continue; 5317 5318 // The induction variable and its update instruction will remain scalar. 5319 Worklist.insert(Ind); 5320 Worklist.insert(IndUpdate); 5321 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5322 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5323 << "\n"); 5324 } 5325 5326 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5327 } 5328 5329 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5330 if (!blockNeedsPredication(I->getParent())) 5331 return false; 5332 switch(I->getOpcode()) { 5333 default: 5334 break; 5335 case Instruction::Load: 5336 case Instruction::Store: { 5337 if (!Legal->isMaskRequired(I)) 5338 return false; 5339 auto *Ptr = getLoadStorePointerOperand(I); 5340 auto *Ty = getMemInstValueType(I); 5341 const Align Alignment = getLoadStoreAlignment(I); 5342 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5343 isLegalMaskedGather(Ty, Alignment)) 5344 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5345 isLegalMaskedScatter(Ty, Alignment)); 5346 } 5347 case Instruction::UDiv: 5348 case Instruction::SDiv: 5349 case Instruction::SRem: 5350 case Instruction::URem: 5351 return mayDivideByZero(*I); 5352 } 5353 return false; 5354 } 5355 5356 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5357 Instruction *I, ElementCount VF) { 5358 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5359 assert(getWideningDecision(I, VF) == CM_Unknown && 5360 "Decision should not be set yet."); 5361 auto *Group = getInterleavedAccessGroup(I); 5362 assert(Group && "Must have a group."); 5363 5364 // If the instruction's allocated size doesn't equal it's type size, it 5365 // requires padding and will be scalarized. 5366 auto &DL = I->getModule()->getDataLayout(); 5367 auto *ScalarTy = getMemInstValueType(I); 5368 if (hasIrregularType(ScalarTy, DL)) 5369 return false; 5370 5371 // Check if masking is required. 5372 // A Group may need masking for one of two reasons: it resides in a block that 5373 // needs predication, or it was decided to use masking to deal with gaps. 5374 bool PredicatedAccessRequiresMasking = 5375 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5376 bool AccessWithGapsRequiresMasking = 5377 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5378 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5379 return true; 5380 5381 // If masked interleaving is required, we expect that the user/target had 5382 // enabled it, because otherwise it either wouldn't have been created or 5383 // it should have been invalidated by the CostModel. 5384 assert(useMaskedInterleavedAccesses(TTI) && 5385 "Masked interleave-groups for predicated accesses are not enabled."); 5386 5387 auto *Ty = getMemInstValueType(I); 5388 const Align Alignment = getLoadStoreAlignment(I); 5389 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5390 : TTI.isLegalMaskedStore(Ty, Alignment); 5391 } 5392 5393 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5394 Instruction *I, ElementCount VF) { 5395 // Get and ensure we have a valid memory instruction. 5396 LoadInst *LI = dyn_cast<LoadInst>(I); 5397 StoreInst *SI = dyn_cast<StoreInst>(I); 5398 assert((LI || SI) && "Invalid memory instruction"); 5399 5400 auto *Ptr = getLoadStorePointerOperand(I); 5401 5402 // In order to be widened, the pointer should be consecutive, first of all. 5403 if (!Legal->isConsecutivePtr(Ptr)) 5404 return false; 5405 5406 // If the instruction is a store located in a predicated block, it will be 5407 // scalarized. 5408 if (isScalarWithPredication(I)) 5409 return false; 5410 5411 // If the instruction's allocated size doesn't equal it's type size, it 5412 // requires padding and will be scalarized. 5413 auto &DL = I->getModule()->getDataLayout(); 5414 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5415 if (hasIrregularType(ScalarTy, DL)) 5416 return false; 5417 5418 return true; 5419 } 5420 5421 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5422 // We should not collect Uniforms more than once per VF. Right now, 5423 // this function is called from collectUniformsAndScalars(), which 5424 // already does this check. Collecting Uniforms for VF=1 does not make any 5425 // sense. 5426 5427 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5428 "This function should not be visited twice for the same VF"); 5429 5430 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5431 // not analyze again. Uniforms.count(VF) will return 1. 5432 Uniforms[VF].clear(); 5433 5434 // We now know that the loop is vectorizable! 5435 // Collect instructions inside the loop that will remain uniform after 5436 // vectorization. 5437 5438 // Global values, params and instructions outside of current loop are out of 5439 // scope. 5440 auto isOutOfScope = [&](Value *V) -> bool { 5441 Instruction *I = dyn_cast<Instruction>(V); 5442 return (!I || !TheLoop->contains(I)); 5443 }; 5444 5445 SetVector<Instruction *> Worklist; 5446 BasicBlock *Latch = TheLoop->getLoopLatch(); 5447 5448 // Instructions that are scalar with predication must not be considered 5449 // uniform after vectorization, because that would create an erroneous 5450 // replicating region where only a single instance out of VF should be formed. 5451 // TODO: optimize such seldom cases if found important, see PR40816. 5452 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5453 if (isOutOfScope(I)) { 5454 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5455 << *I << "\n"); 5456 return; 5457 } 5458 if (isScalarWithPredication(I)) { 5459 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5460 << *I << "\n"); 5461 return; 5462 } 5463 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5464 Worklist.insert(I); 5465 }; 5466 5467 // Start with the conditional branch. If the branch condition is an 5468 // instruction contained in the loop that is only used by the branch, it is 5469 // uniform. 5470 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5471 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5472 addToWorklistIfAllowed(Cmp); 5473 5474 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5475 InstWidening WideningDecision = getWideningDecision(I, VF); 5476 assert(WideningDecision != CM_Unknown && 5477 "Widening decision should be ready at this moment"); 5478 5479 // A uniform memory op is itself uniform. We exclude uniform stores 5480 // here as they demand the last lane, not the first one. 5481 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5482 assert(WideningDecision == CM_Scalarize); 5483 return true; 5484 } 5485 5486 return (WideningDecision == CM_Widen || 5487 WideningDecision == CM_Widen_Reverse || 5488 WideningDecision == CM_Interleave); 5489 }; 5490 5491 5492 // Returns true if Ptr is the pointer operand of a memory access instruction 5493 // I, and I is known to not require scalarization. 5494 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5495 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5496 }; 5497 5498 // Holds a list of values which are known to have at least one uniform use. 5499 // Note that there may be other uses which aren't uniform. A "uniform use" 5500 // here is something which only demands lane 0 of the unrolled iterations; 5501 // it does not imply that all lanes produce the same value (e.g. this is not 5502 // the usual meaning of uniform) 5503 SetVector<Value *> HasUniformUse; 5504 5505 // Scan the loop for instructions which are either a) known to have only 5506 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5507 for (auto *BB : TheLoop->blocks()) 5508 for (auto &I : *BB) { 5509 // If there's no pointer operand, there's nothing to do. 5510 auto *Ptr = getLoadStorePointerOperand(&I); 5511 if (!Ptr) 5512 continue; 5513 5514 // A uniform memory op is itself uniform. We exclude uniform stores 5515 // here as they demand the last lane, not the first one. 5516 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5517 addToWorklistIfAllowed(&I); 5518 5519 if (isUniformDecision(&I, VF)) { 5520 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5521 HasUniformUse.insert(Ptr); 5522 } 5523 } 5524 5525 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5526 // demanding) users. Since loops are assumed to be in LCSSA form, this 5527 // disallows uses outside the loop as well. 5528 for (auto *V : HasUniformUse) { 5529 if (isOutOfScope(V)) 5530 continue; 5531 auto *I = cast<Instruction>(V); 5532 auto UsersAreMemAccesses = 5533 llvm::all_of(I->users(), [&](User *U) -> bool { 5534 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5535 }); 5536 if (UsersAreMemAccesses) 5537 addToWorklistIfAllowed(I); 5538 } 5539 5540 // Expand Worklist in topological order: whenever a new instruction 5541 // is added , its users should be already inside Worklist. It ensures 5542 // a uniform instruction will only be used by uniform instructions. 5543 unsigned idx = 0; 5544 while (idx != Worklist.size()) { 5545 Instruction *I = Worklist[idx++]; 5546 5547 for (auto OV : I->operand_values()) { 5548 // isOutOfScope operands cannot be uniform instructions. 5549 if (isOutOfScope(OV)) 5550 continue; 5551 // First order recurrence Phi's should typically be considered 5552 // non-uniform. 5553 auto *OP = dyn_cast<PHINode>(OV); 5554 if (OP && Legal->isFirstOrderRecurrence(OP)) 5555 continue; 5556 // If all the users of the operand are uniform, then add the 5557 // operand into the uniform worklist. 5558 auto *OI = cast<Instruction>(OV); 5559 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5560 auto *J = cast<Instruction>(U); 5561 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5562 })) 5563 addToWorklistIfAllowed(OI); 5564 } 5565 } 5566 5567 // For an instruction to be added into Worklist above, all its users inside 5568 // the loop should also be in Worklist. However, this condition cannot be 5569 // true for phi nodes that form a cyclic dependence. We must process phi 5570 // nodes separately. An induction variable will remain uniform if all users 5571 // of the induction variable and induction variable update remain uniform. 5572 // The code below handles both pointer and non-pointer induction variables. 5573 for (auto &Induction : Legal->getInductionVars()) { 5574 auto *Ind = Induction.first; 5575 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5576 5577 // Determine if all users of the induction variable are uniform after 5578 // vectorization. 5579 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5580 auto *I = cast<Instruction>(U); 5581 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5582 isVectorizedMemAccessUse(I, Ind); 5583 }); 5584 if (!UniformInd) 5585 continue; 5586 5587 // Determine if all users of the induction variable update instruction are 5588 // uniform after vectorization. 5589 auto UniformIndUpdate = 5590 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5591 auto *I = cast<Instruction>(U); 5592 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5593 isVectorizedMemAccessUse(I, IndUpdate); 5594 }); 5595 if (!UniformIndUpdate) 5596 continue; 5597 5598 // The induction variable and its update instruction will remain uniform. 5599 addToWorklistIfAllowed(Ind); 5600 addToWorklistIfAllowed(IndUpdate); 5601 } 5602 5603 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5604 } 5605 5606 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5607 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5608 5609 if (Legal->getRuntimePointerChecking()->Need) { 5610 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5611 "runtime pointer checks needed. Enable vectorization of this " 5612 "loop with '#pragma clang loop vectorize(enable)' when " 5613 "compiling with -Os/-Oz", 5614 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5615 return true; 5616 } 5617 5618 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5619 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5620 "runtime SCEV checks needed. Enable vectorization of this " 5621 "loop with '#pragma clang loop vectorize(enable)' when " 5622 "compiling with -Os/-Oz", 5623 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5624 return true; 5625 } 5626 5627 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5628 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5629 reportVectorizationFailure("Runtime stride check for small trip count", 5630 "runtime stride == 1 checks needed. Enable vectorization of " 5631 "this loop without such check by compiling with -Os/-Oz", 5632 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5633 return true; 5634 } 5635 5636 return false; 5637 } 5638 5639 ElementCount 5640 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5641 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5642 reportVectorizationInfo( 5643 "Disabling scalable vectorization, because target does not " 5644 "support scalable vectors.", 5645 "ScalableVectorsUnsupported", ORE, TheLoop); 5646 return ElementCount::getScalable(0); 5647 } 5648 5649 auto MaxScalableVF = ElementCount::getScalable( 5650 std::numeric_limits<ElementCount::ScalarTy>::max()); 5651 5652 // Disable scalable vectorization if the loop contains unsupported reductions. 5653 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5654 // FIXME: While for scalable vectors this is currently sufficient, this should 5655 // be replaced by a more detailed mechanism that filters out specific VFs, 5656 // instead of invalidating vectorization for a whole set of VFs based on the 5657 // MaxVF. 5658 if (!canVectorizeReductions(MaxScalableVF)) { 5659 reportVectorizationInfo( 5660 "Scalable vectorization not supported for the reduction " 5661 "operations found in this loop.", 5662 "ScalableVFUnfeasible", ORE, TheLoop); 5663 return ElementCount::getScalable(0); 5664 } 5665 5666 if (Legal->isSafeForAnyVectorWidth()) 5667 return MaxScalableVF; 5668 5669 // Limit MaxScalableVF by the maximum safe dependence distance. 5670 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5671 MaxScalableVF = ElementCount::getScalable( 5672 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5673 if (!MaxScalableVF) 5674 reportVectorizationInfo( 5675 "Max legal vector width too small, scalable vectorization " 5676 "unfeasible.", 5677 "ScalableVFUnfeasible", ORE, TheLoop); 5678 5679 return MaxScalableVF; 5680 } 5681 5682 FixedScalableVFPair 5683 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5684 ElementCount UserVF) { 5685 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5686 unsigned SmallestType, WidestType; 5687 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5688 5689 // Get the maximum safe dependence distance in bits computed by LAA. 5690 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5691 // the memory accesses that is most restrictive (involved in the smallest 5692 // dependence distance). 5693 unsigned MaxSafeElements = 5694 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5695 5696 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5697 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5698 5699 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5700 << ".\n"); 5701 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5702 << ".\n"); 5703 5704 // First analyze the UserVF, fall back if the UserVF should be ignored. 5705 if (UserVF) { 5706 auto MaxSafeUserVF = 5707 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5708 5709 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5710 return UserVF; 5711 5712 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5713 5714 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5715 // is better to ignore the hint and let the compiler choose a suitable VF. 5716 if (!UserVF.isScalable()) { 5717 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5718 << " is unsafe, clamping to max safe VF=" 5719 << MaxSafeFixedVF << ".\n"); 5720 ORE->emit([&]() { 5721 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5722 TheLoop->getStartLoc(), 5723 TheLoop->getHeader()) 5724 << "User-specified vectorization factor " 5725 << ore::NV("UserVectorizationFactor", UserVF) 5726 << " is unsafe, clamping to maximum safe vectorization factor " 5727 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5728 }); 5729 return MaxSafeFixedVF; 5730 } 5731 5732 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5733 << " is unsafe. Ignoring scalable UserVF.\n"); 5734 ORE->emit([&]() { 5735 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5736 TheLoop->getStartLoc(), 5737 TheLoop->getHeader()) 5738 << "User-specified vectorization factor " 5739 << ore::NV("UserVectorizationFactor", UserVF) 5740 << " is unsafe. Ignoring the hint to let the compiler pick a " 5741 "suitable VF."; 5742 }); 5743 } 5744 5745 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5746 << " / " << WidestType << " bits.\n"); 5747 5748 FixedScalableVFPair Result(ElementCount::getFixed(1), 5749 ElementCount::getScalable(0)); 5750 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5751 WidestType, MaxSafeFixedVF)) 5752 Result.FixedVF = MaxVF; 5753 5754 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5755 WidestType, MaxSafeScalableVF)) 5756 if (MaxVF.isScalable()) { 5757 Result.ScalableVF = MaxVF; 5758 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5759 << "\n"); 5760 } 5761 5762 return Result; 5763 } 5764 5765 FixedScalableVFPair 5766 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5767 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5768 // TODO: It may by useful to do since it's still likely to be dynamically 5769 // uniform if the target can skip. 5770 reportVectorizationFailure( 5771 "Not inserting runtime ptr check for divergent target", 5772 "runtime pointer checks needed. Not enabled for divergent target", 5773 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5774 return FixedScalableVFPair::getNone(); 5775 } 5776 5777 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5778 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5779 if (TC == 1) { 5780 reportVectorizationFailure("Single iteration (non) loop", 5781 "loop trip count is one, irrelevant for vectorization", 5782 "SingleIterationLoop", ORE, TheLoop); 5783 return FixedScalableVFPair::getNone(); 5784 } 5785 5786 switch (ScalarEpilogueStatus) { 5787 case CM_ScalarEpilogueAllowed: 5788 return computeFeasibleMaxVF(TC, UserVF); 5789 case CM_ScalarEpilogueNotAllowedUsePredicate: 5790 LLVM_FALLTHROUGH; 5791 case CM_ScalarEpilogueNotNeededUsePredicate: 5792 LLVM_DEBUG( 5793 dbgs() << "LV: vector predicate hint/switch found.\n" 5794 << "LV: Not allowing scalar epilogue, creating predicated " 5795 << "vector loop.\n"); 5796 break; 5797 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5798 // fallthrough as a special case of OptForSize 5799 case CM_ScalarEpilogueNotAllowedOptSize: 5800 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5801 LLVM_DEBUG( 5802 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5803 else 5804 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5805 << "count.\n"); 5806 5807 // Bail if runtime checks are required, which are not good when optimising 5808 // for size. 5809 if (runtimeChecksRequired()) 5810 return FixedScalableVFPair::getNone(); 5811 5812 break; 5813 } 5814 5815 // The only loops we can vectorize without a scalar epilogue, are loops with 5816 // a bottom-test and a single exiting block. We'd have to handle the fact 5817 // that not every instruction executes on the last iteration. This will 5818 // require a lane mask which varies through the vector loop body. (TODO) 5819 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5820 // If there was a tail-folding hint/switch, but we can't fold the tail by 5821 // masking, fallback to a vectorization with a scalar epilogue. 5822 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5823 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5824 "scalar epilogue instead.\n"); 5825 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5826 return computeFeasibleMaxVF(TC, UserVF); 5827 } 5828 return FixedScalableVFPair::getNone(); 5829 } 5830 5831 // Now try the tail folding 5832 5833 // Invalidate interleave groups that require an epilogue if we can't mask 5834 // the interleave-group. 5835 if (!useMaskedInterleavedAccesses(TTI)) { 5836 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5837 "No decisions should have been taken at this point"); 5838 // Note: There is no need to invalidate any cost modeling decisions here, as 5839 // non where taken so far. 5840 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5841 } 5842 5843 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5844 // Avoid tail folding if the trip count is known to be a multiple of any VF 5845 // we chose. 5846 // FIXME: The condition below pessimises the case for fixed-width vectors, 5847 // when scalable VFs are also candidates for vectorization. 5848 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5849 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5850 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5851 "MaxFixedVF must be a power of 2"); 5852 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5853 : MaxFixedVF.getFixedValue(); 5854 ScalarEvolution *SE = PSE.getSE(); 5855 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5856 const SCEV *ExitCount = SE->getAddExpr( 5857 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5858 const SCEV *Rem = SE->getURemExpr( 5859 SE->applyLoopGuards(ExitCount, TheLoop), 5860 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5861 if (Rem->isZero()) { 5862 // Accept MaxFixedVF if we do not have a tail. 5863 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5864 return MaxFactors; 5865 } 5866 } 5867 5868 // If we don't know the precise trip count, or if the trip count that we 5869 // found modulo the vectorization factor is not zero, try to fold the tail 5870 // by masking. 5871 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5872 if (Legal->prepareToFoldTailByMasking()) { 5873 FoldTailByMasking = true; 5874 return MaxFactors; 5875 } 5876 5877 // If there was a tail-folding hint/switch, but we can't fold the tail by 5878 // masking, fallback to a vectorization with a scalar epilogue. 5879 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5880 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5881 "scalar epilogue instead.\n"); 5882 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5883 return MaxFactors; 5884 } 5885 5886 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5887 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5888 return FixedScalableVFPair::getNone(); 5889 } 5890 5891 if (TC == 0) { 5892 reportVectorizationFailure( 5893 "Unable to calculate the loop count due to complex control flow", 5894 "unable to calculate the loop count due to complex control flow", 5895 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5896 return FixedScalableVFPair::getNone(); 5897 } 5898 5899 reportVectorizationFailure( 5900 "Cannot optimize for size and vectorize at the same time.", 5901 "cannot optimize for size and vectorize at the same time. " 5902 "Enable vectorization of this loop with '#pragma clang loop " 5903 "vectorize(enable)' when compiling with -Os/-Oz", 5904 "NoTailLoopWithOptForSize", ORE, TheLoop); 5905 return FixedScalableVFPair::getNone(); 5906 } 5907 5908 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5909 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5910 const ElementCount &MaxSafeVF) { 5911 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5912 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5913 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5914 : TargetTransformInfo::RGK_FixedWidthVector); 5915 5916 // Convenience function to return the minimum of two ElementCounts. 5917 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5918 assert((LHS.isScalable() == RHS.isScalable()) && 5919 "Scalable flags must match"); 5920 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5921 }; 5922 5923 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5924 // Note that both WidestRegister and WidestType may not be a powers of 2. 5925 auto MaxVectorElementCount = ElementCount::get( 5926 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5927 ComputeScalableMaxVF); 5928 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5929 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5930 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5931 5932 if (!MaxVectorElementCount) { 5933 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5934 return ElementCount::getFixed(1); 5935 } 5936 5937 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5938 if (ConstTripCount && 5939 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5940 isPowerOf2_32(ConstTripCount)) { 5941 // We need to clamp the VF to be the ConstTripCount. There is no point in 5942 // choosing a higher viable VF as done in the loop below. If 5943 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5944 // the TC is less than or equal to the known number of lanes. 5945 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5946 << ConstTripCount << "\n"); 5947 return TripCountEC; 5948 } 5949 5950 ElementCount MaxVF = MaxVectorElementCount; 5951 if (TTI.shouldMaximizeVectorBandwidth() || 5952 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5953 auto MaxVectorElementCountMaxBW = ElementCount::get( 5954 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5955 ComputeScalableMaxVF); 5956 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5957 5958 // Collect all viable vectorization factors larger than the default MaxVF 5959 // (i.e. MaxVectorElementCount). 5960 SmallVector<ElementCount, 8> VFs; 5961 for (ElementCount VS = MaxVectorElementCount * 2; 5962 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5963 VFs.push_back(VS); 5964 5965 // For each VF calculate its register usage. 5966 auto RUs = calculateRegisterUsage(VFs); 5967 5968 // Select the largest VF which doesn't require more registers than existing 5969 // ones. 5970 for (int i = RUs.size() - 1; i >= 0; --i) { 5971 bool Selected = true; 5972 for (auto &pair : RUs[i].MaxLocalUsers) { 5973 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5974 if (pair.second > TargetNumRegisters) 5975 Selected = false; 5976 } 5977 if (Selected) { 5978 MaxVF = VFs[i]; 5979 break; 5980 } 5981 } 5982 if (ElementCount MinVF = 5983 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5984 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5985 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5986 << ") with target's minimum: " << MinVF << '\n'); 5987 MaxVF = MinVF; 5988 } 5989 } 5990 } 5991 return MaxVF; 5992 } 5993 5994 bool LoopVectorizationCostModel::isMoreProfitable( 5995 const VectorizationFactor &A, const VectorizationFactor &B) const { 5996 InstructionCost::CostType CostA = *A.Cost.getValue(); 5997 InstructionCost::CostType CostB = *B.Cost.getValue(); 5998 5999 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6000 6001 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6002 MaxTripCount) { 6003 // If we are folding the tail and the trip count is a known (possibly small) 6004 // constant, the trip count will be rounded up to an integer number of 6005 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6006 // which we compare directly. When not folding the tail, the total cost will 6007 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6008 // approximated with the per-lane cost below instead of using the tripcount 6009 // as here. 6010 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6011 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6012 return RTCostA < RTCostB; 6013 } 6014 6015 // To avoid the need for FP division: 6016 // (CostA / A.Width) < (CostB / B.Width) 6017 // <=> (CostA * B.Width) < (CostB * A.Width) 6018 return (CostA * B.Width.getKnownMinValue()) < 6019 (CostB * A.Width.getKnownMinValue()); 6020 } 6021 6022 VectorizationFactor 6023 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 6024 // FIXME: This can be fixed for scalable vectors later, because at this stage 6025 // the LoopVectorizer will only consider vectorizing a loop with scalable 6026 // vectors when the loop has a hint to enable vectorization for a given VF. 6027 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 6028 6029 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6030 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6031 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6032 6033 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6034 VectorizationFactor ChosenFactor = ScalarCost; 6035 6036 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6037 if (ForceVectorization && MaxVF.isVector()) { 6038 // Ignore scalar width, because the user explicitly wants vectorization. 6039 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6040 // evaluation. 6041 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6042 } 6043 6044 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 6045 i *= 2) { 6046 // Notice that the vector loop needs to be executed less times, so 6047 // we need to divide the cost of the vector loops by the width of 6048 // the vector elements. 6049 VectorizationCostTy C = expectedCost(i); 6050 6051 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6052 VectorizationFactor Candidate(i, C.first); 6053 LLVM_DEBUG( 6054 dbgs() << "LV: Vector loop of width " << i << " costs: " 6055 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 6056 << ".\n"); 6057 6058 if (!C.second && !ForceVectorization) { 6059 LLVM_DEBUG( 6060 dbgs() << "LV: Not considering vector loop of width " << i 6061 << " because it will not generate any vector instructions.\n"); 6062 continue; 6063 } 6064 6065 // If profitable add it to ProfitableVF list. 6066 if (isMoreProfitable(Candidate, ScalarCost)) 6067 ProfitableVFs.push_back(Candidate); 6068 6069 if (isMoreProfitable(Candidate, ChosenFactor)) 6070 ChosenFactor = Candidate; 6071 } 6072 6073 if (!EnableCondStoresVectorization && NumPredStores) { 6074 reportVectorizationFailure("There are conditional stores.", 6075 "store that is conditionally executed prevents vectorization", 6076 "ConditionalStore", ORE, TheLoop); 6077 ChosenFactor = ScalarCost; 6078 } 6079 6080 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6081 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6082 dbgs() 6083 << "LV: Vectorization seems to be not beneficial, " 6084 << "but was forced by a user.\n"); 6085 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6086 return ChosenFactor; 6087 } 6088 6089 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6090 const Loop &L, ElementCount VF) const { 6091 // Cross iteration phis such as reductions need special handling and are 6092 // currently unsupported. 6093 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6094 return Legal->isFirstOrderRecurrence(&Phi) || 6095 Legal->isReductionVariable(&Phi); 6096 })) 6097 return false; 6098 6099 // Phis with uses outside of the loop require special handling and are 6100 // currently unsupported. 6101 for (auto &Entry : Legal->getInductionVars()) { 6102 // Look for uses of the value of the induction at the last iteration. 6103 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6104 for (User *U : PostInc->users()) 6105 if (!L.contains(cast<Instruction>(U))) 6106 return false; 6107 // Look for uses of penultimate value of the induction. 6108 for (User *U : Entry.first->users()) 6109 if (!L.contains(cast<Instruction>(U))) 6110 return false; 6111 } 6112 6113 // Induction variables that are widened require special handling that is 6114 // currently not supported. 6115 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6116 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6117 this->isProfitableToScalarize(Entry.first, VF)); 6118 })) 6119 return false; 6120 6121 return true; 6122 } 6123 6124 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6125 const ElementCount VF) const { 6126 // FIXME: We need a much better cost-model to take different parameters such 6127 // as register pressure, code size increase and cost of extra branches into 6128 // account. For now we apply a very crude heuristic and only consider loops 6129 // with vectorization factors larger than a certain value. 6130 // We also consider epilogue vectorization unprofitable for targets that don't 6131 // consider interleaving beneficial (eg. MVE). 6132 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6133 return false; 6134 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6135 return true; 6136 return false; 6137 } 6138 6139 VectorizationFactor 6140 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6141 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6142 VectorizationFactor Result = VectorizationFactor::Disabled(); 6143 if (!EnableEpilogueVectorization) { 6144 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6145 return Result; 6146 } 6147 6148 if (!isScalarEpilogueAllowed()) { 6149 LLVM_DEBUG( 6150 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6151 "allowed.\n";); 6152 return Result; 6153 } 6154 6155 // FIXME: This can be fixed for scalable vectors later, because at this stage 6156 // the LoopVectorizer will only consider vectorizing a loop with scalable 6157 // vectors when the loop has a hint to enable vectorization for a given VF. 6158 if (MainLoopVF.isScalable()) { 6159 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6160 "yet supported.\n"); 6161 return Result; 6162 } 6163 6164 // Not really a cost consideration, but check for unsupported cases here to 6165 // simplify the logic. 6166 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6167 LLVM_DEBUG( 6168 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6169 "not a supported candidate.\n";); 6170 return Result; 6171 } 6172 6173 if (EpilogueVectorizationForceVF > 1) { 6174 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6175 if (LVP.hasPlanWithVFs( 6176 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6177 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6178 else { 6179 LLVM_DEBUG( 6180 dbgs() 6181 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6182 return Result; 6183 } 6184 } 6185 6186 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6187 TheLoop->getHeader()->getParent()->hasMinSize()) { 6188 LLVM_DEBUG( 6189 dbgs() 6190 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6191 return Result; 6192 } 6193 6194 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6195 return Result; 6196 6197 for (auto &NextVF : ProfitableVFs) 6198 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6199 (Result.Width.getFixedValue() == 1 || 6200 isMoreProfitable(NextVF, Result)) && 6201 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6202 Result = NextVF; 6203 6204 if (Result != VectorizationFactor::Disabled()) 6205 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6206 << Result.Width.getFixedValue() << "\n";); 6207 return Result; 6208 } 6209 6210 std::pair<unsigned, unsigned> 6211 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6212 unsigned MinWidth = -1U; 6213 unsigned MaxWidth = 8; 6214 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6215 6216 // For each block. 6217 for (BasicBlock *BB : TheLoop->blocks()) { 6218 // For each instruction in the loop. 6219 for (Instruction &I : BB->instructionsWithoutDebug()) { 6220 Type *T = I.getType(); 6221 6222 // Skip ignored values. 6223 if (ValuesToIgnore.count(&I)) 6224 continue; 6225 6226 // Only examine Loads, Stores and PHINodes. 6227 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6228 continue; 6229 6230 // Examine PHI nodes that are reduction variables. Update the type to 6231 // account for the recurrence type. 6232 if (auto *PN = dyn_cast<PHINode>(&I)) { 6233 if (!Legal->isReductionVariable(PN)) 6234 continue; 6235 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6236 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6237 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6238 RdxDesc.getRecurrenceType(), 6239 TargetTransformInfo::ReductionFlags())) 6240 continue; 6241 T = RdxDesc.getRecurrenceType(); 6242 } 6243 6244 // Examine the stored values. 6245 if (auto *ST = dyn_cast<StoreInst>(&I)) 6246 T = ST->getValueOperand()->getType(); 6247 6248 // Ignore loaded pointer types and stored pointer types that are not 6249 // vectorizable. 6250 // 6251 // FIXME: The check here attempts to predict whether a load or store will 6252 // be vectorized. We only know this for certain after a VF has 6253 // been selected. Here, we assume that if an access can be 6254 // vectorized, it will be. We should also look at extending this 6255 // optimization to non-pointer types. 6256 // 6257 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6258 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6259 continue; 6260 6261 MinWidth = std::min(MinWidth, 6262 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6263 MaxWidth = std::max(MaxWidth, 6264 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6265 } 6266 } 6267 6268 return {MinWidth, MaxWidth}; 6269 } 6270 6271 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6272 unsigned LoopCost) { 6273 // -- The interleave heuristics -- 6274 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6275 // There are many micro-architectural considerations that we can't predict 6276 // at this level. For example, frontend pressure (on decode or fetch) due to 6277 // code size, or the number and capabilities of the execution ports. 6278 // 6279 // We use the following heuristics to select the interleave count: 6280 // 1. If the code has reductions, then we interleave to break the cross 6281 // iteration dependency. 6282 // 2. If the loop is really small, then we interleave to reduce the loop 6283 // overhead. 6284 // 3. We don't interleave if we think that we will spill registers to memory 6285 // due to the increased register pressure. 6286 6287 if (!isScalarEpilogueAllowed()) 6288 return 1; 6289 6290 // We used the distance for the interleave count. 6291 if (Legal->getMaxSafeDepDistBytes() != -1U) 6292 return 1; 6293 6294 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6295 const bool HasReductions = !Legal->getReductionVars().empty(); 6296 // Do not interleave loops with a relatively small known or estimated trip 6297 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6298 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6299 // because with the above conditions interleaving can expose ILP and break 6300 // cross iteration dependences for reductions. 6301 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6302 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6303 return 1; 6304 6305 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6306 // We divide by these constants so assume that we have at least one 6307 // instruction that uses at least one register. 6308 for (auto& pair : R.MaxLocalUsers) { 6309 pair.second = std::max(pair.second, 1U); 6310 } 6311 6312 // We calculate the interleave count using the following formula. 6313 // Subtract the number of loop invariants from the number of available 6314 // registers. These registers are used by all of the interleaved instances. 6315 // Next, divide the remaining registers by the number of registers that is 6316 // required by the loop, in order to estimate how many parallel instances 6317 // fit without causing spills. All of this is rounded down if necessary to be 6318 // a power of two. We want power of two interleave count to simplify any 6319 // addressing operations or alignment considerations. 6320 // We also want power of two interleave counts to ensure that the induction 6321 // variable of the vector loop wraps to zero, when tail is folded by masking; 6322 // this currently happens when OptForSize, in which case IC is set to 1 above. 6323 unsigned IC = UINT_MAX; 6324 6325 for (auto& pair : R.MaxLocalUsers) { 6326 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6327 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6328 << " registers of " 6329 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6330 if (VF.isScalar()) { 6331 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6332 TargetNumRegisters = ForceTargetNumScalarRegs; 6333 } else { 6334 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6335 TargetNumRegisters = ForceTargetNumVectorRegs; 6336 } 6337 unsigned MaxLocalUsers = pair.second; 6338 unsigned LoopInvariantRegs = 0; 6339 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6340 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6341 6342 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6343 // Don't count the induction variable as interleaved. 6344 if (EnableIndVarRegisterHeur) { 6345 TmpIC = 6346 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6347 std::max(1U, (MaxLocalUsers - 1))); 6348 } 6349 6350 IC = std::min(IC, TmpIC); 6351 } 6352 6353 // Clamp the interleave ranges to reasonable counts. 6354 unsigned MaxInterleaveCount = 6355 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6356 6357 // Check if the user has overridden the max. 6358 if (VF.isScalar()) { 6359 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6360 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6361 } else { 6362 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6363 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6364 } 6365 6366 // If trip count is known or estimated compile time constant, limit the 6367 // interleave count to be less than the trip count divided by VF, provided it 6368 // is at least 1. 6369 // 6370 // For scalable vectors we can't know if interleaving is beneficial. It may 6371 // not be beneficial for small loops if none of the lanes in the second vector 6372 // iterations is enabled. However, for larger loops, there is likely to be a 6373 // similar benefit as for fixed-width vectors. For now, we choose to leave 6374 // the InterleaveCount as if vscale is '1', although if some information about 6375 // the vector is known (e.g. min vector size), we can make a better decision. 6376 if (BestKnownTC) { 6377 MaxInterleaveCount = 6378 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6379 // Make sure MaxInterleaveCount is greater than 0. 6380 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6381 } 6382 6383 assert(MaxInterleaveCount > 0 && 6384 "Maximum interleave count must be greater than 0"); 6385 6386 // Clamp the calculated IC to be between the 1 and the max interleave count 6387 // that the target and trip count allows. 6388 if (IC > MaxInterleaveCount) 6389 IC = MaxInterleaveCount; 6390 else 6391 // Make sure IC is greater than 0. 6392 IC = std::max(1u, IC); 6393 6394 assert(IC > 0 && "Interleave count must be greater than 0."); 6395 6396 // If we did not calculate the cost for VF (because the user selected the VF) 6397 // then we calculate the cost of VF here. 6398 if (LoopCost == 0) { 6399 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6400 LoopCost = *expectedCost(VF).first.getValue(); 6401 } 6402 6403 assert(LoopCost && "Non-zero loop cost expected"); 6404 6405 // Interleave if we vectorized this loop and there is a reduction that could 6406 // benefit from interleaving. 6407 if (VF.isVector() && HasReductions) { 6408 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6409 return IC; 6410 } 6411 6412 // Note that if we've already vectorized the loop we will have done the 6413 // runtime check and so interleaving won't require further checks. 6414 bool InterleavingRequiresRuntimePointerCheck = 6415 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6416 6417 // We want to interleave small loops in order to reduce the loop overhead and 6418 // potentially expose ILP opportunities. 6419 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6420 << "LV: IC is " << IC << '\n' 6421 << "LV: VF is " << VF << '\n'); 6422 const bool AggressivelyInterleaveReductions = 6423 TTI.enableAggressiveInterleaving(HasReductions); 6424 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6425 // We assume that the cost overhead is 1 and we use the cost model 6426 // to estimate the cost of the loop and interleave until the cost of the 6427 // loop overhead is about 5% of the cost of the loop. 6428 unsigned SmallIC = 6429 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6430 6431 // Interleave until store/load ports (estimated by max interleave count) are 6432 // saturated. 6433 unsigned NumStores = Legal->getNumStores(); 6434 unsigned NumLoads = Legal->getNumLoads(); 6435 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6436 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6437 6438 // If we have a scalar reduction (vector reductions are already dealt with 6439 // by this point), we can increase the critical path length if the loop 6440 // we're interleaving is inside another loop. Limit, by default to 2, so the 6441 // critical path only gets increased by one reduction operation. 6442 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6443 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6444 SmallIC = std::min(SmallIC, F); 6445 StoresIC = std::min(StoresIC, F); 6446 LoadsIC = std::min(LoadsIC, F); 6447 } 6448 6449 if (EnableLoadStoreRuntimeInterleave && 6450 std::max(StoresIC, LoadsIC) > SmallIC) { 6451 LLVM_DEBUG( 6452 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6453 return std::max(StoresIC, LoadsIC); 6454 } 6455 6456 // If there are scalar reductions and TTI has enabled aggressive 6457 // interleaving for reductions, we will interleave to expose ILP. 6458 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6459 AggressivelyInterleaveReductions) { 6460 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6461 // Interleave no less than SmallIC but not as aggressive as the normal IC 6462 // to satisfy the rare situation when resources are too limited. 6463 return std::max(IC / 2, SmallIC); 6464 } else { 6465 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6466 return SmallIC; 6467 } 6468 } 6469 6470 // Interleave if this is a large loop (small loops are already dealt with by 6471 // this point) that could benefit from interleaving. 6472 if (AggressivelyInterleaveReductions) { 6473 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6474 return IC; 6475 } 6476 6477 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6478 return 1; 6479 } 6480 6481 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6482 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6483 // This function calculates the register usage by measuring the highest number 6484 // of values that are alive at a single location. Obviously, this is a very 6485 // rough estimation. We scan the loop in a topological order in order and 6486 // assign a number to each instruction. We use RPO to ensure that defs are 6487 // met before their users. We assume that each instruction that has in-loop 6488 // users starts an interval. We record every time that an in-loop value is 6489 // used, so we have a list of the first and last occurrences of each 6490 // instruction. Next, we transpose this data structure into a multi map that 6491 // holds the list of intervals that *end* at a specific location. This multi 6492 // map allows us to perform a linear search. We scan the instructions linearly 6493 // and record each time that a new interval starts, by placing it in a set. 6494 // If we find this value in the multi-map then we remove it from the set. 6495 // The max register usage is the maximum size of the set. 6496 // We also search for instructions that are defined outside the loop, but are 6497 // used inside the loop. We need this number separately from the max-interval 6498 // usage number because when we unroll, loop-invariant values do not take 6499 // more register. 6500 LoopBlocksDFS DFS(TheLoop); 6501 DFS.perform(LI); 6502 6503 RegisterUsage RU; 6504 6505 // Each 'key' in the map opens a new interval. The values 6506 // of the map are the index of the 'last seen' usage of the 6507 // instruction that is the key. 6508 using IntervalMap = DenseMap<Instruction *, unsigned>; 6509 6510 // Maps instruction to its index. 6511 SmallVector<Instruction *, 64> IdxToInstr; 6512 // Marks the end of each interval. 6513 IntervalMap EndPoint; 6514 // Saves the list of instruction indices that are used in the loop. 6515 SmallPtrSet<Instruction *, 8> Ends; 6516 // Saves the list of values that are used in the loop but are 6517 // defined outside the loop, such as arguments and constants. 6518 SmallPtrSet<Value *, 8> LoopInvariants; 6519 6520 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6521 for (Instruction &I : BB->instructionsWithoutDebug()) { 6522 IdxToInstr.push_back(&I); 6523 6524 // Save the end location of each USE. 6525 for (Value *U : I.operands()) { 6526 auto *Instr = dyn_cast<Instruction>(U); 6527 6528 // Ignore non-instruction values such as arguments, constants, etc. 6529 if (!Instr) 6530 continue; 6531 6532 // If this instruction is outside the loop then record it and continue. 6533 if (!TheLoop->contains(Instr)) { 6534 LoopInvariants.insert(Instr); 6535 continue; 6536 } 6537 6538 // Overwrite previous end points. 6539 EndPoint[Instr] = IdxToInstr.size(); 6540 Ends.insert(Instr); 6541 } 6542 } 6543 } 6544 6545 // Saves the list of intervals that end with the index in 'key'. 6546 using InstrList = SmallVector<Instruction *, 2>; 6547 DenseMap<unsigned, InstrList> TransposeEnds; 6548 6549 // Transpose the EndPoints to a list of values that end at each index. 6550 for (auto &Interval : EndPoint) 6551 TransposeEnds[Interval.second].push_back(Interval.first); 6552 6553 SmallPtrSet<Instruction *, 8> OpenIntervals; 6554 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6555 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6556 6557 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6558 6559 // A lambda that gets the register usage for the given type and VF. 6560 const auto &TTICapture = TTI; 6561 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6562 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6563 return 0U; 6564 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6565 }; 6566 6567 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6568 Instruction *I = IdxToInstr[i]; 6569 6570 // Remove all of the instructions that end at this location. 6571 InstrList &List = TransposeEnds[i]; 6572 for (Instruction *ToRemove : List) 6573 OpenIntervals.erase(ToRemove); 6574 6575 // Ignore instructions that are never used within the loop. 6576 if (!Ends.count(I)) 6577 continue; 6578 6579 // Skip ignored values. 6580 if (ValuesToIgnore.count(I)) 6581 continue; 6582 6583 // For each VF find the maximum usage of registers. 6584 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6585 // Count the number of live intervals. 6586 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6587 6588 if (VFs[j].isScalar()) { 6589 for (auto Inst : OpenIntervals) { 6590 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6591 if (RegUsage.find(ClassID) == RegUsage.end()) 6592 RegUsage[ClassID] = 1; 6593 else 6594 RegUsage[ClassID] += 1; 6595 } 6596 } else { 6597 collectUniformsAndScalars(VFs[j]); 6598 for (auto Inst : OpenIntervals) { 6599 // Skip ignored values for VF > 1. 6600 if (VecValuesToIgnore.count(Inst)) 6601 continue; 6602 if (isScalarAfterVectorization(Inst, VFs[j])) { 6603 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6604 if (RegUsage.find(ClassID) == RegUsage.end()) 6605 RegUsage[ClassID] = 1; 6606 else 6607 RegUsage[ClassID] += 1; 6608 } else { 6609 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6610 if (RegUsage.find(ClassID) == RegUsage.end()) 6611 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6612 else 6613 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6614 } 6615 } 6616 } 6617 6618 for (auto& pair : RegUsage) { 6619 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6620 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6621 else 6622 MaxUsages[j][pair.first] = pair.second; 6623 } 6624 } 6625 6626 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6627 << OpenIntervals.size() << '\n'); 6628 6629 // Add the current instruction to the list of open intervals. 6630 OpenIntervals.insert(I); 6631 } 6632 6633 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6634 SmallMapVector<unsigned, unsigned, 4> Invariant; 6635 6636 for (auto Inst : LoopInvariants) { 6637 unsigned Usage = 6638 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6639 unsigned ClassID = 6640 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6641 if (Invariant.find(ClassID) == Invariant.end()) 6642 Invariant[ClassID] = Usage; 6643 else 6644 Invariant[ClassID] += Usage; 6645 } 6646 6647 LLVM_DEBUG({ 6648 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6649 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6650 << " item\n"; 6651 for (const auto &pair : MaxUsages[i]) { 6652 dbgs() << "LV(REG): RegisterClass: " 6653 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6654 << " registers\n"; 6655 } 6656 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6657 << " item\n"; 6658 for (const auto &pair : Invariant) { 6659 dbgs() << "LV(REG): RegisterClass: " 6660 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6661 << " registers\n"; 6662 } 6663 }); 6664 6665 RU.LoopInvariantRegs = Invariant; 6666 RU.MaxLocalUsers = MaxUsages[i]; 6667 RUs[i] = RU; 6668 } 6669 6670 return RUs; 6671 } 6672 6673 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6674 // TODO: Cost model for emulated masked load/store is completely 6675 // broken. This hack guides the cost model to use an artificially 6676 // high enough value to practically disable vectorization with such 6677 // operations, except where previously deployed legality hack allowed 6678 // using very low cost values. This is to avoid regressions coming simply 6679 // from moving "masked load/store" check from legality to cost model. 6680 // Masked Load/Gather emulation was previously never allowed. 6681 // Limited number of Masked Store/Scatter emulation was allowed. 6682 assert(isPredicatedInst(I) && 6683 "Expecting a scalar emulated instruction"); 6684 return isa<LoadInst>(I) || 6685 (isa<StoreInst>(I) && 6686 NumPredStores > NumberOfStoresToPredicate); 6687 } 6688 6689 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6690 // If we aren't vectorizing the loop, or if we've already collected the 6691 // instructions to scalarize, there's nothing to do. Collection may already 6692 // have occurred if we have a user-selected VF and are now computing the 6693 // expected cost for interleaving. 6694 if (VF.isScalar() || VF.isZero() || 6695 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6696 return; 6697 6698 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6699 // not profitable to scalarize any instructions, the presence of VF in the 6700 // map will indicate that we've analyzed it already. 6701 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6702 6703 // Find all the instructions that are scalar with predication in the loop and 6704 // determine if it would be better to not if-convert the blocks they are in. 6705 // If so, we also record the instructions to scalarize. 6706 for (BasicBlock *BB : TheLoop->blocks()) { 6707 if (!blockNeedsPredication(BB)) 6708 continue; 6709 for (Instruction &I : *BB) 6710 if (isScalarWithPredication(&I)) { 6711 ScalarCostsTy ScalarCosts; 6712 // Do not apply discount logic if hacked cost is needed 6713 // for emulated masked memrefs. 6714 if (!useEmulatedMaskMemRefHack(&I) && 6715 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6716 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6717 // Remember that BB will remain after vectorization. 6718 PredicatedBBsAfterVectorization.insert(BB); 6719 } 6720 } 6721 } 6722 6723 int LoopVectorizationCostModel::computePredInstDiscount( 6724 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6725 assert(!isUniformAfterVectorization(PredInst, VF) && 6726 "Instruction marked uniform-after-vectorization will be predicated"); 6727 6728 // Initialize the discount to zero, meaning that the scalar version and the 6729 // vector version cost the same. 6730 InstructionCost Discount = 0; 6731 6732 // Holds instructions to analyze. The instructions we visit are mapped in 6733 // ScalarCosts. Those instructions are the ones that would be scalarized if 6734 // we find that the scalar version costs less. 6735 SmallVector<Instruction *, 8> Worklist; 6736 6737 // Returns true if the given instruction can be scalarized. 6738 auto canBeScalarized = [&](Instruction *I) -> bool { 6739 // We only attempt to scalarize instructions forming a single-use chain 6740 // from the original predicated block that would otherwise be vectorized. 6741 // Although not strictly necessary, we give up on instructions we know will 6742 // already be scalar to avoid traversing chains that are unlikely to be 6743 // beneficial. 6744 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6745 isScalarAfterVectorization(I, VF)) 6746 return false; 6747 6748 // If the instruction is scalar with predication, it will be analyzed 6749 // separately. We ignore it within the context of PredInst. 6750 if (isScalarWithPredication(I)) 6751 return false; 6752 6753 // If any of the instruction's operands are uniform after vectorization, 6754 // the instruction cannot be scalarized. This prevents, for example, a 6755 // masked load from being scalarized. 6756 // 6757 // We assume we will only emit a value for lane zero of an instruction 6758 // marked uniform after vectorization, rather than VF identical values. 6759 // Thus, if we scalarize an instruction that uses a uniform, we would 6760 // create uses of values corresponding to the lanes we aren't emitting code 6761 // for. This behavior can be changed by allowing getScalarValue to clone 6762 // the lane zero values for uniforms rather than asserting. 6763 for (Use &U : I->operands()) 6764 if (auto *J = dyn_cast<Instruction>(U.get())) 6765 if (isUniformAfterVectorization(J, VF)) 6766 return false; 6767 6768 // Otherwise, we can scalarize the instruction. 6769 return true; 6770 }; 6771 6772 // Compute the expected cost discount from scalarizing the entire expression 6773 // feeding the predicated instruction. We currently only consider expressions 6774 // that are single-use instruction chains. 6775 Worklist.push_back(PredInst); 6776 while (!Worklist.empty()) { 6777 Instruction *I = Worklist.pop_back_val(); 6778 6779 // If we've already analyzed the instruction, there's nothing to do. 6780 if (ScalarCosts.find(I) != ScalarCosts.end()) 6781 continue; 6782 6783 // Compute the cost of the vector instruction. Note that this cost already 6784 // includes the scalarization overhead of the predicated instruction. 6785 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6786 6787 // Compute the cost of the scalarized instruction. This cost is the cost of 6788 // the instruction as if it wasn't if-converted and instead remained in the 6789 // predicated block. We will scale this cost by block probability after 6790 // computing the scalarization overhead. 6791 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6792 InstructionCost ScalarCost = 6793 VF.getKnownMinValue() * 6794 getInstructionCost(I, ElementCount::getFixed(1)).first; 6795 6796 // Compute the scalarization overhead of needed insertelement instructions 6797 // and phi nodes. 6798 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6799 ScalarCost += TTI.getScalarizationOverhead( 6800 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6801 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6802 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6803 ScalarCost += 6804 VF.getKnownMinValue() * 6805 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6806 } 6807 6808 // Compute the scalarization overhead of needed extractelement 6809 // instructions. For each of the instruction's operands, if the operand can 6810 // be scalarized, add it to the worklist; otherwise, account for the 6811 // overhead. 6812 for (Use &U : I->operands()) 6813 if (auto *J = dyn_cast<Instruction>(U.get())) { 6814 assert(VectorType::isValidElementType(J->getType()) && 6815 "Instruction has non-scalar type"); 6816 if (canBeScalarized(J)) 6817 Worklist.push_back(J); 6818 else if (needsExtract(J, VF)) { 6819 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6820 ScalarCost += TTI.getScalarizationOverhead( 6821 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6822 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6823 } 6824 } 6825 6826 // Scale the total scalar cost by block probability. 6827 ScalarCost /= getReciprocalPredBlockProb(); 6828 6829 // Compute the discount. A non-negative discount means the vector version 6830 // of the instruction costs more, and scalarizing would be beneficial. 6831 Discount += VectorCost - ScalarCost; 6832 ScalarCosts[I] = ScalarCost; 6833 } 6834 6835 return *Discount.getValue(); 6836 } 6837 6838 LoopVectorizationCostModel::VectorizationCostTy 6839 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6840 VectorizationCostTy Cost; 6841 6842 // For each block. 6843 for (BasicBlock *BB : TheLoop->blocks()) { 6844 VectorizationCostTy BlockCost; 6845 6846 // For each instruction in the old loop. 6847 for (Instruction &I : BB->instructionsWithoutDebug()) { 6848 // Skip ignored values. 6849 if (ValuesToIgnore.count(&I) || 6850 (VF.isVector() && VecValuesToIgnore.count(&I))) 6851 continue; 6852 6853 VectorizationCostTy C = getInstructionCost(&I, VF); 6854 6855 // Check if we should override the cost. 6856 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6857 C.first = InstructionCost(ForceTargetInstructionCost); 6858 6859 BlockCost.first += C.first; 6860 BlockCost.second |= C.second; 6861 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6862 << " for VF " << VF << " For instruction: " << I 6863 << '\n'); 6864 } 6865 6866 // If we are vectorizing a predicated block, it will have been 6867 // if-converted. This means that the block's instructions (aside from 6868 // stores and instructions that may divide by zero) will now be 6869 // unconditionally executed. For the scalar case, we may not always execute 6870 // the predicated block, if it is an if-else block. Thus, scale the block's 6871 // cost by the probability of executing it. blockNeedsPredication from 6872 // Legal is used so as to not include all blocks in tail folded loops. 6873 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6874 BlockCost.first /= getReciprocalPredBlockProb(); 6875 6876 Cost.first += BlockCost.first; 6877 Cost.second |= BlockCost.second; 6878 } 6879 6880 return Cost; 6881 } 6882 6883 /// Gets Address Access SCEV after verifying that the access pattern 6884 /// is loop invariant except the induction variable dependence. 6885 /// 6886 /// This SCEV can be sent to the Target in order to estimate the address 6887 /// calculation cost. 6888 static const SCEV *getAddressAccessSCEV( 6889 Value *Ptr, 6890 LoopVectorizationLegality *Legal, 6891 PredicatedScalarEvolution &PSE, 6892 const Loop *TheLoop) { 6893 6894 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6895 if (!Gep) 6896 return nullptr; 6897 6898 // We are looking for a gep with all loop invariant indices except for one 6899 // which should be an induction variable. 6900 auto SE = PSE.getSE(); 6901 unsigned NumOperands = Gep->getNumOperands(); 6902 for (unsigned i = 1; i < NumOperands; ++i) { 6903 Value *Opd = Gep->getOperand(i); 6904 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6905 !Legal->isInductionVariable(Opd)) 6906 return nullptr; 6907 } 6908 6909 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6910 return PSE.getSCEV(Ptr); 6911 } 6912 6913 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6914 return Legal->hasStride(I->getOperand(0)) || 6915 Legal->hasStride(I->getOperand(1)); 6916 } 6917 6918 InstructionCost 6919 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6920 ElementCount VF) { 6921 assert(VF.isVector() && 6922 "Scalarization cost of instruction implies vectorization."); 6923 if (VF.isScalable()) 6924 return InstructionCost::getInvalid(); 6925 6926 Type *ValTy = getMemInstValueType(I); 6927 auto SE = PSE.getSE(); 6928 6929 unsigned AS = getLoadStoreAddressSpace(I); 6930 Value *Ptr = getLoadStorePointerOperand(I); 6931 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6932 6933 // Figure out whether the access is strided and get the stride value 6934 // if it's known in compile time 6935 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6936 6937 // Get the cost of the scalar memory instruction and address computation. 6938 InstructionCost Cost = 6939 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6940 6941 // Don't pass *I here, since it is scalar but will actually be part of a 6942 // vectorized loop where the user of it is a vectorized instruction. 6943 const Align Alignment = getLoadStoreAlignment(I); 6944 Cost += VF.getKnownMinValue() * 6945 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6946 AS, TTI::TCK_RecipThroughput); 6947 6948 // Get the overhead of the extractelement and insertelement instructions 6949 // we might create due to scalarization. 6950 Cost += getScalarizationOverhead(I, VF); 6951 6952 // If we have a predicated load/store, it will need extra i1 extracts and 6953 // conditional branches, but may not be executed for each vector lane. Scale 6954 // the cost by the probability of executing the predicated block. 6955 if (isPredicatedInst(I)) { 6956 Cost /= getReciprocalPredBlockProb(); 6957 6958 // Add the cost of an i1 extract and a branch 6959 auto *Vec_i1Ty = 6960 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6961 Cost += TTI.getScalarizationOverhead( 6962 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6963 /*Insert=*/false, /*Extract=*/true); 6964 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6965 6966 if (useEmulatedMaskMemRefHack(I)) 6967 // Artificially setting to a high enough value to practically disable 6968 // vectorization with such operations. 6969 Cost = 3000000; 6970 } 6971 6972 return Cost; 6973 } 6974 6975 InstructionCost 6976 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6977 ElementCount VF) { 6978 Type *ValTy = getMemInstValueType(I); 6979 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6980 Value *Ptr = getLoadStorePointerOperand(I); 6981 unsigned AS = getLoadStoreAddressSpace(I); 6982 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6983 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6984 6985 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6986 "Stride should be 1 or -1 for consecutive memory access"); 6987 const Align Alignment = getLoadStoreAlignment(I); 6988 InstructionCost Cost = 0; 6989 if (Legal->isMaskRequired(I)) 6990 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6991 CostKind); 6992 else 6993 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6994 CostKind, I); 6995 6996 bool Reverse = ConsecutiveStride < 0; 6997 if (Reverse) 6998 Cost += 6999 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7000 return Cost; 7001 } 7002 7003 InstructionCost 7004 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7005 ElementCount VF) { 7006 assert(Legal->isUniformMemOp(*I)); 7007 7008 Type *ValTy = getMemInstValueType(I); 7009 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7010 const Align Alignment = getLoadStoreAlignment(I); 7011 unsigned AS = getLoadStoreAddressSpace(I); 7012 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7013 if (isa<LoadInst>(I)) { 7014 return TTI.getAddressComputationCost(ValTy) + 7015 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7016 CostKind) + 7017 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7018 } 7019 StoreInst *SI = cast<StoreInst>(I); 7020 7021 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7022 return TTI.getAddressComputationCost(ValTy) + 7023 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7024 CostKind) + 7025 (isLoopInvariantStoreValue 7026 ? 0 7027 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7028 VF.getKnownMinValue() - 1)); 7029 } 7030 7031 InstructionCost 7032 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7033 ElementCount VF) { 7034 Type *ValTy = getMemInstValueType(I); 7035 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7036 const Align Alignment = getLoadStoreAlignment(I); 7037 const Value *Ptr = getLoadStorePointerOperand(I); 7038 7039 return TTI.getAddressComputationCost(VectorTy) + 7040 TTI.getGatherScatterOpCost( 7041 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7042 TargetTransformInfo::TCK_RecipThroughput, I); 7043 } 7044 7045 InstructionCost 7046 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7047 ElementCount VF) { 7048 // TODO: Once we have support for interleaving with scalable vectors 7049 // we can calculate the cost properly here. 7050 if (VF.isScalable()) 7051 return InstructionCost::getInvalid(); 7052 7053 Type *ValTy = getMemInstValueType(I); 7054 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7055 unsigned AS = getLoadStoreAddressSpace(I); 7056 7057 auto Group = getInterleavedAccessGroup(I); 7058 assert(Group && "Fail to get an interleaved access group."); 7059 7060 unsigned InterleaveFactor = Group->getFactor(); 7061 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7062 7063 // Holds the indices of existing members in an interleaved load group. 7064 // An interleaved store group doesn't need this as it doesn't allow gaps. 7065 SmallVector<unsigned, 4> Indices; 7066 if (isa<LoadInst>(I)) { 7067 for (unsigned i = 0; i < InterleaveFactor; i++) 7068 if (Group->getMember(i)) 7069 Indices.push_back(i); 7070 } 7071 7072 // Calculate the cost of the whole interleaved group. 7073 bool UseMaskForGaps = 7074 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7075 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7076 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7077 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7078 7079 if (Group->isReverse()) { 7080 // TODO: Add support for reversed masked interleaved access. 7081 assert(!Legal->isMaskRequired(I) && 7082 "Reverse masked interleaved access not supported."); 7083 Cost += 7084 Group->getNumMembers() * 7085 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7086 } 7087 return Cost; 7088 } 7089 7090 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7091 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7092 // Early exit for no inloop reductions 7093 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7094 return InstructionCost::getInvalid(); 7095 auto *VectorTy = cast<VectorType>(Ty); 7096 7097 // We are looking for a pattern of, and finding the minimal acceptable cost: 7098 // reduce(mul(ext(A), ext(B))) or 7099 // reduce(mul(A, B)) or 7100 // reduce(ext(A)) or 7101 // reduce(A). 7102 // The basic idea is that we walk down the tree to do that, finding the root 7103 // reduction instruction in InLoopReductionImmediateChains. From there we find 7104 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7105 // of the components. If the reduction cost is lower then we return it for the 7106 // reduction instruction and 0 for the other instructions in the pattern. If 7107 // it is not we return an invalid cost specifying the orignal cost method 7108 // should be used. 7109 Instruction *RetI = I; 7110 if ((RetI->getOpcode() == Instruction::SExt || 7111 RetI->getOpcode() == Instruction::ZExt)) { 7112 if (!RetI->hasOneUser()) 7113 return InstructionCost::getInvalid(); 7114 RetI = RetI->user_back(); 7115 } 7116 if (RetI->getOpcode() == Instruction::Mul && 7117 RetI->user_back()->getOpcode() == Instruction::Add) { 7118 if (!RetI->hasOneUser()) 7119 return InstructionCost::getInvalid(); 7120 RetI = RetI->user_back(); 7121 } 7122 7123 // Test if the found instruction is a reduction, and if not return an invalid 7124 // cost specifying the parent to use the original cost modelling. 7125 if (!InLoopReductionImmediateChains.count(RetI)) 7126 return InstructionCost::getInvalid(); 7127 7128 // Find the reduction this chain is a part of and calculate the basic cost of 7129 // the reduction on its own. 7130 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7131 Instruction *ReductionPhi = LastChain; 7132 while (!isa<PHINode>(ReductionPhi)) 7133 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7134 7135 RecurrenceDescriptor RdxDesc = 7136 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7137 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7138 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7139 7140 // Get the operand that was not the reduction chain and match it to one of the 7141 // patterns, returning the better cost if it is found. 7142 Instruction *RedOp = RetI->getOperand(1) == LastChain 7143 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7144 : dyn_cast<Instruction>(RetI->getOperand(1)); 7145 7146 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7147 7148 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7149 !TheLoop->isLoopInvariant(RedOp)) { 7150 bool IsUnsigned = isa<ZExtInst>(RedOp); 7151 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7152 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7153 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7154 CostKind); 7155 7156 InstructionCost ExtCost = 7157 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7158 TTI::CastContextHint::None, CostKind, RedOp); 7159 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7160 return I == RetI ? *RedCost.getValue() : 0; 7161 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7162 Instruction *Mul = RedOp; 7163 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7164 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7165 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7166 Op0->getOpcode() == Op1->getOpcode() && 7167 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7168 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7169 bool IsUnsigned = isa<ZExtInst>(Op0); 7170 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7171 // reduce(mul(ext, ext)) 7172 InstructionCost ExtCost = 7173 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7174 TTI::CastContextHint::None, CostKind, Op0); 7175 InstructionCost MulCost = 7176 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7177 7178 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7179 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7180 CostKind); 7181 7182 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7183 return I == RetI ? *RedCost.getValue() : 0; 7184 } else { 7185 InstructionCost MulCost = 7186 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7187 7188 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7189 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7190 CostKind); 7191 7192 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7193 return I == RetI ? *RedCost.getValue() : 0; 7194 } 7195 } 7196 7197 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7198 } 7199 7200 InstructionCost 7201 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7202 ElementCount VF) { 7203 // Calculate scalar cost only. Vectorization cost should be ready at this 7204 // moment. 7205 if (VF.isScalar()) { 7206 Type *ValTy = getMemInstValueType(I); 7207 const Align Alignment = getLoadStoreAlignment(I); 7208 unsigned AS = getLoadStoreAddressSpace(I); 7209 7210 return TTI.getAddressComputationCost(ValTy) + 7211 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7212 TTI::TCK_RecipThroughput, I); 7213 } 7214 return getWideningCost(I, VF); 7215 } 7216 7217 LoopVectorizationCostModel::VectorizationCostTy 7218 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7219 ElementCount VF) { 7220 // If we know that this instruction will remain uniform, check the cost of 7221 // the scalar version. 7222 if (isUniformAfterVectorization(I, VF)) 7223 VF = ElementCount::getFixed(1); 7224 7225 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7226 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7227 7228 // Forced scalars do not have any scalarization overhead. 7229 auto ForcedScalar = ForcedScalars.find(VF); 7230 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7231 auto InstSet = ForcedScalar->second; 7232 if (InstSet.count(I)) 7233 return VectorizationCostTy( 7234 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7235 VF.getKnownMinValue()), 7236 false); 7237 } 7238 7239 Type *VectorTy; 7240 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7241 7242 bool TypeNotScalarized = 7243 VF.isVector() && VectorTy->isVectorTy() && 7244 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7245 return VectorizationCostTy(C, TypeNotScalarized); 7246 } 7247 7248 InstructionCost 7249 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7250 ElementCount VF) const { 7251 7252 if (VF.isScalable()) 7253 return InstructionCost::getInvalid(); 7254 7255 if (VF.isScalar()) 7256 return 0; 7257 7258 InstructionCost Cost = 0; 7259 Type *RetTy = ToVectorTy(I->getType(), VF); 7260 if (!RetTy->isVoidTy() && 7261 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7262 Cost += TTI.getScalarizationOverhead( 7263 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7264 true, false); 7265 7266 // Some targets keep addresses scalar. 7267 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7268 return Cost; 7269 7270 // Some targets support efficient element stores. 7271 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7272 return Cost; 7273 7274 // Collect operands to consider. 7275 CallInst *CI = dyn_cast<CallInst>(I); 7276 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7277 7278 // Skip operands that do not require extraction/scalarization and do not incur 7279 // any overhead. 7280 SmallVector<Type *> Tys; 7281 for (auto *V : filterExtractingOperands(Ops, VF)) 7282 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7283 return Cost + TTI.getOperandsScalarizationOverhead( 7284 filterExtractingOperands(Ops, VF), Tys); 7285 } 7286 7287 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7288 if (VF.isScalar()) 7289 return; 7290 NumPredStores = 0; 7291 for (BasicBlock *BB : TheLoop->blocks()) { 7292 // For each instruction in the old loop. 7293 for (Instruction &I : *BB) { 7294 Value *Ptr = getLoadStorePointerOperand(&I); 7295 if (!Ptr) 7296 continue; 7297 7298 // TODO: We should generate better code and update the cost model for 7299 // predicated uniform stores. Today they are treated as any other 7300 // predicated store (see added test cases in 7301 // invariant-store-vectorization.ll). 7302 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7303 NumPredStores++; 7304 7305 if (Legal->isUniformMemOp(I)) { 7306 // TODO: Avoid replicating loads and stores instead of 7307 // relying on instcombine to remove them. 7308 // Load: Scalar load + broadcast 7309 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7310 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7311 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7312 continue; 7313 } 7314 7315 // We assume that widening is the best solution when possible. 7316 if (memoryInstructionCanBeWidened(&I, VF)) { 7317 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7318 int ConsecutiveStride = 7319 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7320 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7321 "Expected consecutive stride."); 7322 InstWidening Decision = 7323 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7324 setWideningDecision(&I, VF, Decision, Cost); 7325 continue; 7326 } 7327 7328 // Choose between Interleaving, Gather/Scatter or Scalarization. 7329 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7330 unsigned NumAccesses = 1; 7331 if (isAccessInterleaved(&I)) { 7332 auto Group = getInterleavedAccessGroup(&I); 7333 assert(Group && "Fail to get an interleaved access group."); 7334 7335 // Make one decision for the whole group. 7336 if (getWideningDecision(&I, VF) != CM_Unknown) 7337 continue; 7338 7339 NumAccesses = Group->getNumMembers(); 7340 if (interleavedAccessCanBeWidened(&I, VF)) 7341 InterleaveCost = getInterleaveGroupCost(&I, VF); 7342 } 7343 7344 InstructionCost GatherScatterCost = 7345 isLegalGatherOrScatter(&I) 7346 ? getGatherScatterCost(&I, VF) * NumAccesses 7347 : InstructionCost::getInvalid(); 7348 7349 InstructionCost ScalarizationCost = 7350 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7351 7352 // Choose better solution for the current VF, 7353 // write down this decision and use it during vectorization. 7354 InstructionCost Cost; 7355 InstWidening Decision; 7356 if (InterleaveCost <= GatherScatterCost && 7357 InterleaveCost < ScalarizationCost) { 7358 Decision = CM_Interleave; 7359 Cost = InterleaveCost; 7360 } else if (GatherScatterCost < ScalarizationCost) { 7361 Decision = CM_GatherScatter; 7362 Cost = GatherScatterCost; 7363 } else { 7364 assert(!VF.isScalable() && 7365 "We cannot yet scalarise for scalable vectors"); 7366 Decision = CM_Scalarize; 7367 Cost = ScalarizationCost; 7368 } 7369 // If the instructions belongs to an interleave group, the whole group 7370 // receives the same decision. The whole group receives the cost, but 7371 // the cost will actually be assigned to one instruction. 7372 if (auto Group = getInterleavedAccessGroup(&I)) 7373 setWideningDecision(Group, VF, Decision, Cost); 7374 else 7375 setWideningDecision(&I, VF, Decision, Cost); 7376 } 7377 } 7378 7379 // Make sure that any load of address and any other address computation 7380 // remains scalar unless there is gather/scatter support. This avoids 7381 // inevitable extracts into address registers, and also has the benefit of 7382 // activating LSR more, since that pass can't optimize vectorized 7383 // addresses. 7384 if (TTI.prefersVectorizedAddressing()) 7385 return; 7386 7387 // Start with all scalar pointer uses. 7388 SmallPtrSet<Instruction *, 8> AddrDefs; 7389 for (BasicBlock *BB : TheLoop->blocks()) 7390 for (Instruction &I : *BB) { 7391 Instruction *PtrDef = 7392 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7393 if (PtrDef && TheLoop->contains(PtrDef) && 7394 getWideningDecision(&I, VF) != CM_GatherScatter) 7395 AddrDefs.insert(PtrDef); 7396 } 7397 7398 // Add all instructions used to generate the addresses. 7399 SmallVector<Instruction *, 4> Worklist; 7400 append_range(Worklist, AddrDefs); 7401 while (!Worklist.empty()) { 7402 Instruction *I = Worklist.pop_back_val(); 7403 for (auto &Op : I->operands()) 7404 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7405 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7406 AddrDefs.insert(InstOp).second) 7407 Worklist.push_back(InstOp); 7408 } 7409 7410 for (auto *I : AddrDefs) { 7411 if (isa<LoadInst>(I)) { 7412 // Setting the desired widening decision should ideally be handled in 7413 // by cost functions, but since this involves the task of finding out 7414 // if the loaded register is involved in an address computation, it is 7415 // instead changed here when we know this is the case. 7416 InstWidening Decision = getWideningDecision(I, VF); 7417 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7418 // Scalarize a widened load of address. 7419 setWideningDecision( 7420 I, VF, CM_Scalarize, 7421 (VF.getKnownMinValue() * 7422 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7423 else if (auto Group = getInterleavedAccessGroup(I)) { 7424 // Scalarize an interleave group of address loads. 7425 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7426 if (Instruction *Member = Group->getMember(I)) 7427 setWideningDecision( 7428 Member, VF, CM_Scalarize, 7429 (VF.getKnownMinValue() * 7430 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7431 } 7432 } 7433 } else 7434 // Make sure I gets scalarized and a cost estimate without 7435 // scalarization overhead. 7436 ForcedScalars[VF].insert(I); 7437 } 7438 } 7439 7440 InstructionCost 7441 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7442 Type *&VectorTy) { 7443 Type *RetTy = I->getType(); 7444 if (canTruncateToMinimalBitwidth(I, VF)) 7445 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7446 auto SE = PSE.getSE(); 7447 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7448 7449 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7450 ElementCount VF) -> bool { 7451 if (VF.isScalar()) 7452 return true; 7453 7454 auto Scalarized = InstsToScalarize.find(VF); 7455 assert(Scalarized != InstsToScalarize.end() && 7456 "VF not yet analyzed for scalarization profitability"); 7457 return !Scalarized->second.count(I) && 7458 llvm::all_of(I->users(), [&](User *U) { 7459 auto *UI = cast<Instruction>(U); 7460 return !Scalarized->second.count(UI); 7461 }); 7462 }; 7463 (void) hasSingleCopyAfterVectorization; 7464 7465 if (isScalarAfterVectorization(I, VF)) { 7466 // With the exception of GEPs and PHIs, after scalarization there should 7467 // only be one copy of the instruction generated in the loop. This is 7468 // because the VF is either 1, or any instructions that need scalarizing 7469 // have already been dealt with by the the time we get here. As a result, 7470 // it means we don't have to multiply the instruction cost by VF. 7471 assert(I->getOpcode() == Instruction::GetElementPtr || 7472 I->getOpcode() == Instruction::PHI || 7473 (I->getOpcode() == Instruction::BitCast && 7474 I->getType()->isPointerTy()) || 7475 hasSingleCopyAfterVectorization(I, VF)); 7476 VectorTy = RetTy; 7477 } else 7478 VectorTy = ToVectorTy(RetTy, VF); 7479 7480 // TODO: We need to estimate the cost of intrinsic calls. 7481 switch (I->getOpcode()) { 7482 case Instruction::GetElementPtr: 7483 // We mark this instruction as zero-cost because the cost of GEPs in 7484 // vectorized code depends on whether the corresponding memory instruction 7485 // is scalarized or not. Therefore, we handle GEPs with the memory 7486 // instruction cost. 7487 return 0; 7488 case Instruction::Br: { 7489 // In cases of scalarized and predicated instructions, there will be VF 7490 // predicated blocks in the vectorized loop. Each branch around these 7491 // blocks requires also an extract of its vector compare i1 element. 7492 bool ScalarPredicatedBB = false; 7493 BranchInst *BI = cast<BranchInst>(I); 7494 if (VF.isVector() && BI->isConditional() && 7495 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7496 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7497 ScalarPredicatedBB = true; 7498 7499 if (ScalarPredicatedBB) { 7500 // Return cost for branches around scalarized and predicated blocks. 7501 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7502 auto *Vec_i1Ty = 7503 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7504 return (TTI.getScalarizationOverhead( 7505 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7506 false, true) + 7507 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7508 VF.getKnownMinValue())); 7509 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7510 // The back-edge branch will remain, as will all scalar branches. 7511 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7512 else 7513 // This branch will be eliminated by if-conversion. 7514 return 0; 7515 // Note: We currently assume zero cost for an unconditional branch inside 7516 // a predicated block since it will become a fall-through, although we 7517 // may decide in the future to call TTI for all branches. 7518 } 7519 case Instruction::PHI: { 7520 auto *Phi = cast<PHINode>(I); 7521 7522 // First-order recurrences are replaced by vector shuffles inside the loop. 7523 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7524 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7525 return TTI.getShuffleCost( 7526 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7527 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7528 7529 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7530 // converted into select instructions. We require N - 1 selects per phi 7531 // node, where N is the number of incoming values. 7532 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7533 return (Phi->getNumIncomingValues() - 1) * 7534 TTI.getCmpSelInstrCost( 7535 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7536 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7537 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7538 7539 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7540 } 7541 case Instruction::UDiv: 7542 case Instruction::SDiv: 7543 case Instruction::URem: 7544 case Instruction::SRem: 7545 // If we have a predicated instruction, it may not be executed for each 7546 // vector lane. Get the scalarization cost and scale this amount by the 7547 // probability of executing the predicated block. If the instruction is not 7548 // predicated, we fall through to the next case. 7549 if (VF.isVector() && isScalarWithPredication(I)) { 7550 InstructionCost Cost = 0; 7551 7552 // These instructions have a non-void type, so account for the phi nodes 7553 // that we will create. This cost is likely to be zero. The phi node 7554 // cost, if any, should be scaled by the block probability because it 7555 // models a copy at the end of each predicated block. 7556 Cost += VF.getKnownMinValue() * 7557 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7558 7559 // The cost of the non-predicated instruction. 7560 Cost += VF.getKnownMinValue() * 7561 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7562 7563 // The cost of insertelement and extractelement instructions needed for 7564 // scalarization. 7565 Cost += getScalarizationOverhead(I, VF); 7566 7567 // Scale the cost by the probability of executing the predicated blocks. 7568 // This assumes the predicated block for each vector lane is equally 7569 // likely. 7570 return Cost / getReciprocalPredBlockProb(); 7571 } 7572 LLVM_FALLTHROUGH; 7573 case Instruction::Add: 7574 case Instruction::FAdd: 7575 case Instruction::Sub: 7576 case Instruction::FSub: 7577 case Instruction::Mul: 7578 case Instruction::FMul: 7579 case Instruction::FDiv: 7580 case Instruction::FRem: 7581 case Instruction::Shl: 7582 case Instruction::LShr: 7583 case Instruction::AShr: 7584 case Instruction::And: 7585 case Instruction::Or: 7586 case Instruction::Xor: { 7587 // Since we will replace the stride by 1 the multiplication should go away. 7588 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7589 return 0; 7590 7591 // Detect reduction patterns 7592 InstructionCost RedCost; 7593 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7594 .isValid()) 7595 return RedCost; 7596 7597 // Certain instructions can be cheaper to vectorize if they have a constant 7598 // second vector operand. One example of this are shifts on x86. 7599 Value *Op2 = I->getOperand(1); 7600 TargetTransformInfo::OperandValueProperties Op2VP; 7601 TargetTransformInfo::OperandValueKind Op2VK = 7602 TTI.getOperandInfo(Op2, Op2VP); 7603 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7604 Op2VK = TargetTransformInfo::OK_UniformValue; 7605 7606 SmallVector<const Value *, 4> Operands(I->operand_values()); 7607 return TTI.getArithmeticInstrCost( 7608 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7609 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7610 } 7611 case Instruction::FNeg: { 7612 return TTI.getArithmeticInstrCost( 7613 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7614 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7615 TargetTransformInfo::OP_None, I->getOperand(0), I); 7616 } 7617 case Instruction::Select: { 7618 SelectInst *SI = cast<SelectInst>(I); 7619 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7620 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7621 7622 const Value *Op0, *Op1; 7623 using namespace llvm::PatternMatch; 7624 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7625 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7626 // select x, y, false --> x & y 7627 // select x, true, y --> x | y 7628 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7629 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7630 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7631 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7632 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7633 Op1->getType()->getScalarSizeInBits() == 1); 7634 7635 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7636 return TTI.getArithmeticInstrCost( 7637 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7638 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7639 } 7640 7641 Type *CondTy = SI->getCondition()->getType(); 7642 if (!ScalarCond) 7643 CondTy = VectorType::get(CondTy, VF); 7644 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7645 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7646 } 7647 case Instruction::ICmp: 7648 case Instruction::FCmp: { 7649 Type *ValTy = I->getOperand(0)->getType(); 7650 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7651 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7652 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7653 VectorTy = ToVectorTy(ValTy, VF); 7654 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7655 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7656 } 7657 case Instruction::Store: 7658 case Instruction::Load: { 7659 ElementCount Width = VF; 7660 if (Width.isVector()) { 7661 InstWidening Decision = getWideningDecision(I, Width); 7662 assert(Decision != CM_Unknown && 7663 "CM decision should be taken at this point"); 7664 if (Decision == CM_Scalarize) 7665 Width = ElementCount::getFixed(1); 7666 } 7667 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7668 return getMemoryInstructionCost(I, VF); 7669 } 7670 case Instruction::BitCast: 7671 if (I->getType()->isPointerTy()) 7672 return 0; 7673 LLVM_FALLTHROUGH; 7674 case Instruction::ZExt: 7675 case Instruction::SExt: 7676 case Instruction::FPToUI: 7677 case Instruction::FPToSI: 7678 case Instruction::FPExt: 7679 case Instruction::PtrToInt: 7680 case Instruction::IntToPtr: 7681 case Instruction::SIToFP: 7682 case Instruction::UIToFP: 7683 case Instruction::Trunc: 7684 case Instruction::FPTrunc: { 7685 // Computes the CastContextHint from a Load/Store instruction. 7686 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7687 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7688 "Expected a load or a store!"); 7689 7690 if (VF.isScalar() || !TheLoop->contains(I)) 7691 return TTI::CastContextHint::Normal; 7692 7693 switch (getWideningDecision(I, VF)) { 7694 case LoopVectorizationCostModel::CM_GatherScatter: 7695 return TTI::CastContextHint::GatherScatter; 7696 case LoopVectorizationCostModel::CM_Interleave: 7697 return TTI::CastContextHint::Interleave; 7698 case LoopVectorizationCostModel::CM_Scalarize: 7699 case LoopVectorizationCostModel::CM_Widen: 7700 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7701 : TTI::CastContextHint::Normal; 7702 case LoopVectorizationCostModel::CM_Widen_Reverse: 7703 return TTI::CastContextHint::Reversed; 7704 case LoopVectorizationCostModel::CM_Unknown: 7705 llvm_unreachable("Instr did not go through cost modelling?"); 7706 } 7707 7708 llvm_unreachable("Unhandled case!"); 7709 }; 7710 7711 unsigned Opcode = I->getOpcode(); 7712 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7713 // For Trunc, the context is the only user, which must be a StoreInst. 7714 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7715 if (I->hasOneUse()) 7716 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7717 CCH = ComputeCCH(Store); 7718 } 7719 // For Z/Sext, the context is the operand, which must be a LoadInst. 7720 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7721 Opcode == Instruction::FPExt) { 7722 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7723 CCH = ComputeCCH(Load); 7724 } 7725 7726 // We optimize the truncation of induction variables having constant 7727 // integer steps. The cost of these truncations is the same as the scalar 7728 // operation. 7729 if (isOptimizableIVTruncate(I, VF)) { 7730 auto *Trunc = cast<TruncInst>(I); 7731 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7732 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7733 } 7734 7735 // Detect reduction patterns 7736 InstructionCost RedCost; 7737 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7738 .isValid()) 7739 return RedCost; 7740 7741 Type *SrcScalarTy = I->getOperand(0)->getType(); 7742 Type *SrcVecTy = 7743 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7744 if (canTruncateToMinimalBitwidth(I, VF)) { 7745 // This cast is going to be shrunk. This may remove the cast or it might 7746 // turn it into slightly different cast. For example, if MinBW == 16, 7747 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7748 // 7749 // Calculate the modified src and dest types. 7750 Type *MinVecTy = VectorTy; 7751 if (Opcode == Instruction::Trunc) { 7752 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7753 VectorTy = 7754 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7755 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7756 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7757 VectorTy = 7758 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7759 } 7760 } 7761 7762 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7763 } 7764 case Instruction::Call: { 7765 bool NeedToScalarize; 7766 CallInst *CI = cast<CallInst>(I); 7767 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7768 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7769 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7770 return std::min(CallCost, IntrinsicCost); 7771 } 7772 return CallCost; 7773 } 7774 case Instruction::ExtractValue: 7775 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7776 default: 7777 // This opcode is unknown. Assume that it is the same as 'mul'. 7778 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7779 } // end of switch. 7780 } 7781 7782 char LoopVectorize::ID = 0; 7783 7784 static const char lv_name[] = "Loop Vectorization"; 7785 7786 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7787 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7788 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7789 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7790 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7791 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7792 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7793 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7794 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7795 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7796 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7797 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7798 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7799 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7800 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7801 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7802 7803 namespace llvm { 7804 7805 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7806 7807 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7808 bool VectorizeOnlyWhenForced) { 7809 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7810 } 7811 7812 } // end namespace llvm 7813 7814 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7815 // Check if the pointer operand of a load or store instruction is 7816 // consecutive. 7817 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7818 return Legal->isConsecutivePtr(Ptr); 7819 return false; 7820 } 7821 7822 void LoopVectorizationCostModel::collectValuesToIgnore() { 7823 // Ignore ephemeral values. 7824 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7825 7826 // Ignore type-promoting instructions we identified during reduction 7827 // detection. 7828 for (auto &Reduction : Legal->getReductionVars()) { 7829 RecurrenceDescriptor &RedDes = Reduction.second; 7830 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7831 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7832 } 7833 // Ignore type-casting instructions we identified during induction 7834 // detection. 7835 for (auto &Induction : Legal->getInductionVars()) { 7836 InductionDescriptor &IndDes = Induction.second; 7837 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7838 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7839 } 7840 } 7841 7842 void LoopVectorizationCostModel::collectInLoopReductions() { 7843 for (auto &Reduction : Legal->getReductionVars()) { 7844 PHINode *Phi = Reduction.first; 7845 RecurrenceDescriptor &RdxDesc = Reduction.second; 7846 7847 // We don't collect reductions that are type promoted (yet). 7848 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7849 continue; 7850 7851 // If the target would prefer this reduction to happen "in-loop", then we 7852 // want to record it as such. 7853 unsigned Opcode = RdxDesc.getOpcode(); 7854 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7855 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7856 TargetTransformInfo::ReductionFlags())) 7857 continue; 7858 7859 // Check that we can correctly put the reductions into the loop, by 7860 // finding the chain of operations that leads from the phi to the loop 7861 // exit value. 7862 SmallVector<Instruction *, 4> ReductionOperations = 7863 RdxDesc.getReductionOpChain(Phi, TheLoop); 7864 bool InLoop = !ReductionOperations.empty(); 7865 if (InLoop) { 7866 InLoopReductionChains[Phi] = ReductionOperations; 7867 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7868 Instruction *LastChain = Phi; 7869 for (auto *I : ReductionOperations) { 7870 InLoopReductionImmediateChains[I] = LastChain; 7871 LastChain = I; 7872 } 7873 } 7874 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7875 << " reduction for phi: " << *Phi << "\n"); 7876 } 7877 } 7878 7879 // TODO: we could return a pair of values that specify the max VF and 7880 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7881 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7882 // doesn't have a cost model that can choose which plan to execute if 7883 // more than one is generated. 7884 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7885 LoopVectorizationCostModel &CM) { 7886 unsigned WidestType; 7887 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7888 return WidestVectorRegBits / WidestType; 7889 } 7890 7891 VectorizationFactor 7892 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7893 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7894 ElementCount VF = UserVF; 7895 // Outer loop handling: They may require CFG and instruction level 7896 // transformations before even evaluating whether vectorization is profitable. 7897 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7898 // the vectorization pipeline. 7899 if (!OrigLoop->isInnermost()) { 7900 // If the user doesn't provide a vectorization factor, determine a 7901 // reasonable one. 7902 if (UserVF.isZero()) { 7903 VF = ElementCount::getFixed(determineVPlanVF( 7904 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7905 .getFixedSize(), 7906 CM)); 7907 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7908 7909 // Make sure we have a VF > 1 for stress testing. 7910 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7911 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7912 << "overriding computed VF.\n"); 7913 VF = ElementCount::getFixed(4); 7914 } 7915 } 7916 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7917 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7918 "VF needs to be a power of two"); 7919 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7920 << "VF " << VF << " to build VPlans.\n"); 7921 buildVPlans(VF, VF); 7922 7923 // For VPlan build stress testing, we bail out after VPlan construction. 7924 if (VPlanBuildStressTest) 7925 return VectorizationFactor::Disabled(); 7926 7927 return {VF, 0 /*Cost*/}; 7928 } 7929 7930 LLVM_DEBUG( 7931 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7932 "VPlan-native path.\n"); 7933 return VectorizationFactor::Disabled(); 7934 } 7935 7936 Optional<VectorizationFactor> 7937 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7938 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7939 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7940 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7941 return None; 7942 7943 // Invalidate interleave groups if all blocks of loop will be predicated. 7944 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7945 !useMaskedInterleavedAccesses(*TTI)) { 7946 LLVM_DEBUG( 7947 dbgs() 7948 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7949 "which requires masked-interleaved support.\n"); 7950 if (CM.InterleaveInfo.invalidateGroups()) 7951 // Invalidating interleave groups also requires invalidating all decisions 7952 // based on them, which includes widening decisions and uniform and scalar 7953 // values. 7954 CM.invalidateCostModelingDecisions(); 7955 } 7956 7957 ElementCount MaxUserVF = 7958 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7959 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7960 if (!UserVF.isZero() && UserVFIsLegal) { 7961 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7962 << " VF " << UserVF << ".\n"); 7963 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7964 "VF needs to be a power of two"); 7965 // Collect the instructions (and their associated costs) that will be more 7966 // profitable to scalarize. 7967 CM.selectUserVectorizationFactor(UserVF); 7968 CM.collectInLoopReductions(); 7969 buildVPlansWithVPRecipes({UserVF}, {UserVF}); 7970 LLVM_DEBUG(printPlans(dbgs())); 7971 return {{UserVF, 0}}; 7972 } 7973 7974 ElementCount MaxVF = MaxFactors.FixedVF; 7975 assert(!MaxVF.isScalable() && 7976 "Scalable vectors not yet supported beyond this point"); 7977 7978 for (ElementCount VF = ElementCount::getFixed(1); 7979 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7980 // Collect Uniform and Scalar instructions after vectorization with VF. 7981 CM.collectUniformsAndScalars(VF); 7982 7983 // Collect the instructions (and their associated costs) that will be more 7984 // profitable to scalarize. 7985 if (VF.isVector()) 7986 CM.collectInstsToScalarize(VF); 7987 } 7988 7989 CM.collectInLoopReductions(); 7990 7991 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7992 LLVM_DEBUG(printPlans(dbgs())); 7993 if (!MaxFactors.hasVector()) 7994 return VectorizationFactor::Disabled(); 7995 7996 // Select the optimal vectorization factor. 7997 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 7998 7999 // Check if it is profitable to vectorize with runtime checks. 8000 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8001 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8002 bool PragmaThresholdReached = 8003 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8004 bool ThresholdReached = 8005 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8006 if ((ThresholdReached && !Hints.allowReordering()) || 8007 PragmaThresholdReached) { 8008 ORE->emit([&]() { 8009 return OptimizationRemarkAnalysisAliasing( 8010 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8011 OrigLoop->getHeader()) 8012 << "loop not vectorized: cannot prove it is safe to reorder " 8013 "memory operations"; 8014 }); 8015 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8016 Hints.emitRemarkWithHints(); 8017 return VectorizationFactor::Disabled(); 8018 } 8019 } 8020 return SelectedVF; 8021 } 8022 8023 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8024 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8025 << '\n'); 8026 BestVF = VF; 8027 BestUF = UF; 8028 8029 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8030 return !Plan->hasVF(VF); 8031 }); 8032 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8033 } 8034 8035 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8036 DominatorTree *DT) { 8037 // Perform the actual loop transformation. 8038 8039 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8040 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8041 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8042 8043 VPTransformState State{ 8044 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8045 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8046 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8047 State.CanonicalIV = ILV.Induction; 8048 8049 ILV.printDebugTracesAtStart(); 8050 8051 //===------------------------------------------------===// 8052 // 8053 // Notice: any optimization or new instruction that go 8054 // into the code below should also be implemented in 8055 // the cost-model. 8056 // 8057 //===------------------------------------------------===// 8058 8059 // 2. Copy and widen instructions from the old loop into the new loop. 8060 VPlans.front()->execute(&State); 8061 8062 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8063 // predication, updating analyses. 8064 ILV.fixVectorizedLoop(State); 8065 8066 ILV.printDebugTracesAtEnd(); 8067 } 8068 8069 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8070 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8071 for (const auto &Plan : VPlans) 8072 if (PrintVPlansInDotFormat) 8073 Plan->printDOT(O); 8074 else 8075 Plan->print(O); 8076 } 8077 #endif 8078 8079 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8080 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8081 8082 // We create new control-flow for the vectorized loop, so the original exit 8083 // conditions will be dead after vectorization if it's only used by the 8084 // terminator 8085 SmallVector<BasicBlock*> ExitingBlocks; 8086 OrigLoop->getExitingBlocks(ExitingBlocks); 8087 for (auto *BB : ExitingBlocks) { 8088 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8089 if (!Cmp || !Cmp->hasOneUse()) 8090 continue; 8091 8092 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8093 if (!DeadInstructions.insert(Cmp).second) 8094 continue; 8095 8096 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8097 // TODO: can recurse through operands in general 8098 for (Value *Op : Cmp->operands()) { 8099 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8100 DeadInstructions.insert(cast<Instruction>(Op)); 8101 } 8102 } 8103 8104 // We create new "steps" for induction variable updates to which the original 8105 // induction variables map. An original update instruction will be dead if 8106 // all its users except the induction variable are dead. 8107 auto *Latch = OrigLoop->getLoopLatch(); 8108 for (auto &Induction : Legal->getInductionVars()) { 8109 PHINode *Ind = Induction.first; 8110 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8111 8112 // If the tail is to be folded by masking, the primary induction variable, 8113 // if exists, isn't dead: it will be used for masking. Don't kill it. 8114 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8115 continue; 8116 8117 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8118 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8119 })) 8120 DeadInstructions.insert(IndUpdate); 8121 8122 // We record as "Dead" also the type-casting instructions we had identified 8123 // during induction analysis. We don't need any handling for them in the 8124 // vectorized loop because we have proven that, under a proper runtime 8125 // test guarding the vectorized loop, the value of the phi, and the casted 8126 // value of the phi, are the same. The last instruction in this casting chain 8127 // will get its scalar/vector/widened def from the scalar/vector/widened def 8128 // of the respective phi node. Any other casts in the induction def-use chain 8129 // have no other uses outside the phi update chain, and will be ignored. 8130 InductionDescriptor &IndDes = Induction.second; 8131 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8132 DeadInstructions.insert(Casts.begin(), Casts.end()); 8133 } 8134 } 8135 8136 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8137 8138 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8139 8140 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8141 Instruction::BinaryOps BinOp) { 8142 // When unrolling and the VF is 1, we only need to add a simple scalar. 8143 Type *Ty = Val->getType(); 8144 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8145 8146 if (Ty->isFloatingPointTy()) { 8147 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8148 8149 // Floating-point operations inherit FMF via the builder's flags. 8150 Value *MulOp = Builder.CreateFMul(C, Step); 8151 return Builder.CreateBinOp(BinOp, Val, MulOp); 8152 } 8153 Constant *C = ConstantInt::get(Ty, StartIdx); 8154 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8155 } 8156 8157 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8158 SmallVector<Metadata *, 4> MDs; 8159 // Reserve first location for self reference to the LoopID metadata node. 8160 MDs.push_back(nullptr); 8161 bool IsUnrollMetadata = false; 8162 MDNode *LoopID = L->getLoopID(); 8163 if (LoopID) { 8164 // First find existing loop unrolling disable metadata. 8165 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8166 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8167 if (MD) { 8168 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8169 IsUnrollMetadata = 8170 S && S->getString().startswith("llvm.loop.unroll.disable"); 8171 } 8172 MDs.push_back(LoopID->getOperand(i)); 8173 } 8174 } 8175 8176 if (!IsUnrollMetadata) { 8177 // Add runtime unroll disable metadata. 8178 LLVMContext &Context = L->getHeader()->getContext(); 8179 SmallVector<Metadata *, 1> DisableOperands; 8180 DisableOperands.push_back( 8181 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8182 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8183 MDs.push_back(DisableNode); 8184 MDNode *NewLoopID = MDNode::get(Context, MDs); 8185 // Set operand 0 to refer to the loop id itself. 8186 NewLoopID->replaceOperandWith(0, NewLoopID); 8187 L->setLoopID(NewLoopID); 8188 } 8189 } 8190 8191 //===--------------------------------------------------------------------===// 8192 // EpilogueVectorizerMainLoop 8193 //===--------------------------------------------------------------------===// 8194 8195 /// This function is partially responsible for generating the control flow 8196 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8197 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8198 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8199 Loop *Lp = createVectorLoopSkeleton(""); 8200 8201 // Generate the code to check the minimum iteration count of the vector 8202 // epilogue (see below). 8203 EPI.EpilogueIterationCountCheck = 8204 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8205 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8206 8207 // Generate the code to check any assumptions that we've made for SCEV 8208 // expressions. 8209 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8210 8211 // Generate the code that checks at runtime if arrays overlap. We put the 8212 // checks into a separate block to make the more common case of few elements 8213 // faster. 8214 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8215 8216 // Generate the iteration count check for the main loop, *after* the check 8217 // for the epilogue loop, so that the path-length is shorter for the case 8218 // that goes directly through the vector epilogue. The longer-path length for 8219 // the main loop is compensated for, by the gain from vectorizing the larger 8220 // trip count. Note: the branch will get updated later on when we vectorize 8221 // the epilogue. 8222 EPI.MainLoopIterationCountCheck = 8223 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8224 8225 // Generate the induction variable. 8226 OldInduction = Legal->getPrimaryInduction(); 8227 Type *IdxTy = Legal->getWidestInductionType(); 8228 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8229 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8230 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8231 EPI.VectorTripCount = CountRoundDown; 8232 Induction = 8233 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8234 getDebugLocFromInstOrOperands(OldInduction)); 8235 8236 // Skip induction resume value creation here because they will be created in 8237 // the second pass. If we created them here, they wouldn't be used anyway, 8238 // because the vplan in the second pass still contains the inductions from the 8239 // original loop. 8240 8241 return completeLoopSkeleton(Lp, OrigLoopID); 8242 } 8243 8244 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8245 LLVM_DEBUG({ 8246 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8247 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8248 << ", Main Loop UF:" << EPI.MainLoopUF 8249 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8250 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8251 }); 8252 } 8253 8254 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8255 DEBUG_WITH_TYPE(VerboseDebug, { 8256 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8257 }); 8258 } 8259 8260 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8261 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8262 assert(L && "Expected valid Loop."); 8263 assert(Bypass && "Expected valid bypass basic block."); 8264 unsigned VFactor = 8265 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8266 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8267 Value *Count = getOrCreateTripCount(L); 8268 // Reuse existing vector loop preheader for TC checks. 8269 // Note that new preheader block is generated for vector loop. 8270 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8271 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8272 8273 // Generate code to check if the loop's trip count is less than VF * UF of the 8274 // main vector loop. 8275 auto P = 8276 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8277 8278 Value *CheckMinIters = Builder.CreateICmp( 8279 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8280 "min.iters.check"); 8281 8282 if (!ForEpilogue) 8283 TCCheckBlock->setName("vector.main.loop.iter.check"); 8284 8285 // Create new preheader for vector loop. 8286 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8287 DT, LI, nullptr, "vector.ph"); 8288 8289 if (ForEpilogue) { 8290 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8291 DT->getNode(Bypass)->getIDom()) && 8292 "TC check is expected to dominate Bypass"); 8293 8294 // Update dominator for Bypass & LoopExit. 8295 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8296 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8297 8298 LoopBypassBlocks.push_back(TCCheckBlock); 8299 8300 // Save the trip count so we don't have to regenerate it in the 8301 // vec.epilog.iter.check. This is safe to do because the trip count 8302 // generated here dominates the vector epilog iter check. 8303 EPI.TripCount = Count; 8304 } 8305 8306 ReplaceInstWithInst( 8307 TCCheckBlock->getTerminator(), 8308 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8309 8310 return TCCheckBlock; 8311 } 8312 8313 //===--------------------------------------------------------------------===// 8314 // EpilogueVectorizerEpilogueLoop 8315 //===--------------------------------------------------------------------===// 8316 8317 /// This function is partially responsible for generating the control flow 8318 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8319 BasicBlock * 8320 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8321 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8322 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8323 8324 // Now, compare the remaining count and if there aren't enough iterations to 8325 // execute the vectorized epilogue skip to the scalar part. 8326 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8327 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8328 LoopVectorPreHeader = 8329 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8330 LI, nullptr, "vec.epilog.ph"); 8331 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8332 VecEpilogueIterationCountCheck); 8333 8334 // Adjust the control flow taking the state info from the main loop 8335 // vectorization into account. 8336 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8337 "expected this to be saved from the previous pass."); 8338 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8339 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8340 8341 DT->changeImmediateDominator(LoopVectorPreHeader, 8342 EPI.MainLoopIterationCountCheck); 8343 8344 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8345 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8346 8347 if (EPI.SCEVSafetyCheck) 8348 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8349 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8350 if (EPI.MemSafetyCheck) 8351 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8352 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8353 8354 DT->changeImmediateDominator( 8355 VecEpilogueIterationCountCheck, 8356 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8357 8358 DT->changeImmediateDominator(LoopScalarPreHeader, 8359 EPI.EpilogueIterationCountCheck); 8360 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8361 8362 // Keep track of bypass blocks, as they feed start values to the induction 8363 // phis in the scalar loop preheader. 8364 if (EPI.SCEVSafetyCheck) 8365 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8366 if (EPI.MemSafetyCheck) 8367 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8368 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8369 8370 // Generate a resume induction for the vector epilogue and put it in the 8371 // vector epilogue preheader 8372 Type *IdxTy = Legal->getWidestInductionType(); 8373 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8374 LoopVectorPreHeader->getFirstNonPHI()); 8375 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8376 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8377 EPI.MainLoopIterationCountCheck); 8378 8379 // Generate the induction variable. 8380 OldInduction = Legal->getPrimaryInduction(); 8381 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8382 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8383 Value *StartIdx = EPResumeVal; 8384 Induction = 8385 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8386 getDebugLocFromInstOrOperands(OldInduction)); 8387 8388 // Generate induction resume values. These variables save the new starting 8389 // indexes for the scalar loop. They are used to test if there are any tail 8390 // iterations left once the vector loop has completed. 8391 // Note that when the vectorized epilogue is skipped due to iteration count 8392 // check, then the resume value for the induction variable comes from 8393 // the trip count of the main vector loop, hence passing the AdditionalBypass 8394 // argument. 8395 createInductionResumeValues(Lp, CountRoundDown, 8396 {VecEpilogueIterationCountCheck, 8397 EPI.VectorTripCount} /* AdditionalBypass */); 8398 8399 AddRuntimeUnrollDisableMetaData(Lp); 8400 return completeLoopSkeleton(Lp, OrigLoopID); 8401 } 8402 8403 BasicBlock * 8404 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8405 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8406 8407 assert(EPI.TripCount && 8408 "Expected trip count to have been safed in the first pass."); 8409 assert( 8410 (!isa<Instruction>(EPI.TripCount) || 8411 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8412 "saved trip count does not dominate insertion point."); 8413 Value *TC = EPI.TripCount; 8414 IRBuilder<> Builder(Insert->getTerminator()); 8415 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8416 8417 // Generate code to check if the loop's trip count is less than VF * UF of the 8418 // vector epilogue loop. 8419 auto P = 8420 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8421 8422 Value *CheckMinIters = Builder.CreateICmp( 8423 P, Count, 8424 ConstantInt::get(Count->getType(), 8425 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8426 "min.epilog.iters.check"); 8427 8428 ReplaceInstWithInst( 8429 Insert->getTerminator(), 8430 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8431 8432 LoopBypassBlocks.push_back(Insert); 8433 return Insert; 8434 } 8435 8436 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8437 LLVM_DEBUG({ 8438 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8439 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8440 << ", Main Loop UF:" << EPI.MainLoopUF 8441 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8442 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8443 }); 8444 } 8445 8446 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8447 DEBUG_WITH_TYPE(VerboseDebug, { 8448 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8449 }); 8450 } 8451 8452 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8453 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8454 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8455 bool PredicateAtRangeStart = Predicate(Range.Start); 8456 8457 for (ElementCount TmpVF = Range.Start * 2; 8458 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8459 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8460 Range.End = TmpVF; 8461 break; 8462 } 8463 8464 return PredicateAtRangeStart; 8465 } 8466 8467 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8468 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8469 /// of VF's starting at a given VF and extending it as much as possible. Each 8470 /// vectorization decision can potentially shorten this sub-range during 8471 /// buildVPlan(). 8472 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8473 ElementCount MaxVF) { 8474 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8475 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8476 VFRange SubRange = {VF, MaxVFPlusOne}; 8477 VPlans.push_back(buildVPlan(SubRange)); 8478 VF = SubRange.End; 8479 } 8480 } 8481 8482 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8483 VPlanPtr &Plan) { 8484 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8485 8486 // Look for cached value. 8487 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8488 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8489 if (ECEntryIt != EdgeMaskCache.end()) 8490 return ECEntryIt->second; 8491 8492 VPValue *SrcMask = createBlockInMask(Src, Plan); 8493 8494 // The terminator has to be a branch inst! 8495 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8496 assert(BI && "Unexpected terminator found"); 8497 8498 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8499 return EdgeMaskCache[Edge] = SrcMask; 8500 8501 // If source is an exiting block, we know the exit edge is dynamically dead 8502 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8503 // adding uses of an otherwise potentially dead instruction. 8504 if (OrigLoop->isLoopExiting(Src)) 8505 return EdgeMaskCache[Edge] = SrcMask; 8506 8507 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8508 assert(EdgeMask && "No Edge Mask found for condition"); 8509 8510 if (BI->getSuccessor(0) != Dst) 8511 EdgeMask = Builder.createNot(EdgeMask); 8512 8513 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8514 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8515 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8516 // The select version does not introduce new UB if SrcMask is false and 8517 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8518 VPValue *False = Plan->getOrAddVPValue( 8519 ConstantInt::getFalse(BI->getCondition()->getType())); 8520 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8521 } 8522 8523 return EdgeMaskCache[Edge] = EdgeMask; 8524 } 8525 8526 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8527 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8528 8529 // Look for cached value. 8530 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8531 if (BCEntryIt != BlockMaskCache.end()) 8532 return BCEntryIt->second; 8533 8534 // All-one mask is modelled as no-mask following the convention for masked 8535 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8536 VPValue *BlockMask = nullptr; 8537 8538 if (OrigLoop->getHeader() == BB) { 8539 if (!CM.blockNeedsPredication(BB)) 8540 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8541 8542 // Create the block in mask as the first non-phi instruction in the block. 8543 VPBuilder::InsertPointGuard Guard(Builder); 8544 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8545 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8546 8547 // Introduce the early-exit compare IV <= BTC to form header block mask. 8548 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8549 // Start by constructing the desired canonical IV. 8550 VPValue *IV = nullptr; 8551 if (Legal->getPrimaryInduction()) 8552 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8553 else { 8554 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8555 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8556 IV = IVRecipe->getVPSingleValue(); 8557 } 8558 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8559 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8560 8561 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8562 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8563 // as a second argument, we only pass the IV here and extract the 8564 // tripcount from the transform state where codegen of the VP instructions 8565 // happen. 8566 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8567 } else { 8568 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8569 } 8570 return BlockMaskCache[BB] = BlockMask; 8571 } 8572 8573 // This is the block mask. We OR all incoming edges. 8574 for (auto *Predecessor : predecessors(BB)) { 8575 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8576 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8577 return BlockMaskCache[BB] = EdgeMask; 8578 8579 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8580 BlockMask = EdgeMask; 8581 continue; 8582 } 8583 8584 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8585 } 8586 8587 return BlockMaskCache[BB] = BlockMask; 8588 } 8589 8590 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8591 ArrayRef<VPValue *> Operands, 8592 VFRange &Range, 8593 VPlanPtr &Plan) { 8594 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8595 "Must be called with either a load or store"); 8596 8597 auto willWiden = [&](ElementCount VF) -> bool { 8598 if (VF.isScalar()) 8599 return false; 8600 LoopVectorizationCostModel::InstWidening Decision = 8601 CM.getWideningDecision(I, VF); 8602 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8603 "CM decision should be taken at this point."); 8604 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8605 return true; 8606 if (CM.isScalarAfterVectorization(I, VF) || 8607 CM.isProfitableToScalarize(I, VF)) 8608 return false; 8609 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8610 }; 8611 8612 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8613 return nullptr; 8614 8615 VPValue *Mask = nullptr; 8616 if (Legal->isMaskRequired(I)) 8617 Mask = createBlockInMask(I->getParent(), Plan); 8618 8619 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8620 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8621 8622 StoreInst *Store = cast<StoreInst>(I); 8623 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8624 Mask); 8625 } 8626 8627 VPWidenIntOrFpInductionRecipe * 8628 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8629 ArrayRef<VPValue *> Operands) const { 8630 // Check if this is an integer or fp induction. If so, build the recipe that 8631 // produces its scalar and vector values. 8632 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8633 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8634 II.getKind() == InductionDescriptor::IK_FpInduction) { 8635 assert(II.getStartValue() == 8636 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8637 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8638 return new VPWidenIntOrFpInductionRecipe( 8639 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8640 } 8641 8642 return nullptr; 8643 } 8644 8645 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8646 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8647 VPlan &Plan) const { 8648 // Optimize the special case where the source is a constant integer 8649 // induction variable. Notice that we can only optimize the 'trunc' case 8650 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8651 // (c) other casts depend on pointer size. 8652 8653 // Determine whether \p K is a truncation based on an induction variable that 8654 // can be optimized. 8655 auto isOptimizableIVTruncate = 8656 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8657 return [=](ElementCount VF) -> bool { 8658 return CM.isOptimizableIVTruncate(K, VF); 8659 }; 8660 }; 8661 8662 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8663 isOptimizableIVTruncate(I), Range)) { 8664 8665 InductionDescriptor II = 8666 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8667 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8668 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8669 Start, nullptr, I); 8670 } 8671 return nullptr; 8672 } 8673 8674 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8675 ArrayRef<VPValue *> Operands, 8676 VPlanPtr &Plan) { 8677 // If all incoming values are equal, the incoming VPValue can be used directly 8678 // instead of creating a new VPBlendRecipe. 8679 VPValue *FirstIncoming = Operands[0]; 8680 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8681 return FirstIncoming == Inc; 8682 })) { 8683 return Operands[0]; 8684 } 8685 8686 // We know that all PHIs in non-header blocks are converted into selects, so 8687 // we don't have to worry about the insertion order and we can just use the 8688 // builder. At this point we generate the predication tree. There may be 8689 // duplications since this is a simple recursive scan, but future 8690 // optimizations will clean it up. 8691 SmallVector<VPValue *, 2> OperandsWithMask; 8692 unsigned NumIncoming = Phi->getNumIncomingValues(); 8693 8694 for (unsigned In = 0; In < NumIncoming; In++) { 8695 VPValue *EdgeMask = 8696 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8697 assert((EdgeMask || NumIncoming == 1) && 8698 "Multiple predecessors with one having a full mask"); 8699 OperandsWithMask.push_back(Operands[In]); 8700 if (EdgeMask) 8701 OperandsWithMask.push_back(EdgeMask); 8702 } 8703 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8704 } 8705 8706 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8707 ArrayRef<VPValue *> Operands, 8708 VFRange &Range) const { 8709 8710 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8711 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8712 Range); 8713 8714 if (IsPredicated) 8715 return nullptr; 8716 8717 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8718 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8719 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8720 ID == Intrinsic::pseudoprobe || 8721 ID == Intrinsic::experimental_noalias_scope_decl)) 8722 return nullptr; 8723 8724 auto willWiden = [&](ElementCount VF) -> bool { 8725 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8726 // The following case may be scalarized depending on the VF. 8727 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8728 // version of the instruction. 8729 // Is it beneficial to perform intrinsic call compared to lib call? 8730 bool NeedToScalarize = false; 8731 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8732 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8733 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8734 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8735 "Either the intrinsic cost or vector call cost must be valid"); 8736 return UseVectorIntrinsic || !NeedToScalarize; 8737 }; 8738 8739 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8740 return nullptr; 8741 8742 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8743 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8744 } 8745 8746 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8747 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8748 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8749 // Instruction should be widened, unless it is scalar after vectorization, 8750 // scalarization is profitable or it is predicated. 8751 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8752 return CM.isScalarAfterVectorization(I, VF) || 8753 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8754 }; 8755 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8756 Range); 8757 } 8758 8759 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8760 ArrayRef<VPValue *> Operands) const { 8761 auto IsVectorizableOpcode = [](unsigned Opcode) { 8762 switch (Opcode) { 8763 case Instruction::Add: 8764 case Instruction::And: 8765 case Instruction::AShr: 8766 case Instruction::BitCast: 8767 case Instruction::FAdd: 8768 case Instruction::FCmp: 8769 case Instruction::FDiv: 8770 case Instruction::FMul: 8771 case Instruction::FNeg: 8772 case Instruction::FPExt: 8773 case Instruction::FPToSI: 8774 case Instruction::FPToUI: 8775 case Instruction::FPTrunc: 8776 case Instruction::FRem: 8777 case Instruction::FSub: 8778 case Instruction::ICmp: 8779 case Instruction::IntToPtr: 8780 case Instruction::LShr: 8781 case Instruction::Mul: 8782 case Instruction::Or: 8783 case Instruction::PtrToInt: 8784 case Instruction::SDiv: 8785 case Instruction::Select: 8786 case Instruction::SExt: 8787 case Instruction::Shl: 8788 case Instruction::SIToFP: 8789 case Instruction::SRem: 8790 case Instruction::Sub: 8791 case Instruction::Trunc: 8792 case Instruction::UDiv: 8793 case Instruction::UIToFP: 8794 case Instruction::URem: 8795 case Instruction::Xor: 8796 case Instruction::ZExt: 8797 return true; 8798 } 8799 return false; 8800 }; 8801 8802 if (!IsVectorizableOpcode(I->getOpcode())) 8803 return nullptr; 8804 8805 // Success: widen this instruction. 8806 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8807 } 8808 8809 void VPRecipeBuilder::fixHeaderPhis() { 8810 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8811 for (VPWidenPHIRecipe *R : PhisToFix) { 8812 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8813 VPRecipeBase *IncR = 8814 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8815 R->addOperand(IncR->getVPSingleValue()); 8816 } 8817 } 8818 8819 VPBasicBlock *VPRecipeBuilder::handleReplication( 8820 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8821 VPlanPtr &Plan) { 8822 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8823 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8824 Range); 8825 8826 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8827 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8828 8829 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8830 IsUniform, IsPredicated); 8831 setRecipe(I, Recipe); 8832 Plan->addVPValue(I, Recipe); 8833 8834 // Find if I uses a predicated instruction. If so, it will use its scalar 8835 // value. Avoid hoisting the insert-element which packs the scalar value into 8836 // a vector value, as that happens iff all users use the vector value. 8837 for (VPValue *Op : Recipe->operands()) { 8838 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8839 if (!PredR) 8840 continue; 8841 auto *RepR = 8842 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8843 assert(RepR->isPredicated() && 8844 "expected Replicate recipe to be predicated"); 8845 RepR->setAlsoPack(false); 8846 } 8847 8848 // Finalize the recipe for Instr, first if it is not predicated. 8849 if (!IsPredicated) { 8850 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8851 VPBB->appendRecipe(Recipe); 8852 return VPBB; 8853 } 8854 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8855 assert(VPBB->getSuccessors().empty() && 8856 "VPBB has successors when handling predicated replication."); 8857 // Record predicated instructions for above packing optimizations. 8858 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8859 VPBlockUtils::insertBlockAfter(Region, VPBB); 8860 auto *RegSucc = new VPBasicBlock(); 8861 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8862 return RegSucc; 8863 } 8864 8865 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8866 VPRecipeBase *PredRecipe, 8867 VPlanPtr &Plan) { 8868 // Instructions marked for predication are replicated and placed under an 8869 // if-then construct to prevent side-effects. 8870 8871 // Generate recipes to compute the block mask for this region. 8872 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8873 8874 // Build the triangular if-then region. 8875 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8876 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8877 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8878 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8879 auto *PHIRecipe = Instr->getType()->isVoidTy() 8880 ? nullptr 8881 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8882 if (PHIRecipe) { 8883 Plan->removeVPValueFor(Instr); 8884 Plan->addVPValue(Instr, PHIRecipe); 8885 } 8886 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8887 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8888 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8889 8890 // Note: first set Entry as region entry and then connect successors starting 8891 // from it in order, to propagate the "parent" of each VPBasicBlock. 8892 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8893 VPBlockUtils::connectBlocks(Pred, Exit); 8894 8895 return Region; 8896 } 8897 8898 VPRecipeOrVPValueTy 8899 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8900 ArrayRef<VPValue *> Operands, 8901 VFRange &Range, VPlanPtr &Plan) { 8902 // First, check for specific widening recipes that deal with calls, memory 8903 // operations, inductions and Phi nodes. 8904 if (auto *CI = dyn_cast<CallInst>(Instr)) 8905 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8906 8907 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8908 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8909 8910 VPRecipeBase *Recipe; 8911 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8912 if (Phi->getParent() != OrigLoop->getHeader()) 8913 return tryToBlend(Phi, Operands, Plan); 8914 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8915 return toVPRecipeResult(Recipe); 8916 8917 if (Legal->isReductionVariable(Phi)) { 8918 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8919 assert(RdxDesc.getRecurrenceStartValue() == 8920 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8921 VPValue *StartV = Operands[0]; 8922 8923 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8924 PhisToFix.push_back(PhiRecipe); 8925 // Record the incoming value from the backedge, so we can add the incoming 8926 // value from the backedge after all recipes have been created. 8927 recordRecipeOf(cast<Instruction>( 8928 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8929 return toVPRecipeResult(PhiRecipe); 8930 } 8931 8932 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8933 } 8934 8935 if (isa<TruncInst>(Instr) && 8936 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8937 Range, *Plan))) 8938 return toVPRecipeResult(Recipe); 8939 8940 if (!shouldWiden(Instr, Range)) 8941 return nullptr; 8942 8943 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8944 return toVPRecipeResult(new VPWidenGEPRecipe( 8945 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8946 8947 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8948 bool InvariantCond = 8949 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8950 return toVPRecipeResult(new VPWidenSelectRecipe( 8951 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8952 } 8953 8954 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8955 } 8956 8957 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8958 ElementCount MaxVF) { 8959 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8960 8961 // Collect instructions from the original loop that will become trivially dead 8962 // in the vectorized loop. We don't need to vectorize these instructions. For 8963 // example, original induction update instructions can become dead because we 8964 // separately emit induction "steps" when generating code for the new loop. 8965 // Similarly, we create a new latch condition when setting up the structure 8966 // of the new loop, so the old one can become dead. 8967 SmallPtrSet<Instruction *, 4> DeadInstructions; 8968 collectTriviallyDeadInstructions(DeadInstructions); 8969 8970 // Add assume instructions we need to drop to DeadInstructions, to prevent 8971 // them from being added to the VPlan. 8972 // TODO: We only need to drop assumes in blocks that get flattend. If the 8973 // control flow is preserved, we should keep them. 8974 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8975 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8976 8977 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8978 // Dead instructions do not need sinking. Remove them from SinkAfter. 8979 for (Instruction *I : DeadInstructions) 8980 SinkAfter.erase(I); 8981 8982 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8983 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8984 VFRange SubRange = {VF, MaxVFPlusOne}; 8985 VPlans.push_back( 8986 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8987 VF = SubRange.End; 8988 } 8989 } 8990 8991 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8992 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8993 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8994 8995 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8996 8997 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8998 8999 // --------------------------------------------------------------------------- 9000 // Pre-construction: record ingredients whose recipes we'll need to further 9001 // process after constructing the initial VPlan. 9002 // --------------------------------------------------------------------------- 9003 9004 // Mark instructions we'll need to sink later and their targets as 9005 // ingredients whose recipe we'll need to record. 9006 for (auto &Entry : SinkAfter) { 9007 RecipeBuilder.recordRecipeOf(Entry.first); 9008 RecipeBuilder.recordRecipeOf(Entry.second); 9009 } 9010 for (auto &Reduction : CM.getInLoopReductionChains()) { 9011 PHINode *Phi = Reduction.first; 9012 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9013 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9014 9015 RecipeBuilder.recordRecipeOf(Phi); 9016 for (auto &R : ReductionOperations) { 9017 RecipeBuilder.recordRecipeOf(R); 9018 // For min/max reducitons, where we have a pair of icmp/select, we also 9019 // need to record the ICmp recipe, so it can be removed later. 9020 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9021 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9022 } 9023 } 9024 9025 // For each interleave group which is relevant for this (possibly trimmed) 9026 // Range, add it to the set of groups to be later applied to the VPlan and add 9027 // placeholders for its members' Recipes which we'll be replacing with a 9028 // single VPInterleaveRecipe. 9029 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9030 auto applyIG = [IG, this](ElementCount VF) -> bool { 9031 return (VF.isVector() && // Query is illegal for VF == 1 9032 CM.getWideningDecision(IG->getInsertPos(), VF) == 9033 LoopVectorizationCostModel::CM_Interleave); 9034 }; 9035 if (!getDecisionAndClampRange(applyIG, Range)) 9036 continue; 9037 InterleaveGroups.insert(IG); 9038 for (unsigned i = 0; i < IG->getFactor(); i++) 9039 if (Instruction *Member = IG->getMember(i)) 9040 RecipeBuilder.recordRecipeOf(Member); 9041 }; 9042 9043 // --------------------------------------------------------------------------- 9044 // Build initial VPlan: Scan the body of the loop in a topological order to 9045 // visit each basic block after having visited its predecessor basic blocks. 9046 // --------------------------------------------------------------------------- 9047 9048 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9049 auto Plan = std::make_unique<VPlan>(); 9050 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9051 Plan->setEntry(VPBB); 9052 9053 // Scan the body of the loop in a topological order to visit each basic block 9054 // after having visited its predecessor basic blocks. 9055 LoopBlocksDFS DFS(OrigLoop); 9056 DFS.perform(LI); 9057 9058 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9059 // Relevant instructions from basic block BB will be grouped into VPRecipe 9060 // ingredients and fill a new VPBasicBlock. 9061 unsigned VPBBsForBB = 0; 9062 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9063 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9064 VPBB = FirstVPBBForBB; 9065 Builder.setInsertPoint(VPBB); 9066 9067 // Introduce each ingredient into VPlan. 9068 // TODO: Model and preserve debug instrinsics in VPlan. 9069 for (Instruction &I : BB->instructionsWithoutDebug()) { 9070 Instruction *Instr = &I; 9071 9072 // First filter out irrelevant instructions, to ensure no recipes are 9073 // built for them. 9074 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9075 continue; 9076 9077 SmallVector<VPValue *, 4> Operands; 9078 auto *Phi = dyn_cast<PHINode>(Instr); 9079 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9080 Operands.push_back(Plan->getOrAddVPValue( 9081 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9082 } else { 9083 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9084 Operands = {OpRange.begin(), OpRange.end()}; 9085 } 9086 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9087 Instr, Operands, Range, Plan)) { 9088 // If Instr can be simplified to an existing VPValue, use it. 9089 if (RecipeOrValue.is<VPValue *>()) { 9090 auto *VPV = RecipeOrValue.get<VPValue *>(); 9091 Plan->addVPValue(Instr, VPV); 9092 // If the re-used value is a recipe, register the recipe for the 9093 // instruction, in case the recipe for Instr needs to be recorded. 9094 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9095 RecipeBuilder.setRecipe(Instr, R); 9096 continue; 9097 } 9098 // Otherwise, add the new recipe. 9099 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9100 for (auto *Def : Recipe->definedValues()) { 9101 auto *UV = Def->getUnderlyingValue(); 9102 Plan->addVPValue(UV, Def); 9103 } 9104 9105 RecipeBuilder.setRecipe(Instr, Recipe); 9106 VPBB->appendRecipe(Recipe); 9107 continue; 9108 } 9109 9110 // Otherwise, if all widening options failed, Instruction is to be 9111 // replicated. This may create a successor for VPBB. 9112 VPBasicBlock *NextVPBB = 9113 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9114 if (NextVPBB != VPBB) { 9115 VPBB = NextVPBB; 9116 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9117 : ""); 9118 } 9119 } 9120 } 9121 9122 RecipeBuilder.fixHeaderPhis(); 9123 9124 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9125 // may also be empty, such as the last one VPBB, reflecting original 9126 // basic-blocks with no recipes. 9127 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9128 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9129 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9130 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9131 delete PreEntry; 9132 9133 // --------------------------------------------------------------------------- 9134 // Transform initial VPlan: Apply previously taken decisions, in order, to 9135 // bring the VPlan to its final state. 9136 // --------------------------------------------------------------------------- 9137 9138 // Apply Sink-After legal constraints. 9139 for (auto &Entry : SinkAfter) { 9140 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9141 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9142 9143 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9144 auto *Region = 9145 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9146 if (Region && Region->isReplicator()) 9147 return Region; 9148 return nullptr; 9149 }; 9150 9151 // If the target is in a replication region, make sure to move Sink to the 9152 // block after it, not into the replication region itself. 9153 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9154 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9155 assert(!GetReplicateRegion(Sink) && 9156 "cannot sink a region into another region yet"); 9157 VPBasicBlock *NextBlock = 9158 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9159 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9160 continue; 9161 } 9162 9163 auto *SinkRegion = GetReplicateRegion(Sink); 9164 // Unless the sink source is in a replicate region, sink the recipe 9165 // directly. 9166 if (!SinkRegion) { 9167 Sink->moveAfter(Target); 9168 continue; 9169 } 9170 9171 // If the sink source is in a replicate region, we need to move the whole 9172 // replicate region, which should only contain a single recipe in the main 9173 // block. 9174 assert(Sink->getParent()->size() == 1 && 9175 "parent must be a replicator with a single recipe"); 9176 auto *SplitBlock = 9177 Target->getParent()->splitAt(std::next(Target->getIterator())); 9178 9179 auto *Pred = SinkRegion->getSinglePredecessor(); 9180 auto *Succ = SinkRegion->getSingleSuccessor(); 9181 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9182 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9183 VPBlockUtils::connectBlocks(Pred, Succ); 9184 9185 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9186 9187 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9188 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9189 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9190 if (VPBB == SplitPred) 9191 VPBB = SplitBlock; 9192 } 9193 9194 // Interleave memory: for each Interleave Group we marked earlier as relevant 9195 // for this VPlan, replace the Recipes widening its memory instructions with a 9196 // single VPInterleaveRecipe at its insertion point. 9197 for (auto IG : InterleaveGroups) { 9198 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9199 RecipeBuilder.getRecipe(IG->getInsertPos())); 9200 SmallVector<VPValue *, 4> StoredValues; 9201 for (unsigned i = 0; i < IG->getFactor(); ++i) 9202 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9203 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9204 9205 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9206 Recipe->getMask()); 9207 VPIG->insertBefore(Recipe); 9208 unsigned J = 0; 9209 for (unsigned i = 0; i < IG->getFactor(); ++i) 9210 if (Instruction *Member = IG->getMember(i)) { 9211 if (!Member->getType()->isVoidTy()) { 9212 VPValue *OriginalV = Plan->getVPValue(Member); 9213 Plan->removeVPValueFor(Member); 9214 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9215 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9216 J++; 9217 } 9218 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9219 } 9220 } 9221 9222 // Adjust the recipes for any inloop reductions. 9223 if (Range.Start.isVector()) 9224 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9225 9226 // Finally, if tail is folded by masking, introduce selects between the phi 9227 // and the live-out instruction of each reduction, at the end of the latch. 9228 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9229 Builder.setInsertPoint(VPBB); 9230 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9231 for (auto &Reduction : Legal->getReductionVars()) { 9232 if (CM.isInLoopReduction(Reduction.first)) 9233 continue; 9234 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9235 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9236 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9237 } 9238 } 9239 9240 std::string PlanName; 9241 raw_string_ostream RSO(PlanName); 9242 ElementCount VF = Range.Start; 9243 Plan->addVF(VF); 9244 RSO << "Initial VPlan for VF={" << VF; 9245 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9246 Plan->addVF(VF); 9247 RSO << "," << VF; 9248 } 9249 RSO << "},UF>=1"; 9250 RSO.flush(); 9251 Plan->setName(PlanName); 9252 9253 return Plan; 9254 } 9255 9256 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9257 // Outer loop handling: They may require CFG and instruction level 9258 // transformations before even evaluating whether vectorization is profitable. 9259 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9260 // the vectorization pipeline. 9261 assert(!OrigLoop->isInnermost()); 9262 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9263 9264 // Create new empty VPlan 9265 auto Plan = std::make_unique<VPlan>(); 9266 9267 // Build hierarchical CFG 9268 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9269 HCFGBuilder.buildHierarchicalCFG(); 9270 9271 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9272 VF *= 2) 9273 Plan->addVF(VF); 9274 9275 if (EnableVPlanPredication) { 9276 VPlanPredicator VPP(*Plan); 9277 VPP.predicate(); 9278 9279 // Avoid running transformation to recipes until masked code generation in 9280 // VPlan-native path is in place. 9281 return Plan; 9282 } 9283 9284 SmallPtrSet<Instruction *, 1> DeadInstructions; 9285 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9286 Legal->getInductionVars(), 9287 DeadInstructions, *PSE.getSE()); 9288 return Plan; 9289 } 9290 9291 // Adjust the recipes for any inloop reductions. The chain of instructions 9292 // leading from the loop exit instr to the phi need to be converted to 9293 // reductions, with one operand being vector and the other being the scalar 9294 // reduction chain. 9295 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9296 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9297 for (auto &Reduction : CM.getInLoopReductionChains()) { 9298 PHINode *Phi = Reduction.first; 9299 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9300 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9301 9302 // ReductionOperations are orders top-down from the phi's use to the 9303 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9304 // which of the two operands will remain scalar and which will be reduced. 9305 // For minmax the chain will be the select instructions. 9306 Instruction *Chain = Phi; 9307 for (Instruction *R : ReductionOperations) { 9308 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9309 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9310 9311 VPValue *ChainOp = Plan->getVPValue(Chain); 9312 unsigned FirstOpId; 9313 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9314 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9315 "Expected to replace a VPWidenSelectSC"); 9316 FirstOpId = 1; 9317 } else { 9318 assert(isa<VPWidenRecipe>(WidenRecipe) && 9319 "Expected to replace a VPWidenSC"); 9320 FirstOpId = 0; 9321 } 9322 unsigned VecOpId = 9323 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9324 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9325 9326 auto *CondOp = CM.foldTailByMasking() 9327 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9328 : nullptr; 9329 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9330 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9331 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9332 Plan->removeVPValueFor(R); 9333 Plan->addVPValue(R, RedRecipe); 9334 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9335 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9336 WidenRecipe->eraseFromParent(); 9337 9338 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9339 VPRecipeBase *CompareRecipe = 9340 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9341 assert(isa<VPWidenRecipe>(CompareRecipe) && 9342 "Expected to replace a VPWidenSC"); 9343 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9344 "Expected no remaining users"); 9345 CompareRecipe->eraseFromParent(); 9346 } 9347 Chain = R; 9348 } 9349 } 9350 } 9351 9352 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9353 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9354 VPSlotTracker &SlotTracker) const { 9355 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9356 IG->getInsertPos()->printAsOperand(O, false); 9357 O << ", "; 9358 getAddr()->printAsOperand(O, SlotTracker); 9359 VPValue *Mask = getMask(); 9360 if (Mask) { 9361 O << ", "; 9362 Mask->printAsOperand(O, SlotTracker); 9363 } 9364 for (unsigned i = 0; i < IG->getFactor(); ++i) 9365 if (Instruction *I = IG->getMember(i)) 9366 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9367 } 9368 #endif 9369 9370 void VPWidenCallRecipe::execute(VPTransformState &State) { 9371 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9372 *this, State); 9373 } 9374 9375 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9376 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9377 this, *this, InvariantCond, State); 9378 } 9379 9380 void VPWidenRecipe::execute(VPTransformState &State) { 9381 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9382 } 9383 9384 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9385 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9386 *this, State.UF, State.VF, IsPtrLoopInvariant, 9387 IsIndexLoopInvariant, State); 9388 } 9389 9390 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9391 assert(!State.Instance && "Int or FP induction being replicated."); 9392 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9393 getTruncInst(), getVPValue(0), 9394 getCastValue(), State); 9395 } 9396 9397 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9398 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9399 this, State); 9400 } 9401 9402 void VPBlendRecipe::execute(VPTransformState &State) { 9403 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9404 // We know that all PHIs in non-header blocks are converted into 9405 // selects, so we don't have to worry about the insertion order and we 9406 // can just use the builder. 9407 // At this point we generate the predication tree. There may be 9408 // duplications since this is a simple recursive scan, but future 9409 // optimizations will clean it up. 9410 9411 unsigned NumIncoming = getNumIncomingValues(); 9412 9413 // Generate a sequence of selects of the form: 9414 // SELECT(Mask3, In3, 9415 // SELECT(Mask2, In2, 9416 // SELECT(Mask1, In1, 9417 // In0))) 9418 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9419 // are essentially undef are taken from In0. 9420 InnerLoopVectorizer::VectorParts Entry(State.UF); 9421 for (unsigned In = 0; In < NumIncoming; ++In) { 9422 for (unsigned Part = 0; Part < State.UF; ++Part) { 9423 // We might have single edge PHIs (blocks) - use an identity 9424 // 'select' for the first PHI operand. 9425 Value *In0 = State.get(getIncomingValue(In), Part); 9426 if (In == 0) 9427 Entry[Part] = In0; // Initialize with the first incoming value. 9428 else { 9429 // Select between the current value and the previous incoming edge 9430 // based on the incoming mask. 9431 Value *Cond = State.get(getMask(In), Part); 9432 Entry[Part] = 9433 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9434 } 9435 } 9436 } 9437 for (unsigned Part = 0; Part < State.UF; ++Part) 9438 State.set(this, Entry[Part], Part); 9439 } 9440 9441 void VPInterleaveRecipe::execute(VPTransformState &State) { 9442 assert(!State.Instance && "Interleave group being replicated."); 9443 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9444 getStoredValues(), getMask()); 9445 } 9446 9447 void VPReductionRecipe::execute(VPTransformState &State) { 9448 assert(!State.Instance && "Reduction being replicated."); 9449 Value *PrevInChain = State.get(getChainOp(), 0); 9450 for (unsigned Part = 0; Part < State.UF; ++Part) { 9451 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9452 bool IsOrdered = useOrderedReductions(*RdxDesc); 9453 Value *NewVecOp = State.get(getVecOp(), Part); 9454 if (VPValue *Cond = getCondOp()) { 9455 Value *NewCond = State.get(Cond, Part); 9456 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9457 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9458 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9459 Constant *IdenVec = 9460 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9461 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9462 NewVecOp = Select; 9463 } 9464 Value *NewRed; 9465 Value *NextInChain; 9466 if (IsOrdered) { 9467 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9468 PrevInChain); 9469 PrevInChain = NewRed; 9470 } else { 9471 PrevInChain = State.get(getChainOp(), Part); 9472 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9473 } 9474 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9475 NextInChain = 9476 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9477 NewRed, PrevInChain); 9478 } else if (IsOrdered) 9479 NextInChain = NewRed; 9480 else { 9481 NextInChain = State.Builder.CreateBinOp( 9482 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9483 PrevInChain); 9484 } 9485 State.set(this, NextInChain, Part); 9486 } 9487 } 9488 9489 void VPReplicateRecipe::execute(VPTransformState &State) { 9490 if (State.Instance) { // Generate a single instance. 9491 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9492 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9493 *State.Instance, IsPredicated, State); 9494 // Insert scalar instance packing it into a vector. 9495 if (AlsoPack && State.VF.isVector()) { 9496 // If we're constructing lane 0, initialize to start from poison. 9497 if (State.Instance->Lane.isFirstLane()) { 9498 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9499 Value *Poison = PoisonValue::get( 9500 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9501 State.set(this, Poison, State.Instance->Part); 9502 } 9503 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9504 } 9505 return; 9506 } 9507 9508 // Generate scalar instances for all VF lanes of all UF parts, unless the 9509 // instruction is uniform inwhich case generate only the first lane for each 9510 // of the UF parts. 9511 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9512 assert((!State.VF.isScalable() || IsUniform) && 9513 "Can't scalarize a scalable vector"); 9514 for (unsigned Part = 0; Part < State.UF; ++Part) 9515 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9516 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9517 VPIteration(Part, Lane), IsPredicated, 9518 State); 9519 } 9520 9521 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9522 assert(State.Instance && "Branch on Mask works only on single instance."); 9523 9524 unsigned Part = State.Instance->Part; 9525 unsigned Lane = State.Instance->Lane.getKnownLane(); 9526 9527 Value *ConditionBit = nullptr; 9528 VPValue *BlockInMask = getMask(); 9529 if (BlockInMask) { 9530 ConditionBit = State.get(BlockInMask, Part); 9531 if (ConditionBit->getType()->isVectorTy()) 9532 ConditionBit = State.Builder.CreateExtractElement( 9533 ConditionBit, State.Builder.getInt32(Lane)); 9534 } else // Block in mask is all-one. 9535 ConditionBit = State.Builder.getTrue(); 9536 9537 // Replace the temporary unreachable terminator with a new conditional branch, 9538 // whose two destinations will be set later when they are created. 9539 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9540 assert(isa<UnreachableInst>(CurrentTerminator) && 9541 "Expected to replace unreachable terminator with conditional branch."); 9542 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9543 CondBr->setSuccessor(0, nullptr); 9544 ReplaceInstWithInst(CurrentTerminator, CondBr); 9545 } 9546 9547 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9548 assert(State.Instance && "Predicated instruction PHI works per instance."); 9549 Instruction *ScalarPredInst = 9550 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9551 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9552 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9553 assert(PredicatingBB && "Predicated block has no single predecessor."); 9554 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9555 "operand must be VPReplicateRecipe"); 9556 9557 // By current pack/unpack logic we need to generate only a single phi node: if 9558 // a vector value for the predicated instruction exists at this point it means 9559 // the instruction has vector users only, and a phi for the vector value is 9560 // needed. In this case the recipe of the predicated instruction is marked to 9561 // also do that packing, thereby "hoisting" the insert-element sequence. 9562 // Otherwise, a phi node for the scalar value is needed. 9563 unsigned Part = State.Instance->Part; 9564 if (State.hasVectorValue(getOperand(0), Part)) { 9565 Value *VectorValue = State.get(getOperand(0), Part); 9566 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9567 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9568 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9569 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9570 if (State.hasVectorValue(this, Part)) 9571 State.reset(this, VPhi, Part); 9572 else 9573 State.set(this, VPhi, Part); 9574 // NOTE: Currently we need to update the value of the operand, so the next 9575 // predicated iteration inserts its generated value in the correct vector. 9576 State.reset(getOperand(0), VPhi, Part); 9577 } else { 9578 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9579 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9580 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9581 PredicatingBB); 9582 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9583 if (State.hasScalarValue(this, *State.Instance)) 9584 State.reset(this, Phi, *State.Instance); 9585 else 9586 State.set(this, Phi, *State.Instance); 9587 // NOTE: Currently we need to update the value of the operand, so the next 9588 // predicated iteration inserts its generated value in the correct vector. 9589 State.reset(getOperand(0), Phi, *State.Instance); 9590 } 9591 } 9592 9593 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9594 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9595 State.ILV->vectorizeMemoryInstruction( 9596 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9597 StoredValue, getMask()); 9598 } 9599 9600 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9601 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9602 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9603 // for predication. 9604 static ScalarEpilogueLowering getScalarEpilogueLowering( 9605 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9606 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9607 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9608 LoopVectorizationLegality &LVL) { 9609 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9610 // don't look at hints or options, and don't request a scalar epilogue. 9611 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9612 // LoopAccessInfo (due to code dependency and not being able to reliably get 9613 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9614 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9615 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9616 // back to the old way and vectorize with versioning when forced. See D81345.) 9617 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9618 PGSOQueryType::IRPass) && 9619 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9620 return CM_ScalarEpilogueNotAllowedOptSize; 9621 9622 // 2) If set, obey the directives 9623 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9624 switch (PreferPredicateOverEpilogue) { 9625 case PreferPredicateTy::ScalarEpilogue: 9626 return CM_ScalarEpilogueAllowed; 9627 case PreferPredicateTy::PredicateElseScalarEpilogue: 9628 return CM_ScalarEpilogueNotNeededUsePredicate; 9629 case PreferPredicateTy::PredicateOrDontVectorize: 9630 return CM_ScalarEpilogueNotAllowedUsePredicate; 9631 }; 9632 } 9633 9634 // 3) If set, obey the hints 9635 switch (Hints.getPredicate()) { 9636 case LoopVectorizeHints::FK_Enabled: 9637 return CM_ScalarEpilogueNotNeededUsePredicate; 9638 case LoopVectorizeHints::FK_Disabled: 9639 return CM_ScalarEpilogueAllowed; 9640 }; 9641 9642 // 4) if the TTI hook indicates this is profitable, request predication. 9643 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9644 LVL.getLAI())) 9645 return CM_ScalarEpilogueNotNeededUsePredicate; 9646 9647 return CM_ScalarEpilogueAllowed; 9648 } 9649 9650 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9651 // If Values have been set for this Def return the one relevant for \p Part. 9652 if (hasVectorValue(Def, Part)) 9653 return Data.PerPartOutput[Def][Part]; 9654 9655 if (!hasScalarValue(Def, {Part, 0})) { 9656 Value *IRV = Def->getLiveInIRValue(); 9657 Value *B = ILV->getBroadcastInstrs(IRV); 9658 set(Def, B, Part); 9659 return B; 9660 } 9661 9662 Value *ScalarValue = get(Def, {Part, 0}); 9663 // If we aren't vectorizing, we can just copy the scalar map values over 9664 // to the vector map. 9665 if (VF.isScalar()) { 9666 set(Def, ScalarValue, Part); 9667 return ScalarValue; 9668 } 9669 9670 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9671 bool IsUniform = RepR && RepR->isUniform(); 9672 9673 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9674 // Check if there is a scalar value for the selected lane. 9675 if (!hasScalarValue(Def, {Part, LastLane})) { 9676 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9677 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9678 "unexpected recipe found to be invariant"); 9679 IsUniform = true; 9680 LastLane = 0; 9681 } 9682 9683 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9684 9685 // Set the insert point after the last scalarized instruction. This 9686 // ensures the insertelement sequence will directly follow the scalar 9687 // definitions. 9688 auto OldIP = Builder.saveIP(); 9689 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9690 Builder.SetInsertPoint(&*NewIP); 9691 9692 // However, if we are vectorizing, we need to construct the vector values. 9693 // If the value is known to be uniform after vectorization, we can just 9694 // broadcast the scalar value corresponding to lane zero for each unroll 9695 // iteration. Otherwise, we construct the vector values using 9696 // insertelement instructions. Since the resulting vectors are stored in 9697 // State, we will only generate the insertelements once. 9698 Value *VectorValue = nullptr; 9699 if (IsUniform) { 9700 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9701 set(Def, VectorValue, Part); 9702 } else { 9703 // Initialize packing with insertelements to start from undef. 9704 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9705 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9706 set(Def, Undef, Part); 9707 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9708 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9709 VectorValue = get(Def, Part); 9710 } 9711 Builder.restoreIP(OldIP); 9712 return VectorValue; 9713 } 9714 9715 // Process the loop in the VPlan-native vectorization path. This path builds 9716 // VPlan upfront in the vectorization pipeline, which allows to apply 9717 // VPlan-to-VPlan transformations from the very beginning without modifying the 9718 // input LLVM IR. 9719 static bool processLoopInVPlanNativePath( 9720 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9721 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9722 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9723 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9724 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9725 LoopVectorizationRequirements &Requirements) { 9726 9727 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9728 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9729 return false; 9730 } 9731 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9732 Function *F = L->getHeader()->getParent(); 9733 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9734 9735 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9736 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9737 9738 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9739 &Hints, IAI); 9740 // Use the planner for outer loop vectorization. 9741 // TODO: CM is not used at this point inside the planner. Turn CM into an 9742 // optional argument if we don't need it in the future. 9743 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9744 Requirements, ORE); 9745 9746 // Get user vectorization factor. 9747 ElementCount UserVF = Hints.getWidth(); 9748 9749 // Plan how to best vectorize, return the best VF and its cost. 9750 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9751 9752 // If we are stress testing VPlan builds, do not attempt to generate vector 9753 // code. Masked vector code generation support will follow soon. 9754 // Also, do not attempt to vectorize if no vector code will be produced. 9755 if (VPlanBuildStressTest || EnableVPlanPredication || 9756 VectorizationFactor::Disabled() == VF) 9757 return false; 9758 9759 LVP.setBestPlan(VF.Width, 1); 9760 9761 { 9762 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9763 F->getParent()->getDataLayout()); 9764 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9765 &CM, BFI, PSI, Checks); 9766 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9767 << L->getHeader()->getParent()->getName() << "\"\n"); 9768 LVP.executePlan(LB, DT); 9769 } 9770 9771 // Mark the loop as already vectorized to avoid vectorizing again. 9772 Hints.setAlreadyVectorized(); 9773 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9774 return true; 9775 } 9776 9777 // Emit a remark if there are stores to floats that required a floating point 9778 // extension. If the vectorized loop was generated with floating point there 9779 // will be a performance penalty from the conversion overhead and the change in 9780 // the vector width. 9781 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9782 SmallVector<Instruction *, 4> Worklist; 9783 for (BasicBlock *BB : L->getBlocks()) { 9784 for (Instruction &Inst : *BB) { 9785 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9786 if (S->getValueOperand()->getType()->isFloatTy()) 9787 Worklist.push_back(S); 9788 } 9789 } 9790 } 9791 9792 // Traverse the floating point stores upwards searching, for floating point 9793 // conversions. 9794 SmallPtrSet<const Instruction *, 4> Visited; 9795 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9796 while (!Worklist.empty()) { 9797 auto *I = Worklist.pop_back_val(); 9798 if (!L->contains(I)) 9799 continue; 9800 if (!Visited.insert(I).second) 9801 continue; 9802 9803 // Emit a remark if the floating point store required a floating 9804 // point conversion. 9805 // TODO: More work could be done to identify the root cause such as a 9806 // constant or a function return type and point the user to it. 9807 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9808 ORE->emit([&]() { 9809 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9810 I->getDebugLoc(), L->getHeader()) 9811 << "floating point conversion changes vector width. " 9812 << "Mixed floating point precision requires an up/down " 9813 << "cast that will negatively impact performance."; 9814 }); 9815 9816 for (Use &Op : I->operands()) 9817 if (auto *OpI = dyn_cast<Instruction>(Op)) 9818 Worklist.push_back(OpI); 9819 } 9820 } 9821 9822 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9823 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9824 !EnableLoopInterleaving), 9825 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9826 !EnableLoopVectorization) {} 9827 9828 bool LoopVectorizePass::processLoop(Loop *L) { 9829 assert((EnableVPlanNativePath || L->isInnermost()) && 9830 "VPlan-native path is not enabled. Only process inner loops."); 9831 9832 #ifndef NDEBUG 9833 const std::string DebugLocStr = getDebugLocString(L); 9834 #endif /* NDEBUG */ 9835 9836 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9837 << L->getHeader()->getParent()->getName() << "\" from " 9838 << DebugLocStr << "\n"); 9839 9840 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9841 9842 LLVM_DEBUG( 9843 dbgs() << "LV: Loop hints:" 9844 << " force=" 9845 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9846 ? "disabled" 9847 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9848 ? "enabled" 9849 : "?")) 9850 << " width=" << Hints.getWidth() 9851 << " interleave=" << Hints.getInterleave() << "\n"); 9852 9853 // Function containing loop 9854 Function *F = L->getHeader()->getParent(); 9855 9856 // Looking at the diagnostic output is the only way to determine if a loop 9857 // was vectorized (other than looking at the IR or machine code), so it 9858 // is important to generate an optimization remark for each loop. Most of 9859 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9860 // generated as OptimizationRemark and OptimizationRemarkMissed are 9861 // less verbose reporting vectorized loops and unvectorized loops that may 9862 // benefit from vectorization, respectively. 9863 9864 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9865 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9866 return false; 9867 } 9868 9869 PredicatedScalarEvolution PSE(*SE, *L); 9870 9871 // Check if it is legal to vectorize the loop. 9872 LoopVectorizationRequirements Requirements; 9873 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9874 &Requirements, &Hints, DB, AC, BFI, PSI); 9875 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9876 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9877 Hints.emitRemarkWithHints(); 9878 return false; 9879 } 9880 9881 // Check the function attributes and profiles to find out if this function 9882 // should be optimized for size. 9883 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9884 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9885 9886 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9887 // here. They may require CFG and instruction level transformations before 9888 // even evaluating whether vectorization is profitable. Since we cannot modify 9889 // the incoming IR, we need to build VPlan upfront in the vectorization 9890 // pipeline. 9891 if (!L->isInnermost()) 9892 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9893 ORE, BFI, PSI, Hints, Requirements); 9894 9895 assert(L->isInnermost() && "Inner loop expected."); 9896 9897 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9898 // count by optimizing for size, to minimize overheads. 9899 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9900 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9901 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9902 << "This loop is worth vectorizing only if no scalar " 9903 << "iteration overheads are incurred."); 9904 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9905 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9906 else { 9907 LLVM_DEBUG(dbgs() << "\n"); 9908 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9909 } 9910 } 9911 9912 // Check the function attributes to see if implicit floats are allowed. 9913 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9914 // an integer loop and the vector instructions selected are purely integer 9915 // vector instructions? 9916 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9917 reportVectorizationFailure( 9918 "Can't vectorize when the NoImplicitFloat attribute is used", 9919 "loop not vectorized due to NoImplicitFloat attribute", 9920 "NoImplicitFloat", ORE, L); 9921 Hints.emitRemarkWithHints(); 9922 return false; 9923 } 9924 9925 // Check if the target supports potentially unsafe FP vectorization. 9926 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9927 // for the target we're vectorizing for, to make sure none of the 9928 // additional fp-math flags can help. 9929 if (Hints.isPotentiallyUnsafe() && 9930 TTI->isFPVectorizationPotentiallyUnsafe()) { 9931 reportVectorizationFailure( 9932 "Potentially unsafe FP op prevents vectorization", 9933 "loop not vectorized due to unsafe FP support.", 9934 "UnsafeFP", ORE, L); 9935 Hints.emitRemarkWithHints(); 9936 return false; 9937 } 9938 9939 if (!Requirements.canVectorizeFPMath(Hints)) { 9940 ORE->emit([&]() { 9941 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9942 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9943 ExactFPMathInst->getDebugLoc(), 9944 ExactFPMathInst->getParent()) 9945 << "loop not vectorized: cannot prove it is safe to reorder " 9946 "floating-point operations"; 9947 }); 9948 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9949 "reorder floating-point operations\n"); 9950 Hints.emitRemarkWithHints(); 9951 return false; 9952 } 9953 9954 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9955 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9956 9957 // If an override option has been passed in for interleaved accesses, use it. 9958 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9959 UseInterleaved = EnableInterleavedMemAccesses; 9960 9961 // Analyze interleaved memory accesses. 9962 if (UseInterleaved) { 9963 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9964 } 9965 9966 // Use the cost model. 9967 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9968 F, &Hints, IAI); 9969 CM.collectValuesToIgnore(); 9970 9971 // Use the planner for vectorization. 9972 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9973 Requirements, ORE); 9974 9975 // Get user vectorization factor and interleave count. 9976 ElementCount UserVF = Hints.getWidth(); 9977 unsigned UserIC = Hints.getInterleave(); 9978 9979 // Plan how to best vectorize, return the best VF and its cost. 9980 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9981 9982 VectorizationFactor VF = VectorizationFactor::Disabled(); 9983 unsigned IC = 1; 9984 9985 if (MaybeVF) { 9986 VF = *MaybeVF; 9987 // Select the interleave count. 9988 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 9989 } 9990 9991 // Identify the diagnostic messages that should be produced. 9992 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9993 bool VectorizeLoop = true, InterleaveLoop = true; 9994 if (VF.Width.isScalar()) { 9995 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9996 VecDiagMsg = std::make_pair( 9997 "VectorizationNotBeneficial", 9998 "the cost-model indicates that vectorization is not beneficial"); 9999 VectorizeLoop = false; 10000 } 10001 10002 if (!MaybeVF && UserIC > 1) { 10003 // Tell the user interleaving was avoided up-front, despite being explicitly 10004 // requested. 10005 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10006 "interleaving should be avoided up front\n"); 10007 IntDiagMsg = std::make_pair( 10008 "InterleavingAvoided", 10009 "Ignoring UserIC, because interleaving was avoided up front"); 10010 InterleaveLoop = false; 10011 } else if (IC == 1 && UserIC <= 1) { 10012 // Tell the user interleaving is not beneficial. 10013 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10014 IntDiagMsg = std::make_pair( 10015 "InterleavingNotBeneficial", 10016 "the cost-model indicates that interleaving is not beneficial"); 10017 InterleaveLoop = false; 10018 if (UserIC == 1) { 10019 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10020 IntDiagMsg.second += 10021 " and is explicitly disabled or interleave count is set to 1"; 10022 } 10023 } else if (IC > 1 && UserIC == 1) { 10024 // Tell the user interleaving is beneficial, but it explicitly disabled. 10025 LLVM_DEBUG( 10026 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10027 IntDiagMsg = std::make_pair( 10028 "InterleavingBeneficialButDisabled", 10029 "the cost-model indicates that interleaving is beneficial " 10030 "but is explicitly disabled or interleave count is set to 1"); 10031 InterleaveLoop = false; 10032 } 10033 10034 // Override IC if user provided an interleave count. 10035 IC = UserIC > 0 ? UserIC : IC; 10036 10037 // Emit diagnostic messages, if any. 10038 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10039 if (!VectorizeLoop && !InterleaveLoop) { 10040 // Do not vectorize or interleaving the loop. 10041 ORE->emit([&]() { 10042 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10043 L->getStartLoc(), L->getHeader()) 10044 << VecDiagMsg.second; 10045 }); 10046 ORE->emit([&]() { 10047 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10048 L->getStartLoc(), L->getHeader()) 10049 << IntDiagMsg.second; 10050 }); 10051 return false; 10052 } else if (!VectorizeLoop && InterleaveLoop) { 10053 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10054 ORE->emit([&]() { 10055 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10056 L->getStartLoc(), L->getHeader()) 10057 << VecDiagMsg.second; 10058 }); 10059 } else if (VectorizeLoop && !InterleaveLoop) { 10060 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10061 << ") in " << DebugLocStr << '\n'); 10062 ORE->emit([&]() { 10063 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10064 L->getStartLoc(), L->getHeader()) 10065 << IntDiagMsg.second; 10066 }); 10067 } else if (VectorizeLoop && InterleaveLoop) { 10068 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10069 << ") in " << DebugLocStr << '\n'); 10070 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10071 } 10072 10073 bool DisableRuntimeUnroll = false; 10074 MDNode *OrigLoopID = L->getLoopID(); 10075 { 10076 // Optimistically generate runtime checks. Drop them if they turn out to not 10077 // be profitable. Limit the scope of Checks, so the cleanup happens 10078 // immediately after vector codegeneration is done. 10079 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10080 F->getParent()->getDataLayout()); 10081 if (!VF.Width.isScalar() || IC > 1) 10082 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10083 LVP.setBestPlan(VF.Width, IC); 10084 10085 using namespace ore; 10086 if (!VectorizeLoop) { 10087 assert(IC > 1 && "interleave count should not be 1 or 0"); 10088 // If we decided that it is not legal to vectorize the loop, then 10089 // interleave it. 10090 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10091 &CM, BFI, PSI, Checks); 10092 LVP.executePlan(Unroller, DT); 10093 10094 ORE->emit([&]() { 10095 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10096 L->getHeader()) 10097 << "interleaved loop (interleaved count: " 10098 << NV("InterleaveCount", IC) << ")"; 10099 }); 10100 } else { 10101 // If we decided that it is *legal* to vectorize the loop, then do it. 10102 10103 // Consider vectorizing the epilogue too if it's profitable. 10104 VectorizationFactor EpilogueVF = 10105 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10106 if (EpilogueVF.Width.isVector()) { 10107 10108 // The first pass vectorizes the main loop and creates a scalar epilogue 10109 // to be vectorized by executing the plan (potentially with a different 10110 // factor) again shortly afterwards. 10111 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10112 EpilogueVF.Width.getKnownMinValue(), 10113 1); 10114 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10115 EPI, &LVL, &CM, BFI, PSI, Checks); 10116 10117 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10118 LVP.executePlan(MainILV, DT); 10119 ++LoopsVectorized; 10120 10121 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10122 formLCSSARecursively(*L, *DT, LI, SE); 10123 10124 // Second pass vectorizes the epilogue and adjusts the control flow 10125 // edges from the first pass. 10126 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10127 EPI.MainLoopVF = EPI.EpilogueVF; 10128 EPI.MainLoopUF = EPI.EpilogueUF; 10129 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10130 ORE, EPI, &LVL, &CM, BFI, PSI, 10131 Checks); 10132 LVP.executePlan(EpilogILV, DT); 10133 ++LoopsEpilogueVectorized; 10134 10135 if (!MainILV.areSafetyChecksAdded()) 10136 DisableRuntimeUnroll = true; 10137 } else { 10138 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10139 &LVL, &CM, BFI, PSI, Checks); 10140 LVP.executePlan(LB, DT); 10141 ++LoopsVectorized; 10142 10143 // Add metadata to disable runtime unrolling a scalar loop when there 10144 // are no runtime checks about strides and memory. A scalar loop that is 10145 // rarely used is not worth unrolling. 10146 if (!LB.areSafetyChecksAdded()) 10147 DisableRuntimeUnroll = true; 10148 } 10149 // Report the vectorization decision. 10150 ORE->emit([&]() { 10151 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10152 L->getHeader()) 10153 << "vectorized loop (vectorization width: " 10154 << NV("VectorizationFactor", VF.Width) 10155 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10156 }); 10157 } 10158 10159 if (ORE->allowExtraAnalysis(LV_NAME)) 10160 checkMixedPrecision(L, ORE); 10161 } 10162 10163 Optional<MDNode *> RemainderLoopID = 10164 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10165 LLVMLoopVectorizeFollowupEpilogue}); 10166 if (RemainderLoopID.hasValue()) { 10167 L->setLoopID(RemainderLoopID.getValue()); 10168 } else { 10169 if (DisableRuntimeUnroll) 10170 AddRuntimeUnrollDisableMetaData(L); 10171 10172 // Mark the loop as already vectorized to avoid vectorizing again. 10173 Hints.setAlreadyVectorized(); 10174 } 10175 10176 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10177 return true; 10178 } 10179 10180 LoopVectorizeResult LoopVectorizePass::runImpl( 10181 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10182 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10183 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10184 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10185 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10186 SE = &SE_; 10187 LI = &LI_; 10188 TTI = &TTI_; 10189 DT = &DT_; 10190 BFI = &BFI_; 10191 TLI = TLI_; 10192 AA = &AA_; 10193 AC = &AC_; 10194 GetLAA = &GetLAA_; 10195 DB = &DB_; 10196 ORE = &ORE_; 10197 PSI = PSI_; 10198 10199 // Don't attempt if 10200 // 1. the target claims to have no vector registers, and 10201 // 2. interleaving won't help ILP. 10202 // 10203 // The second condition is necessary because, even if the target has no 10204 // vector registers, loop vectorization may still enable scalar 10205 // interleaving. 10206 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10207 TTI->getMaxInterleaveFactor(1) < 2) 10208 return LoopVectorizeResult(false, false); 10209 10210 bool Changed = false, CFGChanged = false; 10211 10212 // The vectorizer requires loops to be in simplified form. 10213 // Since simplification may add new inner loops, it has to run before the 10214 // legality and profitability checks. This means running the loop vectorizer 10215 // will simplify all loops, regardless of whether anything end up being 10216 // vectorized. 10217 for (auto &L : *LI) 10218 Changed |= CFGChanged |= 10219 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10220 10221 // Build up a worklist of inner-loops to vectorize. This is necessary as 10222 // the act of vectorizing or partially unrolling a loop creates new loops 10223 // and can invalidate iterators across the loops. 10224 SmallVector<Loop *, 8> Worklist; 10225 10226 for (Loop *L : *LI) 10227 collectSupportedLoops(*L, LI, ORE, Worklist); 10228 10229 LoopsAnalyzed += Worklist.size(); 10230 10231 // Now walk the identified inner loops. 10232 while (!Worklist.empty()) { 10233 Loop *L = Worklist.pop_back_val(); 10234 10235 // For the inner loops we actually process, form LCSSA to simplify the 10236 // transform. 10237 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10238 10239 Changed |= CFGChanged |= processLoop(L); 10240 } 10241 10242 // Process each loop nest in the function. 10243 return LoopVectorizeResult(Changed, CFGChanged); 10244 } 10245 10246 PreservedAnalyses LoopVectorizePass::run(Function &F, 10247 FunctionAnalysisManager &AM) { 10248 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10249 auto &LI = AM.getResult<LoopAnalysis>(F); 10250 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10251 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10252 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10253 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10254 auto &AA = AM.getResult<AAManager>(F); 10255 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10256 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10257 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10258 MemorySSA *MSSA = EnableMSSALoopDependency 10259 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10260 : nullptr; 10261 10262 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10263 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10264 [&](Loop &L) -> const LoopAccessInfo & { 10265 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10266 TLI, TTI, nullptr, MSSA}; 10267 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10268 }; 10269 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10270 ProfileSummaryInfo *PSI = 10271 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10272 LoopVectorizeResult Result = 10273 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10274 if (!Result.MadeAnyChange) 10275 return PreservedAnalyses::all(); 10276 PreservedAnalyses PA; 10277 10278 // We currently do not preserve loopinfo/dominator analyses with outer loop 10279 // vectorization. Until this is addressed, mark these analyses as preserved 10280 // only for non-VPlan-native path. 10281 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10282 if (!EnableVPlanNativePath) { 10283 PA.preserve<LoopAnalysis>(); 10284 PA.preserve<DominatorTreeAnalysis>(); 10285 } 10286 PA.preserve<BasicAA>(); 10287 PA.preserve<GlobalsAA>(); 10288 if (!Result.MadeCFGChange) 10289 PA.preserveSet<CFGAnalyses>(); 10290 return PA; 10291 } 10292