1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 // FIXME: When loop hints are passed which allow reordering of FP operations, 335 // we still choose to use strict reductions with this flag. We should instead 336 // use the default behaviour of vectorizing with unordered reductions if 337 // reordering is allowed. 338 cl::opt<bool> EnableStrictReductions( 339 "enable-strict-reductions", cl::init(false), cl::Hidden, 340 cl::desc("Enable the vectorisation of loops with in-order (strict) " 341 "FP reductions")); 342 343 static cl::opt<bool> PreferPredicatedReductionSelect( 344 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Prefer predicating a reduction operation over an after loop select.")); 347 348 cl::opt<bool> EnableVPlanNativePath( 349 "enable-vplan-native-path", cl::init(false), cl::Hidden, 350 cl::desc("Enable VPlan-native vectorization path with " 351 "support for outer loop vectorization.")); 352 353 // FIXME: Remove this switch once we have divergence analysis. Currently we 354 // assume divergent non-backedge branches when this switch is true. 355 cl::opt<bool> EnableVPlanPredication( 356 "enable-vplan-predication", cl::init(false), cl::Hidden, 357 cl::desc("Enable VPlan-native vectorization path predicator with " 358 "support for outer loop vectorization.")); 359 360 // This flag enables the stress testing of the VPlan H-CFG construction in the 361 // VPlan-native vectorization path. It must be used in conjuction with 362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 363 // verification of the H-CFGs built. 364 static cl::opt<bool> VPlanBuildStressTest( 365 "vplan-build-stress-test", cl::init(false), cl::Hidden, 366 cl::desc( 367 "Build VPlan for every supported loop nest in the function and bail " 368 "out right after the build (stress test the VPlan H-CFG construction " 369 "in the VPlan-native vectorization path).")); 370 371 cl::opt<bool> llvm::EnableLoopInterleaving( 372 "interleave-loops", cl::init(true), cl::Hidden, 373 cl::desc("Enable loop interleaving in Loop vectorization passes")); 374 cl::opt<bool> llvm::EnableLoopVectorization( 375 "vectorize-loops", cl::init(true), cl::Hidden, 376 cl::desc("Run the Loop vectorization passes")); 377 378 cl::opt<bool> PrintVPlansInDotFormat( 379 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 380 cl::desc("Use dot format instead of plain text when dumping VPlans")); 381 382 /// A helper function that returns the type of loaded or stored value. 383 static Type *getMemInstValueType(Value *I) { 384 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 385 "Expected Load or Store instruction"); 386 if (auto *LI = dyn_cast<LoadInst>(I)) 387 return LI->getType(); 388 return cast<StoreInst>(I)->getValueOperand()->getType(); 389 } 390 391 /// A helper function that returns true if the given type is irregular. The 392 /// type is irregular if its allocated size doesn't equal the store size of an 393 /// element of the corresponding vector type. 394 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 395 // Determine if an array of N elements of type Ty is "bitcast compatible" 396 // with a <N x Ty> vector. 397 // This is only true if there is no padding between the array elements. 398 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 399 } 400 401 /// A helper function that returns the reciprocal of the block probability of 402 /// predicated blocks. If we return X, we are assuming the predicated block 403 /// will execute once for every X iterations of the loop header. 404 /// 405 /// TODO: We should use actual block probability here, if available. Currently, 406 /// we always assume predicated blocks have a 50% chance of executing. 407 static unsigned getReciprocalPredBlockProb() { return 2; } 408 409 /// A helper function that returns an integer or floating-point constant with 410 /// value C. 411 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 412 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 413 : ConstantFP::get(Ty, C); 414 } 415 416 /// Returns "best known" trip count for the specified loop \p L as defined by 417 /// the following procedure: 418 /// 1) Returns exact trip count if it is known. 419 /// 2) Returns expected trip count according to profile data if any. 420 /// 3) Returns upper bound estimate if it is known. 421 /// 4) Returns None if all of the above failed. 422 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 423 // Check if exact trip count is known. 424 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 425 return ExpectedTC; 426 427 // Check if there is an expected trip count available from profile data. 428 if (LoopVectorizeWithBlockFrequency) 429 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 430 return EstimatedTC; 431 432 // Check if upper bound estimate is known. 433 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 434 return ExpectedTC; 435 436 return None; 437 } 438 439 // Forward declare GeneratedRTChecks. 440 class GeneratedRTChecks; 441 442 namespace llvm { 443 444 /// InnerLoopVectorizer vectorizes loops which contain only one basic 445 /// block to a specified vectorization factor (VF). 446 /// This class performs the widening of scalars into vectors, or multiple 447 /// scalars. This class also implements the following features: 448 /// * It inserts an epilogue loop for handling loops that don't have iteration 449 /// counts that are known to be a multiple of the vectorization factor. 450 /// * It handles the code generation for reduction variables. 451 /// * Scalarization (implementation using scalars) of un-vectorizable 452 /// instructions. 453 /// InnerLoopVectorizer does not perform any vectorization-legality 454 /// checks, and relies on the caller to check for the different legality 455 /// aspects. The InnerLoopVectorizer relies on the 456 /// LoopVectorizationLegality class to provide information about the induction 457 /// and reduction variables that were found to a given vectorization factor. 458 class InnerLoopVectorizer { 459 public: 460 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 461 LoopInfo *LI, DominatorTree *DT, 462 const TargetLibraryInfo *TLI, 463 const TargetTransformInfo *TTI, AssumptionCache *AC, 464 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 465 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 466 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 467 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 468 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 469 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 470 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 471 PSI(PSI), RTChecks(RTChecks) { 472 // Query this against the original loop and save it here because the profile 473 // of the original loop header may change as the transformation happens. 474 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 475 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 476 } 477 478 virtual ~InnerLoopVectorizer() = default; 479 480 /// Create a new empty loop that will contain vectorized instructions later 481 /// on, while the old loop will be used as the scalar remainder. Control flow 482 /// is generated around the vectorized (and scalar epilogue) loops consisting 483 /// of various checks and bypasses. Return the pre-header block of the new 484 /// loop. 485 /// In the case of epilogue vectorization, this function is overriden to 486 /// handle the more complex control flow around the loops. 487 virtual BasicBlock *createVectorizedLoopSkeleton(); 488 489 /// Widen a single instruction within the innermost loop. 490 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 491 VPTransformState &State); 492 493 /// Widen a single call instruction within the innermost loop. 494 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 495 VPTransformState &State); 496 497 /// Widen a single select instruction within the innermost loop. 498 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 499 bool InvariantCond, VPTransformState &State); 500 501 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 502 void fixVectorizedLoop(VPTransformState &State); 503 504 // Return true if any runtime check is added. 505 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 506 507 /// A type for vectorized values in the new loop. Each value from the 508 /// original loop, when vectorized, is represented by UF vector values in the 509 /// new unrolled loop, where UF is the unroll factor. 510 using VectorParts = SmallVector<Value *, 2>; 511 512 /// Vectorize a single GetElementPtrInst based on information gathered and 513 /// decisions taken during planning. 514 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 515 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 516 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 517 518 /// Vectorize a single PHINode in a block. This method handles the induction 519 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 520 /// arbitrary length vectors. 521 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 522 VPWidenPHIRecipe *PhiR, VPTransformState &State); 523 524 /// A helper function to scalarize a single Instruction in the innermost loop. 525 /// Generates a sequence of scalar instances for each lane between \p MinLane 526 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 527 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 528 /// Instr's operands. 529 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 530 const VPIteration &Instance, bool IfPredicateInstr, 531 VPTransformState &State); 532 533 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 534 /// is provided, the integer induction variable will first be truncated to 535 /// the corresponding type. 536 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 537 VPValue *Def, VPValue *CastDef, 538 VPTransformState &State); 539 540 /// Construct the vector value of a scalarized value \p V one lane at a time. 541 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 542 VPTransformState &State); 543 544 /// Try to vectorize interleaved access group \p Group with the base address 545 /// given in \p Addr, optionally masking the vector operations if \p 546 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 547 /// values in the vectorized loop. 548 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 549 ArrayRef<VPValue *> VPDefs, 550 VPTransformState &State, VPValue *Addr, 551 ArrayRef<VPValue *> StoredValues, 552 VPValue *BlockInMask = nullptr); 553 554 /// Vectorize Load and Store instructions with the base address given in \p 555 /// Addr, optionally masking the vector operations if \p BlockInMask is 556 /// non-null. Use \p State to translate given VPValues to IR values in the 557 /// vectorized loop. 558 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 559 VPValue *Def, VPValue *Addr, 560 VPValue *StoredValue, VPValue *BlockInMask); 561 562 /// Set the debug location in the builder using the debug location in 563 /// the instruction. 564 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 565 566 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 567 void fixNonInductionPHIs(VPTransformState &State); 568 569 /// Create a broadcast instruction. This method generates a broadcast 570 /// instruction (shuffle) for loop invariant values and for the induction 571 /// value. If this is the induction variable then we extend it to N, N+1, ... 572 /// this is needed because each iteration in the loop corresponds to a SIMD 573 /// element. 574 virtual Value *getBroadcastInstrs(Value *V); 575 576 protected: 577 friend class LoopVectorizationPlanner; 578 579 /// A small list of PHINodes. 580 using PhiVector = SmallVector<PHINode *, 4>; 581 582 /// A type for scalarized values in the new loop. Each value from the 583 /// original loop, when scalarized, is represented by UF x VF scalar values 584 /// in the new unrolled loop, where UF is the unroll factor and VF is the 585 /// vectorization factor. 586 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 587 588 /// Set up the values of the IVs correctly when exiting the vector loop. 589 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 590 Value *CountRoundDown, Value *EndValue, 591 BasicBlock *MiddleBlock); 592 593 /// Create a new induction variable inside L. 594 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 595 Value *Step, Instruction *DL); 596 597 /// Handle all cross-iteration phis in the header. 598 void fixCrossIterationPHIs(VPTransformState &State); 599 600 /// Fix a first-order recurrence. This is the second phase of vectorizing 601 /// this phi node. 602 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 603 604 /// Fix a reduction cross-iteration phi. This is the second phase of 605 /// vectorizing this phi node. 606 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 607 608 /// Clear NSW/NUW flags from reduction instructions if necessary. 609 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 610 VPTransformState &State); 611 612 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 613 /// means we need to add the appropriate incoming value from the middle 614 /// block as exiting edges from the scalar epilogue loop (if present) are 615 /// already in place, and we exit the vector loop exclusively to the middle 616 /// block. 617 void fixLCSSAPHIs(VPTransformState &State); 618 619 /// Iteratively sink the scalarized operands of a predicated instruction into 620 /// the block that was created for it. 621 void sinkScalarOperands(Instruction *PredInst); 622 623 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 624 /// represented as. 625 void truncateToMinimalBitwidths(VPTransformState &State); 626 627 /// This function adds 628 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 629 /// to each vector element of Val. The sequence starts at StartIndex. 630 /// \p Opcode is relevant for FP induction variable. 631 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 632 Instruction::BinaryOps Opcode = 633 Instruction::BinaryOpsEnd); 634 635 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 636 /// variable on which to base the steps, \p Step is the size of the step, and 637 /// \p EntryVal is the value from the original loop that maps to the steps. 638 /// Note that \p EntryVal doesn't have to be an induction variable - it 639 /// can also be a truncate instruction. 640 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 641 const InductionDescriptor &ID, VPValue *Def, 642 VPValue *CastDef, VPTransformState &State); 643 644 /// Create a vector induction phi node based on an existing scalar one. \p 645 /// EntryVal is the value from the original loop that maps to the vector phi 646 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 647 /// truncate instruction, instead of widening the original IV, we widen a 648 /// version of the IV truncated to \p EntryVal's type. 649 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 650 Value *Step, Value *Start, 651 Instruction *EntryVal, VPValue *Def, 652 VPValue *CastDef, 653 VPTransformState &State); 654 655 /// Returns true if an instruction \p I should be scalarized instead of 656 /// vectorized for the chosen vectorization factor. 657 bool shouldScalarizeInstruction(Instruction *I) const; 658 659 /// Returns true if we should generate a scalar version of \p IV. 660 bool needsScalarInduction(Instruction *IV) const; 661 662 /// If there is a cast involved in the induction variable \p ID, which should 663 /// be ignored in the vectorized loop body, this function records the 664 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 665 /// cast. We had already proved that the casted Phi is equal to the uncasted 666 /// Phi in the vectorized loop (under a runtime guard), and therefore 667 /// there is no need to vectorize the cast - the same value can be used in the 668 /// vector loop for both the Phi and the cast. 669 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 670 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 671 /// 672 /// \p EntryVal is the value from the original loop that maps to the vector 673 /// phi node and is used to distinguish what is the IV currently being 674 /// processed - original one (if \p EntryVal is a phi corresponding to the 675 /// original IV) or the "newly-created" one based on the proof mentioned above 676 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 677 /// latter case \p EntryVal is a TruncInst and we must not record anything for 678 /// that IV, but it's error-prone to expect callers of this routine to care 679 /// about that, hence this explicit parameter. 680 void recordVectorLoopValueForInductionCast( 681 const InductionDescriptor &ID, const Instruction *EntryVal, 682 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 683 unsigned Part, unsigned Lane = UINT_MAX); 684 685 /// Generate a shuffle sequence that will reverse the vector Vec. 686 virtual Value *reverseVector(Value *Vec); 687 688 /// Returns (and creates if needed) the original loop trip count. 689 Value *getOrCreateTripCount(Loop *NewLoop); 690 691 /// Returns (and creates if needed) the trip count of the widened loop. 692 Value *getOrCreateVectorTripCount(Loop *NewLoop); 693 694 /// Returns a bitcasted value to the requested vector type. 695 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 696 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 697 const DataLayout &DL); 698 699 /// Emit a bypass check to see if the vector trip count is zero, including if 700 /// it overflows. 701 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 702 703 /// Emit a bypass check to see if all of the SCEV assumptions we've 704 /// had to make are correct. Returns the block containing the checks or 705 /// nullptr if no checks have been added. 706 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 707 708 /// Emit bypass checks to check any memory assumptions we may have made. 709 /// Returns the block containing the checks or nullptr if no checks have been 710 /// added. 711 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 712 713 /// Compute the transformed value of Index at offset StartValue using step 714 /// StepValue. 715 /// For integer induction, returns StartValue + Index * StepValue. 716 /// For pointer induction, returns StartValue[Index * StepValue]. 717 /// FIXME: The newly created binary instructions should contain nsw/nuw 718 /// flags, which can be found from the original scalar operations. 719 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 720 const DataLayout &DL, 721 const InductionDescriptor &ID) const; 722 723 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 724 /// vector loop preheader, middle block and scalar preheader. Also 725 /// allocate a loop object for the new vector loop and return it. 726 Loop *createVectorLoopSkeleton(StringRef Prefix); 727 728 /// Create new phi nodes for the induction variables to resume iteration count 729 /// in the scalar epilogue, from where the vectorized loop left off (given by 730 /// \p VectorTripCount). 731 /// In cases where the loop skeleton is more complicated (eg. epilogue 732 /// vectorization) and the resume values can come from an additional bypass 733 /// block, the \p AdditionalBypass pair provides information about the bypass 734 /// block and the end value on the edge from bypass to this loop. 735 void createInductionResumeValues( 736 Loop *L, Value *VectorTripCount, 737 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 738 739 /// Complete the loop skeleton by adding debug MDs, creating appropriate 740 /// conditional branches in the middle block, preparing the builder and 741 /// running the verifier. Take in the vector loop \p L as argument, and return 742 /// the preheader of the completed vector loop. 743 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 744 745 /// Add additional metadata to \p To that was not present on \p Orig. 746 /// 747 /// Currently this is used to add the noalias annotations based on the 748 /// inserted memchecks. Use this for instructions that are *cloned* into the 749 /// vector loop. 750 void addNewMetadata(Instruction *To, const Instruction *Orig); 751 752 /// Add metadata from one instruction to another. 753 /// 754 /// This includes both the original MDs from \p From and additional ones (\see 755 /// addNewMetadata). Use this for *newly created* instructions in the vector 756 /// loop. 757 void addMetadata(Instruction *To, Instruction *From); 758 759 /// Similar to the previous function but it adds the metadata to a 760 /// vector of instructions. 761 void addMetadata(ArrayRef<Value *> To, Instruction *From); 762 763 /// Allow subclasses to override and print debug traces before/after vplan 764 /// execution, when trace information is requested. 765 virtual void printDebugTracesAtStart(){}; 766 virtual void printDebugTracesAtEnd(){}; 767 768 /// The original loop. 769 Loop *OrigLoop; 770 771 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 772 /// dynamic knowledge to simplify SCEV expressions and converts them to a 773 /// more usable form. 774 PredicatedScalarEvolution &PSE; 775 776 /// Loop Info. 777 LoopInfo *LI; 778 779 /// Dominator Tree. 780 DominatorTree *DT; 781 782 /// Alias Analysis. 783 AAResults *AA; 784 785 /// Target Library Info. 786 const TargetLibraryInfo *TLI; 787 788 /// Target Transform Info. 789 const TargetTransformInfo *TTI; 790 791 /// Assumption Cache. 792 AssumptionCache *AC; 793 794 /// Interface to emit optimization remarks. 795 OptimizationRemarkEmitter *ORE; 796 797 /// LoopVersioning. It's only set up (non-null) if memchecks were 798 /// used. 799 /// 800 /// This is currently only used to add no-alias metadata based on the 801 /// memchecks. The actually versioning is performed manually. 802 std::unique_ptr<LoopVersioning> LVer; 803 804 /// The vectorization SIMD factor to use. Each vector will have this many 805 /// vector elements. 806 ElementCount VF; 807 808 /// The vectorization unroll factor to use. Each scalar is vectorized to this 809 /// many different vector instructions. 810 unsigned UF; 811 812 /// The builder that we use 813 IRBuilder<> Builder; 814 815 // --- Vectorization state --- 816 817 /// The vector-loop preheader. 818 BasicBlock *LoopVectorPreHeader; 819 820 /// The scalar-loop preheader. 821 BasicBlock *LoopScalarPreHeader; 822 823 /// Middle Block between the vector and the scalar. 824 BasicBlock *LoopMiddleBlock; 825 826 /// The (unique) ExitBlock of the scalar loop. Note that 827 /// there can be multiple exiting edges reaching this block. 828 BasicBlock *LoopExitBlock; 829 830 /// The vector loop body. 831 BasicBlock *LoopVectorBody; 832 833 /// The scalar loop body. 834 BasicBlock *LoopScalarBody; 835 836 /// A list of all bypass blocks. The first block is the entry of the loop. 837 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 838 839 /// The new Induction variable which was added to the new block. 840 PHINode *Induction = nullptr; 841 842 /// The induction variable of the old basic block. 843 PHINode *OldInduction = nullptr; 844 845 /// Store instructions that were predicated. 846 SmallVector<Instruction *, 4> PredicatedInstructions; 847 848 /// Trip count of the original loop. 849 Value *TripCount = nullptr; 850 851 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 852 Value *VectorTripCount = nullptr; 853 854 /// The legality analysis. 855 LoopVectorizationLegality *Legal; 856 857 /// The profitablity analysis. 858 LoopVectorizationCostModel *Cost; 859 860 // Record whether runtime checks are added. 861 bool AddedSafetyChecks = false; 862 863 // Holds the end values for each induction variable. We save the end values 864 // so we can later fix-up the external users of the induction variables. 865 DenseMap<PHINode *, Value *> IVEndValues; 866 867 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 868 // fixed up at the end of vector code generation. 869 SmallVector<PHINode *, 8> OrigPHIsToFix; 870 871 /// BFI and PSI are used to check for profile guided size optimizations. 872 BlockFrequencyInfo *BFI; 873 ProfileSummaryInfo *PSI; 874 875 // Whether this loop should be optimized for size based on profile guided size 876 // optimizatios. 877 bool OptForSizeBasedOnProfile; 878 879 /// Structure to hold information about generated runtime checks, responsible 880 /// for cleaning the checks, if vectorization turns out unprofitable. 881 GeneratedRTChecks &RTChecks; 882 }; 883 884 class InnerLoopUnroller : public InnerLoopVectorizer { 885 public: 886 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 887 LoopInfo *LI, DominatorTree *DT, 888 const TargetLibraryInfo *TLI, 889 const TargetTransformInfo *TTI, AssumptionCache *AC, 890 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 891 LoopVectorizationLegality *LVL, 892 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 893 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 894 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 895 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 896 BFI, PSI, Check) {} 897 898 private: 899 Value *getBroadcastInstrs(Value *V) override; 900 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 901 Instruction::BinaryOps Opcode = 902 Instruction::BinaryOpsEnd) override; 903 Value *reverseVector(Value *Vec) override; 904 }; 905 906 /// Encapsulate information regarding vectorization of a loop and its epilogue. 907 /// This information is meant to be updated and used across two stages of 908 /// epilogue vectorization. 909 struct EpilogueLoopVectorizationInfo { 910 ElementCount MainLoopVF = ElementCount::getFixed(0); 911 unsigned MainLoopUF = 0; 912 ElementCount EpilogueVF = ElementCount::getFixed(0); 913 unsigned EpilogueUF = 0; 914 BasicBlock *MainLoopIterationCountCheck = nullptr; 915 BasicBlock *EpilogueIterationCountCheck = nullptr; 916 BasicBlock *SCEVSafetyCheck = nullptr; 917 BasicBlock *MemSafetyCheck = nullptr; 918 Value *TripCount = nullptr; 919 Value *VectorTripCount = nullptr; 920 921 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 922 unsigned EUF) 923 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 924 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 925 assert(EUF == 1 && 926 "A high UF for the epilogue loop is likely not beneficial."); 927 } 928 }; 929 930 /// An extension of the inner loop vectorizer that creates a skeleton for a 931 /// vectorized loop that has its epilogue (residual) also vectorized. 932 /// The idea is to run the vplan on a given loop twice, firstly to setup the 933 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 934 /// from the first step and vectorize the epilogue. This is achieved by 935 /// deriving two concrete strategy classes from this base class and invoking 936 /// them in succession from the loop vectorizer planner. 937 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 938 public: 939 InnerLoopAndEpilogueVectorizer( 940 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 941 DominatorTree *DT, const TargetLibraryInfo *TLI, 942 const TargetTransformInfo *TTI, AssumptionCache *AC, 943 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 944 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 945 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 946 GeneratedRTChecks &Checks) 947 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 948 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 949 Checks), 950 EPI(EPI) {} 951 952 // Override this function to handle the more complex control flow around the 953 // three loops. 954 BasicBlock *createVectorizedLoopSkeleton() final override { 955 return createEpilogueVectorizedLoopSkeleton(); 956 } 957 958 /// The interface for creating a vectorized skeleton using one of two 959 /// different strategies, each corresponding to one execution of the vplan 960 /// as described above. 961 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 962 963 /// Holds and updates state information required to vectorize the main loop 964 /// and its epilogue in two separate passes. This setup helps us avoid 965 /// regenerating and recomputing runtime safety checks. It also helps us to 966 /// shorten the iteration-count-check path length for the cases where the 967 /// iteration count of the loop is so small that the main vector loop is 968 /// completely skipped. 969 EpilogueLoopVectorizationInfo &EPI; 970 }; 971 972 /// A specialized derived class of inner loop vectorizer that performs 973 /// vectorization of *main* loops in the process of vectorizing loops and their 974 /// epilogues. 975 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 976 public: 977 EpilogueVectorizerMainLoop( 978 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 979 DominatorTree *DT, const TargetLibraryInfo *TLI, 980 const TargetTransformInfo *TTI, AssumptionCache *AC, 981 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 982 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 983 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 984 GeneratedRTChecks &Check) 985 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 986 EPI, LVL, CM, BFI, PSI, Check) {} 987 /// Implements the interface for creating a vectorized skeleton using the 988 /// *main loop* strategy (ie the first pass of vplan execution). 989 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 990 991 protected: 992 /// Emits an iteration count bypass check once for the main loop (when \p 993 /// ForEpilogue is false) and once for the epilogue loop (when \p 994 /// ForEpilogue is true). 995 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 996 bool ForEpilogue); 997 void printDebugTracesAtStart() override; 998 void printDebugTracesAtEnd() override; 999 }; 1000 1001 // A specialized derived class of inner loop vectorizer that performs 1002 // vectorization of *epilogue* loops in the process of vectorizing loops and 1003 // their epilogues. 1004 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1005 public: 1006 EpilogueVectorizerEpilogueLoop( 1007 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1008 DominatorTree *DT, const TargetLibraryInfo *TLI, 1009 const TargetTransformInfo *TTI, AssumptionCache *AC, 1010 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1011 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1012 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1013 GeneratedRTChecks &Checks) 1014 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1015 EPI, LVL, CM, BFI, PSI, Checks) {} 1016 /// Implements the interface for creating a vectorized skeleton using the 1017 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1018 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1019 1020 protected: 1021 /// Emits an iteration count bypass check after the main vector loop has 1022 /// finished to see if there are any iterations left to execute by either 1023 /// the vector epilogue or the scalar epilogue. 1024 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1025 BasicBlock *Bypass, 1026 BasicBlock *Insert); 1027 void printDebugTracesAtStart() override; 1028 void printDebugTracesAtEnd() override; 1029 }; 1030 } // end namespace llvm 1031 1032 /// Look for a meaningful debug location on the instruction or it's 1033 /// operands. 1034 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1035 if (!I) 1036 return I; 1037 1038 DebugLoc Empty; 1039 if (I->getDebugLoc() != Empty) 1040 return I; 1041 1042 for (Use &Op : I->operands()) { 1043 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1044 if (OpInst->getDebugLoc() != Empty) 1045 return OpInst; 1046 } 1047 1048 return I; 1049 } 1050 1051 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1052 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1053 const DILocation *DIL = Inst->getDebugLoc(); 1054 1055 // When a FSDiscriminator is enabled, we don't need to add the multiply 1056 // factors to the discriminators. 1057 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1058 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1059 // FIXME: For scalable vectors, assume vscale=1. 1060 auto NewDIL = 1061 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1062 if (NewDIL) 1063 B.SetCurrentDebugLocation(NewDIL.getValue()); 1064 else 1065 LLVM_DEBUG(dbgs() 1066 << "Failed to create new discriminator: " 1067 << DIL->getFilename() << " Line: " << DIL->getLine()); 1068 } else 1069 B.SetCurrentDebugLocation(DIL); 1070 } else 1071 B.SetCurrentDebugLocation(DebugLoc()); 1072 } 1073 1074 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1075 /// is passed, the message relates to that particular instruction. 1076 #ifndef NDEBUG 1077 static void debugVectorizationMessage(const StringRef Prefix, 1078 const StringRef DebugMsg, 1079 Instruction *I) { 1080 dbgs() << "LV: " << Prefix << DebugMsg; 1081 if (I != nullptr) 1082 dbgs() << " " << *I; 1083 else 1084 dbgs() << '.'; 1085 dbgs() << '\n'; 1086 } 1087 #endif 1088 1089 /// Create an analysis remark that explains why vectorization failed 1090 /// 1091 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1092 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1093 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1094 /// the location of the remark. \return the remark object that can be 1095 /// streamed to. 1096 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1097 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1098 Value *CodeRegion = TheLoop->getHeader(); 1099 DebugLoc DL = TheLoop->getStartLoc(); 1100 1101 if (I) { 1102 CodeRegion = I->getParent(); 1103 // If there is no debug location attached to the instruction, revert back to 1104 // using the loop's. 1105 if (I->getDebugLoc()) 1106 DL = I->getDebugLoc(); 1107 } 1108 1109 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1110 } 1111 1112 /// Return a value for Step multiplied by VF. 1113 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1114 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1115 Constant *StepVal = ConstantInt::get( 1116 Step->getType(), 1117 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1118 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1119 } 1120 1121 namespace llvm { 1122 1123 /// Return the runtime value for VF. 1124 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1125 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1126 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1127 } 1128 1129 void reportVectorizationFailure(const StringRef DebugMsg, 1130 const StringRef OREMsg, const StringRef ORETag, 1131 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1132 Instruction *I) { 1133 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1134 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1135 ORE->emit( 1136 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1137 << "loop not vectorized: " << OREMsg); 1138 } 1139 1140 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1141 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1142 Instruction *I) { 1143 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1144 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1145 ORE->emit( 1146 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1147 << Msg); 1148 } 1149 1150 } // end namespace llvm 1151 1152 #ifndef NDEBUG 1153 /// \return string containing a file name and a line # for the given loop. 1154 static std::string getDebugLocString(const Loop *L) { 1155 std::string Result; 1156 if (L) { 1157 raw_string_ostream OS(Result); 1158 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1159 LoopDbgLoc.print(OS); 1160 else 1161 // Just print the module name. 1162 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1163 OS.flush(); 1164 } 1165 return Result; 1166 } 1167 #endif 1168 1169 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1170 const Instruction *Orig) { 1171 // If the loop was versioned with memchecks, add the corresponding no-alias 1172 // metadata. 1173 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1174 LVer->annotateInstWithNoAlias(To, Orig); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(Instruction *To, 1178 Instruction *From) { 1179 propagateMetadata(To, From); 1180 addNewMetadata(To, From); 1181 } 1182 1183 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1184 Instruction *From) { 1185 for (Value *V : To) { 1186 if (Instruction *I = dyn_cast<Instruction>(V)) 1187 addMetadata(I, From); 1188 } 1189 } 1190 1191 namespace llvm { 1192 1193 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1194 // lowered. 1195 enum ScalarEpilogueLowering { 1196 1197 // The default: allowing scalar epilogues. 1198 CM_ScalarEpilogueAllowed, 1199 1200 // Vectorization with OptForSize: don't allow epilogues. 1201 CM_ScalarEpilogueNotAllowedOptSize, 1202 1203 // A special case of vectorisation with OptForSize: loops with a very small 1204 // trip count are considered for vectorization under OptForSize, thereby 1205 // making sure the cost of their loop body is dominant, free of runtime 1206 // guards and scalar iteration overheads. 1207 CM_ScalarEpilogueNotAllowedLowTripLoop, 1208 1209 // Loop hint predicate indicating an epilogue is undesired. 1210 CM_ScalarEpilogueNotNeededUsePredicate, 1211 1212 // Directive indicating we must either tail fold or not vectorize 1213 CM_ScalarEpilogueNotAllowedUsePredicate 1214 }; 1215 1216 /// LoopVectorizationCostModel - estimates the expected speedups due to 1217 /// vectorization. 1218 /// In many cases vectorization is not profitable. This can happen because of 1219 /// a number of reasons. In this class we mainly attempt to predict the 1220 /// expected speedup/slowdowns due to the supported instruction set. We use the 1221 /// TargetTransformInfo to query the different backends for the cost of 1222 /// different operations. 1223 class LoopVectorizationCostModel { 1224 public: 1225 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1226 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1227 LoopVectorizationLegality *Legal, 1228 const TargetTransformInfo &TTI, 1229 const TargetLibraryInfo *TLI, DemandedBits *DB, 1230 AssumptionCache *AC, 1231 OptimizationRemarkEmitter *ORE, const Function *F, 1232 const LoopVectorizeHints *Hints, 1233 InterleavedAccessInfo &IAI) 1234 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1235 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1236 Hints(Hints), InterleaveInfo(IAI) {} 1237 1238 /// \return An upper bound for the vectorization factors (both fixed and 1239 /// scalable). If the factors are 0, vectorization and interleaving should be 1240 /// avoided up front. 1241 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1242 1243 /// \return True if runtime checks are required for vectorization, and false 1244 /// otherwise. 1245 bool runtimeChecksRequired(); 1246 1247 /// \return The most profitable vectorization factor and the cost of that VF. 1248 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1249 /// then this vectorization factor will be selected if vectorization is 1250 /// possible. 1251 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1252 VectorizationFactor 1253 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1254 const LoopVectorizationPlanner &LVP); 1255 1256 /// Setup cost-based decisions for user vectorization factor. 1257 void selectUserVectorizationFactor(ElementCount UserVF) { 1258 collectUniformsAndScalars(UserVF); 1259 collectInstsToScalarize(UserVF); 1260 } 1261 1262 /// \return The size (in bits) of the smallest and widest types in the code 1263 /// that needs to be vectorized. We ignore values that remain scalar such as 1264 /// 64 bit loop indices. 1265 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1266 1267 /// \return The desired interleave count. 1268 /// If interleave count has been specified by metadata it will be returned. 1269 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1270 /// are the selected vectorization factor and the cost of the selected VF. 1271 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1272 1273 /// Memory access instruction may be vectorized in more than one way. 1274 /// Form of instruction after vectorization depends on cost. 1275 /// This function takes cost-based decisions for Load/Store instructions 1276 /// and collects them in a map. This decisions map is used for building 1277 /// the lists of loop-uniform and loop-scalar instructions. 1278 /// The calculated cost is saved with widening decision in order to 1279 /// avoid redundant calculations. 1280 void setCostBasedWideningDecision(ElementCount VF); 1281 1282 /// A struct that represents some properties of the register usage 1283 /// of a loop. 1284 struct RegisterUsage { 1285 /// Holds the number of loop invariant values that are used in the loop. 1286 /// The key is ClassID of target-provided register class. 1287 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1288 /// Holds the maximum number of concurrent live intervals in the loop. 1289 /// The key is ClassID of target-provided register class. 1290 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1291 }; 1292 1293 /// \return Returns information about the register usages of the loop for the 1294 /// given vectorization factors. 1295 SmallVector<RegisterUsage, 8> 1296 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1297 1298 /// Collect values we want to ignore in the cost model. 1299 void collectValuesToIgnore(); 1300 1301 /// Split reductions into those that happen in the loop, and those that happen 1302 /// outside. In loop reductions are collected into InLoopReductionChains. 1303 void collectInLoopReductions(); 1304 1305 /// \returns The smallest bitwidth each instruction can be represented with. 1306 /// The vector equivalents of these instructions should be truncated to this 1307 /// type. 1308 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1309 return MinBWs; 1310 } 1311 1312 /// \returns True if it is more profitable to scalarize instruction \p I for 1313 /// vectorization factor \p VF. 1314 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1315 assert(VF.isVector() && 1316 "Profitable to scalarize relevant only for VF > 1."); 1317 1318 // Cost model is not run in the VPlan-native path - return conservative 1319 // result until this changes. 1320 if (EnableVPlanNativePath) 1321 return false; 1322 1323 auto Scalars = InstsToScalarize.find(VF); 1324 assert(Scalars != InstsToScalarize.end() && 1325 "VF not yet analyzed for scalarization profitability"); 1326 return Scalars->second.find(I) != Scalars->second.end(); 1327 } 1328 1329 /// Returns true if \p I is known to be uniform after vectorization. 1330 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1331 if (VF.isScalar()) 1332 return true; 1333 1334 // Cost model is not run in the VPlan-native path - return conservative 1335 // result until this changes. 1336 if (EnableVPlanNativePath) 1337 return false; 1338 1339 auto UniformsPerVF = Uniforms.find(VF); 1340 assert(UniformsPerVF != Uniforms.end() && 1341 "VF not yet analyzed for uniformity"); 1342 return UniformsPerVF->second.count(I); 1343 } 1344 1345 /// Returns true if \p I is known to be scalar after vectorization. 1346 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1347 if (VF.isScalar()) 1348 return true; 1349 1350 // Cost model is not run in the VPlan-native path - return conservative 1351 // result until this changes. 1352 if (EnableVPlanNativePath) 1353 return false; 1354 1355 auto ScalarsPerVF = Scalars.find(VF); 1356 assert(ScalarsPerVF != Scalars.end() && 1357 "Scalar values are not calculated for VF"); 1358 return ScalarsPerVF->second.count(I); 1359 } 1360 1361 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1362 /// for vectorization factor \p VF. 1363 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1364 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1365 !isProfitableToScalarize(I, VF) && 1366 !isScalarAfterVectorization(I, VF); 1367 } 1368 1369 /// Decision that was taken during cost calculation for memory instruction. 1370 enum InstWidening { 1371 CM_Unknown, 1372 CM_Widen, // For consecutive accesses with stride +1. 1373 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1374 CM_Interleave, 1375 CM_GatherScatter, 1376 CM_Scalarize 1377 }; 1378 1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1380 /// instruction \p I and vector width \p VF. 1381 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1382 InstructionCost Cost) { 1383 assert(VF.isVector() && "Expected VF >=2"); 1384 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1385 } 1386 1387 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1388 /// interleaving group \p Grp and vector width \p VF. 1389 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1390 ElementCount VF, InstWidening W, 1391 InstructionCost Cost) { 1392 assert(VF.isVector() && "Expected VF >=2"); 1393 /// Broadcast this decicion to all instructions inside the group. 1394 /// But the cost will be assigned to one instruction only. 1395 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1396 if (auto *I = Grp->getMember(i)) { 1397 if (Grp->getInsertPos() == I) 1398 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1399 else 1400 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1401 } 1402 } 1403 } 1404 1405 /// Return the cost model decision for the given instruction \p I and vector 1406 /// width \p VF. Return CM_Unknown if this instruction did not pass 1407 /// through the cost modeling. 1408 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1409 assert(VF.isVector() && "Expected VF to be a vector VF"); 1410 // Cost model is not run in the VPlan-native path - return conservative 1411 // result until this changes. 1412 if (EnableVPlanNativePath) 1413 return CM_GatherScatter; 1414 1415 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1416 auto Itr = WideningDecisions.find(InstOnVF); 1417 if (Itr == WideningDecisions.end()) 1418 return CM_Unknown; 1419 return Itr->second.first; 1420 } 1421 1422 /// Return the vectorization cost for the given instruction \p I and vector 1423 /// width \p VF. 1424 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1425 assert(VF.isVector() && "Expected VF >=2"); 1426 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1427 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1428 "The cost is not calculated"); 1429 return WideningDecisions[InstOnVF].second; 1430 } 1431 1432 /// Return True if instruction \p I is an optimizable truncate whose operand 1433 /// is an induction variable. Such a truncate will be removed by adding a new 1434 /// induction variable with the destination type. 1435 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1436 // If the instruction is not a truncate, return false. 1437 auto *Trunc = dyn_cast<TruncInst>(I); 1438 if (!Trunc) 1439 return false; 1440 1441 // Get the source and destination types of the truncate. 1442 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1443 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1444 1445 // If the truncate is free for the given types, return false. Replacing a 1446 // free truncate with an induction variable would add an induction variable 1447 // update instruction to each iteration of the loop. We exclude from this 1448 // check the primary induction variable since it will need an update 1449 // instruction regardless. 1450 Value *Op = Trunc->getOperand(0); 1451 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1452 return false; 1453 1454 // If the truncated value is not an induction variable, return false. 1455 return Legal->isInductionPhi(Op); 1456 } 1457 1458 /// Collects the instructions to scalarize for each predicated instruction in 1459 /// the loop. 1460 void collectInstsToScalarize(ElementCount VF); 1461 1462 /// Collect Uniform and Scalar values for the given \p VF. 1463 /// The sets depend on CM decision for Load/Store instructions 1464 /// that may be vectorized as interleave, gather-scatter or scalarized. 1465 void collectUniformsAndScalars(ElementCount VF) { 1466 // Do the analysis once. 1467 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1468 return; 1469 setCostBasedWideningDecision(VF); 1470 collectLoopUniforms(VF); 1471 collectLoopScalars(VF); 1472 } 1473 1474 /// Returns true if the target machine supports masked store operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedStore(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked load operation 1482 /// for the given \p DataType and kind of access to \p Ptr. 1483 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1484 return Legal->isConsecutivePtr(Ptr) && 1485 TTI.isLegalMaskedLoad(DataType, Alignment); 1486 } 1487 1488 /// Returns true if the target machine supports masked scatter operation 1489 /// for the given \p DataType. 1490 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1491 return TTI.isLegalMaskedScatter(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine supports masked gather operation 1495 /// for the given \p DataType. 1496 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1497 return TTI.isLegalMaskedGather(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine can represent \p V as a masked gather 1501 /// or scatter operation. 1502 bool isLegalGatherOrScatter(Value *V) { 1503 bool LI = isa<LoadInst>(V); 1504 bool SI = isa<StoreInst>(V); 1505 if (!LI && !SI) 1506 return false; 1507 auto *Ty = getMemInstValueType(V); 1508 Align Align = getLoadStoreAlignment(V); 1509 return (LI && isLegalMaskedGather(Ty, Align)) || 1510 (SI && isLegalMaskedScatter(Ty, Align)); 1511 } 1512 1513 /// Returns true if the target machine supports all of the reduction 1514 /// variables found for the given VF. 1515 bool canVectorizeReductions(ElementCount VF) { 1516 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1517 RecurrenceDescriptor RdxDesc = Reduction.second; 1518 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1519 })); 1520 } 1521 1522 /// Returns true if \p I is an instruction that will be scalarized with 1523 /// predication. Such instructions include conditional stores and 1524 /// instructions that may divide by zero. 1525 /// If a non-zero VF has been calculated, we check if I will be scalarized 1526 /// predication for that VF. 1527 bool isScalarWithPredication(Instruction *I) const; 1528 1529 // Returns true if \p I is an instruction that will be predicated either 1530 // through scalar predication or masked load/store or masked gather/scatter. 1531 // Superset of instructions that return true for isScalarWithPredication. 1532 bool isPredicatedInst(Instruction *I) { 1533 if (!blockNeedsPredication(I->getParent())) 1534 return false; 1535 // Loads and stores that need some form of masked operation are predicated 1536 // instructions. 1537 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1538 return Legal->isMaskRequired(I); 1539 return isScalarWithPredication(I); 1540 } 1541 1542 /// Returns true if \p I is a memory instruction with consecutive memory 1543 /// access that can be widened. 1544 bool 1545 memoryInstructionCanBeWidened(Instruction *I, 1546 ElementCount VF = ElementCount::getFixed(1)); 1547 1548 /// Returns true if \p I is a memory instruction in an interleaved-group 1549 /// of memory accesses that can be vectorized with wide vector loads/stores 1550 /// and shuffles. 1551 bool 1552 interleavedAccessCanBeWidened(Instruction *I, 1553 ElementCount VF = ElementCount::getFixed(1)); 1554 1555 /// Check if \p Instr belongs to any interleaved access group. 1556 bool isAccessInterleaved(Instruction *Instr) { 1557 return InterleaveInfo.isInterleaved(Instr); 1558 } 1559 1560 /// Get the interleaved access group that \p Instr belongs to. 1561 const InterleaveGroup<Instruction> * 1562 getInterleavedAccessGroup(Instruction *Instr) { 1563 return InterleaveInfo.getInterleaveGroup(Instr); 1564 } 1565 1566 /// Returns true if we're required to use a scalar epilogue for at least 1567 /// the final iteration of the original loop. 1568 bool requiresScalarEpilogue() const { 1569 if (!isScalarEpilogueAllowed()) 1570 return false; 1571 // If we might exit from anywhere but the latch, must run the exiting 1572 // iteration in scalar form. 1573 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1574 return true; 1575 return InterleaveInfo.requiresScalarEpilogue(); 1576 } 1577 1578 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1579 /// loop hint annotation. 1580 bool isScalarEpilogueAllowed() const { 1581 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1582 } 1583 1584 /// Returns true if all loop blocks should be masked to fold tail loop. 1585 bool foldTailByMasking() const { return FoldTailByMasking; } 1586 1587 bool blockNeedsPredication(BasicBlock *BB) const { 1588 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1589 } 1590 1591 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1592 /// nodes to the chain of instructions representing the reductions. Uses a 1593 /// MapVector to ensure deterministic iteration order. 1594 using ReductionChainMap = 1595 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1596 1597 /// Return the chain of instructions representing an inloop reduction. 1598 const ReductionChainMap &getInLoopReductionChains() const { 1599 return InLoopReductionChains; 1600 } 1601 1602 /// Returns true if the Phi is part of an inloop reduction. 1603 bool isInLoopReduction(PHINode *Phi) const { 1604 return InLoopReductionChains.count(Phi); 1605 } 1606 1607 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1608 /// with factor VF. Return the cost of the instruction, including 1609 /// scalarization overhead if it's needed. 1610 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1611 1612 /// Estimate cost of a call instruction CI if it were vectorized with factor 1613 /// VF. Return the cost of the instruction, including scalarization overhead 1614 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1615 /// scalarized - 1616 /// i.e. either vector version isn't available, or is too expensive. 1617 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1618 bool &NeedToScalarize) const; 1619 1620 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1621 /// that of B. 1622 bool isMoreProfitable(const VectorizationFactor &A, 1623 const VectorizationFactor &B) const; 1624 1625 /// Invalidates decisions already taken by the cost model. 1626 void invalidateCostModelingDecisions() { 1627 WideningDecisions.clear(); 1628 Uniforms.clear(); 1629 Scalars.clear(); 1630 } 1631 1632 private: 1633 unsigned NumPredStores = 0; 1634 1635 /// \return An upper bound for the vectorization factors for both 1636 /// fixed and scalable vectorization, where the minimum-known number of 1637 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1638 /// disabled or unsupported, then the scalable part will be equal to 1639 /// ElementCount::getScalable(0). 1640 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1641 ElementCount UserVF); 1642 1643 /// \return the maximized element count based on the targets vector 1644 /// registers and the loop trip-count, but limited to a maximum safe VF. 1645 /// This is a helper function of computeFeasibleMaxVF. 1646 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1647 /// issue that occurred on one of the buildbots which cannot be reproduced 1648 /// without having access to the properietary compiler (see comments on 1649 /// D98509). The issue is currently under investigation and this workaround 1650 /// will be removed as soon as possible. 1651 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1652 unsigned SmallestType, 1653 unsigned WidestType, 1654 const ElementCount &MaxSafeVF); 1655 1656 /// \return the maximum legal scalable VF, based on the safe max number 1657 /// of elements. 1658 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1659 1660 /// The vectorization cost is a combination of the cost itself and a boolean 1661 /// indicating whether any of the contributing operations will actually 1662 /// operate on 1663 /// vector values after type legalization in the backend. If this latter value 1664 /// is 1665 /// false, then all operations will be scalarized (i.e. no vectorization has 1666 /// actually taken place). 1667 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1668 1669 /// Returns the expected execution cost. The unit of the cost does 1670 /// not matter because we use the 'cost' units to compare different 1671 /// vector widths. The cost that is returned is *not* normalized by 1672 /// the factor width. 1673 VectorizationCostTy expectedCost(ElementCount VF); 1674 1675 /// Returns the execution time cost of an instruction for a given vector 1676 /// width. Vector width of one means scalar. 1677 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1678 1679 /// The cost-computation logic from getInstructionCost which provides 1680 /// the vector type as an output parameter. 1681 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1682 Type *&VectorTy); 1683 1684 /// Return the cost of instructions in an inloop reduction pattern, if I is 1685 /// part of that pattern. 1686 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1687 Type *VectorTy, 1688 TTI::TargetCostKind CostKind); 1689 1690 /// Calculate vectorization cost of memory instruction \p I. 1691 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1692 1693 /// The cost computation for scalarized memory instruction. 1694 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1695 1696 /// The cost computation for interleaving group of memory instructions. 1697 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1698 1699 /// The cost computation for Gather/Scatter instruction. 1700 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1701 1702 /// The cost computation for widening instruction \p I with consecutive 1703 /// memory access. 1704 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1705 1706 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1707 /// Load: scalar load + broadcast. 1708 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1709 /// element) 1710 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1711 1712 /// Estimate the overhead of scalarizing an instruction. This is a 1713 /// convenience wrapper for the type-based getScalarizationOverhead API. 1714 InstructionCost getScalarizationOverhead(Instruction *I, 1715 ElementCount VF) const; 1716 1717 /// Returns whether the instruction is a load or store and will be a emitted 1718 /// as a vector operation. 1719 bool isConsecutiveLoadOrStore(Instruction *I); 1720 1721 /// Returns true if an artificially high cost for emulated masked memrefs 1722 /// should be used. 1723 bool useEmulatedMaskMemRefHack(Instruction *I); 1724 1725 /// Map of scalar integer values to the smallest bitwidth they can be legally 1726 /// represented as. The vector equivalents of these values should be truncated 1727 /// to this type. 1728 MapVector<Instruction *, uint64_t> MinBWs; 1729 1730 /// A type representing the costs for instructions if they were to be 1731 /// scalarized rather than vectorized. The entries are Instruction-Cost 1732 /// pairs. 1733 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1734 1735 /// A set containing all BasicBlocks that are known to present after 1736 /// vectorization as a predicated block. 1737 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1738 1739 /// Records whether it is allowed to have the original scalar loop execute at 1740 /// least once. This may be needed as a fallback loop in case runtime 1741 /// aliasing/dependence checks fail, or to handle the tail/remainder 1742 /// iterations when the trip count is unknown or doesn't divide by the VF, 1743 /// or as a peel-loop to handle gaps in interleave-groups. 1744 /// Under optsize and when the trip count is very small we don't allow any 1745 /// iterations to execute in the scalar loop. 1746 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1747 1748 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1749 bool FoldTailByMasking = false; 1750 1751 /// A map holding scalar costs for different vectorization factors. The 1752 /// presence of a cost for an instruction in the mapping indicates that the 1753 /// instruction will be scalarized when vectorizing with the associated 1754 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1755 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1756 1757 /// Holds the instructions known to be uniform after vectorization. 1758 /// The data is collected per VF. 1759 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1760 1761 /// Holds the instructions known to be scalar after vectorization. 1762 /// The data is collected per VF. 1763 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1764 1765 /// Holds the instructions (address computations) that are forced to be 1766 /// scalarized. 1767 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1768 1769 /// PHINodes of the reductions that should be expanded in-loop along with 1770 /// their associated chains of reduction operations, in program order from top 1771 /// (PHI) to bottom 1772 ReductionChainMap InLoopReductionChains; 1773 1774 /// A Map of inloop reduction operations and their immediate chain operand. 1775 /// FIXME: This can be removed once reductions can be costed correctly in 1776 /// vplan. This was added to allow quick lookup to the inloop operations, 1777 /// without having to loop through InLoopReductionChains. 1778 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1779 1780 /// Returns the expected difference in cost from scalarizing the expression 1781 /// feeding a predicated instruction \p PredInst. The instructions to 1782 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1783 /// non-negative return value implies the expression will be scalarized. 1784 /// Currently, only single-use chains are considered for scalarization. 1785 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1786 ElementCount VF); 1787 1788 /// Collect the instructions that are uniform after vectorization. An 1789 /// instruction is uniform if we represent it with a single scalar value in 1790 /// the vectorized loop corresponding to each vector iteration. Examples of 1791 /// uniform instructions include pointer operands of consecutive or 1792 /// interleaved memory accesses. Note that although uniformity implies an 1793 /// instruction will be scalar, the reverse is not true. In general, a 1794 /// scalarized instruction will be represented by VF scalar values in the 1795 /// vectorized loop, each corresponding to an iteration of the original 1796 /// scalar loop. 1797 void collectLoopUniforms(ElementCount VF); 1798 1799 /// Collect the instructions that are scalar after vectorization. An 1800 /// instruction is scalar if it is known to be uniform or will be scalarized 1801 /// during vectorization. Non-uniform scalarized instructions will be 1802 /// represented by VF values in the vectorized loop, each corresponding to an 1803 /// iteration of the original scalar loop. 1804 void collectLoopScalars(ElementCount VF); 1805 1806 /// Keeps cost model vectorization decision and cost for instructions. 1807 /// Right now it is used for memory instructions only. 1808 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1809 std::pair<InstWidening, InstructionCost>>; 1810 1811 DecisionList WideningDecisions; 1812 1813 /// Returns true if \p V is expected to be vectorized and it needs to be 1814 /// extracted. 1815 bool needsExtract(Value *V, ElementCount VF) const { 1816 Instruction *I = dyn_cast<Instruction>(V); 1817 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1818 TheLoop->isLoopInvariant(I)) 1819 return false; 1820 1821 // Assume we can vectorize V (and hence we need extraction) if the 1822 // scalars are not computed yet. This can happen, because it is called 1823 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1824 // the scalars are collected. That should be a safe assumption in most 1825 // cases, because we check if the operands have vectorizable types 1826 // beforehand in LoopVectorizationLegality. 1827 return Scalars.find(VF) == Scalars.end() || 1828 !isScalarAfterVectorization(I, VF); 1829 }; 1830 1831 /// Returns a range containing only operands needing to be extracted. 1832 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1833 ElementCount VF) const { 1834 return SmallVector<Value *, 4>(make_filter_range( 1835 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1836 } 1837 1838 /// Determines if we have the infrastructure to vectorize loop \p L and its 1839 /// epilogue, assuming the main loop is vectorized by \p VF. 1840 bool isCandidateForEpilogueVectorization(const Loop &L, 1841 const ElementCount VF) const; 1842 1843 /// Returns true if epilogue vectorization is considered profitable, and 1844 /// false otherwise. 1845 /// \p VF is the vectorization factor chosen for the original loop. 1846 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1847 1848 public: 1849 /// The loop that we evaluate. 1850 Loop *TheLoop; 1851 1852 /// Predicated scalar evolution analysis. 1853 PredicatedScalarEvolution &PSE; 1854 1855 /// Loop Info analysis. 1856 LoopInfo *LI; 1857 1858 /// Vectorization legality. 1859 LoopVectorizationLegality *Legal; 1860 1861 /// Vector target information. 1862 const TargetTransformInfo &TTI; 1863 1864 /// Target Library Info. 1865 const TargetLibraryInfo *TLI; 1866 1867 /// Demanded bits analysis. 1868 DemandedBits *DB; 1869 1870 /// Assumption cache. 1871 AssumptionCache *AC; 1872 1873 /// Interface to emit optimization remarks. 1874 OptimizationRemarkEmitter *ORE; 1875 1876 const Function *TheFunction; 1877 1878 /// Loop Vectorize Hint. 1879 const LoopVectorizeHints *Hints; 1880 1881 /// The interleave access information contains groups of interleaved accesses 1882 /// with the same stride and close to each other. 1883 InterleavedAccessInfo &InterleaveInfo; 1884 1885 /// Values to ignore in the cost model. 1886 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1887 1888 /// Values to ignore in the cost model when VF > 1. 1889 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1890 1891 /// Profitable vector factors. 1892 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1893 }; 1894 } // end namespace llvm 1895 1896 /// Helper struct to manage generating runtime checks for vectorization. 1897 /// 1898 /// The runtime checks are created up-front in temporary blocks to allow better 1899 /// estimating the cost and un-linked from the existing IR. After deciding to 1900 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1901 /// temporary blocks are completely removed. 1902 class GeneratedRTChecks { 1903 /// Basic block which contains the generated SCEV checks, if any. 1904 BasicBlock *SCEVCheckBlock = nullptr; 1905 1906 /// The value representing the result of the generated SCEV checks. If it is 1907 /// nullptr, either no SCEV checks have been generated or they have been used. 1908 Value *SCEVCheckCond = nullptr; 1909 1910 /// Basic block which contains the generated memory runtime checks, if any. 1911 BasicBlock *MemCheckBlock = nullptr; 1912 1913 /// The value representing the result of the generated memory runtime checks. 1914 /// If it is nullptr, either no memory runtime checks have been generated or 1915 /// they have been used. 1916 Instruction *MemRuntimeCheckCond = nullptr; 1917 1918 DominatorTree *DT; 1919 LoopInfo *LI; 1920 1921 SCEVExpander SCEVExp; 1922 SCEVExpander MemCheckExp; 1923 1924 public: 1925 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1926 const DataLayout &DL) 1927 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1928 MemCheckExp(SE, DL, "scev.check") {} 1929 1930 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1931 /// accurately estimate the cost of the runtime checks. The blocks are 1932 /// un-linked from the IR and is added back during vector code generation. If 1933 /// there is no vector code generation, the check blocks are removed 1934 /// completely. 1935 void Create(Loop *L, const LoopAccessInfo &LAI, 1936 const SCEVUnionPredicate &UnionPred) { 1937 1938 BasicBlock *LoopHeader = L->getHeader(); 1939 BasicBlock *Preheader = L->getLoopPreheader(); 1940 1941 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1942 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1943 // may be used by SCEVExpander. The blocks will be un-linked from their 1944 // predecessors and removed from LI & DT at the end of the function. 1945 if (!UnionPred.isAlwaysTrue()) { 1946 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1947 nullptr, "vector.scevcheck"); 1948 1949 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1950 &UnionPred, SCEVCheckBlock->getTerminator()); 1951 } 1952 1953 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1954 if (RtPtrChecking.Need) { 1955 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1956 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1957 "vector.memcheck"); 1958 1959 std::tie(std::ignore, MemRuntimeCheckCond) = 1960 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1961 RtPtrChecking.getChecks(), MemCheckExp); 1962 assert(MemRuntimeCheckCond && 1963 "no RT checks generated although RtPtrChecking " 1964 "claimed checks are required"); 1965 } 1966 1967 if (!MemCheckBlock && !SCEVCheckBlock) 1968 return; 1969 1970 // Unhook the temporary block with the checks, update various places 1971 // accordingly. 1972 if (SCEVCheckBlock) 1973 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1974 if (MemCheckBlock) 1975 MemCheckBlock->replaceAllUsesWith(Preheader); 1976 1977 if (SCEVCheckBlock) { 1978 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1979 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1980 Preheader->getTerminator()->eraseFromParent(); 1981 } 1982 if (MemCheckBlock) { 1983 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1984 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1985 Preheader->getTerminator()->eraseFromParent(); 1986 } 1987 1988 DT->changeImmediateDominator(LoopHeader, Preheader); 1989 if (MemCheckBlock) { 1990 DT->eraseNode(MemCheckBlock); 1991 LI->removeBlock(MemCheckBlock); 1992 } 1993 if (SCEVCheckBlock) { 1994 DT->eraseNode(SCEVCheckBlock); 1995 LI->removeBlock(SCEVCheckBlock); 1996 } 1997 } 1998 1999 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2000 /// unused. 2001 ~GeneratedRTChecks() { 2002 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2003 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2004 if (!SCEVCheckCond) 2005 SCEVCleaner.markResultUsed(); 2006 2007 if (!MemRuntimeCheckCond) 2008 MemCheckCleaner.markResultUsed(); 2009 2010 if (MemRuntimeCheckCond) { 2011 auto &SE = *MemCheckExp.getSE(); 2012 // Memory runtime check generation creates compares that use expanded 2013 // values. Remove them before running the SCEVExpanderCleaners. 2014 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2015 if (MemCheckExp.isInsertedInstruction(&I)) 2016 continue; 2017 SE.forgetValue(&I); 2018 SE.eraseValueFromMap(&I); 2019 I.eraseFromParent(); 2020 } 2021 } 2022 MemCheckCleaner.cleanup(); 2023 SCEVCleaner.cleanup(); 2024 2025 if (SCEVCheckCond) 2026 SCEVCheckBlock->eraseFromParent(); 2027 if (MemRuntimeCheckCond) 2028 MemCheckBlock->eraseFromParent(); 2029 } 2030 2031 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2032 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2033 /// depending on the generated condition. 2034 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2035 BasicBlock *LoopVectorPreHeader, 2036 BasicBlock *LoopExitBlock) { 2037 if (!SCEVCheckCond) 2038 return nullptr; 2039 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2040 if (C->isZero()) 2041 return nullptr; 2042 2043 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2044 2045 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2046 // Create new preheader for vector loop. 2047 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2048 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2049 2050 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2051 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2052 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2053 SCEVCheckBlock); 2054 2055 DT->addNewBlock(SCEVCheckBlock, Pred); 2056 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2057 2058 ReplaceInstWithInst( 2059 SCEVCheckBlock->getTerminator(), 2060 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2061 // Mark the check as used, to prevent it from being removed during cleanup. 2062 SCEVCheckCond = nullptr; 2063 return SCEVCheckBlock; 2064 } 2065 2066 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2067 /// the branches to branch to the vector preheader or \p Bypass, depending on 2068 /// the generated condition. 2069 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2070 BasicBlock *LoopVectorPreHeader) { 2071 // Check if we generated code that checks in runtime if arrays overlap. 2072 if (!MemRuntimeCheckCond) 2073 return nullptr; 2074 2075 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2076 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2077 MemCheckBlock); 2078 2079 DT->addNewBlock(MemCheckBlock, Pred); 2080 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2081 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2082 2083 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2084 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2085 2086 ReplaceInstWithInst( 2087 MemCheckBlock->getTerminator(), 2088 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2089 MemCheckBlock->getTerminator()->setDebugLoc( 2090 Pred->getTerminator()->getDebugLoc()); 2091 2092 // Mark the check as used, to prevent it from being removed during cleanup. 2093 MemRuntimeCheckCond = nullptr; 2094 return MemCheckBlock; 2095 } 2096 }; 2097 2098 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2099 // vectorization. The loop needs to be annotated with #pragma omp simd 2100 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2101 // vector length information is not provided, vectorization is not considered 2102 // explicit. Interleave hints are not allowed either. These limitations will be 2103 // relaxed in the future. 2104 // Please, note that we are currently forced to abuse the pragma 'clang 2105 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2106 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2107 // provides *explicit vectorization hints* (LV can bypass legal checks and 2108 // assume that vectorization is legal). However, both hints are implemented 2109 // using the same metadata (llvm.loop.vectorize, processed by 2110 // LoopVectorizeHints). This will be fixed in the future when the native IR 2111 // representation for pragma 'omp simd' is introduced. 2112 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2113 OptimizationRemarkEmitter *ORE) { 2114 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2115 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2116 2117 // Only outer loops with an explicit vectorization hint are supported. 2118 // Unannotated outer loops are ignored. 2119 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2120 return false; 2121 2122 Function *Fn = OuterLp->getHeader()->getParent(); 2123 if (!Hints.allowVectorization(Fn, OuterLp, 2124 true /*VectorizeOnlyWhenForced*/)) { 2125 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2126 return false; 2127 } 2128 2129 if (Hints.getInterleave() > 1) { 2130 // TODO: Interleave support is future work. 2131 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2132 "outer loops.\n"); 2133 Hints.emitRemarkWithHints(); 2134 return false; 2135 } 2136 2137 return true; 2138 } 2139 2140 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2141 OptimizationRemarkEmitter *ORE, 2142 SmallVectorImpl<Loop *> &V) { 2143 // Collect inner loops and outer loops without irreducible control flow. For 2144 // now, only collect outer loops that have explicit vectorization hints. If we 2145 // are stress testing the VPlan H-CFG construction, we collect the outermost 2146 // loop of every loop nest. 2147 if (L.isInnermost() || VPlanBuildStressTest || 2148 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2149 LoopBlocksRPO RPOT(&L); 2150 RPOT.perform(LI); 2151 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2152 V.push_back(&L); 2153 // TODO: Collect inner loops inside marked outer loops in case 2154 // vectorization fails for the outer loop. Do not invoke 2155 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2156 // already known to be reducible. We can use an inherited attribute for 2157 // that. 2158 return; 2159 } 2160 } 2161 for (Loop *InnerL : L) 2162 collectSupportedLoops(*InnerL, LI, ORE, V); 2163 } 2164 2165 namespace { 2166 2167 /// The LoopVectorize Pass. 2168 struct LoopVectorize : public FunctionPass { 2169 /// Pass identification, replacement for typeid 2170 static char ID; 2171 2172 LoopVectorizePass Impl; 2173 2174 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2175 bool VectorizeOnlyWhenForced = false) 2176 : FunctionPass(ID), 2177 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2178 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2179 } 2180 2181 bool runOnFunction(Function &F) override { 2182 if (skipFunction(F)) 2183 return false; 2184 2185 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2186 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2187 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2188 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2189 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2190 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2191 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2192 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2193 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2194 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2195 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2196 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2197 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2198 2199 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2200 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2201 2202 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2203 GetLAA, *ORE, PSI).MadeAnyChange; 2204 } 2205 2206 void getAnalysisUsage(AnalysisUsage &AU) const override { 2207 AU.addRequired<AssumptionCacheTracker>(); 2208 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2209 AU.addRequired<DominatorTreeWrapperPass>(); 2210 AU.addRequired<LoopInfoWrapperPass>(); 2211 AU.addRequired<ScalarEvolutionWrapperPass>(); 2212 AU.addRequired<TargetTransformInfoWrapperPass>(); 2213 AU.addRequired<AAResultsWrapperPass>(); 2214 AU.addRequired<LoopAccessLegacyAnalysis>(); 2215 AU.addRequired<DemandedBitsWrapperPass>(); 2216 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2217 AU.addRequired<InjectTLIMappingsLegacy>(); 2218 2219 // We currently do not preserve loopinfo/dominator analyses with outer loop 2220 // vectorization. Until this is addressed, mark these analyses as preserved 2221 // only for non-VPlan-native path. 2222 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2223 if (!EnableVPlanNativePath) { 2224 AU.addPreserved<LoopInfoWrapperPass>(); 2225 AU.addPreserved<DominatorTreeWrapperPass>(); 2226 } 2227 2228 AU.addPreserved<BasicAAWrapperPass>(); 2229 AU.addPreserved<GlobalsAAWrapperPass>(); 2230 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2231 } 2232 }; 2233 2234 } // end anonymous namespace 2235 2236 //===----------------------------------------------------------------------===// 2237 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2238 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2239 //===----------------------------------------------------------------------===// 2240 2241 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2242 // We need to place the broadcast of invariant variables outside the loop, 2243 // but only if it's proven safe to do so. Else, broadcast will be inside 2244 // vector loop body. 2245 Instruction *Instr = dyn_cast<Instruction>(V); 2246 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2247 (!Instr || 2248 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2249 // Place the code for broadcasting invariant variables in the new preheader. 2250 IRBuilder<>::InsertPointGuard Guard(Builder); 2251 if (SafeToHoist) 2252 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2253 2254 // Broadcast the scalar into all locations in the vector. 2255 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2256 2257 return Shuf; 2258 } 2259 2260 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2261 const InductionDescriptor &II, Value *Step, Value *Start, 2262 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2263 VPTransformState &State) { 2264 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2265 "Expected either an induction phi-node or a truncate of it!"); 2266 2267 // Construct the initial value of the vector IV in the vector loop preheader 2268 auto CurrIP = Builder.saveIP(); 2269 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2270 if (isa<TruncInst>(EntryVal)) { 2271 assert(Start->getType()->isIntegerTy() && 2272 "Truncation requires an integer type"); 2273 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2274 Step = Builder.CreateTrunc(Step, TruncType); 2275 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2276 } 2277 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2278 Value *SteppedStart = 2279 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2280 2281 // We create vector phi nodes for both integer and floating-point induction 2282 // variables. Here, we determine the kind of arithmetic we will perform. 2283 Instruction::BinaryOps AddOp; 2284 Instruction::BinaryOps MulOp; 2285 if (Step->getType()->isIntegerTy()) { 2286 AddOp = Instruction::Add; 2287 MulOp = Instruction::Mul; 2288 } else { 2289 AddOp = II.getInductionOpcode(); 2290 MulOp = Instruction::FMul; 2291 } 2292 2293 // Multiply the vectorization factor by the step using integer or 2294 // floating-point arithmetic as appropriate. 2295 Type *StepType = Step->getType(); 2296 if (Step->getType()->isFloatingPointTy()) 2297 StepType = IntegerType::get(StepType->getContext(), 2298 StepType->getScalarSizeInBits()); 2299 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2300 if (Step->getType()->isFloatingPointTy()) 2301 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2302 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2303 2304 // Create a vector splat to use in the induction update. 2305 // 2306 // FIXME: If the step is non-constant, we create the vector splat with 2307 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2308 // handle a constant vector splat. 2309 Value *SplatVF = isa<Constant>(Mul) 2310 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2311 : Builder.CreateVectorSplat(VF, Mul); 2312 Builder.restoreIP(CurrIP); 2313 2314 // We may need to add the step a number of times, depending on the unroll 2315 // factor. The last of those goes into the PHI. 2316 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2317 &*LoopVectorBody->getFirstInsertionPt()); 2318 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2319 Instruction *LastInduction = VecInd; 2320 for (unsigned Part = 0; Part < UF; ++Part) { 2321 State.set(Def, LastInduction, Part); 2322 2323 if (isa<TruncInst>(EntryVal)) 2324 addMetadata(LastInduction, EntryVal); 2325 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2326 State, Part); 2327 2328 LastInduction = cast<Instruction>( 2329 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2330 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2331 } 2332 2333 // Move the last step to the end of the latch block. This ensures consistent 2334 // placement of all induction updates. 2335 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2336 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2337 auto *ICmp = cast<Instruction>(Br->getCondition()); 2338 LastInduction->moveBefore(ICmp); 2339 LastInduction->setName("vec.ind.next"); 2340 2341 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2342 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2343 } 2344 2345 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2346 return Cost->isScalarAfterVectorization(I, VF) || 2347 Cost->isProfitableToScalarize(I, VF); 2348 } 2349 2350 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2351 if (shouldScalarizeInstruction(IV)) 2352 return true; 2353 auto isScalarInst = [&](User *U) -> bool { 2354 auto *I = cast<Instruction>(U); 2355 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2356 }; 2357 return llvm::any_of(IV->users(), isScalarInst); 2358 } 2359 2360 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2361 const InductionDescriptor &ID, const Instruction *EntryVal, 2362 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2363 unsigned Part, unsigned Lane) { 2364 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2365 "Expected either an induction phi-node or a truncate of it!"); 2366 2367 // This induction variable is not the phi from the original loop but the 2368 // newly-created IV based on the proof that casted Phi is equal to the 2369 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2370 // re-uses the same InductionDescriptor that original IV uses but we don't 2371 // have to do any recording in this case - that is done when original IV is 2372 // processed. 2373 if (isa<TruncInst>(EntryVal)) 2374 return; 2375 2376 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2377 if (Casts.empty()) 2378 return; 2379 // Only the first Cast instruction in the Casts vector is of interest. 2380 // The rest of the Casts (if exist) have no uses outside the 2381 // induction update chain itself. 2382 if (Lane < UINT_MAX) 2383 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2384 else 2385 State.set(CastDef, VectorLoopVal, Part); 2386 } 2387 2388 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2389 TruncInst *Trunc, VPValue *Def, 2390 VPValue *CastDef, 2391 VPTransformState &State) { 2392 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2393 "Primary induction variable must have an integer type"); 2394 2395 auto II = Legal->getInductionVars().find(IV); 2396 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2397 2398 auto ID = II->second; 2399 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2400 2401 // The value from the original loop to which we are mapping the new induction 2402 // variable. 2403 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2404 2405 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2406 2407 // Generate code for the induction step. Note that induction steps are 2408 // required to be loop-invariant 2409 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2410 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2411 "Induction step should be loop invariant"); 2412 if (PSE.getSE()->isSCEVable(IV->getType())) { 2413 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2414 return Exp.expandCodeFor(Step, Step->getType(), 2415 LoopVectorPreHeader->getTerminator()); 2416 } 2417 return cast<SCEVUnknown>(Step)->getValue(); 2418 }; 2419 2420 // The scalar value to broadcast. This is derived from the canonical 2421 // induction variable. If a truncation type is given, truncate the canonical 2422 // induction variable and step. Otherwise, derive these values from the 2423 // induction descriptor. 2424 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2425 Value *ScalarIV = Induction; 2426 if (IV != OldInduction) { 2427 ScalarIV = IV->getType()->isIntegerTy() 2428 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2429 : Builder.CreateCast(Instruction::SIToFP, Induction, 2430 IV->getType()); 2431 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2432 ScalarIV->setName("offset.idx"); 2433 } 2434 if (Trunc) { 2435 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2436 assert(Step->getType()->isIntegerTy() && 2437 "Truncation requires an integer step"); 2438 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2439 Step = Builder.CreateTrunc(Step, TruncType); 2440 } 2441 return ScalarIV; 2442 }; 2443 2444 // Create the vector values from the scalar IV, in the absence of creating a 2445 // vector IV. 2446 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2447 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2448 for (unsigned Part = 0; Part < UF; ++Part) { 2449 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2450 Value *EntryPart = 2451 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2452 ID.getInductionOpcode()); 2453 State.set(Def, EntryPart, Part); 2454 if (Trunc) 2455 addMetadata(EntryPart, Trunc); 2456 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2457 State, Part); 2458 } 2459 }; 2460 2461 // Fast-math-flags propagate from the original induction instruction. 2462 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2463 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2464 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2465 2466 // Now do the actual transformations, and start with creating the step value. 2467 Value *Step = CreateStepValue(ID.getStep()); 2468 if (VF.isZero() || VF.isScalar()) { 2469 Value *ScalarIV = CreateScalarIV(Step); 2470 CreateSplatIV(ScalarIV, Step); 2471 return; 2472 } 2473 2474 // Determine if we want a scalar version of the induction variable. This is 2475 // true if the induction variable itself is not widened, or if it has at 2476 // least one user in the loop that is not widened. 2477 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2478 if (!NeedsScalarIV) { 2479 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2480 State); 2481 return; 2482 } 2483 2484 // Try to create a new independent vector induction variable. If we can't 2485 // create the phi node, we will splat the scalar induction variable in each 2486 // loop iteration. 2487 if (!shouldScalarizeInstruction(EntryVal)) { 2488 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2489 State); 2490 Value *ScalarIV = CreateScalarIV(Step); 2491 // Create scalar steps that can be used by instructions we will later 2492 // scalarize. Note that the addition of the scalar steps will not increase 2493 // the number of instructions in the loop in the common case prior to 2494 // InstCombine. We will be trading one vector extract for each scalar step. 2495 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2496 return; 2497 } 2498 2499 // All IV users are scalar instructions, so only emit a scalar IV, not a 2500 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2501 // predicate used by the masked loads/stores. 2502 Value *ScalarIV = CreateScalarIV(Step); 2503 if (!Cost->isScalarEpilogueAllowed()) 2504 CreateSplatIV(ScalarIV, Step); 2505 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2506 } 2507 2508 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2509 Instruction::BinaryOps BinOp) { 2510 // Create and check the types. 2511 auto *ValVTy = cast<VectorType>(Val->getType()); 2512 ElementCount VLen = ValVTy->getElementCount(); 2513 2514 Type *STy = Val->getType()->getScalarType(); 2515 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2516 "Induction Step must be an integer or FP"); 2517 assert(Step->getType() == STy && "Step has wrong type"); 2518 2519 SmallVector<Constant *, 8> Indices; 2520 2521 // Create a vector of consecutive numbers from zero to VF. 2522 VectorType *InitVecValVTy = ValVTy; 2523 Type *InitVecValSTy = STy; 2524 if (STy->isFloatingPointTy()) { 2525 InitVecValSTy = 2526 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2527 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2528 } 2529 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2530 2531 // Add on StartIdx 2532 Value *StartIdxSplat = Builder.CreateVectorSplat( 2533 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2534 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2535 2536 if (STy->isIntegerTy()) { 2537 Step = Builder.CreateVectorSplat(VLen, Step); 2538 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2539 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2540 // which can be found from the original scalar operations. 2541 Step = Builder.CreateMul(InitVec, Step); 2542 return Builder.CreateAdd(Val, Step, "induction"); 2543 } 2544 2545 // Floating point induction. 2546 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2547 "Binary Opcode should be specified for FP induction"); 2548 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2549 Step = Builder.CreateVectorSplat(VLen, Step); 2550 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2551 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2552 } 2553 2554 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2555 Instruction *EntryVal, 2556 const InductionDescriptor &ID, 2557 VPValue *Def, VPValue *CastDef, 2558 VPTransformState &State) { 2559 // We shouldn't have to build scalar steps if we aren't vectorizing. 2560 assert(VF.isVector() && "VF should be greater than one"); 2561 // Get the value type and ensure it and the step have the same integer type. 2562 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2563 assert(ScalarIVTy == Step->getType() && 2564 "Val and Step should have the same type"); 2565 2566 // We build scalar steps for both integer and floating-point induction 2567 // variables. Here, we determine the kind of arithmetic we will perform. 2568 Instruction::BinaryOps AddOp; 2569 Instruction::BinaryOps MulOp; 2570 if (ScalarIVTy->isIntegerTy()) { 2571 AddOp = Instruction::Add; 2572 MulOp = Instruction::Mul; 2573 } else { 2574 AddOp = ID.getInductionOpcode(); 2575 MulOp = Instruction::FMul; 2576 } 2577 2578 // Determine the number of scalars we need to generate for each unroll 2579 // iteration. If EntryVal is uniform, we only need to generate the first 2580 // lane. Otherwise, we generate all VF values. 2581 bool IsUniform = 2582 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2583 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2584 // Compute the scalar steps and save the results in State. 2585 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2586 ScalarIVTy->getScalarSizeInBits()); 2587 Type *VecIVTy = nullptr; 2588 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2589 if (!IsUniform && VF.isScalable()) { 2590 VecIVTy = VectorType::get(ScalarIVTy, VF); 2591 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2592 SplatStep = Builder.CreateVectorSplat(VF, Step); 2593 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2594 } 2595 2596 for (unsigned Part = 0; Part < UF; ++Part) { 2597 Value *StartIdx0 = 2598 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2599 2600 if (!IsUniform && VF.isScalable()) { 2601 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2602 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2603 if (ScalarIVTy->isFloatingPointTy()) 2604 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2605 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2606 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2607 State.set(Def, Add, Part); 2608 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2609 Part); 2610 // It's useful to record the lane values too for the known minimum number 2611 // of elements so we do those below. This improves the code quality when 2612 // trying to extract the first element, for example. 2613 } 2614 2615 if (ScalarIVTy->isFloatingPointTy()) 2616 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2617 2618 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2619 Value *StartIdx = Builder.CreateBinOp( 2620 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2621 // The step returned by `createStepForVF` is a runtime-evaluated value 2622 // when VF is scalable. Otherwise, it should be folded into a Constant. 2623 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2624 "Expected StartIdx to be folded to a constant when VF is not " 2625 "scalable"); 2626 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2627 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2628 State.set(Def, Add, VPIteration(Part, Lane)); 2629 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2630 Part, Lane); 2631 } 2632 } 2633 } 2634 2635 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2636 const VPIteration &Instance, 2637 VPTransformState &State) { 2638 Value *ScalarInst = State.get(Def, Instance); 2639 Value *VectorValue = State.get(Def, Instance.Part); 2640 VectorValue = Builder.CreateInsertElement( 2641 VectorValue, ScalarInst, 2642 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2643 State.set(Def, VectorValue, Instance.Part); 2644 } 2645 2646 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2647 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2648 return Builder.CreateVectorReverse(Vec, "reverse"); 2649 } 2650 2651 // Return whether we allow using masked interleave-groups (for dealing with 2652 // strided loads/stores that reside in predicated blocks, or for dealing 2653 // with gaps). 2654 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2655 // If an override option has been passed in for interleaved accesses, use it. 2656 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2657 return EnableMaskedInterleavedMemAccesses; 2658 2659 return TTI.enableMaskedInterleavedAccessVectorization(); 2660 } 2661 2662 // Try to vectorize the interleave group that \p Instr belongs to. 2663 // 2664 // E.g. Translate following interleaved load group (factor = 3): 2665 // for (i = 0; i < N; i+=3) { 2666 // R = Pic[i]; // Member of index 0 2667 // G = Pic[i+1]; // Member of index 1 2668 // B = Pic[i+2]; // Member of index 2 2669 // ... // do something to R, G, B 2670 // } 2671 // To: 2672 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2673 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2674 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2675 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2676 // 2677 // Or translate following interleaved store group (factor = 3): 2678 // for (i = 0; i < N; i+=3) { 2679 // ... do something to R, G, B 2680 // Pic[i] = R; // Member of index 0 2681 // Pic[i+1] = G; // Member of index 1 2682 // Pic[i+2] = B; // Member of index 2 2683 // } 2684 // To: 2685 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2686 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2687 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2688 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2689 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2690 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2691 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2692 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2693 VPValue *BlockInMask) { 2694 Instruction *Instr = Group->getInsertPos(); 2695 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2696 2697 // Prepare for the vector type of the interleaved load/store. 2698 Type *ScalarTy = getMemInstValueType(Instr); 2699 unsigned InterleaveFactor = Group->getFactor(); 2700 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2701 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2702 2703 // Prepare for the new pointers. 2704 SmallVector<Value *, 2> AddrParts; 2705 unsigned Index = Group->getIndex(Instr); 2706 2707 // TODO: extend the masked interleaved-group support to reversed access. 2708 assert((!BlockInMask || !Group->isReverse()) && 2709 "Reversed masked interleave-group not supported."); 2710 2711 // If the group is reverse, adjust the index to refer to the last vector lane 2712 // instead of the first. We adjust the index from the first vector lane, 2713 // rather than directly getting the pointer for lane VF - 1, because the 2714 // pointer operand of the interleaved access is supposed to be uniform. For 2715 // uniform instructions, we're only required to generate a value for the 2716 // first vector lane in each unroll iteration. 2717 if (Group->isReverse()) 2718 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2719 2720 for (unsigned Part = 0; Part < UF; Part++) { 2721 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2722 setDebugLocFromInst(Builder, AddrPart); 2723 2724 // Notice current instruction could be any index. Need to adjust the address 2725 // to the member of index 0. 2726 // 2727 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2728 // b = A[i]; // Member of index 0 2729 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2730 // 2731 // E.g. A[i+1] = a; // Member of index 1 2732 // A[i] = b; // Member of index 0 2733 // A[i+2] = c; // Member of index 2 (Current instruction) 2734 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2735 2736 bool InBounds = false; 2737 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2738 InBounds = gep->isInBounds(); 2739 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2740 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2741 2742 // Cast to the vector pointer type. 2743 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2744 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2745 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2746 } 2747 2748 setDebugLocFromInst(Builder, Instr); 2749 Value *PoisonVec = PoisonValue::get(VecTy); 2750 2751 Value *MaskForGaps = nullptr; 2752 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2753 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2754 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2755 } 2756 2757 // Vectorize the interleaved load group. 2758 if (isa<LoadInst>(Instr)) { 2759 // For each unroll part, create a wide load for the group. 2760 SmallVector<Value *, 2> NewLoads; 2761 for (unsigned Part = 0; Part < UF; Part++) { 2762 Instruction *NewLoad; 2763 if (BlockInMask || MaskForGaps) { 2764 assert(useMaskedInterleavedAccesses(*TTI) && 2765 "masked interleaved groups are not allowed."); 2766 Value *GroupMask = MaskForGaps; 2767 if (BlockInMask) { 2768 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2769 Value *ShuffledMask = Builder.CreateShuffleVector( 2770 BlockInMaskPart, 2771 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2772 "interleaved.mask"); 2773 GroupMask = MaskForGaps 2774 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2775 MaskForGaps) 2776 : ShuffledMask; 2777 } 2778 NewLoad = 2779 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2780 GroupMask, PoisonVec, "wide.masked.vec"); 2781 } 2782 else 2783 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2784 Group->getAlign(), "wide.vec"); 2785 Group->addMetadata(NewLoad); 2786 NewLoads.push_back(NewLoad); 2787 } 2788 2789 // For each member in the group, shuffle out the appropriate data from the 2790 // wide loads. 2791 unsigned J = 0; 2792 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2793 Instruction *Member = Group->getMember(I); 2794 2795 // Skip the gaps in the group. 2796 if (!Member) 2797 continue; 2798 2799 auto StrideMask = 2800 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2801 for (unsigned Part = 0; Part < UF; Part++) { 2802 Value *StridedVec = Builder.CreateShuffleVector( 2803 NewLoads[Part], StrideMask, "strided.vec"); 2804 2805 // If this member has different type, cast the result type. 2806 if (Member->getType() != ScalarTy) { 2807 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2808 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2809 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2810 } 2811 2812 if (Group->isReverse()) 2813 StridedVec = reverseVector(StridedVec); 2814 2815 State.set(VPDefs[J], StridedVec, Part); 2816 } 2817 ++J; 2818 } 2819 return; 2820 } 2821 2822 // The sub vector type for current instruction. 2823 auto *SubVT = VectorType::get(ScalarTy, VF); 2824 2825 // Vectorize the interleaved store group. 2826 for (unsigned Part = 0; Part < UF; Part++) { 2827 // Collect the stored vector from each member. 2828 SmallVector<Value *, 4> StoredVecs; 2829 for (unsigned i = 0; i < InterleaveFactor; i++) { 2830 // Interleaved store group doesn't allow a gap, so each index has a member 2831 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2832 2833 Value *StoredVec = State.get(StoredValues[i], Part); 2834 2835 if (Group->isReverse()) 2836 StoredVec = reverseVector(StoredVec); 2837 2838 // If this member has different type, cast it to a unified type. 2839 2840 if (StoredVec->getType() != SubVT) 2841 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2842 2843 StoredVecs.push_back(StoredVec); 2844 } 2845 2846 // Concatenate all vectors into a wide vector. 2847 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2848 2849 // Interleave the elements in the wide vector. 2850 Value *IVec = Builder.CreateShuffleVector( 2851 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2852 "interleaved.vec"); 2853 2854 Instruction *NewStoreInstr; 2855 if (BlockInMask) { 2856 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2857 Value *ShuffledMask = Builder.CreateShuffleVector( 2858 BlockInMaskPart, 2859 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2860 "interleaved.mask"); 2861 NewStoreInstr = Builder.CreateMaskedStore( 2862 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2863 } 2864 else 2865 NewStoreInstr = 2866 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2867 2868 Group->addMetadata(NewStoreInstr); 2869 } 2870 } 2871 2872 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2873 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2874 VPValue *StoredValue, VPValue *BlockInMask) { 2875 // Attempt to issue a wide load. 2876 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2877 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2878 2879 assert((LI || SI) && "Invalid Load/Store instruction"); 2880 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2881 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2882 2883 LoopVectorizationCostModel::InstWidening Decision = 2884 Cost->getWideningDecision(Instr, VF); 2885 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2886 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2887 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2888 "CM decision is not to widen the memory instruction"); 2889 2890 Type *ScalarDataTy = getMemInstValueType(Instr); 2891 2892 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2893 const Align Alignment = getLoadStoreAlignment(Instr); 2894 2895 // Determine if the pointer operand of the access is either consecutive or 2896 // reverse consecutive. 2897 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2898 bool ConsecutiveStride = 2899 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2900 bool CreateGatherScatter = 2901 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2902 2903 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2904 // gather/scatter. Otherwise Decision should have been to Scalarize. 2905 assert((ConsecutiveStride || CreateGatherScatter) && 2906 "The instruction should be scalarized"); 2907 (void)ConsecutiveStride; 2908 2909 VectorParts BlockInMaskParts(UF); 2910 bool isMaskRequired = BlockInMask; 2911 if (isMaskRequired) 2912 for (unsigned Part = 0; Part < UF; ++Part) 2913 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2914 2915 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2916 // Calculate the pointer for the specific unroll-part. 2917 GetElementPtrInst *PartPtr = nullptr; 2918 2919 bool InBounds = false; 2920 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2921 InBounds = gep->isInBounds(); 2922 if (Reverse) { 2923 // If the address is consecutive but reversed, then the 2924 // wide store needs to start at the last vector element. 2925 // RunTimeVF = VScale * VF.getKnownMinValue() 2926 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2927 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2928 // NumElt = -Part * RunTimeVF 2929 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2930 // LastLane = 1 - RunTimeVF 2931 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2932 PartPtr = 2933 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2934 PartPtr->setIsInBounds(InBounds); 2935 PartPtr = cast<GetElementPtrInst>( 2936 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2937 PartPtr->setIsInBounds(InBounds); 2938 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2939 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2940 } else { 2941 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2942 PartPtr = cast<GetElementPtrInst>( 2943 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2944 PartPtr->setIsInBounds(InBounds); 2945 } 2946 2947 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2948 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2949 }; 2950 2951 // Handle Stores: 2952 if (SI) { 2953 setDebugLocFromInst(Builder, SI); 2954 2955 for (unsigned Part = 0; Part < UF; ++Part) { 2956 Instruction *NewSI = nullptr; 2957 Value *StoredVal = State.get(StoredValue, Part); 2958 if (CreateGatherScatter) { 2959 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2960 Value *VectorGep = State.get(Addr, Part); 2961 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2962 MaskPart); 2963 } else { 2964 if (Reverse) { 2965 // If we store to reverse consecutive memory locations, then we need 2966 // to reverse the order of elements in the stored value. 2967 StoredVal = reverseVector(StoredVal); 2968 // We don't want to update the value in the map as it might be used in 2969 // another expression. So don't call resetVectorValue(StoredVal). 2970 } 2971 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2972 if (isMaskRequired) 2973 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2974 BlockInMaskParts[Part]); 2975 else 2976 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2977 } 2978 addMetadata(NewSI, SI); 2979 } 2980 return; 2981 } 2982 2983 // Handle loads. 2984 assert(LI && "Must have a load instruction"); 2985 setDebugLocFromInst(Builder, LI); 2986 for (unsigned Part = 0; Part < UF; ++Part) { 2987 Value *NewLI; 2988 if (CreateGatherScatter) { 2989 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2990 Value *VectorGep = State.get(Addr, Part); 2991 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2992 nullptr, "wide.masked.gather"); 2993 addMetadata(NewLI, LI); 2994 } else { 2995 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2996 if (isMaskRequired) 2997 NewLI = Builder.CreateMaskedLoad( 2998 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2999 "wide.masked.load"); 3000 else 3001 NewLI = 3002 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3003 3004 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3005 addMetadata(NewLI, LI); 3006 if (Reverse) 3007 NewLI = reverseVector(NewLI); 3008 } 3009 3010 State.set(Def, NewLI, Part); 3011 } 3012 } 3013 3014 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3015 VPUser &User, 3016 const VPIteration &Instance, 3017 bool IfPredicateInstr, 3018 VPTransformState &State) { 3019 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3020 3021 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3022 // the first lane and part. 3023 if (isa<NoAliasScopeDeclInst>(Instr)) 3024 if (!Instance.isFirstIteration()) 3025 return; 3026 3027 setDebugLocFromInst(Builder, Instr); 3028 3029 // Does this instruction return a value ? 3030 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3031 3032 Instruction *Cloned = Instr->clone(); 3033 if (!IsVoidRetTy) 3034 Cloned->setName(Instr->getName() + ".cloned"); 3035 3036 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3037 Builder.GetInsertPoint()); 3038 // Replace the operands of the cloned instructions with their scalar 3039 // equivalents in the new loop. 3040 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3041 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3042 auto InputInstance = Instance; 3043 if (!Operand || !OrigLoop->contains(Operand) || 3044 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3045 InputInstance.Lane = VPLane::getFirstLane(); 3046 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3047 Cloned->setOperand(op, NewOp); 3048 } 3049 addNewMetadata(Cloned, Instr); 3050 3051 // Place the cloned scalar in the new loop. 3052 Builder.Insert(Cloned); 3053 3054 State.set(Def, Cloned, Instance); 3055 3056 // If we just cloned a new assumption, add it the assumption cache. 3057 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3058 AC->registerAssumption(II); 3059 3060 // End if-block. 3061 if (IfPredicateInstr) 3062 PredicatedInstructions.push_back(Cloned); 3063 } 3064 3065 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3066 Value *End, Value *Step, 3067 Instruction *DL) { 3068 BasicBlock *Header = L->getHeader(); 3069 BasicBlock *Latch = L->getLoopLatch(); 3070 // As we're just creating this loop, it's possible no latch exists 3071 // yet. If so, use the header as this will be a single block loop. 3072 if (!Latch) 3073 Latch = Header; 3074 3075 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3076 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3077 setDebugLocFromInst(Builder, OldInst); 3078 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3079 3080 Builder.SetInsertPoint(Latch->getTerminator()); 3081 setDebugLocFromInst(Builder, OldInst); 3082 3083 // Create i+1 and fill the PHINode. 3084 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3085 Induction->addIncoming(Start, L->getLoopPreheader()); 3086 Induction->addIncoming(Next, Latch); 3087 // Create the compare. 3088 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3089 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3090 3091 // Now we have two terminators. Remove the old one from the block. 3092 Latch->getTerminator()->eraseFromParent(); 3093 3094 return Induction; 3095 } 3096 3097 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3098 if (TripCount) 3099 return TripCount; 3100 3101 assert(L && "Create Trip Count for null loop."); 3102 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3103 // Find the loop boundaries. 3104 ScalarEvolution *SE = PSE.getSE(); 3105 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3106 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3107 "Invalid loop count"); 3108 3109 Type *IdxTy = Legal->getWidestInductionType(); 3110 assert(IdxTy && "No type for induction"); 3111 3112 // The exit count might have the type of i64 while the phi is i32. This can 3113 // happen if we have an induction variable that is sign extended before the 3114 // compare. The only way that we get a backedge taken count is that the 3115 // induction variable was signed and as such will not overflow. In such a case 3116 // truncation is legal. 3117 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3118 IdxTy->getPrimitiveSizeInBits()) 3119 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3120 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3121 3122 // Get the total trip count from the count by adding 1. 3123 const SCEV *ExitCount = SE->getAddExpr( 3124 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3125 3126 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3127 3128 // Expand the trip count and place the new instructions in the preheader. 3129 // Notice that the pre-header does not change, only the loop body. 3130 SCEVExpander Exp(*SE, DL, "induction"); 3131 3132 // Count holds the overall loop count (N). 3133 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3134 L->getLoopPreheader()->getTerminator()); 3135 3136 if (TripCount->getType()->isPointerTy()) 3137 TripCount = 3138 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3139 L->getLoopPreheader()->getTerminator()); 3140 3141 return TripCount; 3142 } 3143 3144 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3145 if (VectorTripCount) 3146 return VectorTripCount; 3147 3148 Value *TC = getOrCreateTripCount(L); 3149 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3150 3151 Type *Ty = TC->getType(); 3152 // This is where we can make the step a runtime constant. 3153 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3154 3155 // If the tail is to be folded by masking, round the number of iterations N 3156 // up to a multiple of Step instead of rounding down. This is done by first 3157 // adding Step-1 and then rounding down. Note that it's ok if this addition 3158 // overflows: the vector induction variable will eventually wrap to zero given 3159 // that it starts at zero and its Step is a power of two; the loop will then 3160 // exit, with the last early-exit vector comparison also producing all-true. 3161 if (Cost->foldTailByMasking()) { 3162 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3163 "VF*UF must be a power of 2 when folding tail by masking"); 3164 assert(!VF.isScalable() && 3165 "Tail folding not yet supported for scalable vectors"); 3166 TC = Builder.CreateAdd( 3167 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3168 } 3169 3170 // Now we need to generate the expression for the part of the loop that the 3171 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3172 // iterations are not required for correctness, or N - Step, otherwise. Step 3173 // is equal to the vectorization factor (number of SIMD elements) times the 3174 // unroll factor (number of SIMD instructions). 3175 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3176 3177 // There are two cases where we need to ensure (at least) the last iteration 3178 // runs in the scalar remainder loop. Thus, if the step evenly divides 3179 // the trip count, we set the remainder to be equal to the step. If the step 3180 // does not evenly divide the trip count, no adjustment is necessary since 3181 // there will already be scalar iterations. Note that the minimum iterations 3182 // check ensures that N >= Step. The cases are: 3183 // 1) If there is a non-reversed interleaved group that may speculatively 3184 // access memory out-of-bounds. 3185 // 2) If any instruction may follow a conditionally taken exit. That is, if 3186 // the loop contains multiple exiting blocks, or a single exiting block 3187 // which is not the latch. 3188 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3189 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3190 R = Builder.CreateSelect(IsZero, Step, R); 3191 } 3192 3193 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3194 3195 return VectorTripCount; 3196 } 3197 3198 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3199 const DataLayout &DL) { 3200 // Verify that V is a vector type with same number of elements as DstVTy. 3201 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3202 unsigned VF = DstFVTy->getNumElements(); 3203 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3204 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3205 Type *SrcElemTy = SrcVecTy->getElementType(); 3206 Type *DstElemTy = DstFVTy->getElementType(); 3207 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3208 "Vector elements must have same size"); 3209 3210 // Do a direct cast if element types are castable. 3211 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3212 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3213 } 3214 // V cannot be directly casted to desired vector type. 3215 // May happen when V is a floating point vector but DstVTy is a vector of 3216 // pointers or vice-versa. Handle this using a two-step bitcast using an 3217 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3218 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3219 "Only one type should be a pointer type"); 3220 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3221 "Only one type should be a floating point type"); 3222 Type *IntTy = 3223 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3224 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3225 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3226 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3227 } 3228 3229 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3230 BasicBlock *Bypass) { 3231 Value *Count = getOrCreateTripCount(L); 3232 // Reuse existing vector loop preheader for TC checks. 3233 // Note that new preheader block is generated for vector loop. 3234 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3235 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3236 3237 // Generate code to check if the loop's trip count is less than VF * UF, or 3238 // equal to it in case a scalar epilogue is required; this implies that the 3239 // vector trip count is zero. This check also covers the case where adding one 3240 // to the backedge-taken count overflowed leading to an incorrect trip count 3241 // of zero. In this case we will also jump to the scalar loop. 3242 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3243 : ICmpInst::ICMP_ULT; 3244 3245 // If tail is to be folded, vector loop takes care of all iterations. 3246 Value *CheckMinIters = Builder.getFalse(); 3247 if (!Cost->foldTailByMasking()) { 3248 Value *Step = 3249 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3250 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3251 } 3252 // Create new preheader for vector loop. 3253 LoopVectorPreHeader = 3254 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3255 "vector.ph"); 3256 3257 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3258 DT->getNode(Bypass)->getIDom()) && 3259 "TC check is expected to dominate Bypass"); 3260 3261 // Update dominator for Bypass & LoopExit. 3262 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3263 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3264 3265 ReplaceInstWithInst( 3266 TCCheckBlock->getTerminator(), 3267 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3268 LoopBypassBlocks.push_back(TCCheckBlock); 3269 } 3270 3271 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3272 3273 BasicBlock *const SCEVCheckBlock = 3274 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3275 if (!SCEVCheckBlock) 3276 return nullptr; 3277 3278 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3279 (OptForSizeBasedOnProfile && 3280 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3281 "Cannot SCEV check stride or overflow when optimizing for size"); 3282 3283 3284 // Update dominator only if this is first RT check. 3285 if (LoopBypassBlocks.empty()) { 3286 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3287 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3288 } 3289 3290 LoopBypassBlocks.push_back(SCEVCheckBlock); 3291 AddedSafetyChecks = true; 3292 return SCEVCheckBlock; 3293 } 3294 3295 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3296 BasicBlock *Bypass) { 3297 // VPlan-native path does not do any analysis for runtime checks currently. 3298 if (EnableVPlanNativePath) 3299 return nullptr; 3300 3301 BasicBlock *const MemCheckBlock = 3302 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3303 3304 // Check if we generated code that checks in runtime if arrays overlap. We put 3305 // the checks into a separate block to make the more common case of few 3306 // elements faster. 3307 if (!MemCheckBlock) 3308 return nullptr; 3309 3310 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3311 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3312 "Cannot emit memory checks when optimizing for size, unless forced " 3313 "to vectorize."); 3314 ORE->emit([&]() { 3315 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3316 L->getStartLoc(), L->getHeader()) 3317 << "Code-size may be reduced by not forcing " 3318 "vectorization, or by source-code modifications " 3319 "eliminating the need for runtime checks " 3320 "(e.g., adding 'restrict')."; 3321 }); 3322 } 3323 3324 LoopBypassBlocks.push_back(MemCheckBlock); 3325 3326 AddedSafetyChecks = true; 3327 3328 // We currently don't use LoopVersioning for the actual loop cloning but we 3329 // still use it to add the noalias metadata. 3330 LVer = std::make_unique<LoopVersioning>( 3331 *Legal->getLAI(), 3332 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3333 DT, PSE.getSE()); 3334 LVer->prepareNoAliasMetadata(); 3335 return MemCheckBlock; 3336 } 3337 3338 Value *InnerLoopVectorizer::emitTransformedIndex( 3339 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3340 const InductionDescriptor &ID) const { 3341 3342 SCEVExpander Exp(*SE, DL, "induction"); 3343 auto Step = ID.getStep(); 3344 auto StartValue = ID.getStartValue(); 3345 assert(Index->getType()->getScalarType() == Step->getType() && 3346 "Index scalar type does not match StepValue type"); 3347 3348 // Note: the IR at this point is broken. We cannot use SE to create any new 3349 // SCEV and then expand it, hoping that SCEV's simplification will give us 3350 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3351 // lead to various SCEV crashes. So all we can do is to use builder and rely 3352 // on InstCombine for future simplifications. Here we handle some trivial 3353 // cases only. 3354 auto CreateAdd = [&B](Value *X, Value *Y) { 3355 assert(X->getType() == Y->getType() && "Types don't match!"); 3356 if (auto *CX = dyn_cast<ConstantInt>(X)) 3357 if (CX->isZero()) 3358 return Y; 3359 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3360 if (CY->isZero()) 3361 return X; 3362 return B.CreateAdd(X, Y); 3363 }; 3364 3365 // We allow X to be a vector type, in which case Y will potentially be 3366 // splatted into a vector with the same element count. 3367 auto CreateMul = [&B](Value *X, Value *Y) { 3368 assert(X->getType()->getScalarType() == Y->getType() && 3369 "Types don't match!"); 3370 if (auto *CX = dyn_cast<ConstantInt>(X)) 3371 if (CX->isOne()) 3372 return Y; 3373 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3374 if (CY->isOne()) 3375 return X; 3376 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3377 if (XVTy && !isa<VectorType>(Y->getType())) 3378 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3379 return B.CreateMul(X, Y); 3380 }; 3381 3382 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3383 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3384 // the DomTree is not kept up-to-date for additional blocks generated in the 3385 // vector loop. By using the header as insertion point, we guarantee that the 3386 // expanded instructions dominate all their uses. 3387 auto GetInsertPoint = [this, &B]() { 3388 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3389 if (InsertBB != LoopVectorBody && 3390 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3391 return LoopVectorBody->getTerminator(); 3392 return &*B.GetInsertPoint(); 3393 }; 3394 3395 switch (ID.getKind()) { 3396 case InductionDescriptor::IK_IntInduction: { 3397 assert(!isa<VectorType>(Index->getType()) && 3398 "Vector indices not supported for integer inductions yet"); 3399 assert(Index->getType() == StartValue->getType() && 3400 "Index type does not match StartValue type"); 3401 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3402 return B.CreateSub(StartValue, Index); 3403 auto *Offset = CreateMul( 3404 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3405 return CreateAdd(StartValue, Offset); 3406 } 3407 case InductionDescriptor::IK_PtrInduction: { 3408 assert(isa<SCEVConstant>(Step) && 3409 "Expected constant step for pointer induction"); 3410 return B.CreateGEP( 3411 StartValue->getType()->getPointerElementType(), StartValue, 3412 CreateMul(Index, 3413 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3414 GetInsertPoint()))); 3415 } 3416 case InductionDescriptor::IK_FpInduction: { 3417 assert(!isa<VectorType>(Index->getType()) && 3418 "Vector indices not supported for FP inductions yet"); 3419 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3420 auto InductionBinOp = ID.getInductionBinOp(); 3421 assert(InductionBinOp && 3422 (InductionBinOp->getOpcode() == Instruction::FAdd || 3423 InductionBinOp->getOpcode() == Instruction::FSub) && 3424 "Original bin op should be defined for FP induction"); 3425 3426 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3427 Value *MulExp = B.CreateFMul(StepValue, Index); 3428 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3429 "induction"); 3430 } 3431 case InductionDescriptor::IK_NoInduction: 3432 return nullptr; 3433 } 3434 llvm_unreachable("invalid enum"); 3435 } 3436 3437 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3438 LoopScalarBody = OrigLoop->getHeader(); 3439 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3440 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3441 assert(LoopExitBlock && "Must have an exit block"); 3442 assert(LoopVectorPreHeader && "Invalid loop structure"); 3443 3444 LoopMiddleBlock = 3445 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3446 LI, nullptr, Twine(Prefix) + "middle.block"); 3447 LoopScalarPreHeader = 3448 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3449 nullptr, Twine(Prefix) + "scalar.ph"); 3450 3451 // Set up branch from middle block to the exit and scalar preheader blocks. 3452 // completeLoopSkeleton will update the condition to use an iteration check, 3453 // if required to decide whether to execute the remainder. 3454 BranchInst *BrInst = 3455 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3456 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3457 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3458 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3459 3460 // We intentionally don't let SplitBlock to update LoopInfo since 3461 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3462 // LoopVectorBody is explicitly added to the correct place few lines later. 3463 LoopVectorBody = 3464 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3465 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3466 3467 // Update dominator for loop exit. 3468 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3469 3470 // Create and register the new vector loop. 3471 Loop *Lp = LI->AllocateLoop(); 3472 Loop *ParentLoop = OrigLoop->getParentLoop(); 3473 3474 // Insert the new loop into the loop nest and register the new basic blocks 3475 // before calling any utilities such as SCEV that require valid LoopInfo. 3476 if (ParentLoop) { 3477 ParentLoop->addChildLoop(Lp); 3478 } else { 3479 LI->addTopLevelLoop(Lp); 3480 } 3481 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3482 return Lp; 3483 } 3484 3485 void InnerLoopVectorizer::createInductionResumeValues( 3486 Loop *L, Value *VectorTripCount, 3487 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3488 assert(VectorTripCount && L && "Expected valid arguments"); 3489 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3490 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3491 "Inconsistent information about additional bypass."); 3492 // We are going to resume the execution of the scalar loop. 3493 // Go over all of the induction variables that we found and fix the 3494 // PHIs that are left in the scalar version of the loop. 3495 // The starting values of PHI nodes depend on the counter of the last 3496 // iteration in the vectorized loop. 3497 // If we come from a bypass edge then we need to start from the original 3498 // start value. 3499 for (auto &InductionEntry : Legal->getInductionVars()) { 3500 PHINode *OrigPhi = InductionEntry.first; 3501 InductionDescriptor II = InductionEntry.second; 3502 3503 // Create phi nodes to merge from the backedge-taken check block. 3504 PHINode *BCResumeVal = 3505 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3506 LoopScalarPreHeader->getTerminator()); 3507 // Copy original phi DL over to the new one. 3508 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3509 Value *&EndValue = IVEndValues[OrigPhi]; 3510 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3511 if (OrigPhi == OldInduction) { 3512 // We know what the end value is. 3513 EndValue = VectorTripCount; 3514 } else { 3515 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3516 3517 // Fast-math-flags propagate from the original induction instruction. 3518 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3519 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3520 3521 Type *StepType = II.getStep()->getType(); 3522 Instruction::CastOps CastOp = 3523 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3524 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3525 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3526 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3527 EndValue->setName("ind.end"); 3528 3529 // Compute the end value for the additional bypass (if applicable). 3530 if (AdditionalBypass.first) { 3531 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3532 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3533 StepType, true); 3534 CRD = 3535 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3536 EndValueFromAdditionalBypass = 3537 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3538 EndValueFromAdditionalBypass->setName("ind.end"); 3539 } 3540 } 3541 // The new PHI merges the original incoming value, in case of a bypass, 3542 // or the value at the end of the vectorized loop. 3543 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3544 3545 // Fix the scalar body counter (PHI node). 3546 // The old induction's phi node in the scalar body needs the truncated 3547 // value. 3548 for (BasicBlock *BB : LoopBypassBlocks) 3549 BCResumeVal->addIncoming(II.getStartValue(), BB); 3550 3551 if (AdditionalBypass.first) 3552 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3553 EndValueFromAdditionalBypass); 3554 3555 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3556 } 3557 } 3558 3559 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3560 MDNode *OrigLoopID) { 3561 assert(L && "Expected valid loop."); 3562 3563 // The trip counts should be cached by now. 3564 Value *Count = getOrCreateTripCount(L); 3565 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3566 3567 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3568 3569 // Add a check in the middle block to see if we have completed 3570 // all of the iterations in the first vector loop. 3571 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3572 // If tail is to be folded, we know we don't need to run the remainder. 3573 if (!Cost->foldTailByMasking()) { 3574 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3575 Count, VectorTripCount, "cmp.n", 3576 LoopMiddleBlock->getTerminator()); 3577 3578 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3579 // of the corresponding compare because they may have ended up with 3580 // different line numbers and we want to avoid awkward line stepping while 3581 // debugging. Eg. if the compare has got a line number inside the loop. 3582 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3583 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3584 } 3585 3586 // Get ready to start creating new instructions into the vectorized body. 3587 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3588 "Inconsistent vector loop preheader"); 3589 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3590 3591 Optional<MDNode *> VectorizedLoopID = 3592 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3593 LLVMLoopVectorizeFollowupVectorized}); 3594 if (VectorizedLoopID.hasValue()) { 3595 L->setLoopID(VectorizedLoopID.getValue()); 3596 3597 // Do not setAlreadyVectorized if loop attributes have been defined 3598 // explicitly. 3599 return LoopVectorPreHeader; 3600 } 3601 3602 // Keep all loop hints from the original loop on the vector loop (we'll 3603 // replace the vectorizer-specific hints below). 3604 if (MDNode *LID = OrigLoop->getLoopID()) 3605 L->setLoopID(LID); 3606 3607 LoopVectorizeHints Hints(L, true, *ORE); 3608 Hints.setAlreadyVectorized(); 3609 3610 #ifdef EXPENSIVE_CHECKS 3611 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3612 LI->verify(*DT); 3613 #endif 3614 3615 return LoopVectorPreHeader; 3616 } 3617 3618 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3619 /* 3620 In this function we generate a new loop. The new loop will contain 3621 the vectorized instructions while the old loop will continue to run the 3622 scalar remainder. 3623 3624 [ ] <-- loop iteration number check. 3625 / | 3626 / v 3627 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3628 | / | 3629 | / v 3630 || [ ] <-- vector pre header. 3631 |/ | 3632 | v 3633 | [ ] \ 3634 | [ ]_| <-- vector loop. 3635 | | 3636 | v 3637 | -[ ] <--- middle-block. 3638 | / | 3639 | / v 3640 -|- >[ ] <--- new preheader. 3641 | | 3642 | v 3643 | [ ] \ 3644 | [ ]_| <-- old scalar loop to handle remainder. 3645 \ | 3646 \ v 3647 >[ ] <-- exit block. 3648 ... 3649 */ 3650 3651 // Get the metadata of the original loop before it gets modified. 3652 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3653 3654 // Workaround! Compute the trip count of the original loop and cache it 3655 // before we start modifying the CFG. This code has a systemic problem 3656 // wherein it tries to run analysis over partially constructed IR; this is 3657 // wrong, and not simply for SCEV. The trip count of the original loop 3658 // simply happens to be prone to hitting this in practice. In theory, we 3659 // can hit the same issue for any SCEV, or ValueTracking query done during 3660 // mutation. See PR49900. 3661 getOrCreateTripCount(OrigLoop); 3662 3663 // Create an empty vector loop, and prepare basic blocks for the runtime 3664 // checks. 3665 Loop *Lp = createVectorLoopSkeleton(""); 3666 3667 // Now, compare the new count to zero. If it is zero skip the vector loop and 3668 // jump to the scalar loop. This check also covers the case where the 3669 // backedge-taken count is uint##_max: adding one to it will overflow leading 3670 // to an incorrect trip count of zero. In this (rare) case we will also jump 3671 // to the scalar loop. 3672 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3673 3674 // Generate the code to check any assumptions that we've made for SCEV 3675 // expressions. 3676 emitSCEVChecks(Lp, LoopScalarPreHeader); 3677 3678 // Generate the code that checks in runtime if arrays overlap. We put the 3679 // checks into a separate block to make the more common case of few elements 3680 // faster. 3681 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3682 3683 // Some loops have a single integer induction variable, while other loops 3684 // don't. One example is c++ iterators that often have multiple pointer 3685 // induction variables. In the code below we also support a case where we 3686 // don't have a single induction variable. 3687 // 3688 // We try to obtain an induction variable from the original loop as hard 3689 // as possible. However if we don't find one that: 3690 // - is an integer 3691 // - counts from zero, stepping by one 3692 // - is the size of the widest induction variable type 3693 // then we create a new one. 3694 OldInduction = Legal->getPrimaryInduction(); 3695 Type *IdxTy = Legal->getWidestInductionType(); 3696 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3697 // The loop step is equal to the vectorization factor (num of SIMD elements) 3698 // times the unroll factor (num of SIMD instructions). 3699 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3700 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3701 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3702 Induction = 3703 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3704 getDebugLocFromInstOrOperands(OldInduction)); 3705 3706 // Emit phis for the new starting index of the scalar loop. 3707 createInductionResumeValues(Lp, CountRoundDown); 3708 3709 return completeLoopSkeleton(Lp, OrigLoopID); 3710 } 3711 3712 // Fix up external users of the induction variable. At this point, we are 3713 // in LCSSA form, with all external PHIs that use the IV having one input value, 3714 // coming from the remainder loop. We need those PHIs to also have a correct 3715 // value for the IV when arriving directly from the middle block. 3716 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3717 const InductionDescriptor &II, 3718 Value *CountRoundDown, Value *EndValue, 3719 BasicBlock *MiddleBlock) { 3720 // There are two kinds of external IV usages - those that use the value 3721 // computed in the last iteration (the PHI) and those that use the penultimate 3722 // value (the value that feeds into the phi from the loop latch). 3723 // We allow both, but they, obviously, have different values. 3724 3725 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3726 3727 DenseMap<Value *, Value *> MissingVals; 3728 3729 // An external user of the last iteration's value should see the value that 3730 // the remainder loop uses to initialize its own IV. 3731 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3732 for (User *U : PostInc->users()) { 3733 Instruction *UI = cast<Instruction>(U); 3734 if (!OrigLoop->contains(UI)) { 3735 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3736 MissingVals[UI] = EndValue; 3737 } 3738 } 3739 3740 // An external user of the penultimate value need to see EndValue - Step. 3741 // The simplest way to get this is to recompute it from the constituent SCEVs, 3742 // that is Start + (Step * (CRD - 1)). 3743 for (User *U : OrigPhi->users()) { 3744 auto *UI = cast<Instruction>(U); 3745 if (!OrigLoop->contains(UI)) { 3746 const DataLayout &DL = 3747 OrigLoop->getHeader()->getModule()->getDataLayout(); 3748 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3749 3750 IRBuilder<> B(MiddleBlock->getTerminator()); 3751 3752 // Fast-math-flags propagate from the original induction instruction. 3753 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3754 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3755 3756 Value *CountMinusOne = B.CreateSub( 3757 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3758 Value *CMO = 3759 !II.getStep()->getType()->isIntegerTy() 3760 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3761 II.getStep()->getType()) 3762 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3763 CMO->setName("cast.cmo"); 3764 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3765 Escape->setName("ind.escape"); 3766 MissingVals[UI] = Escape; 3767 } 3768 } 3769 3770 for (auto &I : MissingVals) { 3771 PHINode *PHI = cast<PHINode>(I.first); 3772 // One corner case we have to handle is two IVs "chasing" each-other, 3773 // that is %IV2 = phi [...], [ %IV1, %latch ] 3774 // In this case, if IV1 has an external use, we need to avoid adding both 3775 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3776 // don't already have an incoming value for the middle block. 3777 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3778 PHI->addIncoming(I.second, MiddleBlock); 3779 } 3780 } 3781 3782 namespace { 3783 3784 struct CSEDenseMapInfo { 3785 static bool canHandle(const Instruction *I) { 3786 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3787 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3788 } 3789 3790 static inline Instruction *getEmptyKey() { 3791 return DenseMapInfo<Instruction *>::getEmptyKey(); 3792 } 3793 3794 static inline Instruction *getTombstoneKey() { 3795 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3796 } 3797 3798 static unsigned getHashValue(const Instruction *I) { 3799 assert(canHandle(I) && "Unknown instruction!"); 3800 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3801 I->value_op_end())); 3802 } 3803 3804 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3805 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3806 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3807 return LHS == RHS; 3808 return LHS->isIdenticalTo(RHS); 3809 } 3810 }; 3811 3812 } // end anonymous namespace 3813 3814 ///Perform cse of induction variable instructions. 3815 static void cse(BasicBlock *BB) { 3816 // Perform simple cse. 3817 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3818 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3819 Instruction *In = &*I++; 3820 3821 if (!CSEDenseMapInfo::canHandle(In)) 3822 continue; 3823 3824 // Check if we can replace this instruction with any of the 3825 // visited instructions. 3826 if (Instruction *V = CSEMap.lookup(In)) { 3827 In->replaceAllUsesWith(V); 3828 In->eraseFromParent(); 3829 continue; 3830 } 3831 3832 CSEMap[In] = In; 3833 } 3834 } 3835 3836 InstructionCost 3837 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3838 bool &NeedToScalarize) const { 3839 Function *F = CI->getCalledFunction(); 3840 Type *ScalarRetTy = CI->getType(); 3841 SmallVector<Type *, 4> Tys, ScalarTys; 3842 for (auto &ArgOp : CI->arg_operands()) 3843 ScalarTys.push_back(ArgOp->getType()); 3844 3845 // Estimate cost of scalarized vector call. The source operands are assumed 3846 // to be vectors, so we need to extract individual elements from there, 3847 // execute VF scalar calls, and then gather the result into the vector return 3848 // value. 3849 InstructionCost ScalarCallCost = 3850 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3851 if (VF.isScalar()) 3852 return ScalarCallCost; 3853 3854 // Compute corresponding vector type for return value and arguments. 3855 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3856 for (Type *ScalarTy : ScalarTys) 3857 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3858 3859 // Compute costs of unpacking argument values for the scalar calls and 3860 // packing the return values to a vector. 3861 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3862 3863 InstructionCost Cost = 3864 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3865 3866 // If we can't emit a vector call for this function, then the currently found 3867 // cost is the cost we need to return. 3868 NeedToScalarize = true; 3869 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3870 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3871 3872 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3873 return Cost; 3874 3875 // If the corresponding vector cost is cheaper, return its cost. 3876 InstructionCost VectorCallCost = 3877 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3878 if (VectorCallCost < Cost) { 3879 NeedToScalarize = false; 3880 Cost = VectorCallCost; 3881 } 3882 return Cost; 3883 } 3884 3885 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3886 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3887 return Elt; 3888 return VectorType::get(Elt, VF); 3889 } 3890 3891 InstructionCost 3892 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3893 ElementCount VF) const { 3894 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3895 assert(ID && "Expected intrinsic call!"); 3896 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3897 FastMathFlags FMF; 3898 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3899 FMF = FPMO->getFastMathFlags(); 3900 3901 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3902 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3903 SmallVector<Type *> ParamTys; 3904 std::transform(FTy->param_begin(), FTy->param_end(), 3905 std::back_inserter(ParamTys), 3906 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3907 3908 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3909 dyn_cast<IntrinsicInst>(CI)); 3910 return TTI.getIntrinsicInstrCost(CostAttrs, 3911 TargetTransformInfo::TCK_RecipThroughput); 3912 } 3913 3914 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3915 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3916 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3917 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3918 } 3919 3920 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3921 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3922 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3923 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3924 } 3925 3926 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3927 // For every instruction `I` in MinBWs, truncate the operands, create a 3928 // truncated version of `I` and reextend its result. InstCombine runs 3929 // later and will remove any ext/trunc pairs. 3930 SmallPtrSet<Value *, 4> Erased; 3931 for (const auto &KV : Cost->getMinimalBitwidths()) { 3932 // If the value wasn't vectorized, we must maintain the original scalar 3933 // type. The absence of the value from State indicates that it 3934 // wasn't vectorized. 3935 VPValue *Def = State.Plan->getVPValue(KV.first); 3936 if (!State.hasAnyVectorValue(Def)) 3937 continue; 3938 for (unsigned Part = 0; Part < UF; ++Part) { 3939 Value *I = State.get(Def, Part); 3940 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3941 continue; 3942 Type *OriginalTy = I->getType(); 3943 Type *ScalarTruncatedTy = 3944 IntegerType::get(OriginalTy->getContext(), KV.second); 3945 auto *TruncatedTy = FixedVectorType::get( 3946 ScalarTruncatedTy, 3947 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3948 if (TruncatedTy == OriginalTy) 3949 continue; 3950 3951 IRBuilder<> B(cast<Instruction>(I)); 3952 auto ShrinkOperand = [&](Value *V) -> Value * { 3953 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3954 if (ZI->getSrcTy() == TruncatedTy) 3955 return ZI->getOperand(0); 3956 return B.CreateZExtOrTrunc(V, TruncatedTy); 3957 }; 3958 3959 // The actual instruction modification depends on the instruction type, 3960 // unfortunately. 3961 Value *NewI = nullptr; 3962 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3963 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3964 ShrinkOperand(BO->getOperand(1))); 3965 3966 // Any wrapping introduced by shrinking this operation shouldn't be 3967 // considered undefined behavior. So, we can't unconditionally copy 3968 // arithmetic wrapping flags to NewI. 3969 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3970 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3971 NewI = 3972 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3973 ShrinkOperand(CI->getOperand(1))); 3974 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3975 NewI = B.CreateSelect(SI->getCondition(), 3976 ShrinkOperand(SI->getTrueValue()), 3977 ShrinkOperand(SI->getFalseValue())); 3978 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3979 switch (CI->getOpcode()) { 3980 default: 3981 llvm_unreachable("Unhandled cast!"); 3982 case Instruction::Trunc: 3983 NewI = ShrinkOperand(CI->getOperand(0)); 3984 break; 3985 case Instruction::SExt: 3986 NewI = B.CreateSExtOrTrunc( 3987 CI->getOperand(0), 3988 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3989 break; 3990 case Instruction::ZExt: 3991 NewI = B.CreateZExtOrTrunc( 3992 CI->getOperand(0), 3993 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3994 break; 3995 } 3996 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3997 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3998 ->getNumElements(); 3999 auto *O0 = B.CreateZExtOrTrunc( 4000 SI->getOperand(0), 4001 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 4002 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 4003 ->getNumElements(); 4004 auto *O1 = B.CreateZExtOrTrunc( 4005 SI->getOperand(1), 4006 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 4007 4008 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4009 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4010 // Don't do anything with the operands, just extend the result. 4011 continue; 4012 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4013 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 4014 ->getNumElements(); 4015 auto *O0 = B.CreateZExtOrTrunc( 4016 IE->getOperand(0), 4017 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4018 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4019 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4020 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4021 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4022 ->getNumElements(); 4023 auto *O0 = B.CreateZExtOrTrunc( 4024 EE->getOperand(0), 4025 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4026 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4027 } else { 4028 // If we don't know what to do, be conservative and don't do anything. 4029 continue; 4030 } 4031 4032 // Lastly, extend the result. 4033 NewI->takeName(cast<Instruction>(I)); 4034 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4035 I->replaceAllUsesWith(Res); 4036 cast<Instruction>(I)->eraseFromParent(); 4037 Erased.insert(I); 4038 State.reset(Def, Res, Part); 4039 } 4040 } 4041 4042 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4043 for (const auto &KV : Cost->getMinimalBitwidths()) { 4044 // If the value wasn't vectorized, we must maintain the original scalar 4045 // type. The absence of the value from State indicates that it 4046 // wasn't vectorized. 4047 VPValue *Def = State.Plan->getVPValue(KV.first); 4048 if (!State.hasAnyVectorValue(Def)) 4049 continue; 4050 for (unsigned Part = 0; Part < UF; ++Part) { 4051 Value *I = State.get(Def, Part); 4052 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4053 if (Inst && Inst->use_empty()) { 4054 Value *NewI = Inst->getOperand(0); 4055 Inst->eraseFromParent(); 4056 State.reset(Def, NewI, Part); 4057 } 4058 } 4059 } 4060 } 4061 4062 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4063 // Insert truncates and extends for any truncated instructions as hints to 4064 // InstCombine. 4065 if (VF.isVector()) 4066 truncateToMinimalBitwidths(State); 4067 4068 // Fix widened non-induction PHIs by setting up the PHI operands. 4069 if (OrigPHIsToFix.size()) { 4070 assert(EnableVPlanNativePath && 4071 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4072 fixNonInductionPHIs(State); 4073 } 4074 4075 // At this point every instruction in the original loop is widened to a 4076 // vector form. Now we need to fix the recurrences in the loop. These PHI 4077 // nodes are currently empty because we did not want to introduce cycles. 4078 // This is the second stage of vectorizing recurrences. 4079 fixCrossIterationPHIs(State); 4080 4081 // Forget the original basic block. 4082 PSE.getSE()->forgetLoop(OrigLoop); 4083 4084 // Fix-up external users of the induction variables. 4085 for (auto &Entry : Legal->getInductionVars()) 4086 fixupIVUsers(Entry.first, Entry.second, 4087 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4088 IVEndValues[Entry.first], LoopMiddleBlock); 4089 4090 fixLCSSAPHIs(State); 4091 for (Instruction *PI : PredicatedInstructions) 4092 sinkScalarOperands(&*PI); 4093 4094 // Remove redundant induction instructions. 4095 cse(LoopVectorBody); 4096 4097 // Set/update profile weights for the vector and remainder loops as original 4098 // loop iterations are now distributed among them. Note that original loop 4099 // represented by LoopScalarBody becomes remainder loop after vectorization. 4100 // 4101 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4102 // end up getting slightly roughened result but that should be OK since 4103 // profile is not inherently precise anyway. Note also possible bypass of 4104 // vector code caused by legality checks is ignored, assigning all the weight 4105 // to the vector loop, optimistically. 4106 // 4107 // For scalable vectorization we can't know at compile time how many iterations 4108 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4109 // vscale of '1'. 4110 setProfileInfoAfterUnrolling( 4111 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4112 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4113 } 4114 4115 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4116 // In order to support recurrences we need to be able to vectorize Phi nodes. 4117 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4118 // stage #2: We now need to fix the recurrences by adding incoming edges to 4119 // the currently empty PHI nodes. At this point every instruction in the 4120 // original loop is widened to a vector form so we can use them to construct 4121 // the incoming edges. 4122 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4123 for (VPRecipeBase &R : Header->phis()) { 4124 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4125 if (!PhiR) 4126 continue; 4127 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4128 if (PhiR->getRecurrenceDescriptor()) { 4129 fixReduction(PhiR, State); 4130 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4131 fixFirstOrderRecurrence(OrigPhi, State); 4132 } 4133 } 4134 4135 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4136 VPTransformState &State) { 4137 // This is the second phase of vectorizing first-order recurrences. An 4138 // overview of the transformation is described below. Suppose we have the 4139 // following loop. 4140 // 4141 // for (int i = 0; i < n; ++i) 4142 // b[i] = a[i] - a[i - 1]; 4143 // 4144 // There is a first-order recurrence on "a". For this loop, the shorthand 4145 // scalar IR looks like: 4146 // 4147 // scalar.ph: 4148 // s_init = a[-1] 4149 // br scalar.body 4150 // 4151 // scalar.body: 4152 // i = phi [0, scalar.ph], [i+1, scalar.body] 4153 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4154 // s2 = a[i] 4155 // b[i] = s2 - s1 4156 // br cond, scalar.body, ... 4157 // 4158 // In this example, s1 is a recurrence because it's value depends on the 4159 // previous iteration. In the first phase of vectorization, we created a 4160 // temporary value for s1. We now complete the vectorization and produce the 4161 // shorthand vector IR shown below (for VF = 4, UF = 1). 4162 // 4163 // vector.ph: 4164 // v_init = vector(..., ..., ..., a[-1]) 4165 // br vector.body 4166 // 4167 // vector.body 4168 // i = phi [0, vector.ph], [i+4, vector.body] 4169 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4170 // v2 = a[i, i+1, i+2, i+3]; 4171 // v3 = vector(v1(3), v2(0, 1, 2)) 4172 // b[i, i+1, i+2, i+3] = v2 - v3 4173 // br cond, vector.body, middle.block 4174 // 4175 // middle.block: 4176 // x = v2(3) 4177 // br scalar.ph 4178 // 4179 // scalar.ph: 4180 // s_init = phi [x, middle.block], [a[-1], otherwise] 4181 // br scalar.body 4182 // 4183 // After execution completes the vector loop, we extract the next value of 4184 // the recurrence (x) to use as the initial value in the scalar loop. 4185 4186 // Get the original loop preheader and single loop latch. 4187 auto *Preheader = OrigLoop->getLoopPreheader(); 4188 auto *Latch = OrigLoop->getLoopLatch(); 4189 4190 // Get the initial and previous values of the scalar recurrence. 4191 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4192 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4193 4194 auto *IdxTy = Builder.getInt32Ty(); 4195 auto *One = ConstantInt::get(IdxTy, 1); 4196 4197 // Create a vector from the initial value. 4198 auto *VectorInit = ScalarInit; 4199 if (VF.isVector()) { 4200 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4201 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4202 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4203 VectorInit = Builder.CreateInsertElement( 4204 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4205 VectorInit, LastIdx, "vector.recur.init"); 4206 } 4207 4208 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4209 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4210 // We constructed a temporary phi node in the first phase of vectorization. 4211 // This phi node will eventually be deleted. 4212 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4213 4214 // Create a phi node for the new recurrence. The current value will either be 4215 // the initial value inserted into a vector or loop-varying vector value. 4216 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4217 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4218 4219 // Get the vectorized previous value of the last part UF - 1. It appears last 4220 // among all unrolled iterations, due to the order of their construction. 4221 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4222 4223 // Find and set the insertion point after the previous value if it is an 4224 // instruction. 4225 BasicBlock::iterator InsertPt; 4226 // Note that the previous value may have been constant-folded so it is not 4227 // guaranteed to be an instruction in the vector loop. 4228 // FIXME: Loop invariant values do not form recurrences. We should deal with 4229 // them earlier. 4230 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4231 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4232 else { 4233 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4234 if (isa<PHINode>(PreviousLastPart)) 4235 // If the previous value is a phi node, we should insert after all the phi 4236 // nodes in the block containing the PHI to avoid breaking basic block 4237 // verification. Note that the basic block may be different to 4238 // LoopVectorBody, in case we predicate the loop. 4239 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4240 else 4241 InsertPt = ++PreviousInst->getIterator(); 4242 } 4243 Builder.SetInsertPoint(&*InsertPt); 4244 4245 // The vector from which to take the initial value for the current iteration 4246 // (actual or unrolled). Initially, this is the vector phi node. 4247 Value *Incoming = VecPhi; 4248 4249 // Shuffle the current and previous vector and update the vector parts. 4250 for (unsigned Part = 0; Part < UF; ++Part) { 4251 Value *PreviousPart = State.get(PreviousDef, Part); 4252 Value *PhiPart = State.get(PhiDef, Part); 4253 auto *Shuffle = VF.isVector() 4254 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4255 : Incoming; 4256 PhiPart->replaceAllUsesWith(Shuffle); 4257 cast<Instruction>(PhiPart)->eraseFromParent(); 4258 State.reset(PhiDef, Shuffle, Part); 4259 Incoming = PreviousPart; 4260 } 4261 4262 // Fix the latch value of the new recurrence in the vector loop. 4263 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4264 4265 // Extract the last vector element in the middle block. This will be the 4266 // initial value for the recurrence when jumping to the scalar loop. 4267 auto *ExtractForScalar = Incoming; 4268 if (VF.isVector()) { 4269 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4270 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4271 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4272 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4273 "vector.recur.extract"); 4274 } 4275 // Extract the second last element in the middle block if the 4276 // Phi is used outside the loop. We need to extract the phi itself 4277 // and not the last element (the phi update in the current iteration). This 4278 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4279 // when the scalar loop is not run at all. 4280 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4281 if (VF.isVector()) { 4282 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4283 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4284 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4285 Incoming, Idx, "vector.recur.extract.for.phi"); 4286 } else if (UF > 1) 4287 // When loop is unrolled without vectorizing, initialize 4288 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4289 // of `Incoming`. This is analogous to the vectorized case above: extracting 4290 // the second last element when VF > 1. 4291 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4292 4293 // Fix the initial value of the original recurrence in the scalar loop. 4294 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4295 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4296 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4297 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4298 Start->addIncoming(Incoming, BB); 4299 } 4300 4301 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4302 Phi->setName("scalar.recur"); 4303 4304 // Finally, fix users of the recurrence outside the loop. The users will need 4305 // either the last value of the scalar recurrence or the last value of the 4306 // vector recurrence we extracted in the middle block. Since the loop is in 4307 // LCSSA form, we just need to find all the phi nodes for the original scalar 4308 // recurrence in the exit block, and then add an edge for the middle block. 4309 // Note that LCSSA does not imply single entry when the original scalar loop 4310 // had multiple exiting edges (as we always run the last iteration in the 4311 // scalar epilogue); in that case, the exiting path through middle will be 4312 // dynamically dead and the value picked for the phi doesn't matter. 4313 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4314 if (any_of(LCSSAPhi.incoming_values(), 4315 [Phi](Value *V) { return V == Phi; })) 4316 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4317 } 4318 4319 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4320 return EnableStrictReductions && RdxDesc.isOrdered(); 4321 } 4322 4323 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4324 VPTransformState &State) { 4325 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4326 // Get it's reduction variable descriptor. 4327 assert(Legal->isReductionVariable(OrigPhi) && 4328 "Unable to find the reduction variable"); 4329 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4330 4331 RecurKind RK = RdxDesc.getRecurrenceKind(); 4332 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4333 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4334 setDebugLocFromInst(Builder, ReductionStartValue); 4335 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4336 4337 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4338 // This is the vector-clone of the value that leaves the loop. 4339 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4340 4341 // Wrap flags are in general invalid after vectorization, clear them. 4342 clearReductionWrapFlags(RdxDesc, State); 4343 4344 // Fix the vector-loop phi. 4345 4346 // Reductions do not have to start at zero. They can start with 4347 // any loop invariant values. 4348 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4349 4350 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4351 useOrderedReductions(RdxDesc); 4352 4353 for (unsigned Part = 0; Part < UF; ++Part) { 4354 if (IsOrdered && Part > 0) 4355 break; 4356 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4357 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4358 if (IsOrdered) 4359 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4360 4361 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4362 } 4363 4364 // Before each round, move the insertion point right between 4365 // the PHIs and the values we are going to write. 4366 // This allows us to write both PHINodes and the extractelement 4367 // instructions. 4368 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4369 4370 setDebugLocFromInst(Builder, LoopExitInst); 4371 4372 Type *PhiTy = OrigPhi->getType(); 4373 // If tail is folded by masking, the vector value to leave the loop should be 4374 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4375 // instead of the former. For an inloop reduction the reduction will already 4376 // be predicated, and does not need to be handled here. 4377 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4378 for (unsigned Part = 0; Part < UF; ++Part) { 4379 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4380 Value *Sel = nullptr; 4381 for (User *U : VecLoopExitInst->users()) { 4382 if (isa<SelectInst>(U)) { 4383 assert(!Sel && "Reduction exit feeding two selects"); 4384 Sel = U; 4385 } else 4386 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4387 } 4388 assert(Sel && "Reduction exit feeds no select"); 4389 State.reset(LoopExitInstDef, Sel, Part); 4390 4391 // If the target can create a predicated operator for the reduction at no 4392 // extra cost in the loop (for example a predicated vadd), it can be 4393 // cheaper for the select to remain in the loop than be sunk out of it, 4394 // and so use the select value for the phi instead of the old 4395 // LoopExitValue. 4396 if (PreferPredicatedReductionSelect || 4397 TTI->preferPredicatedReductionSelect( 4398 RdxDesc.getOpcode(), PhiTy, 4399 TargetTransformInfo::ReductionFlags())) { 4400 auto *VecRdxPhi = 4401 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4402 VecRdxPhi->setIncomingValueForBlock( 4403 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4404 } 4405 } 4406 } 4407 4408 // If the vector reduction can be performed in a smaller type, we truncate 4409 // then extend the loop exit value to enable InstCombine to evaluate the 4410 // entire expression in the smaller type. 4411 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4412 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4413 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4414 Builder.SetInsertPoint( 4415 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4416 VectorParts RdxParts(UF); 4417 for (unsigned Part = 0; Part < UF; ++Part) { 4418 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4419 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4420 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4421 : Builder.CreateZExt(Trunc, VecTy); 4422 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4423 UI != RdxParts[Part]->user_end();) 4424 if (*UI != Trunc) { 4425 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4426 RdxParts[Part] = Extnd; 4427 } else { 4428 ++UI; 4429 } 4430 } 4431 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4432 for (unsigned Part = 0; Part < UF; ++Part) { 4433 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4434 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4435 } 4436 } 4437 4438 // Reduce all of the unrolled parts into a single vector. 4439 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4440 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4441 4442 // The middle block terminator has already been assigned a DebugLoc here (the 4443 // OrigLoop's single latch terminator). We want the whole middle block to 4444 // appear to execute on this line because: (a) it is all compiler generated, 4445 // (b) these instructions are always executed after evaluating the latch 4446 // conditional branch, and (c) other passes may add new predecessors which 4447 // terminate on this line. This is the easiest way to ensure we don't 4448 // accidentally cause an extra step back into the loop while debugging. 4449 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4450 if (IsOrdered) 4451 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4452 else { 4453 // Floating-point operations should have some FMF to enable the reduction. 4454 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4455 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4456 for (unsigned Part = 1; Part < UF; ++Part) { 4457 Value *RdxPart = State.get(LoopExitInstDef, Part); 4458 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4459 ReducedPartRdx = Builder.CreateBinOp( 4460 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4461 } else { 4462 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4463 } 4464 } 4465 } 4466 4467 // Create the reduction after the loop. Note that inloop reductions create the 4468 // target reduction in the loop using a Reduction recipe. 4469 if (VF.isVector() && !IsInLoopReductionPhi) { 4470 ReducedPartRdx = 4471 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4472 // If the reduction can be performed in a smaller type, we need to extend 4473 // the reduction to the wider type before we branch to the original loop. 4474 if (PhiTy != RdxDesc.getRecurrenceType()) 4475 ReducedPartRdx = RdxDesc.isSigned() 4476 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4477 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4478 } 4479 4480 // Create a phi node that merges control-flow from the backedge-taken check 4481 // block and the middle block. 4482 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4483 LoopScalarPreHeader->getTerminator()); 4484 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4485 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4486 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4487 4488 // Now, we need to fix the users of the reduction variable 4489 // inside and outside of the scalar remainder loop. 4490 4491 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4492 // in the exit blocks. See comment on analogous loop in 4493 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4494 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4495 if (any_of(LCSSAPhi.incoming_values(), 4496 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4497 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4498 4499 // Fix the scalar loop reduction variable with the incoming reduction sum 4500 // from the vector body and from the backedge value. 4501 int IncomingEdgeBlockIdx = 4502 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4503 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4504 // Pick the other block. 4505 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4506 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4507 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4508 } 4509 4510 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4511 VPTransformState &State) { 4512 RecurKind RK = RdxDesc.getRecurrenceKind(); 4513 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4514 return; 4515 4516 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4517 assert(LoopExitInstr && "null loop exit instruction"); 4518 SmallVector<Instruction *, 8> Worklist; 4519 SmallPtrSet<Instruction *, 8> Visited; 4520 Worklist.push_back(LoopExitInstr); 4521 Visited.insert(LoopExitInstr); 4522 4523 while (!Worklist.empty()) { 4524 Instruction *Cur = Worklist.pop_back_val(); 4525 if (isa<OverflowingBinaryOperator>(Cur)) 4526 for (unsigned Part = 0; Part < UF; ++Part) { 4527 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4528 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4529 } 4530 4531 for (User *U : Cur->users()) { 4532 Instruction *UI = cast<Instruction>(U); 4533 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4534 Visited.insert(UI).second) 4535 Worklist.push_back(UI); 4536 } 4537 } 4538 } 4539 4540 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4541 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4542 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4543 // Some phis were already hand updated by the reduction and recurrence 4544 // code above, leave them alone. 4545 continue; 4546 4547 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4548 // Non-instruction incoming values will have only one value. 4549 4550 VPLane Lane = VPLane::getFirstLane(); 4551 if (isa<Instruction>(IncomingValue) && 4552 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4553 VF)) 4554 Lane = VPLane::getLastLaneForVF(VF); 4555 4556 // Can be a loop invariant incoming value or the last scalar value to be 4557 // extracted from the vectorized loop. 4558 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4559 Value *lastIncomingValue = 4560 OrigLoop->isLoopInvariant(IncomingValue) 4561 ? IncomingValue 4562 : State.get(State.Plan->getVPValue(IncomingValue), 4563 VPIteration(UF - 1, Lane)); 4564 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4565 } 4566 } 4567 4568 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4569 // The basic block and loop containing the predicated instruction. 4570 auto *PredBB = PredInst->getParent(); 4571 auto *VectorLoop = LI->getLoopFor(PredBB); 4572 4573 // Initialize a worklist with the operands of the predicated instruction. 4574 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4575 4576 // Holds instructions that we need to analyze again. An instruction may be 4577 // reanalyzed if we don't yet know if we can sink it or not. 4578 SmallVector<Instruction *, 8> InstsToReanalyze; 4579 4580 // Returns true if a given use occurs in the predicated block. Phi nodes use 4581 // their operands in their corresponding predecessor blocks. 4582 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4583 auto *I = cast<Instruction>(U.getUser()); 4584 BasicBlock *BB = I->getParent(); 4585 if (auto *Phi = dyn_cast<PHINode>(I)) 4586 BB = Phi->getIncomingBlock( 4587 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4588 return BB == PredBB; 4589 }; 4590 4591 // Iteratively sink the scalarized operands of the predicated instruction 4592 // into the block we created for it. When an instruction is sunk, it's 4593 // operands are then added to the worklist. The algorithm ends after one pass 4594 // through the worklist doesn't sink a single instruction. 4595 bool Changed; 4596 do { 4597 // Add the instructions that need to be reanalyzed to the worklist, and 4598 // reset the changed indicator. 4599 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4600 InstsToReanalyze.clear(); 4601 Changed = false; 4602 4603 while (!Worklist.empty()) { 4604 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4605 4606 // We can't sink an instruction if it is a phi node, is not in the loop, 4607 // or may have side effects. 4608 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4609 I->mayHaveSideEffects()) 4610 continue; 4611 4612 // If the instruction is already in PredBB, check if we can sink its 4613 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4614 // sinking the scalar instruction I, hence it appears in PredBB; but it 4615 // may have failed to sink I's operands (recursively), which we try 4616 // (again) here. 4617 if (I->getParent() == PredBB) { 4618 Worklist.insert(I->op_begin(), I->op_end()); 4619 continue; 4620 } 4621 4622 // It's legal to sink the instruction if all its uses occur in the 4623 // predicated block. Otherwise, there's nothing to do yet, and we may 4624 // need to reanalyze the instruction. 4625 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4626 InstsToReanalyze.push_back(I); 4627 continue; 4628 } 4629 4630 // Move the instruction to the beginning of the predicated block, and add 4631 // it's operands to the worklist. 4632 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4633 Worklist.insert(I->op_begin(), I->op_end()); 4634 4635 // The sinking may have enabled other instructions to be sunk, so we will 4636 // need to iterate. 4637 Changed = true; 4638 } 4639 } while (Changed); 4640 } 4641 4642 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4643 for (PHINode *OrigPhi : OrigPHIsToFix) { 4644 VPWidenPHIRecipe *VPPhi = 4645 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4646 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4647 // Make sure the builder has a valid insert point. 4648 Builder.SetInsertPoint(NewPhi); 4649 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4650 VPValue *Inc = VPPhi->getIncomingValue(i); 4651 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4652 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4653 } 4654 } 4655 } 4656 4657 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4658 VPUser &Operands, unsigned UF, 4659 ElementCount VF, bool IsPtrLoopInvariant, 4660 SmallBitVector &IsIndexLoopInvariant, 4661 VPTransformState &State) { 4662 // Construct a vector GEP by widening the operands of the scalar GEP as 4663 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4664 // results in a vector of pointers when at least one operand of the GEP 4665 // is vector-typed. Thus, to keep the representation compact, we only use 4666 // vector-typed operands for loop-varying values. 4667 4668 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4669 // If we are vectorizing, but the GEP has only loop-invariant operands, 4670 // the GEP we build (by only using vector-typed operands for 4671 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4672 // produce a vector of pointers, we need to either arbitrarily pick an 4673 // operand to broadcast, or broadcast a clone of the original GEP. 4674 // Here, we broadcast a clone of the original. 4675 // 4676 // TODO: If at some point we decide to scalarize instructions having 4677 // loop-invariant operands, this special case will no longer be 4678 // required. We would add the scalarization decision to 4679 // collectLoopScalars() and teach getVectorValue() to broadcast 4680 // the lane-zero scalar value. 4681 auto *Clone = Builder.Insert(GEP->clone()); 4682 for (unsigned Part = 0; Part < UF; ++Part) { 4683 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4684 State.set(VPDef, EntryPart, Part); 4685 addMetadata(EntryPart, GEP); 4686 } 4687 } else { 4688 // If the GEP has at least one loop-varying operand, we are sure to 4689 // produce a vector of pointers. But if we are only unrolling, we want 4690 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4691 // produce with the code below will be scalar (if VF == 1) or vector 4692 // (otherwise). Note that for the unroll-only case, we still maintain 4693 // values in the vector mapping with initVector, as we do for other 4694 // instructions. 4695 for (unsigned Part = 0; Part < UF; ++Part) { 4696 // The pointer operand of the new GEP. If it's loop-invariant, we 4697 // won't broadcast it. 4698 auto *Ptr = IsPtrLoopInvariant 4699 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4700 : State.get(Operands.getOperand(0), Part); 4701 4702 // Collect all the indices for the new GEP. If any index is 4703 // loop-invariant, we won't broadcast it. 4704 SmallVector<Value *, 4> Indices; 4705 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4706 VPValue *Operand = Operands.getOperand(I); 4707 if (IsIndexLoopInvariant[I - 1]) 4708 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4709 else 4710 Indices.push_back(State.get(Operand, Part)); 4711 } 4712 4713 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4714 // but it should be a vector, otherwise. 4715 auto *NewGEP = 4716 GEP->isInBounds() 4717 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4718 Indices) 4719 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4720 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4721 "NewGEP is not a pointer vector"); 4722 State.set(VPDef, NewGEP, Part); 4723 addMetadata(NewGEP, GEP); 4724 } 4725 } 4726 } 4727 4728 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4729 RecurrenceDescriptor *RdxDesc, 4730 VPWidenPHIRecipe *PhiR, 4731 VPTransformState &State) { 4732 PHINode *P = cast<PHINode>(PN); 4733 if (EnableVPlanNativePath) { 4734 // Currently we enter here in the VPlan-native path for non-induction 4735 // PHIs where all control flow is uniform. We simply widen these PHIs. 4736 // Create a vector phi with no operands - the vector phi operands will be 4737 // set at the end of vector code generation. 4738 Type *VecTy = (State.VF.isScalar()) 4739 ? PN->getType() 4740 : VectorType::get(PN->getType(), State.VF); 4741 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4742 State.set(PhiR, VecPhi, 0); 4743 OrigPHIsToFix.push_back(P); 4744 4745 return; 4746 } 4747 4748 assert(PN->getParent() == OrigLoop->getHeader() && 4749 "Non-header phis should have been handled elsewhere"); 4750 4751 VPValue *StartVPV = PhiR->getStartValue(); 4752 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4753 // In order to support recurrences we need to be able to vectorize Phi nodes. 4754 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4755 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4756 // this value when we vectorize all of the instructions that use the PHI. 4757 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4758 Value *Iden = nullptr; 4759 bool ScalarPHI = 4760 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4761 Type *VecTy = 4762 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4763 4764 if (RdxDesc) { 4765 assert(Legal->isReductionVariable(P) && StartV && 4766 "RdxDesc should only be set for reduction variables; in that case " 4767 "a StartV is also required"); 4768 RecurKind RK = RdxDesc->getRecurrenceKind(); 4769 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4770 // MinMax reduction have the start value as their identify. 4771 if (ScalarPHI) { 4772 Iden = StartV; 4773 } else { 4774 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4775 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4776 StartV = Iden = 4777 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4778 } 4779 } else { 4780 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4781 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4782 Iden = IdenC; 4783 4784 if (!ScalarPHI) { 4785 Iden = ConstantVector::getSplat(State.VF, IdenC); 4786 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4787 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4788 Constant *Zero = Builder.getInt32(0); 4789 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4790 } 4791 } 4792 } 4793 4794 bool IsOrdered = State.VF.isVector() && 4795 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4796 useOrderedReductions(*RdxDesc); 4797 4798 for (unsigned Part = 0; Part < State.UF; ++Part) { 4799 // This is phase one of vectorizing PHIs. 4800 if (Part > 0 && IsOrdered) 4801 return; 4802 Value *EntryPart = PHINode::Create( 4803 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4804 State.set(PhiR, EntryPart, Part); 4805 if (StartV) { 4806 // Make sure to add the reduction start value only to the 4807 // first unroll part. 4808 Value *StartVal = (Part == 0) ? StartV : Iden; 4809 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4810 } 4811 } 4812 return; 4813 } 4814 4815 assert(!Legal->isReductionVariable(P) && 4816 "reductions should be handled above"); 4817 4818 setDebugLocFromInst(Builder, P); 4819 4820 // This PHINode must be an induction variable. 4821 // Make sure that we know about it. 4822 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4823 4824 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4825 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4826 4827 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4828 // which can be found from the original scalar operations. 4829 switch (II.getKind()) { 4830 case InductionDescriptor::IK_NoInduction: 4831 llvm_unreachable("Unknown induction"); 4832 case InductionDescriptor::IK_IntInduction: 4833 case InductionDescriptor::IK_FpInduction: 4834 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4835 case InductionDescriptor::IK_PtrInduction: { 4836 // Handle the pointer induction variable case. 4837 assert(P->getType()->isPointerTy() && "Unexpected type."); 4838 4839 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4840 // This is the normalized GEP that starts counting at zero. 4841 Value *PtrInd = 4842 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4843 // Determine the number of scalars we need to generate for each unroll 4844 // iteration. If the instruction is uniform, we only need to generate the 4845 // first lane. Otherwise, we generate all VF values. 4846 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4847 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4848 4849 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4850 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4851 if (NeedsVectorIndex) { 4852 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4853 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4854 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4855 } 4856 4857 for (unsigned Part = 0; Part < UF; ++Part) { 4858 Value *PartStart = createStepForVF( 4859 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4860 4861 if (NeedsVectorIndex) { 4862 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4863 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4864 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4865 Value *SclrGep = 4866 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4867 SclrGep->setName("next.gep"); 4868 State.set(PhiR, SclrGep, Part); 4869 // We've cached the whole vector, which means we can support the 4870 // extraction of any lane. 4871 continue; 4872 } 4873 4874 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4875 Value *Idx = Builder.CreateAdd( 4876 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4877 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4878 Value *SclrGep = 4879 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4880 SclrGep->setName("next.gep"); 4881 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4882 } 4883 } 4884 return; 4885 } 4886 assert(isa<SCEVConstant>(II.getStep()) && 4887 "Induction step not a SCEV constant!"); 4888 Type *PhiType = II.getStep()->getType(); 4889 4890 // Build a pointer phi 4891 Value *ScalarStartValue = II.getStartValue(); 4892 Type *ScStValueType = ScalarStartValue->getType(); 4893 PHINode *NewPointerPhi = 4894 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4895 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4896 4897 // A pointer induction, performed by using a gep 4898 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4899 Instruction *InductionLoc = LoopLatch->getTerminator(); 4900 const SCEV *ScalarStep = II.getStep(); 4901 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4902 Value *ScalarStepValue = 4903 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4904 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4905 Value *NumUnrolledElems = 4906 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4907 Value *InductionGEP = GetElementPtrInst::Create( 4908 ScStValueType->getPointerElementType(), NewPointerPhi, 4909 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4910 InductionLoc); 4911 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4912 4913 // Create UF many actual address geps that use the pointer 4914 // phi as base and a vectorized version of the step value 4915 // (<step*0, ..., step*N>) as offset. 4916 for (unsigned Part = 0; Part < State.UF; ++Part) { 4917 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4918 Value *StartOffsetScalar = 4919 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4920 Value *StartOffset = 4921 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4922 // Create a vector of consecutive numbers from zero to VF. 4923 StartOffset = 4924 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4925 4926 Value *GEP = Builder.CreateGEP( 4927 ScStValueType->getPointerElementType(), NewPointerPhi, 4928 Builder.CreateMul( 4929 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4930 "vector.gep")); 4931 State.set(PhiR, GEP, Part); 4932 } 4933 } 4934 } 4935 } 4936 4937 /// A helper function for checking whether an integer division-related 4938 /// instruction may divide by zero (in which case it must be predicated if 4939 /// executed conditionally in the scalar code). 4940 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4941 /// Non-zero divisors that are non compile-time constants will not be 4942 /// converted into multiplication, so we will still end up scalarizing 4943 /// the division, but can do so w/o predication. 4944 static bool mayDivideByZero(Instruction &I) { 4945 assert((I.getOpcode() == Instruction::UDiv || 4946 I.getOpcode() == Instruction::SDiv || 4947 I.getOpcode() == Instruction::URem || 4948 I.getOpcode() == Instruction::SRem) && 4949 "Unexpected instruction"); 4950 Value *Divisor = I.getOperand(1); 4951 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4952 return !CInt || CInt->isZero(); 4953 } 4954 4955 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4956 VPUser &User, 4957 VPTransformState &State) { 4958 switch (I.getOpcode()) { 4959 case Instruction::Call: 4960 case Instruction::Br: 4961 case Instruction::PHI: 4962 case Instruction::GetElementPtr: 4963 case Instruction::Select: 4964 llvm_unreachable("This instruction is handled by a different recipe."); 4965 case Instruction::UDiv: 4966 case Instruction::SDiv: 4967 case Instruction::SRem: 4968 case Instruction::URem: 4969 case Instruction::Add: 4970 case Instruction::FAdd: 4971 case Instruction::Sub: 4972 case Instruction::FSub: 4973 case Instruction::FNeg: 4974 case Instruction::Mul: 4975 case Instruction::FMul: 4976 case Instruction::FDiv: 4977 case Instruction::FRem: 4978 case Instruction::Shl: 4979 case Instruction::LShr: 4980 case Instruction::AShr: 4981 case Instruction::And: 4982 case Instruction::Or: 4983 case Instruction::Xor: { 4984 // Just widen unops and binops. 4985 setDebugLocFromInst(Builder, &I); 4986 4987 for (unsigned Part = 0; Part < UF; ++Part) { 4988 SmallVector<Value *, 2> Ops; 4989 for (VPValue *VPOp : User.operands()) 4990 Ops.push_back(State.get(VPOp, Part)); 4991 4992 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4993 4994 if (auto *VecOp = dyn_cast<Instruction>(V)) 4995 VecOp->copyIRFlags(&I); 4996 4997 // Use this vector value for all users of the original instruction. 4998 State.set(Def, V, Part); 4999 addMetadata(V, &I); 5000 } 5001 5002 break; 5003 } 5004 case Instruction::ICmp: 5005 case Instruction::FCmp: { 5006 // Widen compares. Generate vector compares. 5007 bool FCmp = (I.getOpcode() == Instruction::FCmp); 5008 auto *Cmp = cast<CmpInst>(&I); 5009 setDebugLocFromInst(Builder, Cmp); 5010 for (unsigned Part = 0; Part < UF; ++Part) { 5011 Value *A = State.get(User.getOperand(0), Part); 5012 Value *B = State.get(User.getOperand(1), Part); 5013 Value *C = nullptr; 5014 if (FCmp) { 5015 // Propagate fast math flags. 5016 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 5017 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 5018 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 5019 } else { 5020 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 5021 } 5022 State.set(Def, C, Part); 5023 addMetadata(C, &I); 5024 } 5025 5026 break; 5027 } 5028 5029 case Instruction::ZExt: 5030 case Instruction::SExt: 5031 case Instruction::FPToUI: 5032 case Instruction::FPToSI: 5033 case Instruction::FPExt: 5034 case Instruction::PtrToInt: 5035 case Instruction::IntToPtr: 5036 case Instruction::SIToFP: 5037 case Instruction::UIToFP: 5038 case Instruction::Trunc: 5039 case Instruction::FPTrunc: 5040 case Instruction::BitCast: { 5041 auto *CI = cast<CastInst>(&I); 5042 setDebugLocFromInst(Builder, CI); 5043 5044 /// Vectorize casts. 5045 Type *DestTy = 5046 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 5047 5048 for (unsigned Part = 0; Part < UF; ++Part) { 5049 Value *A = State.get(User.getOperand(0), Part); 5050 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5051 State.set(Def, Cast, Part); 5052 addMetadata(Cast, &I); 5053 } 5054 break; 5055 } 5056 default: 5057 // This instruction is not vectorized by simple widening. 5058 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5059 llvm_unreachable("Unhandled instruction!"); 5060 } // end of switch. 5061 } 5062 5063 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5064 VPUser &ArgOperands, 5065 VPTransformState &State) { 5066 assert(!isa<DbgInfoIntrinsic>(I) && 5067 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5068 setDebugLocFromInst(Builder, &I); 5069 5070 Module *M = I.getParent()->getParent()->getParent(); 5071 auto *CI = cast<CallInst>(&I); 5072 5073 SmallVector<Type *, 4> Tys; 5074 for (Value *ArgOperand : CI->arg_operands()) 5075 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5076 5077 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5078 5079 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5080 // version of the instruction. 5081 // Is it beneficial to perform intrinsic call compared to lib call? 5082 bool NeedToScalarize = false; 5083 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5084 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5085 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5086 assert((UseVectorIntrinsic || !NeedToScalarize) && 5087 "Instruction should be scalarized elsewhere."); 5088 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5089 "Either the intrinsic cost or vector call cost must be valid"); 5090 5091 for (unsigned Part = 0; Part < UF; ++Part) { 5092 SmallVector<Value *, 4> Args; 5093 for (auto &I : enumerate(ArgOperands.operands())) { 5094 // Some intrinsics have a scalar argument - don't replace it with a 5095 // vector. 5096 Value *Arg; 5097 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5098 Arg = State.get(I.value(), Part); 5099 else 5100 Arg = State.get(I.value(), VPIteration(0, 0)); 5101 Args.push_back(Arg); 5102 } 5103 5104 Function *VectorF; 5105 if (UseVectorIntrinsic) { 5106 // Use vector version of the intrinsic. 5107 Type *TysForDecl[] = {CI->getType()}; 5108 if (VF.isVector()) 5109 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5110 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5111 assert(VectorF && "Can't retrieve vector intrinsic."); 5112 } else { 5113 // Use vector version of the function call. 5114 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5115 #ifndef NDEBUG 5116 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5117 "Can't create vector function."); 5118 #endif 5119 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5120 } 5121 SmallVector<OperandBundleDef, 1> OpBundles; 5122 CI->getOperandBundlesAsDefs(OpBundles); 5123 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5124 5125 if (isa<FPMathOperator>(V)) 5126 V->copyFastMathFlags(CI); 5127 5128 State.set(Def, V, Part); 5129 addMetadata(V, &I); 5130 } 5131 } 5132 5133 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5134 VPUser &Operands, 5135 bool InvariantCond, 5136 VPTransformState &State) { 5137 setDebugLocFromInst(Builder, &I); 5138 5139 // The condition can be loop invariant but still defined inside the 5140 // loop. This means that we can't just use the original 'cond' value. 5141 // We have to take the 'vectorized' value and pick the first lane. 5142 // Instcombine will make this a no-op. 5143 auto *InvarCond = InvariantCond 5144 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5145 : nullptr; 5146 5147 for (unsigned Part = 0; Part < UF; ++Part) { 5148 Value *Cond = 5149 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5150 Value *Op0 = State.get(Operands.getOperand(1), Part); 5151 Value *Op1 = State.get(Operands.getOperand(2), Part); 5152 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5153 State.set(VPDef, Sel, Part); 5154 addMetadata(Sel, &I); 5155 } 5156 } 5157 5158 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5159 // We should not collect Scalars more than once per VF. Right now, this 5160 // function is called from collectUniformsAndScalars(), which already does 5161 // this check. Collecting Scalars for VF=1 does not make any sense. 5162 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5163 "This function should not be visited twice for the same VF"); 5164 5165 SmallSetVector<Instruction *, 8> Worklist; 5166 5167 // These sets are used to seed the analysis with pointers used by memory 5168 // accesses that will remain scalar. 5169 SmallSetVector<Instruction *, 8> ScalarPtrs; 5170 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5171 auto *Latch = TheLoop->getLoopLatch(); 5172 5173 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5174 // The pointer operands of loads and stores will be scalar as long as the 5175 // memory access is not a gather or scatter operation. The value operand of a 5176 // store will remain scalar if the store is scalarized. 5177 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5178 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5179 assert(WideningDecision != CM_Unknown && 5180 "Widening decision should be ready at this moment"); 5181 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5182 if (Ptr == Store->getValueOperand()) 5183 return WideningDecision == CM_Scalarize; 5184 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5185 "Ptr is neither a value or pointer operand"); 5186 return WideningDecision != CM_GatherScatter; 5187 }; 5188 5189 // A helper that returns true if the given value is a bitcast or 5190 // getelementptr instruction contained in the loop. 5191 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5192 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5193 isa<GetElementPtrInst>(V)) && 5194 !TheLoop->isLoopInvariant(V); 5195 }; 5196 5197 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5198 if (!isa<PHINode>(Ptr) || 5199 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5200 return false; 5201 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5202 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5203 return false; 5204 return isScalarUse(MemAccess, Ptr); 5205 }; 5206 5207 // A helper that evaluates a memory access's use of a pointer. If the 5208 // pointer is actually the pointer induction of a loop, it is being 5209 // inserted into Worklist. If the use will be a scalar use, and the 5210 // pointer is only used by memory accesses, we place the pointer in 5211 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5212 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5213 if (isScalarPtrInduction(MemAccess, Ptr)) { 5214 Worklist.insert(cast<Instruction>(Ptr)); 5215 Instruction *Update = cast<Instruction>( 5216 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5217 Worklist.insert(Update); 5218 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5219 << "\n"); 5220 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5221 << "\n"); 5222 return; 5223 } 5224 // We only care about bitcast and getelementptr instructions contained in 5225 // the loop. 5226 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5227 return; 5228 5229 // If the pointer has already been identified as scalar (e.g., if it was 5230 // also identified as uniform), there's nothing to do. 5231 auto *I = cast<Instruction>(Ptr); 5232 if (Worklist.count(I)) 5233 return; 5234 5235 // If the use of the pointer will be a scalar use, and all users of the 5236 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5237 // place the pointer in PossibleNonScalarPtrs. 5238 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5239 return isa<LoadInst>(U) || isa<StoreInst>(U); 5240 })) 5241 ScalarPtrs.insert(I); 5242 else 5243 PossibleNonScalarPtrs.insert(I); 5244 }; 5245 5246 // We seed the scalars analysis with three classes of instructions: (1) 5247 // instructions marked uniform-after-vectorization and (2) bitcast, 5248 // getelementptr and (pointer) phi instructions used by memory accesses 5249 // requiring a scalar use. 5250 // 5251 // (1) Add to the worklist all instructions that have been identified as 5252 // uniform-after-vectorization. 5253 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5254 5255 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5256 // memory accesses requiring a scalar use. The pointer operands of loads and 5257 // stores will be scalar as long as the memory accesses is not a gather or 5258 // scatter operation. The value operand of a store will remain scalar if the 5259 // store is scalarized. 5260 for (auto *BB : TheLoop->blocks()) 5261 for (auto &I : *BB) { 5262 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5263 evaluatePtrUse(Load, Load->getPointerOperand()); 5264 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5265 evaluatePtrUse(Store, Store->getPointerOperand()); 5266 evaluatePtrUse(Store, Store->getValueOperand()); 5267 } 5268 } 5269 for (auto *I : ScalarPtrs) 5270 if (!PossibleNonScalarPtrs.count(I)) { 5271 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5272 Worklist.insert(I); 5273 } 5274 5275 // Insert the forced scalars. 5276 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5277 // induction variable when the PHI user is scalarized. 5278 auto ForcedScalar = ForcedScalars.find(VF); 5279 if (ForcedScalar != ForcedScalars.end()) 5280 for (auto *I : ForcedScalar->second) 5281 Worklist.insert(I); 5282 5283 // Expand the worklist by looking through any bitcasts and getelementptr 5284 // instructions we've already identified as scalar. This is similar to the 5285 // expansion step in collectLoopUniforms(); however, here we're only 5286 // expanding to include additional bitcasts and getelementptr instructions. 5287 unsigned Idx = 0; 5288 while (Idx != Worklist.size()) { 5289 Instruction *Dst = Worklist[Idx++]; 5290 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5291 continue; 5292 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5293 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5294 auto *J = cast<Instruction>(U); 5295 return !TheLoop->contains(J) || Worklist.count(J) || 5296 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5297 isScalarUse(J, Src)); 5298 })) { 5299 Worklist.insert(Src); 5300 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5301 } 5302 } 5303 5304 // An induction variable will remain scalar if all users of the induction 5305 // variable and induction variable update remain scalar. 5306 for (auto &Induction : Legal->getInductionVars()) { 5307 auto *Ind = Induction.first; 5308 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5309 5310 // If tail-folding is applied, the primary induction variable will be used 5311 // to feed a vector compare. 5312 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5313 continue; 5314 5315 // Determine if all users of the induction variable are scalar after 5316 // vectorization. 5317 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5318 auto *I = cast<Instruction>(U); 5319 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5320 }); 5321 if (!ScalarInd) 5322 continue; 5323 5324 // Determine if all users of the induction variable update instruction are 5325 // scalar after vectorization. 5326 auto ScalarIndUpdate = 5327 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5328 auto *I = cast<Instruction>(U); 5329 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5330 }); 5331 if (!ScalarIndUpdate) 5332 continue; 5333 5334 // The induction variable and its update instruction will remain scalar. 5335 Worklist.insert(Ind); 5336 Worklist.insert(IndUpdate); 5337 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5338 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5339 << "\n"); 5340 } 5341 5342 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5343 } 5344 5345 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5346 if (!blockNeedsPredication(I->getParent())) 5347 return false; 5348 switch(I->getOpcode()) { 5349 default: 5350 break; 5351 case Instruction::Load: 5352 case Instruction::Store: { 5353 if (!Legal->isMaskRequired(I)) 5354 return false; 5355 auto *Ptr = getLoadStorePointerOperand(I); 5356 auto *Ty = getMemInstValueType(I); 5357 const Align Alignment = getLoadStoreAlignment(I); 5358 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5359 isLegalMaskedGather(Ty, Alignment)) 5360 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5361 isLegalMaskedScatter(Ty, Alignment)); 5362 } 5363 case Instruction::UDiv: 5364 case Instruction::SDiv: 5365 case Instruction::SRem: 5366 case Instruction::URem: 5367 return mayDivideByZero(*I); 5368 } 5369 return false; 5370 } 5371 5372 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5373 Instruction *I, ElementCount VF) { 5374 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5375 assert(getWideningDecision(I, VF) == CM_Unknown && 5376 "Decision should not be set yet."); 5377 auto *Group = getInterleavedAccessGroup(I); 5378 assert(Group && "Must have a group."); 5379 5380 // If the instruction's allocated size doesn't equal it's type size, it 5381 // requires padding and will be scalarized. 5382 auto &DL = I->getModule()->getDataLayout(); 5383 auto *ScalarTy = getMemInstValueType(I); 5384 if (hasIrregularType(ScalarTy, DL)) 5385 return false; 5386 5387 // Check if masking is required. 5388 // A Group may need masking for one of two reasons: it resides in a block that 5389 // needs predication, or it was decided to use masking to deal with gaps. 5390 bool PredicatedAccessRequiresMasking = 5391 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5392 bool AccessWithGapsRequiresMasking = 5393 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5394 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5395 return true; 5396 5397 // If masked interleaving is required, we expect that the user/target had 5398 // enabled it, because otherwise it either wouldn't have been created or 5399 // it should have been invalidated by the CostModel. 5400 assert(useMaskedInterleavedAccesses(TTI) && 5401 "Masked interleave-groups for predicated accesses are not enabled."); 5402 5403 auto *Ty = getMemInstValueType(I); 5404 const Align Alignment = getLoadStoreAlignment(I); 5405 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5406 : TTI.isLegalMaskedStore(Ty, Alignment); 5407 } 5408 5409 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5410 Instruction *I, ElementCount VF) { 5411 // Get and ensure we have a valid memory instruction. 5412 LoadInst *LI = dyn_cast<LoadInst>(I); 5413 StoreInst *SI = dyn_cast<StoreInst>(I); 5414 assert((LI || SI) && "Invalid memory instruction"); 5415 5416 auto *Ptr = getLoadStorePointerOperand(I); 5417 5418 // In order to be widened, the pointer should be consecutive, first of all. 5419 if (!Legal->isConsecutivePtr(Ptr)) 5420 return false; 5421 5422 // If the instruction is a store located in a predicated block, it will be 5423 // scalarized. 5424 if (isScalarWithPredication(I)) 5425 return false; 5426 5427 // If the instruction's allocated size doesn't equal it's type size, it 5428 // requires padding and will be scalarized. 5429 auto &DL = I->getModule()->getDataLayout(); 5430 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5431 if (hasIrregularType(ScalarTy, DL)) 5432 return false; 5433 5434 return true; 5435 } 5436 5437 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5438 // We should not collect Uniforms more than once per VF. Right now, 5439 // this function is called from collectUniformsAndScalars(), which 5440 // already does this check. Collecting Uniforms for VF=1 does not make any 5441 // sense. 5442 5443 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5444 "This function should not be visited twice for the same VF"); 5445 5446 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5447 // not analyze again. Uniforms.count(VF) will return 1. 5448 Uniforms[VF].clear(); 5449 5450 // We now know that the loop is vectorizable! 5451 // Collect instructions inside the loop that will remain uniform after 5452 // vectorization. 5453 5454 // Global values, params and instructions outside of current loop are out of 5455 // scope. 5456 auto isOutOfScope = [&](Value *V) -> bool { 5457 Instruction *I = dyn_cast<Instruction>(V); 5458 return (!I || !TheLoop->contains(I)); 5459 }; 5460 5461 SetVector<Instruction *> Worklist; 5462 BasicBlock *Latch = TheLoop->getLoopLatch(); 5463 5464 // Instructions that are scalar with predication must not be considered 5465 // uniform after vectorization, because that would create an erroneous 5466 // replicating region where only a single instance out of VF should be formed. 5467 // TODO: optimize such seldom cases if found important, see PR40816. 5468 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5469 if (isOutOfScope(I)) { 5470 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5471 << *I << "\n"); 5472 return; 5473 } 5474 if (isScalarWithPredication(I)) { 5475 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5476 << *I << "\n"); 5477 return; 5478 } 5479 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5480 Worklist.insert(I); 5481 }; 5482 5483 // Start with the conditional branch. If the branch condition is an 5484 // instruction contained in the loop that is only used by the branch, it is 5485 // uniform. 5486 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5487 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5488 addToWorklistIfAllowed(Cmp); 5489 5490 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5491 InstWidening WideningDecision = getWideningDecision(I, VF); 5492 assert(WideningDecision != CM_Unknown && 5493 "Widening decision should be ready at this moment"); 5494 5495 // A uniform memory op is itself uniform. We exclude uniform stores 5496 // here as they demand the last lane, not the first one. 5497 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5498 assert(WideningDecision == CM_Scalarize); 5499 return true; 5500 } 5501 5502 return (WideningDecision == CM_Widen || 5503 WideningDecision == CM_Widen_Reverse || 5504 WideningDecision == CM_Interleave); 5505 }; 5506 5507 5508 // Returns true if Ptr is the pointer operand of a memory access instruction 5509 // I, and I is known to not require scalarization. 5510 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5511 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5512 }; 5513 5514 // Holds a list of values which are known to have at least one uniform use. 5515 // Note that there may be other uses which aren't uniform. A "uniform use" 5516 // here is something which only demands lane 0 of the unrolled iterations; 5517 // it does not imply that all lanes produce the same value (e.g. this is not 5518 // the usual meaning of uniform) 5519 SetVector<Value *> HasUniformUse; 5520 5521 // Scan the loop for instructions which are either a) known to have only 5522 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5523 for (auto *BB : TheLoop->blocks()) 5524 for (auto &I : *BB) { 5525 // If there's no pointer operand, there's nothing to do. 5526 auto *Ptr = getLoadStorePointerOperand(&I); 5527 if (!Ptr) 5528 continue; 5529 5530 // A uniform memory op is itself uniform. We exclude uniform stores 5531 // here as they demand the last lane, not the first one. 5532 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5533 addToWorklistIfAllowed(&I); 5534 5535 if (isUniformDecision(&I, VF)) { 5536 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5537 HasUniformUse.insert(Ptr); 5538 } 5539 } 5540 5541 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5542 // demanding) users. Since loops are assumed to be in LCSSA form, this 5543 // disallows uses outside the loop as well. 5544 for (auto *V : HasUniformUse) { 5545 if (isOutOfScope(V)) 5546 continue; 5547 auto *I = cast<Instruction>(V); 5548 auto UsersAreMemAccesses = 5549 llvm::all_of(I->users(), [&](User *U) -> bool { 5550 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5551 }); 5552 if (UsersAreMemAccesses) 5553 addToWorklistIfAllowed(I); 5554 } 5555 5556 // Expand Worklist in topological order: whenever a new instruction 5557 // is added , its users should be already inside Worklist. It ensures 5558 // a uniform instruction will only be used by uniform instructions. 5559 unsigned idx = 0; 5560 while (idx != Worklist.size()) { 5561 Instruction *I = Worklist[idx++]; 5562 5563 for (auto OV : I->operand_values()) { 5564 // isOutOfScope operands cannot be uniform instructions. 5565 if (isOutOfScope(OV)) 5566 continue; 5567 // First order recurrence Phi's should typically be considered 5568 // non-uniform. 5569 auto *OP = dyn_cast<PHINode>(OV); 5570 if (OP && Legal->isFirstOrderRecurrence(OP)) 5571 continue; 5572 // If all the users of the operand are uniform, then add the 5573 // operand into the uniform worklist. 5574 auto *OI = cast<Instruction>(OV); 5575 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5576 auto *J = cast<Instruction>(U); 5577 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5578 })) 5579 addToWorklistIfAllowed(OI); 5580 } 5581 } 5582 5583 // For an instruction to be added into Worklist above, all its users inside 5584 // the loop should also be in Worklist. However, this condition cannot be 5585 // true for phi nodes that form a cyclic dependence. We must process phi 5586 // nodes separately. An induction variable will remain uniform if all users 5587 // of the induction variable and induction variable update remain uniform. 5588 // The code below handles both pointer and non-pointer induction variables. 5589 for (auto &Induction : Legal->getInductionVars()) { 5590 auto *Ind = Induction.first; 5591 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5592 5593 // Determine if all users of the induction variable are uniform after 5594 // vectorization. 5595 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5596 auto *I = cast<Instruction>(U); 5597 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5598 isVectorizedMemAccessUse(I, Ind); 5599 }); 5600 if (!UniformInd) 5601 continue; 5602 5603 // Determine if all users of the induction variable update instruction are 5604 // uniform after vectorization. 5605 auto UniformIndUpdate = 5606 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5607 auto *I = cast<Instruction>(U); 5608 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5609 isVectorizedMemAccessUse(I, IndUpdate); 5610 }); 5611 if (!UniformIndUpdate) 5612 continue; 5613 5614 // The induction variable and its update instruction will remain uniform. 5615 addToWorklistIfAllowed(Ind); 5616 addToWorklistIfAllowed(IndUpdate); 5617 } 5618 5619 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5620 } 5621 5622 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5623 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5624 5625 if (Legal->getRuntimePointerChecking()->Need) { 5626 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5627 "runtime pointer checks needed. Enable vectorization of this " 5628 "loop with '#pragma clang loop vectorize(enable)' when " 5629 "compiling with -Os/-Oz", 5630 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5631 return true; 5632 } 5633 5634 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5635 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5636 "runtime SCEV checks needed. Enable vectorization of this " 5637 "loop with '#pragma clang loop vectorize(enable)' when " 5638 "compiling with -Os/-Oz", 5639 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5640 return true; 5641 } 5642 5643 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5644 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5645 reportVectorizationFailure("Runtime stride check for small trip count", 5646 "runtime stride == 1 checks needed. Enable vectorization of " 5647 "this loop without such check by compiling with -Os/-Oz", 5648 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5649 return true; 5650 } 5651 5652 return false; 5653 } 5654 5655 ElementCount 5656 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5657 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5658 reportVectorizationInfo( 5659 "Disabling scalable vectorization, because target does not " 5660 "support scalable vectors.", 5661 "ScalableVectorsUnsupported", ORE, TheLoop); 5662 return ElementCount::getScalable(0); 5663 } 5664 5665 if (Hints->isScalableVectorizationDisabled()) { 5666 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5667 "ScalableVectorizationDisabled", ORE, TheLoop); 5668 return ElementCount::getScalable(0); 5669 } 5670 5671 auto MaxScalableVF = ElementCount::getScalable( 5672 std::numeric_limits<ElementCount::ScalarTy>::max()); 5673 5674 // Disable scalable vectorization if the loop contains unsupported reductions. 5675 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5676 // FIXME: While for scalable vectors this is currently sufficient, this should 5677 // be replaced by a more detailed mechanism that filters out specific VFs, 5678 // instead of invalidating vectorization for a whole set of VFs based on the 5679 // MaxVF. 5680 if (!canVectorizeReductions(MaxScalableVF)) { 5681 reportVectorizationInfo( 5682 "Scalable vectorization not supported for the reduction " 5683 "operations found in this loop.", 5684 "ScalableVFUnfeasible", ORE, TheLoop); 5685 return ElementCount::getScalable(0); 5686 } 5687 5688 if (Legal->isSafeForAnyVectorWidth()) 5689 return MaxScalableVF; 5690 5691 // Limit MaxScalableVF by the maximum safe dependence distance. 5692 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5693 MaxScalableVF = ElementCount::getScalable( 5694 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5695 if (!MaxScalableVF) 5696 reportVectorizationInfo( 5697 "Max legal vector width too small, scalable vectorization " 5698 "unfeasible.", 5699 "ScalableVFUnfeasible", ORE, TheLoop); 5700 5701 return MaxScalableVF; 5702 } 5703 5704 FixedScalableVFPair 5705 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5706 ElementCount UserVF) { 5707 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5708 unsigned SmallestType, WidestType; 5709 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5710 5711 // Get the maximum safe dependence distance in bits computed by LAA. 5712 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5713 // the memory accesses that is most restrictive (involved in the smallest 5714 // dependence distance). 5715 unsigned MaxSafeElements = 5716 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5717 5718 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5719 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5720 5721 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5722 << ".\n"); 5723 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5724 << ".\n"); 5725 5726 // First analyze the UserVF, fall back if the UserVF should be ignored. 5727 if (UserVF) { 5728 auto MaxSafeUserVF = 5729 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5730 5731 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5732 return UserVF; 5733 5734 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5735 5736 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5737 // is better to ignore the hint and let the compiler choose a suitable VF. 5738 if (!UserVF.isScalable()) { 5739 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5740 << " is unsafe, clamping to max safe VF=" 5741 << MaxSafeFixedVF << ".\n"); 5742 ORE->emit([&]() { 5743 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5744 TheLoop->getStartLoc(), 5745 TheLoop->getHeader()) 5746 << "User-specified vectorization factor " 5747 << ore::NV("UserVectorizationFactor", UserVF) 5748 << " is unsafe, clamping to maximum safe vectorization factor " 5749 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5750 }); 5751 return MaxSafeFixedVF; 5752 } 5753 5754 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5755 << " is unsafe. Ignoring scalable UserVF.\n"); 5756 ORE->emit([&]() { 5757 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5758 TheLoop->getStartLoc(), 5759 TheLoop->getHeader()) 5760 << "User-specified vectorization factor " 5761 << ore::NV("UserVectorizationFactor", UserVF) 5762 << " is unsafe. Ignoring the hint to let the compiler pick a " 5763 "suitable VF."; 5764 }); 5765 } 5766 5767 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5768 << " / " << WidestType << " bits.\n"); 5769 5770 FixedScalableVFPair Result(ElementCount::getFixed(1), 5771 ElementCount::getScalable(0)); 5772 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5773 WidestType, MaxSafeFixedVF)) 5774 Result.FixedVF = MaxVF; 5775 5776 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5777 WidestType, MaxSafeScalableVF)) 5778 if (MaxVF.isScalable()) { 5779 Result.ScalableVF = MaxVF; 5780 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5781 << "\n"); 5782 } 5783 5784 return Result; 5785 } 5786 5787 FixedScalableVFPair 5788 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5789 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5790 // TODO: It may by useful to do since it's still likely to be dynamically 5791 // uniform if the target can skip. 5792 reportVectorizationFailure( 5793 "Not inserting runtime ptr check for divergent target", 5794 "runtime pointer checks needed. Not enabled for divergent target", 5795 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5796 return FixedScalableVFPair::getNone(); 5797 } 5798 5799 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5800 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5801 if (TC == 1) { 5802 reportVectorizationFailure("Single iteration (non) loop", 5803 "loop trip count is one, irrelevant for vectorization", 5804 "SingleIterationLoop", ORE, TheLoop); 5805 return FixedScalableVFPair::getNone(); 5806 } 5807 5808 switch (ScalarEpilogueStatus) { 5809 case CM_ScalarEpilogueAllowed: 5810 return computeFeasibleMaxVF(TC, UserVF); 5811 case CM_ScalarEpilogueNotAllowedUsePredicate: 5812 LLVM_FALLTHROUGH; 5813 case CM_ScalarEpilogueNotNeededUsePredicate: 5814 LLVM_DEBUG( 5815 dbgs() << "LV: vector predicate hint/switch found.\n" 5816 << "LV: Not allowing scalar epilogue, creating predicated " 5817 << "vector loop.\n"); 5818 break; 5819 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5820 // fallthrough as a special case of OptForSize 5821 case CM_ScalarEpilogueNotAllowedOptSize: 5822 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5823 LLVM_DEBUG( 5824 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5825 else 5826 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5827 << "count.\n"); 5828 5829 // Bail if runtime checks are required, which are not good when optimising 5830 // for size. 5831 if (runtimeChecksRequired()) 5832 return FixedScalableVFPair::getNone(); 5833 5834 break; 5835 } 5836 5837 // The only loops we can vectorize without a scalar epilogue, are loops with 5838 // a bottom-test and a single exiting block. We'd have to handle the fact 5839 // that not every instruction executes on the last iteration. This will 5840 // require a lane mask which varies through the vector loop body. (TODO) 5841 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5842 // If there was a tail-folding hint/switch, but we can't fold the tail by 5843 // masking, fallback to a vectorization with a scalar epilogue. 5844 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5845 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5846 "scalar epilogue instead.\n"); 5847 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5848 return computeFeasibleMaxVF(TC, UserVF); 5849 } 5850 return FixedScalableVFPair::getNone(); 5851 } 5852 5853 // Now try the tail folding 5854 5855 // Invalidate interleave groups that require an epilogue if we can't mask 5856 // the interleave-group. 5857 if (!useMaskedInterleavedAccesses(TTI)) { 5858 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5859 "No decisions should have been taken at this point"); 5860 // Note: There is no need to invalidate any cost modeling decisions here, as 5861 // non where taken so far. 5862 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5863 } 5864 5865 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5866 // Avoid tail folding if the trip count is known to be a multiple of any VF 5867 // we chose. 5868 // FIXME: The condition below pessimises the case for fixed-width vectors, 5869 // when scalable VFs are also candidates for vectorization. 5870 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5871 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5872 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5873 "MaxFixedVF must be a power of 2"); 5874 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5875 : MaxFixedVF.getFixedValue(); 5876 ScalarEvolution *SE = PSE.getSE(); 5877 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5878 const SCEV *ExitCount = SE->getAddExpr( 5879 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5880 const SCEV *Rem = SE->getURemExpr( 5881 SE->applyLoopGuards(ExitCount, TheLoop), 5882 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5883 if (Rem->isZero()) { 5884 // Accept MaxFixedVF if we do not have a tail. 5885 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5886 return MaxFactors; 5887 } 5888 } 5889 5890 // If we don't know the precise trip count, or if the trip count that we 5891 // found modulo the vectorization factor is not zero, try to fold the tail 5892 // by masking. 5893 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5894 if (Legal->prepareToFoldTailByMasking()) { 5895 FoldTailByMasking = true; 5896 return MaxFactors; 5897 } 5898 5899 // If there was a tail-folding hint/switch, but we can't fold the tail by 5900 // masking, fallback to a vectorization with a scalar epilogue. 5901 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5902 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5903 "scalar epilogue instead.\n"); 5904 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5905 return MaxFactors; 5906 } 5907 5908 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5909 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5910 return FixedScalableVFPair::getNone(); 5911 } 5912 5913 if (TC == 0) { 5914 reportVectorizationFailure( 5915 "Unable to calculate the loop count due to complex control flow", 5916 "unable to calculate the loop count due to complex control flow", 5917 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5918 return FixedScalableVFPair::getNone(); 5919 } 5920 5921 reportVectorizationFailure( 5922 "Cannot optimize for size and vectorize at the same time.", 5923 "cannot optimize for size and vectorize at the same time. " 5924 "Enable vectorization of this loop with '#pragma clang loop " 5925 "vectorize(enable)' when compiling with -Os/-Oz", 5926 "NoTailLoopWithOptForSize", ORE, TheLoop); 5927 return FixedScalableVFPair::getNone(); 5928 } 5929 5930 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5931 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5932 const ElementCount &MaxSafeVF) { 5933 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5934 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5935 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5936 : TargetTransformInfo::RGK_FixedWidthVector); 5937 5938 // Convenience function to return the minimum of two ElementCounts. 5939 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5940 assert((LHS.isScalable() == RHS.isScalable()) && 5941 "Scalable flags must match"); 5942 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5943 }; 5944 5945 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5946 // Note that both WidestRegister and WidestType may not be a powers of 2. 5947 auto MaxVectorElementCount = ElementCount::get( 5948 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5949 ComputeScalableMaxVF); 5950 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5951 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5952 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5953 5954 if (!MaxVectorElementCount) { 5955 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5956 return ElementCount::getFixed(1); 5957 } 5958 5959 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5960 if (ConstTripCount && 5961 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5962 isPowerOf2_32(ConstTripCount)) { 5963 // We need to clamp the VF to be the ConstTripCount. There is no point in 5964 // choosing a higher viable VF as done in the loop below. If 5965 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5966 // the TC is less than or equal to the known number of lanes. 5967 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5968 << ConstTripCount << "\n"); 5969 return TripCountEC; 5970 } 5971 5972 ElementCount MaxVF = MaxVectorElementCount; 5973 if (TTI.shouldMaximizeVectorBandwidth() || 5974 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5975 auto MaxVectorElementCountMaxBW = ElementCount::get( 5976 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5977 ComputeScalableMaxVF); 5978 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5979 5980 // Collect all viable vectorization factors larger than the default MaxVF 5981 // (i.e. MaxVectorElementCount). 5982 SmallVector<ElementCount, 8> VFs; 5983 for (ElementCount VS = MaxVectorElementCount * 2; 5984 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5985 VFs.push_back(VS); 5986 5987 // For each VF calculate its register usage. 5988 auto RUs = calculateRegisterUsage(VFs); 5989 5990 // Select the largest VF which doesn't require more registers than existing 5991 // ones. 5992 for (int i = RUs.size() - 1; i >= 0; --i) { 5993 bool Selected = true; 5994 for (auto &pair : RUs[i].MaxLocalUsers) { 5995 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5996 if (pair.second > TargetNumRegisters) 5997 Selected = false; 5998 } 5999 if (Selected) { 6000 MaxVF = VFs[i]; 6001 break; 6002 } 6003 } 6004 if (ElementCount MinVF = 6005 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 6006 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 6007 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 6008 << ") with target's minimum: " << MinVF << '\n'); 6009 MaxVF = MinVF; 6010 } 6011 } 6012 } 6013 return MaxVF; 6014 } 6015 6016 bool LoopVectorizationCostModel::isMoreProfitable( 6017 const VectorizationFactor &A, const VectorizationFactor &B) const { 6018 InstructionCost::CostType CostA = *A.Cost.getValue(); 6019 InstructionCost::CostType CostB = *B.Cost.getValue(); 6020 6021 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6022 6023 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6024 MaxTripCount) { 6025 // If we are folding the tail and the trip count is a known (possibly small) 6026 // constant, the trip count will be rounded up to an integer number of 6027 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6028 // which we compare directly. When not folding the tail, the total cost will 6029 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6030 // approximated with the per-lane cost below instead of using the tripcount 6031 // as here. 6032 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6033 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6034 return RTCostA < RTCostB; 6035 } 6036 6037 // To avoid the need for FP division: 6038 // (CostA / A.Width) < (CostB / B.Width) 6039 // <=> (CostA * B.Width) < (CostB * A.Width) 6040 return (CostA * B.Width.getKnownMinValue()) < 6041 (CostB * A.Width.getKnownMinValue()); 6042 } 6043 6044 VectorizationFactor 6045 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 6046 // FIXME: This can be fixed for scalable vectors later, because at this stage 6047 // the LoopVectorizer will only consider vectorizing a loop with scalable 6048 // vectors when the loop has a hint to enable vectorization for a given VF. 6049 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 6050 6051 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6052 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6053 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6054 6055 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6056 VectorizationFactor ChosenFactor = ScalarCost; 6057 6058 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6059 if (ForceVectorization && MaxVF.isVector()) { 6060 // Ignore scalar width, because the user explicitly wants vectorization. 6061 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6062 // evaluation. 6063 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6064 } 6065 6066 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 6067 i *= 2) { 6068 // Notice that the vector loop needs to be executed less times, so 6069 // we need to divide the cost of the vector loops by the width of 6070 // the vector elements. 6071 VectorizationCostTy C = expectedCost(i); 6072 6073 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6074 VectorizationFactor Candidate(i, C.first); 6075 LLVM_DEBUG( 6076 dbgs() << "LV: Vector loop of width " << i << " costs: " 6077 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 6078 << ".\n"); 6079 6080 if (!C.second && !ForceVectorization) { 6081 LLVM_DEBUG( 6082 dbgs() << "LV: Not considering vector loop of width " << i 6083 << " because it will not generate any vector instructions.\n"); 6084 continue; 6085 } 6086 6087 // If profitable add it to ProfitableVF list. 6088 if (isMoreProfitable(Candidate, ScalarCost)) 6089 ProfitableVFs.push_back(Candidate); 6090 6091 if (isMoreProfitable(Candidate, ChosenFactor)) 6092 ChosenFactor = Candidate; 6093 } 6094 6095 if (!EnableCondStoresVectorization && NumPredStores) { 6096 reportVectorizationFailure("There are conditional stores.", 6097 "store that is conditionally executed prevents vectorization", 6098 "ConditionalStore", ORE, TheLoop); 6099 ChosenFactor = ScalarCost; 6100 } 6101 6102 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6103 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6104 dbgs() 6105 << "LV: Vectorization seems to be not beneficial, " 6106 << "but was forced by a user.\n"); 6107 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6108 return ChosenFactor; 6109 } 6110 6111 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6112 const Loop &L, ElementCount VF) const { 6113 // Cross iteration phis such as reductions need special handling and are 6114 // currently unsupported. 6115 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6116 return Legal->isFirstOrderRecurrence(&Phi) || 6117 Legal->isReductionVariable(&Phi); 6118 })) 6119 return false; 6120 6121 // Phis with uses outside of the loop require special handling and are 6122 // currently unsupported. 6123 for (auto &Entry : Legal->getInductionVars()) { 6124 // Look for uses of the value of the induction at the last iteration. 6125 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6126 for (User *U : PostInc->users()) 6127 if (!L.contains(cast<Instruction>(U))) 6128 return false; 6129 // Look for uses of penultimate value of the induction. 6130 for (User *U : Entry.first->users()) 6131 if (!L.contains(cast<Instruction>(U))) 6132 return false; 6133 } 6134 6135 // Induction variables that are widened require special handling that is 6136 // currently not supported. 6137 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6138 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6139 this->isProfitableToScalarize(Entry.first, VF)); 6140 })) 6141 return false; 6142 6143 return true; 6144 } 6145 6146 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6147 const ElementCount VF) const { 6148 // FIXME: We need a much better cost-model to take different parameters such 6149 // as register pressure, code size increase and cost of extra branches into 6150 // account. For now we apply a very crude heuristic and only consider loops 6151 // with vectorization factors larger than a certain value. 6152 // We also consider epilogue vectorization unprofitable for targets that don't 6153 // consider interleaving beneficial (eg. MVE). 6154 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6155 return false; 6156 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6157 return true; 6158 return false; 6159 } 6160 6161 VectorizationFactor 6162 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6163 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6164 VectorizationFactor Result = VectorizationFactor::Disabled(); 6165 if (!EnableEpilogueVectorization) { 6166 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6167 return Result; 6168 } 6169 6170 if (!isScalarEpilogueAllowed()) { 6171 LLVM_DEBUG( 6172 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6173 "allowed.\n";); 6174 return Result; 6175 } 6176 6177 // FIXME: This can be fixed for scalable vectors later, because at this stage 6178 // the LoopVectorizer will only consider vectorizing a loop with scalable 6179 // vectors when the loop has a hint to enable vectorization for a given VF. 6180 if (MainLoopVF.isScalable()) { 6181 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6182 "yet supported.\n"); 6183 return Result; 6184 } 6185 6186 // Not really a cost consideration, but check for unsupported cases here to 6187 // simplify the logic. 6188 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6189 LLVM_DEBUG( 6190 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6191 "not a supported candidate.\n";); 6192 return Result; 6193 } 6194 6195 if (EpilogueVectorizationForceVF > 1) { 6196 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6197 if (LVP.hasPlanWithVFs( 6198 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6199 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6200 else { 6201 LLVM_DEBUG( 6202 dbgs() 6203 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6204 return Result; 6205 } 6206 } 6207 6208 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6209 TheLoop->getHeader()->getParent()->hasMinSize()) { 6210 LLVM_DEBUG( 6211 dbgs() 6212 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6213 return Result; 6214 } 6215 6216 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6217 return Result; 6218 6219 for (auto &NextVF : ProfitableVFs) 6220 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6221 (Result.Width.getFixedValue() == 1 || 6222 isMoreProfitable(NextVF, Result)) && 6223 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6224 Result = NextVF; 6225 6226 if (Result != VectorizationFactor::Disabled()) 6227 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6228 << Result.Width.getFixedValue() << "\n";); 6229 return Result; 6230 } 6231 6232 std::pair<unsigned, unsigned> 6233 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6234 unsigned MinWidth = -1U; 6235 unsigned MaxWidth = 8; 6236 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6237 6238 // For each block. 6239 for (BasicBlock *BB : TheLoop->blocks()) { 6240 // For each instruction in the loop. 6241 for (Instruction &I : BB->instructionsWithoutDebug()) { 6242 Type *T = I.getType(); 6243 6244 // Skip ignored values. 6245 if (ValuesToIgnore.count(&I)) 6246 continue; 6247 6248 // Only examine Loads, Stores and PHINodes. 6249 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6250 continue; 6251 6252 // Examine PHI nodes that are reduction variables. Update the type to 6253 // account for the recurrence type. 6254 if (auto *PN = dyn_cast<PHINode>(&I)) { 6255 if (!Legal->isReductionVariable(PN)) 6256 continue; 6257 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6258 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6259 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6260 RdxDesc.getRecurrenceType(), 6261 TargetTransformInfo::ReductionFlags())) 6262 continue; 6263 T = RdxDesc.getRecurrenceType(); 6264 } 6265 6266 // Examine the stored values. 6267 if (auto *ST = dyn_cast<StoreInst>(&I)) 6268 T = ST->getValueOperand()->getType(); 6269 6270 // Ignore loaded pointer types and stored pointer types that are not 6271 // vectorizable. 6272 // 6273 // FIXME: The check here attempts to predict whether a load or store will 6274 // be vectorized. We only know this for certain after a VF has 6275 // been selected. Here, we assume that if an access can be 6276 // vectorized, it will be. We should also look at extending this 6277 // optimization to non-pointer types. 6278 // 6279 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6280 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6281 continue; 6282 6283 MinWidth = std::min(MinWidth, 6284 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6285 MaxWidth = std::max(MaxWidth, 6286 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6287 } 6288 } 6289 6290 return {MinWidth, MaxWidth}; 6291 } 6292 6293 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6294 unsigned LoopCost) { 6295 // -- The interleave heuristics -- 6296 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6297 // There are many micro-architectural considerations that we can't predict 6298 // at this level. For example, frontend pressure (on decode or fetch) due to 6299 // code size, or the number and capabilities of the execution ports. 6300 // 6301 // We use the following heuristics to select the interleave count: 6302 // 1. If the code has reductions, then we interleave to break the cross 6303 // iteration dependency. 6304 // 2. If the loop is really small, then we interleave to reduce the loop 6305 // overhead. 6306 // 3. We don't interleave if we think that we will spill registers to memory 6307 // due to the increased register pressure. 6308 6309 if (!isScalarEpilogueAllowed()) 6310 return 1; 6311 6312 // We used the distance for the interleave count. 6313 if (Legal->getMaxSafeDepDistBytes() != -1U) 6314 return 1; 6315 6316 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6317 const bool HasReductions = !Legal->getReductionVars().empty(); 6318 // Do not interleave loops with a relatively small known or estimated trip 6319 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6320 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6321 // because with the above conditions interleaving can expose ILP and break 6322 // cross iteration dependences for reductions. 6323 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6324 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6325 return 1; 6326 6327 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6328 // We divide by these constants so assume that we have at least one 6329 // instruction that uses at least one register. 6330 for (auto& pair : R.MaxLocalUsers) { 6331 pair.second = std::max(pair.second, 1U); 6332 } 6333 6334 // We calculate the interleave count using the following formula. 6335 // Subtract the number of loop invariants from the number of available 6336 // registers. These registers are used by all of the interleaved instances. 6337 // Next, divide the remaining registers by the number of registers that is 6338 // required by the loop, in order to estimate how many parallel instances 6339 // fit without causing spills. All of this is rounded down if necessary to be 6340 // a power of two. We want power of two interleave count to simplify any 6341 // addressing operations or alignment considerations. 6342 // We also want power of two interleave counts to ensure that the induction 6343 // variable of the vector loop wraps to zero, when tail is folded by masking; 6344 // this currently happens when OptForSize, in which case IC is set to 1 above. 6345 unsigned IC = UINT_MAX; 6346 6347 for (auto& pair : R.MaxLocalUsers) { 6348 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6349 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6350 << " registers of " 6351 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6352 if (VF.isScalar()) { 6353 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6354 TargetNumRegisters = ForceTargetNumScalarRegs; 6355 } else { 6356 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6357 TargetNumRegisters = ForceTargetNumVectorRegs; 6358 } 6359 unsigned MaxLocalUsers = pair.second; 6360 unsigned LoopInvariantRegs = 0; 6361 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6362 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6363 6364 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6365 // Don't count the induction variable as interleaved. 6366 if (EnableIndVarRegisterHeur) { 6367 TmpIC = 6368 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6369 std::max(1U, (MaxLocalUsers - 1))); 6370 } 6371 6372 IC = std::min(IC, TmpIC); 6373 } 6374 6375 // Clamp the interleave ranges to reasonable counts. 6376 unsigned MaxInterleaveCount = 6377 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6378 6379 // Check if the user has overridden the max. 6380 if (VF.isScalar()) { 6381 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6382 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6383 } else { 6384 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6385 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6386 } 6387 6388 // If trip count is known or estimated compile time constant, limit the 6389 // interleave count to be less than the trip count divided by VF, provided it 6390 // is at least 1. 6391 // 6392 // For scalable vectors we can't know if interleaving is beneficial. It may 6393 // not be beneficial for small loops if none of the lanes in the second vector 6394 // iterations is enabled. However, for larger loops, there is likely to be a 6395 // similar benefit as for fixed-width vectors. For now, we choose to leave 6396 // the InterleaveCount as if vscale is '1', although if some information about 6397 // the vector is known (e.g. min vector size), we can make a better decision. 6398 if (BestKnownTC) { 6399 MaxInterleaveCount = 6400 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6401 // Make sure MaxInterleaveCount is greater than 0. 6402 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6403 } 6404 6405 assert(MaxInterleaveCount > 0 && 6406 "Maximum interleave count must be greater than 0"); 6407 6408 // Clamp the calculated IC to be between the 1 and the max interleave count 6409 // that the target and trip count allows. 6410 if (IC > MaxInterleaveCount) 6411 IC = MaxInterleaveCount; 6412 else 6413 // Make sure IC is greater than 0. 6414 IC = std::max(1u, IC); 6415 6416 assert(IC > 0 && "Interleave count must be greater than 0."); 6417 6418 // If we did not calculate the cost for VF (because the user selected the VF) 6419 // then we calculate the cost of VF here. 6420 if (LoopCost == 0) { 6421 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6422 LoopCost = *expectedCost(VF).first.getValue(); 6423 } 6424 6425 assert(LoopCost && "Non-zero loop cost expected"); 6426 6427 // Interleave if we vectorized this loop and there is a reduction that could 6428 // benefit from interleaving. 6429 if (VF.isVector() && HasReductions) { 6430 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6431 return IC; 6432 } 6433 6434 // Note that if we've already vectorized the loop we will have done the 6435 // runtime check and so interleaving won't require further checks. 6436 bool InterleavingRequiresRuntimePointerCheck = 6437 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6438 6439 // We want to interleave small loops in order to reduce the loop overhead and 6440 // potentially expose ILP opportunities. 6441 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6442 << "LV: IC is " << IC << '\n' 6443 << "LV: VF is " << VF << '\n'); 6444 const bool AggressivelyInterleaveReductions = 6445 TTI.enableAggressiveInterleaving(HasReductions); 6446 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6447 // We assume that the cost overhead is 1 and we use the cost model 6448 // to estimate the cost of the loop and interleave until the cost of the 6449 // loop overhead is about 5% of the cost of the loop. 6450 unsigned SmallIC = 6451 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6452 6453 // Interleave until store/load ports (estimated by max interleave count) are 6454 // saturated. 6455 unsigned NumStores = Legal->getNumStores(); 6456 unsigned NumLoads = Legal->getNumLoads(); 6457 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6458 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6459 6460 // If we have a scalar reduction (vector reductions are already dealt with 6461 // by this point), we can increase the critical path length if the loop 6462 // we're interleaving is inside another loop. Limit, by default to 2, so the 6463 // critical path only gets increased by one reduction operation. 6464 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6465 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6466 SmallIC = std::min(SmallIC, F); 6467 StoresIC = std::min(StoresIC, F); 6468 LoadsIC = std::min(LoadsIC, F); 6469 } 6470 6471 if (EnableLoadStoreRuntimeInterleave && 6472 std::max(StoresIC, LoadsIC) > SmallIC) { 6473 LLVM_DEBUG( 6474 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6475 return std::max(StoresIC, LoadsIC); 6476 } 6477 6478 // If there are scalar reductions and TTI has enabled aggressive 6479 // interleaving for reductions, we will interleave to expose ILP. 6480 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6481 AggressivelyInterleaveReductions) { 6482 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6483 // Interleave no less than SmallIC but not as aggressive as the normal IC 6484 // to satisfy the rare situation when resources are too limited. 6485 return std::max(IC / 2, SmallIC); 6486 } else { 6487 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6488 return SmallIC; 6489 } 6490 } 6491 6492 // Interleave if this is a large loop (small loops are already dealt with by 6493 // this point) that could benefit from interleaving. 6494 if (AggressivelyInterleaveReductions) { 6495 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6496 return IC; 6497 } 6498 6499 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6500 return 1; 6501 } 6502 6503 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6504 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6505 // This function calculates the register usage by measuring the highest number 6506 // of values that are alive at a single location. Obviously, this is a very 6507 // rough estimation. We scan the loop in a topological order in order and 6508 // assign a number to each instruction. We use RPO to ensure that defs are 6509 // met before their users. We assume that each instruction that has in-loop 6510 // users starts an interval. We record every time that an in-loop value is 6511 // used, so we have a list of the first and last occurrences of each 6512 // instruction. Next, we transpose this data structure into a multi map that 6513 // holds the list of intervals that *end* at a specific location. This multi 6514 // map allows us to perform a linear search. We scan the instructions linearly 6515 // and record each time that a new interval starts, by placing it in a set. 6516 // If we find this value in the multi-map then we remove it from the set. 6517 // The max register usage is the maximum size of the set. 6518 // We also search for instructions that are defined outside the loop, but are 6519 // used inside the loop. We need this number separately from the max-interval 6520 // usage number because when we unroll, loop-invariant values do not take 6521 // more register. 6522 LoopBlocksDFS DFS(TheLoop); 6523 DFS.perform(LI); 6524 6525 RegisterUsage RU; 6526 6527 // Each 'key' in the map opens a new interval. The values 6528 // of the map are the index of the 'last seen' usage of the 6529 // instruction that is the key. 6530 using IntervalMap = DenseMap<Instruction *, unsigned>; 6531 6532 // Maps instruction to its index. 6533 SmallVector<Instruction *, 64> IdxToInstr; 6534 // Marks the end of each interval. 6535 IntervalMap EndPoint; 6536 // Saves the list of instruction indices that are used in the loop. 6537 SmallPtrSet<Instruction *, 8> Ends; 6538 // Saves the list of values that are used in the loop but are 6539 // defined outside the loop, such as arguments and constants. 6540 SmallPtrSet<Value *, 8> LoopInvariants; 6541 6542 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6543 for (Instruction &I : BB->instructionsWithoutDebug()) { 6544 IdxToInstr.push_back(&I); 6545 6546 // Save the end location of each USE. 6547 for (Value *U : I.operands()) { 6548 auto *Instr = dyn_cast<Instruction>(U); 6549 6550 // Ignore non-instruction values such as arguments, constants, etc. 6551 if (!Instr) 6552 continue; 6553 6554 // If this instruction is outside the loop then record it and continue. 6555 if (!TheLoop->contains(Instr)) { 6556 LoopInvariants.insert(Instr); 6557 continue; 6558 } 6559 6560 // Overwrite previous end points. 6561 EndPoint[Instr] = IdxToInstr.size(); 6562 Ends.insert(Instr); 6563 } 6564 } 6565 } 6566 6567 // Saves the list of intervals that end with the index in 'key'. 6568 using InstrList = SmallVector<Instruction *, 2>; 6569 DenseMap<unsigned, InstrList> TransposeEnds; 6570 6571 // Transpose the EndPoints to a list of values that end at each index. 6572 for (auto &Interval : EndPoint) 6573 TransposeEnds[Interval.second].push_back(Interval.first); 6574 6575 SmallPtrSet<Instruction *, 8> OpenIntervals; 6576 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6577 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6578 6579 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6580 6581 // A lambda that gets the register usage for the given type and VF. 6582 const auto &TTICapture = TTI; 6583 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6584 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6585 return 0; 6586 return *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6587 }; 6588 6589 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6590 Instruction *I = IdxToInstr[i]; 6591 6592 // Remove all of the instructions that end at this location. 6593 InstrList &List = TransposeEnds[i]; 6594 for (Instruction *ToRemove : List) 6595 OpenIntervals.erase(ToRemove); 6596 6597 // Ignore instructions that are never used within the loop. 6598 if (!Ends.count(I)) 6599 continue; 6600 6601 // Skip ignored values. 6602 if (ValuesToIgnore.count(I)) 6603 continue; 6604 6605 // For each VF find the maximum usage of registers. 6606 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6607 // Count the number of live intervals. 6608 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6609 6610 if (VFs[j].isScalar()) { 6611 for (auto Inst : OpenIntervals) { 6612 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6613 if (RegUsage.find(ClassID) == RegUsage.end()) 6614 RegUsage[ClassID] = 1; 6615 else 6616 RegUsage[ClassID] += 1; 6617 } 6618 } else { 6619 collectUniformsAndScalars(VFs[j]); 6620 for (auto Inst : OpenIntervals) { 6621 // Skip ignored values for VF > 1. 6622 if (VecValuesToIgnore.count(Inst)) 6623 continue; 6624 if (isScalarAfterVectorization(Inst, VFs[j])) { 6625 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6626 if (RegUsage.find(ClassID) == RegUsage.end()) 6627 RegUsage[ClassID] = 1; 6628 else 6629 RegUsage[ClassID] += 1; 6630 } else { 6631 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6632 if (RegUsage.find(ClassID) == RegUsage.end()) 6633 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6634 else 6635 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6636 } 6637 } 6638 } 6639 6640 for (auto& pair : RegUsage) { 6641 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6642 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6643 else 6644 MaxUsages[j][pair.first] = pair.second; 6645 } 6646 } 6647 6648 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6649 << OpenIntervals.size() << '\n'); 6650 6651 // Add the current instruction to the list of open intervals. 6652 OpenIntervals.insert(I); 6653 } 6654 6655 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6656 SmallMapVector<unsigned, unsigned, 4> Invariant; 6657 6658 for (auto Inst : LoopInvariants) { 6659 unsigned Usage = 6660 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6661 unsigned ClassID = 6662 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6663 if (Invariant.find(ClassID) == Invariant.end()) 6664 Invariant[ClassID] = Usage; 6665 else 6666 Invariant[ClassID] += Usage; 6667 } 6668 6669 LLVM_DEBUG({ 6670 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6671 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6672 << " item\n"; 6673 for (const auto &pair : MaxUsages[i]) { 6674 dbgs() << "LV(REG): RegisterClass: " 6675 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6676 << " registers\n"; 6677 } 6678 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6679 << " item\n"; 6680 for (const auto &pair : Invariant) { 6681 dbgs() << "LV(REG): RegisterClass: " 6682 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6683 << " registers\n"; 6684 } 6685 }); 6686 6687 RU.LoopInvariantRegs = Invariant; 6688 RU.MaxLocalUsers = MaxUsages[i]; 6689 RUs[i] = RU; 6690 } 6691 6692 return RUs; 6693 } 6694 6695 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6696 // TODO: Cost model for emulated masked load/store is completely 6697 // broken. This hack guides the cost model to use an artificially 6698 // high enough value to practically disable vectorization with such 6699 // operations, except where previously deployed legality hack allowed 6700 // using very low cost values. This is to avoid regressions coming simply 6701 // from moving "masked load/store" check from legality to cost model. 6702 // Masked Load/Gather emulation was previously never allowed. 6703 // Limited number of Masked Store/Scatter emulation was allowed. 6704 assert(isPredicatedInst(I) && 6705 "Expecting a scalar emulated instruction"); 6706 return isa<LoadInst>(I) || 6707 (isa<StoreInst>(I) && 6708 NumPredStores > NumberOfStoresToPredicate); 6709 } 6710 6711 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6712 // If we aren't vectorizing the loop, or if we've already collected the 6713 // instructions to scalarize, there's nothing to do. Collection may already 6714 // have occurred if we have a user-selected VF and are now computing the 6715 // expected cost for interleaving. 6716 if (VF.isScalar() || VF.isZero() || 6717 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6718 return; 6719 6720 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6721 // not profitable to scalarize any instructions, the presence of VF in the 6722 // map will indicate that we've analyzed it already. 6723 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6724 6725 // Find all the instructions that are scalar with predication in the loop and 6726 // determine if it would be better to not if-convert the blocks they are in. 6727 // If so, we also record the instructions to scalarize. 6728 for (BasicBlock *BB : TheLoop->blocks()) { 6729 if (!blockNeedsPredication(BB)) 6730 continue; 6731 for (Instruction &I : *BB) 6732 if (isScalarWithPredication(&I)) { 6733 ScalarCostsTy ScalarCosts; 6734 // Do not apply discount logic if hacked cost is needed 6735 // for emulated masked memrefs. 6736 if (!useEmulatedMaskMemRefHack(&I) && 6737 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6738 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6739 // Remember that BB will remain after vectorization. 6740 PredicatedBBsAfterVectorization.insert(BB); 6741 } 6742 } 6743 } 6744 6745 int LoopVectorizationCostModel::computePredInstDiscount( 6746 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6747 assert(!isUniformAfterVectorization(PredInst, VF) && 6748 "Instruction marked uniform-after-vectorization will be predicated"); 6749 6750 // Initialize the discount to zero, meaning that the scalar version and the 6751 // vector version cost the same. 6752 InstructionCost Discount = 0; 6753 6754 // Holds instructions to analyze. The instructions we visit are mapped in 6755 // ScalarCosts. Those instructions are the ones that would be scalarized if 6756 // we find that the scalar version costs less. 6757 SmallVector<Instruction *, 8> Worklist; 6758 6759 // Returns true if the given instruction can be scalarized. 6760 auto canBeScalarized = [&](Instruction *I) -> bool { 6761 // We only attempt to scalarize instructions forming a single-use chain 6762 // from the original predicated block that would otherwise be vectorized. 6763 // Although not strictly necessary, we give up on instructions we know will 6764 // already be scalar to avoid traversing chains that are unlikely to be 6765 // beneficial. 6766 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6767 isScalarAfterVectorization(I, VF)) 6768 return false; 6769 6770 // If the instruction is scalar with predication, it will be analyzed 6771 // separately. We ignore it within the context of PredInst. 6772 if (isScalarWithPredication(I)) 6773 return false; 6774 6775 // If any of the instruction's operands are uniform after vectorization, 6776 // the instruction cannot be scalarized. This prevents, for example, a 6777 // masked load from being scalarized. 6778 // 6779 // We assume we will only emit a value for lane zero of an instruction 6780 // marked uniform after vectorization, rather than VF identical values. 6781 // Thus, if we scalarize an instruction that uses a uniform, we would 6782 // create uses of values corresponding to the lanes we aren't emitting code 6783 // for. This behavior can be changed by allowing getScalarValue to clone 6784 // the lane zero values for uniforms rather than asserting. 6785 for (Use &U : I->operands()) 6786 if (auto *J = dyn_cast<Instruction>(U.get())) 6787 if (isUniformAfterVectorization(J, VF)) 6788 return false; 6789 6790 // Otherwise, we can scalarize the instruction. 6791 return true; 6792 }; 6793 6794 // Compute the expected cost discount from scalarizing the entire expression 6795 // feeding the predicated instruction. We currently only consider expressions 6796 // that are single-use instruction chains. 6797 Worklist.push_back(PredInst); 6798 while (!Worklist.empty()) { 6799 Instruction *I = Worklist.pop_back_val(); 6800 6801 // If we've already analyzed the instruction, there's nothing to do. 6802 if (ScalarCosts.find(I) != ScalarCosts.end()) 6803 continue; 6804 6805 // Compute the cost of the vector instruction. Note that this cost already 6806 // includes the scalarization overhead of the predicated instruction. 6807 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6808 6809 // Compute the cost of the scalarized instruction. This cost is the cost of 6810 // the instruction as if it wasn't if-converted and instead remained in the 6811 // predicated block. We will scale this cost by block probability after 6812 // computing the scalarization overhead. 6813 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6814 InstructionCost ScalarCost = 6815 VF.getKnownMinValue() * 6816 getInstructionCost(I, ElementCount::getFixed(1)).first; 6817 6818 // Compute the scalarization overhead of needed insertelement instructions 6819 // and phi nodes. 6820 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6821 ScalarCost += TTI.getScalarizationOverhead( 6822 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6823 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6824 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6825 ScalarCost += 6826 VF.getKnownMinValue() * 6827 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6828 } 6829 6830 // Compute the scalarization overhead of needed extractelement 6831 // instructions. For each of the instruction's operands, if the operand can 6832 // be scalarized, add it to the worklist; otherwise, account for the 6833 // overhead. 6834 for (Use &U : I->operands()) 6835 if (auto *J = dyn_cast<Instruction>(U.get())) { 6836 assert(VectorType::isValidElementType(J->getType()) && 6837 "Instruction has non-scalar type"); 6838 if (canBeScalarized(J)) 6839 Worklist.push_back(J); 6840 else if (needsExtract(J, VF)) { 6841 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6842 ScalarCost += TTI.getScalarizationOverhead( 6843 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6844 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6845 } 6846 } 6847 6848 // Scale the total scalar cost by block probability. 6849 ScalarCost /= getReciprocalPredBlockProb(); 6850 6851 // Compute the discount. A non-negative discount means the vector version 6852 // of the instruction costs more, and scalarizing would be beneficial. 6853 Discount += VectorCost - ScalarCost; 6854 ScalarCosts[I] = ScalarCost; 6855 } 6856 6857 return *Discount.getValue(); 6858 } 6859 6860 LoopVectorizationCostModel::VectorizationCostTy 6861 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6862 VectorizationCostTy Cost; 6863 6864 // For each block. 6865 for (BasicBlock *BB : TheLoop->blocks()) { 6866 VectorizationCostTy BlockCost; 6867 6868 // For each instruction in the old loop. 6869 for (Instruction &I : BB->instructionsWithoutDebug()) { 6870 // Skip ignored values. 6871 if (ValuesToIgnore.count(&I) || 6872 (VF.isVector() && VecValuesToIgnore.count(&I))) 6873 continue; 6874 6875 VectorizationCostTy C = getInstructionCost(&I, VF); 6876 6877 // Check if we should override the cost. 6878 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6879 C.first = InstructionCost(ForceTargetInstructionCost); 6880 6881 BlockCost.first += C.first; 6882 BlockCost.second |= C.second; 6883 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6884 << " for VF " << VF << " For instruction: " << I 6885 << '\n'); 6886 } 6887 6888 // If we are vectorizing a predicated block, it will have been 6889 // if-converted. This means that the block's instructions (aside from 6890 // stores and instructions that may divide by zero) will now be 6891 // unconditionally executed. For the scalar case, we may not always execute 6892 // the predicated block, if it is an if-else block. Thus, scale the block's 6893 // cost by the probability of executing it. blockNeedsPredication from 6894 // Legal is used so as to not include all blocks in tail folded loops. 6895 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6896 BlockCost.first /= getReciprocalPredBlockProb(); 6897 6898 Cost.first += BlockCost.first; 6899 Cost.second |= BlockCost.second; 6900 } 6901 6902 return Cost; 6903 } 6904 6905 /// Gets Address Access SCEV after verifying that the access pattern 6906 /// is loop invariant except the induction variable dependence. 6907 /// 6908 /// This SCEV can be sent to the Target in order to estimate the address 6909 /// calculation cost. 6910 static const SCEV *getAddressAccessSCEV( 6911 Value *Ptr, 6912 LoopVectorizationLegality *Legal, 6913 PredicatedScalarEvolution &PSE, 6914 const Loop *TheLoop) { 6915 6916 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6917 if (!Gep) 6918 return nullptr; 6919 6920 // We are looking for a gep with all loop invariant indices except for one 6921 // which should be an induction variable. 6922 auto SE = PSE.getSE(); 6923 unsigned NumOperands = Gep->getNumOperands(); 6924 for (unsigned i = 1; i < NumOperands; ++i) { 6925 Value *Opd = Gep->getOperand(i); 6926 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6927 !Legal->isInductionVariable(Opd)) 6928 return nullptr; 6929 } 6930 6931 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6932 return PSE.getSCEV(Ptr); 6933 } 6934 6935 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6936 return Legal->hasStride(I->getOperand(0)) || 6937 Legal->hasStride(I->getOperand(1)); 6938 } 6939 6940 InstructionCost 6941 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6942 ElementCount VF) { 6943 assert(VF.isVector() && 6944 "Scalarization cost of instruction implies vectorization."); 6945 if (VF.isScalable()) 6946 return InstructionCost::getInvalid(); 6947 6948 Type *ValTy = getMemInstValueType(I); 6949 auto SE = PSE.getSE(); 6950 6951 unsigned AS = getLoadStoreAddressSpace(I); 6952 Value *Ptr = getLoadStorePointerOperand(I); 6953 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6954 6955 // Figure out whether the access is strided and get the stride value 6956 // if it's known in compile time 6957 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6958 6959 // Get the cost of the scalar memory instruction and address computation. 6960 InstructionCost Cost = 6961 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6962 6963 // Don't pass *I here, since it is scalar but will actually be part of a 6964 // vectorized loop where the user of it is a vectorized instruction. 6965 const Align Alignment = getLoadStoreAlignment(I); 6966 Cost += VF.getKnownMinValue() * 6967 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6968 AS, TTI::TCK_RecipThroughput); 6969 6970 // Get the overhead of the extractelement and insertelement instructions 6971 // we might create due to scalarization. 6972 Cost += getScalarizationOverhead(I, VF); 6973 6974 // If we have a predicated load/store, it will need extra i1 extracts and 6975 // conditional branches, but may not be executed for each vector lane. Scale 6976 // the cost by the probability of executing the predicated block. 6977 if (isPredicatedInst(I)) { 6978 Cost /= getReciprocalPredBlockProb(); 6979 6980 // Add the cost of an i1 extract and a branch 6981 auto *Vec_i1Ty = 6982 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6983 Cost += TTI.getScalarizationOverhead( 6984 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6985 /*Insert=*/false, /*Extract=*/true); 6986 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6987 6988 if (useEmulatedMaskMemRefHack(I)) 6989 // Artificially setting to a high enough value to practically disable 6990 // vectorization with such operations. 6991 Cost = 3000000; 6992 } 6993 6994 return Cost; 6995 } 6996 6997 InstructionCost 6998 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6999 ElementCount VF) { 7000 Type *ValTy = getMemInstValueType(I); 7001 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7002 Value *Ptr = getLoadStorePointerOperand(I); 7003 unsigned AS = getLoadStoreAddressSpace(I); 7004 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7005 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7006 7007 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7008 "Stride should be 1 or -1 for consecutive memory access"); 7009 const Align Alignment = getLoadStoreAlignment(I); 7010 InstructionCost Cost = 0; 7011 if (Legal->isMaskRequired(I)) 7012 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7013 CostKind); 7014 else 7015 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7016 CostKind, I); 7017 7018 bool Reverse = ConsecutiveStride < 0; 7019 if (Reverse) 7020 Cost += 7021 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7022 return Cost; 7023 } 7024 7025 InstructionCost 7026 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7027 ElementCount VF) { 7028 assert(Legal->isUniformMemOp(*I)); 7029 7030 Type *ValTy = getMemInstValueType(I); 7031 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7032 const Align Alignment = getLoadStoreAlignment(I); 7033 unsigned AS = getLoadStoreAddressSpace(I); 7034 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7035 if (isa<LoadInst>(I)) { 7036 return TTI.getAddressComputationCost(ValTy) + 7037 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7038 CostKind) + 7039 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7040 } 7041 StoreInst *SI = cast<StoreInst>(I); 7042 7043 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7044 return TTI.getAddressComputationCost(ValTy) + 7045 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7046 CostKind) + 7047 (isLoopInvariantStoreValue 7048 ? 0 7049 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7050 VF.getKnownMinValue() - 1)); 7051 } 7052 7053 InstructionCost 7054 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7055 ElementCount VF) { 7056 Type *ValTy = getMemInstValueType(I); 7057 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7058 const Align Alignment = getLoadStoreAlignment(I); 7059 const Value *Ptr = getLoadStorePointerOperand(I); 7060 7061 return TTI.getAddressComputationCost(VectorTy) + 7062 TTI.getGatherScatterOpCost( 7063 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7064 TargetTransformInfo::TCK_RecipThroughput, I); 7065 } 7066 7067 InstructionCost 7068 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7069 ElementCount VF) { 7070 // TODO: Once we have support for interleaving with scalable vectors 7071 // we can calculate the cost properly here. 7072 if (VF.isScalable()) 7073 return InstructionCost::getInvalid(); 7074 7075 Type *ValTy = getMemInstValueType(I); 7076 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7077 unsigned AS = getLoadStoreAddressSpace(I); 7078 7079 auto Group = getInterleavedAccessGroup(I); 7080 assert(Group && "Fail to get an interleaved access group."); 7081 7082 unsigned InterleaveFactor = Group->getFactor(); 7083 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7084 7085 // Holds the indices of existing members in an interleaved load group. 7086 // An interleaved store group doesn't need this as it doesn't allow gaps. 7087 SmallVector<unsigned, 4> Indices; 7088 if (isa<LoadInst>(I)) { 7089 for (unsigned i = 0; i < InterleaveFactor; i++) 7090 if (Group->getMember(i)) 7091 Indices.push_back(i); 7092 } 7093 7094 // Calculate the cost of the whole interleaved group. 7095 bool UseMaskForGaps = 7096 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7097 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7098 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7099 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7100 7101 if (Group->isReverse()) { 7102 // TODO: Add support for reversed masked interleaved access. 7103 assert(!Legal->isMaskRequired(I) && 7104 "Reverse masked interleaved access not supported."); 7105 Cost += 7106 Group->getNumMembers() * 7107 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7108 } 7109 return Cost; 7110 } 7111 7112 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7113 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7114 // Early exit for no inloop reductions 7115 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7116 return InstructionCost::getInvalid(); 7117 auto *VectorTy = cast<VectorType>(Ty); 7118 7119 // We are looking for a pattern of, and finding the minimal acceptable cost: 7120 // reduce(mul(ext(A), ext(B))) or 7121 // reduce(mul(A, B)) or 7122 // reduce(ext(A)) or 7123 // reduce(A). 7124 // The basic idea is that we walk down the tree to do that, finding the root 7125 // reduction instruction in InLoopReductionImmediateChains. From there we find 7126 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7127 // of the components. If the reduction cost is lower then we return it for the 7128 // reduction instruction and 0 for the other instructions in the pattern. If 7129 // it is not we return an invalid cost specifying the orignal cost method 7130 // should be used. 7131 Instruction *RetI = I; 7132 if ((RetI->getOpcode() == Instruction::SExt || 7133 RetI->getOpcode() == Instruction::ZExt)) { 7134 if (!RetI->hasOneUser()) 7135 return InstructionCost::getInvalid(); 7136 RetI = RetI->user_back(); 7137 } 7138 if (RetI->getOpcode() == Instruction::Mul && 7139 RetI->user_back()->getOpcode() == Instruction::Add) { 7140 if (!RetI->hasOneUser()) 7141 return InstructionCost::getInvalid(); 7142 RetI = RetI->user_back(); 7143 } 7144 7145 // Test if the found instruction is a reduction, and if not return an invalid 7146 // cost specifying the parent to use the original cost modelling. 7147 if (!InLoopReductionImmediateChains.count(RetI)) 7148 return InstructionCost::getInvalid(); 7149 7150 // Find the reduction this chain is a part of and calculate the basic cost of 7151 // the reduction on its own. 7152 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7153 Instruction *ReductionPhi = LastChain; 7154 while (!isa<PHINode>(ReductionPhi)) 7155 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7156 7157 RecurrenceDescriptor RdxDesc = 7158 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7159 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7160 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7161 7162 // Get the operand that was not the reduction chain and match it to one of the 7163 // patterns, returning the better cost if it is found. 7164 Instruction *RedOp = RetI->getOperand(1) == LastChain 7165 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7166 : dyn_cast<Instruction>(RetI->getOperand(1)); 7167 7168 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7169 7170 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7171 !TheLoop->isLoopInvariant(RedOp)) { 7172 bool IsUnsigned = isa<ZExtInst>(RedOp); 7173 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7174 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7175 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7176 CostKind); 7177 7178 InstructionCost ExtCost = 7179 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7180 TTI::CastContextHint::None, CostKind, RedOp); 7181 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7182 return I == RetI ? *RedCost.getValue() : 0; 7183 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7184 Instruction *Mul = RedOp; 7185 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7186 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7187 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7188 Op0->getOpcode() == Op1->getOpcode() && 7189 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7190 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7191 bool IsUnsigned = isa<ZExtInst>(Op0); 7192 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7193 // reduce(mul(ext, ext)) 7194 InstructionCost ExtCost = 7195 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7196 TTI::CastContextHint::None, CostKind, Op0); 7197 InstructionCost MulCost = 7198 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7199 7200 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7201 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7202 CostKind); 7203 7204 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7205 return I == RetI ? *RedCost.getValue() : 0; 7206 } else { 7207 InstructionCost MulCost = 7208 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7209 7210 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7211 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7212 CostKind); 7213 7214 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7215 return I == RetI ? *RedCost.getValue() : 0; 7216 } 7217 } 7218 7219 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7220 } 7221 7222 InstructionCost 7223 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7224 ElementCount VF) { 7225 // Calculate scalar cost only. Vectorization cost should be ready at this 7226 // moment. 7227 if (VF.isScalar()) { 7228 Type *ValTy = getMemInstValueType(I); 7229 const Align Alignment = getLoadStoreAlignment(I); 7230 unsigned AS = getLoadStoreAddressSpace(I); 7231 7232 return TTI.getAddressComputationCost(ValTy) + 7233 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7234 TTI::TCK_RecipThroughput, I); 7235 } 7236 return getWideningCost(I, VF); 7237 } 7238 7239 LoopVectorizationCostModel::VectorizationCostTy 7240 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7241 ElementCount VF) { 7242 // If we know that this instruction will remain uniform, check the cost of 7243 // the scalar version. 7244 if (isUniformAfterVectorization(I, VF)) 7245 VF = ElementCount::getFixed(1); 7246 7247 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7248 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7249 7250 // Forced scalars do not have any scalarization overhead. 7251 auto ForcedScalar = ForcedScalars.find(VF); 7252 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7253 auto InstSet = ForcedScalar->second; 7254 if (InstSet.count(I)) 7255 return VectorizationCostTy( 7256 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7257 VF.getKnownMinValue()), 7258 false); 7259 } 7260 7261 Type *VectorTy; 7262 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7263 7264 bool TypeNotScalarized = 7265 VF.isVector() && VectorTy->isVectorTy() && 7266 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7267 return VectorizationCostTy(C, TypeNotScalarized); 7268 } 7269 7270 InstructionCost 7271 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7272 ElementCount VF) const { 7273 7274 if (VF.isScalable()) 7275 return InstructionCost::getInvalid(); 7276 7277 if (VF.isScalar()) 7278 return 0; 7279 7280 InstructionCost Cost = 0; 7281 Type *RetTy = ToVectorTy(I->getType(), VF); 7282 if (!RetTy->isVoidTy() && 7283 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7284 Cost += TTI.getScalarizationOverhead( 7285 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7286 true, false); 7287 7288 // Some targets keep addresses scalar. 7289 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7290 return Cost; 7291 7292 // Some targets support efficient element stores. 7293 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7294 return Cost; 7295 7296 // Collect operands to consider. 7297 CallInst *CI = dyn_cast<CallInst>(I); 7298 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7299 7300 // Skip operands that do not require extraction/scalarization and do not incur 7301 // any overhead. 7302 SmallVector<Type *> Tys; 7303 for (auto *V : filterExtractingOperands(Ops, VF)) 7304 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7305 return Cost + TTI.getOperandsScalarizationOverhead( 7306 filterExtractingOperands(Ops, VF), Tys); 7307 } 7308 7309 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7310 if (VF.isScalar()) 7311 return; 7312 NumPredStores = 0; 7313 for (BasicBlock *BB : TheLoop->blocks()) { 7314 // For each instruction in the old loop. 7315 for (Instruction &I : *BB) { 7316 Value *Ptr = getLoadStorePointerOperand(&I); 7317 if (!Ptr) 7318 continue; 7319 7320 // TODO: We should generate better code and update the cost model for 7321 // predicated uniform stores. Today they are treated as any other 7322 // predicated store (see added test cases in 7323 // invariant-store-vectorization.ll). 7324 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7325 NumPredStores++; 7326 7327 if (Legal->isUniformMemOp(I)) { 7328 // TODO: Avoid replicating loads and stores instead of 7329 // relying on instcombine to remove them. 7330 // Load: Scalar load + broadcast 7331 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7332 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7333 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7334 continue; 7335 } 7336 7337 // We assume that widening is the best solution when possible. 7338 if (memoryInstructionCanBeWidened(&I, VF)) { 7339 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7340 int ConsecutiveStride = 7341 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7342 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7343 "Expected consecutive stride."); 7344 InstWidening Decision = 7345 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7346 setWideningDecision(&I, VF, Decision, Cost); 7347 continue; 7348 } 7349 7350 // Choose between Interleaving, Gather/Scatter or Scalarization. 7351 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7352 unsigned NumAccesses = 1; 7353 if (isAccessInterleaved(&I)) { 7354 auto Group = getInterleavedAccessGroup(&I); 7355 assert(Group && "Fail to get an interleaved access group."); 7356 7357 // Make one decision for the whole group. 7358 if (getWideningDecision(&I, VF) != CM_Unknown) 7359 continue; 7360 7361 NumAccesses = Group->getNumMembers(); 7362 if (interleavedAccessCanBeWidened(&I, VF)) 7363 InterleaveCost = getInterleaveGroupCost(&I, VF); 7364 } 7365 7366 InstructionCost GatherScatterCost = 7367 isLegalGatherOrScatter(&I) 7368 ? getGatherScatterCost(&I, VF) * NumAccesses 7369 : InstructionCost::getInvalid(); 7370 7371 InstructionCost ScalarizationCost = 7372 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7373 7374 // Choose better solution for the current VF, 7375 // write down this decision and use it during vectorization. 7376 InstructionCost Cost; 7377 InstWidening Decision; 7378 if (InterleaveCost <= GatherScatterCost && 7379 InterleaveCost < ScalarizationCost) { 7380 Decision = CM_Interleave; 7381 Cost = InterleaveCost; 7382 } else if (GatherScatterCost < ScalarizationCost) { 7383 Decision = CM_GatherScatter; 7384 Cost = GatherScatterCost; 7385 } else { 7386 assert(!VF.isScalable() && 7387 "We cannot yet scalarise for scalable vectors"); 7388 Decision = CM_Scalarize; 7389 Cost = ScalarizationCost; 7390 } 7391 // If the instructions belongs to an interleave group, the whole group 7392 // receives the same decision. The whole group receives the cost, but 7393 // the cost will actually be assigned to one instruction. 7394 if (auto Group = getInterleavedAccessGroup(&I)) 7395 setWideningDecision(Group, VF, Decision, Cost); 7396 else 7397 setWideningDecision(&I, VF, Decision, Cost); 7398 } 7399 } 7400 7401 // Make sure that any load of address and any other address computation 7402 // remains scalar unless there is gather/scatter support. This avoids 7403 // inevitable extracts into address registers, and also has the benefit of 7404 // activating LSR more, since that pass can't optimize vectorized 7405 // addresses. 7406 if (TTI.prefersVectorizedAddressing()) 7407 return; 7408 7409 // Start with all scalar pointer uses. 7410 SmallPtrSet<Instruction *, 8> AddrDefs; 7411 for (BasicBlock *BB : TheLoop->blocks()) 7412 for (Instruction &I : *BB) { 7413 Instruction *PtrDef = 7414 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7415 if (PtrDef && TheLoop->contains(PtrDef) && 7416 getWideningDecision(&I, VF) != CM_GatherScatter) 7417 AddrDefs.insert(PtrDef); 7418 } 7419 7420 // Add all instructions used to generate the addresses. 7421 SmallVector<Instruction *, 4> Worklist; 7422 append_range(Worklist, AddrDefs); 7423 while (!Worklist.empty()) { 7424 Instruction *I = Worklist.pop_back_val(); 7425 for (auto &Op : I->operands()) 7426 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7427 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7428 AddrDefs.insert(InstOp).second) 7429 Worklist.push_back(InstOp); 7430 } 7431 7432 for (auto *I : AddrDefs) { 7433 if (isa<LoadInst>(I)) { 7434 // Setting the desired widening decision should ideally be handled in 7435 // by cost functions, but since this involves the task of finding out 7436 // if the loaded register is involved in an address computation, it is 7437 // instead changed here when we know this is the case. 7438 InstWidening Decision = getWideningDecision(I, VF); 7439 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7440 // Scalarize a widened load of address. 7441 setWideningDecision( 7442 I, VF, CM_Scalarize, 7443 (VF.getKnownMinValue() * 7444 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7445 else if (auto Group = getInterleavedAccessGroup(I)) { 7446 // Scalarize an interleave group of address loads. 7447 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7448 if (Instruction *Member = Group->getMember(I)) 7449 setWideningDecision( 7450 Member, VF, CM_Scalarize, 7451 (VF.getKnownMinValue() * 7452 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7453 } 7454 } 7455 } else 7456 // Make sure I gets scalarized and a cost estimate without 7457 // scalarization overhead. 7458 ForcedScalars[VF].insert(I); 7459 } 7460 } 7461 7462 InstructionCost 7463 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7464 Type *&VectorTy) { 7465 Type *RetTy = I->getType(); 7466 if (canTruncateToMinimalBitwidth(I, VF)) 7467 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7468 auto SE = PSE.getSE(); 7469 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7470 7471 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7472 ElementCount VF) -> bool { 7473 if (VF.isScalar()) 7474 return true; 7475 7476 auto Scalarized = InstsToScalarize.find(VF); 7477 assert(Scalarized != InstsToScalarize.end() && 7478 "VF not yet analyzed for scalarization profitability"); 7479 return !Scalarized->second.count(I) && 7480 llvm::all_of(I->users(), [&](User *U) { 7481 auto *UI = cast<Instruction>(U); 7482 return !Scalarized->second.count(UI); 7483 }); 7484 }; 7485 (void) hasSingleCopyAfterVectorization; 7486 7487 if (isScalarAfterVectorization(I, VF)) { 7488 // With the exception of GEPs and PHIs, after scalarization there should 7489 // only be one copy of the instruction generated in the loop. This is 7490 // because the VF is either 1, or any instructions that need scalarizing 7491 // have already been dealt with by the the time we get here. As a result, 7492 // it means we don't have to multiply the instruction cost by VF. 7493 assert(I->getOpcode() == Instruction::GetElementPtr || 7494 I->getOpcode() == Instruction::PHI || 7495 (I->getOpcode() == Instruction::BitCast && 7496 I->getType()->isPointerTy()) || 7497 hasSingleCopyAfterVectorization(I, VF)); 7498 VectorTy = RetTy; 7499 } else 7500 VectorTy = ToVectorTy(RetTy, VF); 7501 7502 // TODO: We need to estimate the cost of intrinsic calls. 7503 switch (I->getOpcode()) { 7504 case Instruction::GetElementPtr: 7505 // We mark this instruction as zero-cost because the cost of GEPs in 7506 // vectorized code depends on whether the corresponding memory instruction 7507 // is scalarized or not. Therefore, we handle GEPs with the memory 7508 // instruction cost. 7509 return 0; 7510 case Instruction::Br: { 7511 // In cases of scalarized and predicated instructions, there will be VF 7512 // predicated blocks in the vectorized loop. Each branch around these 7513 // blocks requires also an extract of its vector compare i1 element. 7514 bool ScalarPredicatedBB = false; 7515 BranchInst *BI = cast<BranchInst>(I); 7516 if (VF.isVector() && BI->isConditional() && 7517 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7518 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7519 ScalarPredicatedBB = true; 7520 7521 if (ScalarPredicatedBB) { 7522 // Return cost for branches around scalarized and predicated blocks. 7523 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7524 auto *Vec_i1Ty = 7525 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7526 return (TTI.getScalarizationOverhead( 7527 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7528 false, true) + 7529 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7530 VF.getKnownMinValue())); 7531 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7532 // The back-edge branch will remain, as will all scalar branches. 7533 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7534 else 7535 // This branch will be eliminated by if-conversion. 7536 return 0; 7537 // Note: We currently assume zero cost for an unconditional branch inside 7538 // a predicated block since it will become a fall-through, although we 7539 // may decide in the future to call TTI for all branches. 7540 } 7541 case Instruction::PHI: { 7542 auto *Phi = cast<PHINode>(I); 7543 7544 // First-order recurrences are replaced by vector shuffles inside the loop. 7545 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7546 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7547 return TTI.getShuffleCost( 7548 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7549 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7550 7551 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7552 // converted into select instructions. We require N - 1 selects per phi 7553 // node, where N is the number of incoming values. 7554 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7555 return (Phi->getNumIncomingValues() - 1) * 7556 TTI.getCmpSelInstrCost( 7557 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7558 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7559 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7560 7561 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7562 } 7563 case Instruction::UDiv: 7564 case Instruction::SDiv: 7565 case Instruction::URem: 7566 case Instruction::SRem: 7567 // If we have a predicated instruction, it may not be executed for each 7568 // vector lane. Get the scalarization cost and scale this amount by the 7569 // probability of executing the predicated block. If the instruction is not 7570 // predicated, we fall through to the next case. 7571 if (VF.isVector() && isScalarWithPredication(I)) { 7572 InstructionCost Cost = 0; 7573 7574 // These instructions have a non-void type, so account for the phi nodes 7575 // that we will create. This cost is likely to be zero. The phi node 7576 // cost, if any, should be scaled by the block probability because it 7577 // models a copy at the end of each predicated block. 7578 Cost += VF.getKnownMinValue() * 7579 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7580 7581 // The cost of the non-predicated instruction. 7582 Cost += VF.getKnownMinValue() * 7583 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7584 7585 // The cost of insertelement and extractelement instructions needed for 7586 // scalarization. 7587 Cost += getScalarizationOverhead(I, VF); 7588 7589 // Scale the cost by the probability of executing the predicated blocks. 7590 // This assumes the predicated block for each vector lane is equally 7591 // likely. 7592 return Cost / getReciprocalPredBlockProb(); 7593 } 7594 LLVM_FALLTHROUGH; 7595 case Instruction::Add: 7596 case Instruction::FAdd: 7597 case Instruction::Sub: 7598 case Instruction::FSub: 7599 case Instruction::Mul: 7600 case Instruction::FMul: 7601 case Instruction::FDiv: 7602 case Instruction::FRem: 7603 case Instruction::Shl: 7604 case Instruction::LShr: 7605 case Instruction::AShr: 7606 case Instruction::And: 7607 case Instruction::Or: 7608 case Instruction::Xor: { 7609 // Since we will replace the stride by 1 the multiplication should go away. 7610 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7611 return 0; 7612 7613 // Detect reduction patterns 7614 InstructionCost RedCost; 7615 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7616 .isValid()) 7617 return RedCost; 7618 7619 // Certain instructions can be cheaper to vectorize if they have a constant 7620 // second vector operand. One example of this are shifts on x86. 7621 Value *Op2 = I->getOperand(1); 7622 TargetTransformInfo::OperandValueProperties Op2VP; 7623 TargetTransformInfo::OperandValueKind Op2VK = 7624 TTI.getOperandInfo(Op2, Op2VP); 7625 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7626 Op2VK = TargetTransformInfo::OK_UniformValue; 7627 7628 SmallVector<const Value *, 4> Operands(I->operand_values()); 7629 return TTI.getArithmeticInstrCost( 7630 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7631 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7632 } 7633 case Instruction::FNeg: { 7634 return TTI.getArithmeticInstrCost( 7635 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7636 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7637 TargetTransformInfo::OP_None, I->getOperand(0), I); 7638 } 7639 case Instruction::Select: { 7640 SelectInst *SI = cast<SelectInst>(I); 7641 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7642 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7643 7644 const Value *Op0, *Op1; 7645 using namespace llvm::PatternMatch; 7646 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7647 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7648 // select x, y, false --> x & y 7649 // select x, true, y --> x | y 7650 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7651 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7652 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7653 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7654 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7655 Op1->getType()->getScalarSizeInBits() == 1); 7656 7657 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7658 return TTI.getArithmeticInstrCost( 7659 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7660 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7661 } 7662 7663 Type *CondTy = SI->getCondition()->getType(); 7664 if (!ScalarCond) 7665 CondTy = VectorType::get(CondTy, VF); 7666 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7667 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7668 } 7669 case Instruction::ICmp: 7670 case Instruction::FCmp: { 7671 Type *ValTy = I->getOperand(0)->getType(); 7672 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7673 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7674 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7675 VectorTy = ToVectorTy(ValTy, VF); 7676 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7677 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7678 } 7679 case Instruction::Store: 7680 case Instruction::Load: { 7681 ElementCount Width = VF; 7682 if (Width.isVector()) { 7683 InstWidening Decision = getWideningDecision(I, Width); 7684 assert(Decision != CM_Unknown && 7685 "CM decision should be taken at this point"); 7686 if (Decision == CM_Scalarize) 7687 Width = ElementCount::getFixed(1); 7688 } 7689 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7690 return getMemoryInstructionCost(I, VF); 7691 } 7692 case Instruction::BitCast: 7693 if (I->getType()->isPointerTy()) 7694 return 0; 7695 LLVM_FALLTHROUGH; 7696 case Instruction::ZExt: 7697 case Instruction::SExt: 7698 case Instruction::FPToUI: 7699 case Instruction::FPToSI: 7700 case Instruction::FPExt: 7701 case Instruction::PtrToInt: 7702 case Instruction::IntToPtr: 7703 case Instruction::SIToFP: 7704 case Instruction::UIToFP: 7705 case Instruction::Trunc: 7706 case Instruction::FPTrunc: { 7707 // Computes the CastContextHint from a Load/Store instruction. 7708 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7709 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7710 "Expected a load or a store!"); 7711 7712 if (VF.isScalar() || !TheLoop->contains(I)) 7713 return TTI::CastContextHint::Normal; 7714 7715 switch (getWideningDecision(I, VF)) { 7716 case LoopVectorizationCostModel::CM_GatherScatter: 7717 return TTI::CastContextHint::GatherScatter; 7718 case LoopVectorizationCostModel::CM_Interleave: 7719 return TTI::CastContextHint::Interleave; 7720 case LoopVectorizationCostModel::CM_Scalarize: 7721 case LoopVectorizationCostModel::CM_Widen: 7722 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7723 : TTI::CastContextHint::Normal; 7724 case LoopVectorizationCostModel::CM_Widen_Reverse: 7725 return TTI::CastContextHint::Reversed; 7726 case LoopVectorizationCostModel::CM_Unknown: 7727 llvm_unreachable("Instr did not go through cost modelling?"); 7728 } 7729 7730 llvm_unreachable("Unhandled case!"); 7731 }; 7732 7733 unsigned Opcode = I->getOpcode(); 7734 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7735 // For Trunc, the context is the only user, which must be a StoreInst. 7736 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7737 if (I->hasOneUse()) 7738 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7739 CCH = ComputeCCH(Store); 7740 } 7741 // For Z/Sext, the context is the operand, which must be a LoadInst. 7742 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7743 Opcode == Instruction::FPExt) { 7744 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7745 CCH = ComputeCCH(Load); 7746 } 7747 7748 // We optimize the truncation of induction variables having constant 7749 // integer steps. The cost of these truncations is the same as the scalar 7750 // operation. 7751 if (isOptimizableIVTruncate(I, VF)) { 7752 auto *Trunc = cast<TruncInst>(I); 7753 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7754 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7755 } 7756 7757 // Detect reduction patterns 7758 InstructionCost RedCost; 7759 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7760 .isValid()) 7761 return RedCost; 7762 7763 Type *SrcScalarTy = I->getOperand(0)->getType(); 7764 Type *SrcVecTy = 7765 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7766 if (canTruncateToMinimalBitwidth(I, VF)) { 7767 // This cast is going to be shrunk. This may remove the cast or it might 7768 // turn it into slightly different cast. For example, if MinBW == 16, 7769 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7770 // 7771 // Calculate the modified src and dest types. 7772 Type *MinVecTy = VectorTy; 7773 if (Opcode == Instruction::Trunc) { 7774 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7775 VectorTy = 7776 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7777 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7778 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7779 VectorTy = 7780 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7781 } 7782 } 7783 7784 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7785 } 7786 case Instruction::Call: { 7787 bool NeedToScalarize; 7788 CallInst *CI = cast<CallInst>(I); 7789 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7790 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7791 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7792 return std::min(CallCost, IntrinsicCost); 7793 } 7794 return CallCost; 7795 } 7796 case Instruction::ExtractValue: 7797 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7798 default: 7799 // This opcode is unknown. Assume that it is the same as 'mul'. 7800 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7801 } // end of switch. 7802 } 7803 7804 char LoopVectorize::ID = 0; 7805 7806 static const char lv_name[] = "Loop Vectorization"; 7807 7808 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7809 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7810 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7811 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7812 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7813 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7814 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7815 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7816 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7817 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7818 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7819 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7820 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7821 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7822 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7823 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7824 7825 namespace llvm { 7826 7827 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7828 7829 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7830 bool VectorizeOnlyWhenForced) { 7831 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7832 } 7833 7834 } // end namespace llvm 7835 7836 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7837 // Check if the pointer operand of a load or store instruction is 7838 // consecutive. 7839 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7840 return Legal->isConsecutivePtr(Ptr); 7841 return false; 7842 } 7843 7844 void LoopVectorizationCostModel::collectValuesToIgnore() { 7845 // Ignore ephemeral values. 7846 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7847 7848 // Ignore type-promoting instructions we identified during reduction 7849 // detection. 7850 for (auto &Reduction : Legal->getReductionVars()) { 7851 RecurrenceDescriptor &RedDes = Reduction.second; 7852 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7853 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7854 } 7855 // Ignore type-casting instructions we identified during induction 7856 // detection. 7857 for (auto &Induction : Legal->getInductionVars()) { 7858 InductionDescriptor &IndDes = Induction.second; 7859 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7860 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7861 } 7862 } 7863 7864 void LoopVectorizationCostModel::collectInLoopReductions() { 7865 for (auto &Reduction : Legal->getReductionVars()) { 7866 PHINode *Phi = Reduction.first; 7867 RecurrenceDescriptor &RdxDesc = Reduction.second; 7868 7869 // We don't collect reductions that are type promoted (yet). 7870 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7871 continue; 7872 7873 // If the target would prefer this reduction to happen "in-loop", then we 7874 // want to record it as such. 7875 unsigned Opcode = RdxDesc.getOpcode(); 7876 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7877 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7878 TargetTransformInfo::ReductionFlags())) 7879 continue; 7880 7881 // Check that we can correctly put the reductions into the loop, by 7882 // finding the chain of operations that leads from the phi to the loop 7883 // exit value. 7884 SmallVector<Instruction *, 4> ReductionOperations = 7885 RdxDesc.getReductionOpChain(Phi, TheLoop); 7886 bool InLoop = !ReductionOperations.empty(); 7887 if (InLoop) { 7888 InLoopReductionChains[Phi] = ReductionOperations; 7889 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7890 Instruction *LastChain = Phi; 7891 for (auto *I : ReductionOperations) { 7892 InLoopReductionImmediateChains[I] = LastChain; 7893 LastChain = I; 7894 } 7895 } 7896 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7897 << " reduction for phi: " << *Phi << "\n"); 7898 } 7899 } 7900 7901 // TODO: we could return a pair of values that specify the max VF and 7902 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7903 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7904 // doesn't have a cost model that can choose which plan to execute if 7905 // more than one is generated. 7906 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7907 LoopVectorizationCostModel &CM) { 7908 unsigned WidestType; 7909 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7910 return WidestVectorRegBits / WidestType; 7911 } 7912 7913 VectorizationFactor 7914 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7915 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7916 ElementCount VF = UserVF; 7917 // Outer loop handling: They may require CFG and instruction level 7918 // transformations before even evaluating whether vectorization is profitable. 7919 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7920 // the vectorization pipeline. 7921 if (!OrigLoop->isInnermost()) { 7922 // If the user doesn't provide a vectorization factor, determine a 7923 // reasonable one. 7924 if (UserVF.isZero()) { 7925 VF = ElementCount::getFixed(determineVPlanVF( 7926 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7927 .getFixedSize(), 7928 CM)); 7929 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7930 7931 // Make sure we have a VF > 1 for stress testing. 7932 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7933 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7934 << "overriding computed VF.\n"); 7935 VF = ElementCount::getFixed(4); 7936 } 7937 } 7938 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7939 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7940 "VF needs to be a power of two"); 7941 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7942 << "VF " << VF << " to build VPlans.\n"); 7943 buildVPlans(VF, VF); 7944 7945 // For VPlan build stress testing, we bail out after VPlan construction. 7946 if (VPlanBuildStressTest) 7947 return VectorizationFactor::Disabled(); 7948 7949 return {VF, 0 /*Cost*/}; 7950 } 7951 7952 LLVM_DEBUG( 7953 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7954 "VPlan-native path.\n"); 7955 return VectorizationFactor::Disabled(); 7956 } 7957 7958 Optional<VectorizationFactor> 7959 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7960 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7961 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7962 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7963 return None; 7964 7965 // Invalidate interleave groups if all blocks of loop will be predicated. 7966 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7967 !useMaskedInterleavedAccesses(*TTI)) { 7968 LLVM_DEBUG( 7969 dbgs() 7970 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7971 "which requires masked-interleaved support.\n"); 7972 if (CM.InterleaveInfo.invalidateGroups()) 7973 // Invalidating interleave groups also requires invalidating all decisions 7974 // based on them, which includes widening decisions and uniform and scalar 7975 // values. 7976 CM.invalidateCostModelingDecisions(); 7977 } 7978 7979 ElementCount MaxUserVF = 7980 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7981 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7982 if (!UserVF.isZero() && UserVFIsLegal) { 7983 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7984 << " VF " << UserVF << ".\n"); 7985 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7986 "VF needs to be a power of two"); 7987 // Collect the instructions (and their associated costs) that will be more 7988 // profitable to scalarize. 7989 CM.selectUserVectorizationFactor(UserVF); 7990 CM.collectInLoopReductions(); 7991 buildVPlansWithVPRecipes({UserVF}, {UserVF}); 7992 LLVM_DEBUG(printPlans(dbgs())); 7993 return {{UserVF, 0}}; 7994 } 7995 7996 ElementCount MaxVF = MaxFactors.FixedVF; 7997 assert(!MaxVF.isScalable() && 7998 "Scalable vectors not yet supported beyond this point"); 7999 8000 for (ElementCount VF = ElementCount::getFixed(1); 8001 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 8002 // Collect Uniform and Scalar instructions after vectorization with VF. 8003 CM.collectUniformsAndScalars(VF); 8004 8005 // Collect the instructions (and their associated costs) that will be more 8006 // profitable to scalarize. 8007 if (VF.isVector()) 8008 CM.collectInstsToScalarize(VF); 8009 } 8010 8011 CM.collectInLoopReductions(); 8012 8013 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 8014 LLVM_DEBUG(printPlans(dbgs())); 8015 if (!MaxFactors.hasVector()) 8016 return VectorizationFactor::Disabled(); 8017 8018 // Select the optimal vectorization factor. 8019 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 8020 8021 // Check if it is profitable to vectorize with runtime checks. 8022 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8023 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8024 bool PragmaThresholdReached = 8025 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8026 bool ThresholdReached = 8027 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8028 if ((ThresholdReached && !Hints.allowReordering()) || 8029 PragmaThresholdReached) { 8030 ORE->emit([&]() { 8031 return OptimizationRemarkAnalysisAliasing( 8032 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8033 OrigLoop->getHeader()) 8034 << "loop not vectorized: cannot prove it is safe to reorder " 8035 "memory operations"; 8036 }); 8037 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8038 Hints.emitRemarkWithHints(); 8039 return VectorizationFactor::Disabled(); 8040 } 8041 } 8042 return SelectedVF; 8043 } 8044 8045 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8046 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8047 << '\n'); 8048 BestVF = VF; 8049 BestUF = UF; 8050 8051 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8052 return !Plan->hasVF(VF); 8053 }); 8054 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8055 } 8056 8057 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8058 DominatorTree *DT) { 8059 // Perform the actual loop transformation. 8060 8061 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8062 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8063 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8064 8065 VPTransformState State{ 8066 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8067 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8068 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8069 State.CanonicalIV = ILV.Induction; 8070 8071 ILV.printDebugTracesAtStart(); 8072 8073 //===------------------------------------------------===// 8074 // 8075 // Notice: any optimization or new instruction that go 8076 // into the code below should also be implemented in 8077 // the cost-model. 8078 // 8079 //===------------------------------------------------===// 8080 8081 // 2. Copy and widen instructions from the old loop into the new loop. 8082 VPlans.front()->execute(&State); 8083 8084 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8085 // predication, updating analyses. 8086 ILV.fixVectorizedLoop(State); 8087 8088 ILV.printDebugTracesAtEnd(); 8089 } 8090 8091 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8092 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8093 for (const auto &Plan : VPlans) 8094 if (PrintVPlansInDotFormat) 8095 Plan->printDOT(O); 8096 else 8097 Plan->print(O); 8098 } 8099 #endif 8100 8101 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8102 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8103 8104 // We create new control-flow for the vectorized loop, so the original exit 8105 // conditions will be dead after vectorization if it's only used by the 8106 // terminator 8107 SmallVector<BasicBlock*> ExitingBlocks; 8108 OrigLoop->getExitingBlocks(ExitingBlocks); 8109 for (auto *BB : ExitingBlocks) { 8110 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8111 if (!Cmp || !Cmp->hasOneUse()) 8112 continue; 8113 8114 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8115 if (!DeadInstructions.insert(Cmp).second) 8116 continue; 8117 8118 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8119 // TODO: can recurse through operands in general 8120 for (Value *Op : Cmp->operands()) { 8121 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8122 DeadInstructions.insert(cast<Instruction>(Op)); 8123 } 8124 } 8125 8126 // We create new "steps" for induction variable updates to which the original 8127 // induction variables map. An original update instruction will be dead if 8128 // all its users except the induction variable are dead. 8129 auto *Latch = OrigLoop->getLoopLatch(); 8130 for (auto &Induction : Legal->getInductionVars()) { 8131 PHINode *Ind = Induction.first; 8132 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8133 8134 // If the tail is to be folded by masking, the primary induction variable, 8135 // if exists, isn't dead: it will be used for masking. Don't kill it. 8136 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8137 continue; 8138 8139 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8140 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8141 })) 8142 DeadInstructions.insert(IndUpdate); 8143 8144 // We record as "Dead" also the type-casting instructions we had identified 8145 // during induction analysis. We don't need any handling for them in the 8146 // vectorized loop because we have proven that, under a proper runtime 8147 // test guarding the vectorized loop, the value of the phi, and the casted 8148 // value of the phi, are the same. The last instruction in this casting chain 8149 // will get its scalar/vector/widened def from the scalar/vector/widened def 8150 // of the respective phi node. Any other casts in the induction def-use chain 8151 // have no other uses outside the phi update chain, and will be ignored. 8152 InductionDescriptor &IndDes = Induction.second; 8153 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8154 DeadInstructions.insert(Casts.begin(), Casts.end()); 8155 } 8156 } 8157 8158 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8159 8160 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8161 8162 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8163 Instruction::BinaryOps BinOp) { 8164 // When unrolling and the VF is 1, we only need to add a simple scalar. 8165 Type *Ty = Val->getType(); 8166 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8167 8168 if (Ty->isFloatingPointTy()) { 8169 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8170 8171 // Floating-point operations inherit FMF via the builder's flags. 8172 Value *MulOp = Builder.CreateFMul(C, Step); 8173 return Builder.CreateBinOp(BinOp, Val, MulOp); 8174 } 8175 Constant *C = ConstantInt::get(Ty, StartIdx); 8176 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8177 } 8178 8179 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8180 SmallVector<Metadata *, 4> MDs; 8181 // Reserve first location for self reference to the LoopID metadata node. 8182 MDs.push_back(nullptr); 8183 bool IsUnrollMetadata = false; 8184 MDNode *LoopID = L->getLoopID(); 8185 if (LoopID) { 8186 // First find existing loop unrolling disable metadata. 8187 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8188 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8189 if (MD) { 8190 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8191 IsUnrollMetadata = 8192 S && S->getString().startswith("llvm.loop.unroll.disable"); 8193 } 8194 MDs.push_back(LoopID->getOperand(i)); 8195 } 8196 } 8197 8198 if (!IsUnrollMetadata) { 8199 // Add runtime unroll disable metadata. 8200 LLVMContext &Context = L->getHeader()->getContext(); 8201 SmallVector<Metadata *, 1> DisableOperands; 8202 DisableOperands.push_back( 8203 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8204 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8205 MDs.push_back(DisableNode); 8206 MDNode *NewLoopID = MDNode::get(Context, MDs); 8207 // Set operand 0 to refer to the loop id itself. 8208 NewLoopID->replaceOperandWith(0, NewLoopID); 8209 L->setLoopID(NewLoopID); 8210 } 8211 } 8212 8213 //===--------------------------------------------------------------------===// 8214 // EpilogueVectorizerMainLoop 8215 //===--------------------------------------------------------------------===// 8216 8217 /// This function is partially responsible for generating the control flow 8218 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8219 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8220 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8221 Loop *Lp = createVectorLoopSkeleton(""); 8222 8223 // Generate the code to check the minimum iteration count of the vector 8224 // epilogue (see below). 8225 EPI.EpilogueIterationCountCheck = 8226 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8227 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8228 8229 // Generate the code to check any assumptions that we've made for SCEV 8230 // expressions. 8231 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8232 8233 // Generate the code that checks at runtime if arrays overlap. We put the 8234 // checks into a separate block to make the more common case of few elements 8235 // faster. 8236 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8237 8238 // Generate the iteration count check for the main loop, *after* the check 8239 // for the epilogue loop, so that the path-length is shorter for the case 8240 // that goes directly through the vector epilogue. The longer-path length for 8241 // the main loop is compensated for, by the gain from vectorizing the larger 8242 // trip count. Note: the branch will get updated later on when we vectorize 8243 // the epilogue. 8244 EPI.MainLoopIterationCountCheck = 8245 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8246 8247 // Generate the induction variable. 8248 OldInduction = Legal->getPrimaryInduction(); 8249 Type *IdxTy = Legal->getWidestInductionType(); 8250 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8251 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8252 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8253 EPI.VectorTripCount = CountRoundDown; 8254 Induction = 8255 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8256 getDebugLocFromInstOrOperands(OldInduction)); 8257 8258 // Skip induction resume value creation here because they will be created in 8259 // the second pass. If we created them here, they wouldn't be used anyway, 8260 // because the vplan in the second pass still contains the inductions from the 8261 // original loop. 8262 8263 return completeLoopSkeleton(Lp, OrigLoopID); 8264 } 8265 8266 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8267 LLVM_DEBUG({ 8268 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8269 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8270 << ", Main Loop UF:" << EPI.MainLoopUF 8271 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8272 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8273 }); 8274 } 8275 8276 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8277 DEBUG_WITH_TYPE(VerboseDebug, { 8278 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8279 }); 8280 } 8281 8282 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8283 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8284 assert(L && "Expected valid Loop."); 8285 assert(Bypass && "Expected valid bypass basic block."); 8286 unsigned VFactor = 8287 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8288 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8289 Value *Count = getOrCreateTripCount(L); 8290 // Reuse existing vector loop preheader for TC checks. 8291 // Note that new preheader block is generated for vector loop. 8292 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8293 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8294 8295 // Generate code to check if the loop's trip count is less than VF * UF of the 8296 // main vector loop. 8297 auto P = 8298 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8299 8300 Value *CheckMinIters = Builder.CreateICmp( 8301 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8302 "min.iters.check"); 8303 8304 if (!ForEpilogue) 8305 TCCheckBlock->setName("vector.main.loop.iter.check"); 8306 8307 // Create new preheader for vector loop. 8308 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8309 DT, LI, nullptr, "vector.ph"); 8310 8311 if (ForEpilogue) { 8312 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8313 DT->getNode(Bypass)->getIDom()) && 8314 "TC check is expected to dominate Bypass"); 8315 8316 // Update dominator for Bypass & LoopExit. 8317 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8318 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8319 8320 LoopBypassBlocks.push_back(TCCheckBlock); 8321 8322 // Save the trip count so we don't have to regenerate it in the 8323 // vec.epilog.iter.check. This is safe to do because the trip count 8324 // generated here dominates the vector epilog iter check. 8325 EPI.TripCount = Count; 8326 } 8327 8328 ReplaceInstWithInst( 8329 TCCheckBlock->getTerminator(), 8330 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8331 8332 return TCCheckBlock; 8333 } 8334 8335 //===--------------------------------------------------------------------===// 8336 // EpilogueVectorizerEpilogueLoop 8337 //===--------------------------------------------------------------------===// 8338 8339 /// This function is partially responsible for generating the control flow 8340 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8341 BasicBlock * 8342 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8343 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8344 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8345 8346 // Now, compare the remaining count and if there aren't enough iterations to 8347 // execute the vectorized epilogue skip to the scalar part. 8348 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8349 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8350 LoopVectorPreHeader = 8351 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8352 LI, nullptr, "vec.epilog.ph"); 8353 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8354 VecEpilogueIterationCountCheck); 8355 8356 // Adjust the control flow taking the state info from the main loop 8357 // vectorization into account. 8358 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8359 "expected this to be saved from the previous pass."); 8360 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8361 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8362 8363 DT->changeImmediateDominator(LoopVectorPreHeader, 8364 EPI.MainLoopIterationCountCheck); 8365 8366 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8367 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8368 8369 if (EPI.SCEVSafetyCheck) 8370 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8371 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8372 if (EPI.MemSafetyCheck) 8373 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8374 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8375 8376 DT->changeImmediateDominator( 8377 VecEpilogueIterationCountCheck, 8378 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8379 8380 DT->changeImmediateDominator(LoopScalarPreHeader, 8381 EPI.EpilogueIterationCountCheck); 8382 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8383 8384 // Keep track of bypass blocks, as they feed start values to the induction 8385 // phis in the scalar loop preheader. 8386 if (EPI.SCEVSafetyCheck) 8387 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8388 if (EPI.MemSafetyCheck) 8389 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8390 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8391 8392 // Generate a resume induction for the vector epilogue and put it in the 8393 // vector epilogue preheader 8394 Type *IdxTy = Legal->getWidestInductionType(); 8395 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8396 LoopVectorPreHeader->getFirstNonPHI()); 8397 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8398 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8399 EPI.MainLoopIterationCountCheck); 8400 8401 // Generate the induction variable. 8402 OldInduction = Legal->getPrimaryInduction(); 8403 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8404 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8405 Value *StartIdx = EPResumeVal; 8406 Induction = 8407 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8408 getDebugLocFromInstOrOperands(OldInduction)); 8409 8410 // Generate induction resume values. These variables save the new starting 8411 // indexes for the scalar loop. They are used to test if there are any tail 8412 // iterations left once the vector loop has completed. 8413 // Note that when the vectorized epilogue is skipped due to iteration count 8414 // check, then the resume value for the induction variable comes from 8415 // the trip count of the main vector loop, hence passing the AdditionalBypass 8416 // argument. 8417 createInductionResumeValues(Lp, CountRoundDown, 8418 {VecEpilogueIterationCountCheck, 8419 EPI.VectorTripCount} /* AdditionalBypass */); 8420 8421 AddRuntimeUnrollDisableMetaData(Lp); 8422 return completeLoopSkeleton(Lp, OrigLoopID); 8423 } 8424 8425 BasicBlock * 8426 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8427 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8428 8429 assert(EPI.TripCount && 8430 "Expected trip count to have been safed in the first pass."); 8431 assert( 8432 (!isa<Instruction>(EPI.TripCount) || 8433 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8434 "saved trip count does not dominate insertion point."); 8435 Value *TC = EPI.TripCount; 8436 IRBuilder<> Builder(Insert->getTerminator()); 8437 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8438 8439 // Generate code to check if the loop's trip count is less than VF * UF of the 8440 // vector epilogue loop. 8441 auto P = 8442 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8443 8444 Value *CheckMinIters = Builder.CreateICmp( 8445 P, Count, 8446 ConstantInt::get(Count->getType(), 8447 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8448 "min.epilog.iters.check"); 8449 8450 ReplaceInstWithInst( 8451 Insert->getTerminator(), 8452 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8453 8454 LoopBypassBlocks.push_back(Insert); 8455 return Insert; 8456 } 8457 8458 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8459 LLVM_DEBUG({ 8460 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8461 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8462 << ", Main Loop UF:" << EPI.MainLoopUF 8463 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8464 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8465 }); 8466 } 8467 8468 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8469 DEBUG_WITH_TYPE(VerboseDebug, { 8470 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8471 }); 8472 } 8473 8474 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8475 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8476 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8477 bool PredicateAtRangeStart = Predicate(Range.Start); 8478 8479 for (ElementCount TmpVF = Range.Start * 2; 8480 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8481 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8482 Range.End = TmpVF; 8483 break; 8484 } 8485 8486 return PredicateAtRangeStart; 8487 } 8488 8489 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8490 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8491 /// of VF's starting at a given VF and extending it as much as possible. Each 8492 /// vectorization decision can potentially shorten this sub-range during 8493 /// buildVPlan(). 8494 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8495 ElementCount MaxVF) { 8496 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8497 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8498 VFRange SubRange = {VF, MaxVFPlusOne}; 8499 VPlans.push_back(buildVPlan(SubRange)); 8500 VF = SubRange.End; 8501 } 8502 } 8503 8504 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8505 VPlanPtr &Plan) { 8506 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8507 8508 // Look for cached value. 8509 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8510 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8511 if (ECEntryIt != EdgeMaskCache.end()) 8512 return ECEntryIt->second; 8513 8514 VPValue *SrcMask = createBlockInMask(Src, Plan); 8515 8516 // The terminator has to be a branch inst! 8517 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8518 assert(BI && "Unexpected terminator found"); 8519 8520 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8521 return EdgeMaskCache[Edge] = SrcMask; 8522 8523 // If source is an exiting block, we know the exit edge is dynamically dead 8524 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8525 // adding uses of an otherwise potentially dead instruction. 8526 if (OrigLoop->isLoopExiting(Src)) 8527 return EdgeMaskCache[Edge] = SrcMask; 8528 8529 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8530 assert(EdgeMask && "No Edge Mask found for condition"); 8531 8532 if (BI->getSuccessor(0) != Dst) 8533 EdgeMask = Builder.createNot(EdgeMask); 8534 8535 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8536 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8537 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8538 // The select version does not introduce new UB if SrcMask is false and 8539 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8540 VPValue *False = Plan->getOrAddVPValue( 8541 ConstantInt::getFalse(BI->getCondition()->getType())); 8542 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8543 } 8544 8545 return EdgeMaskCache[Edge] = EdgeMask; 8546 } 8547 8548 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8549 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8550 8551 // Look for cached value. 8552 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8553 if (BCEntryIt != BlockMaskCache.end()) 8554 return BCEntryIt->second; 8555 8556 // All-one mask is modelled as no-mask following the convention for masked 8557 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8558 VPValue *BlockMask = nullptr; 8559 8560 if (OrigLoop->getHeader() == BB) { 8561 if (!CM.blockNeedsPredication(BB)) 8562 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8563 8564 // Create the block in mask as the first non-phi instruction in the block. 8565 VPBuilder::InsertPointGuard Guard(Builder); 8566 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8567 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8568 8569 // Introduce the early-exit compare IV <= BTC to form header block mask. 8570 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8571 // Start by constructing the desired canonical IV. 8572 VPValue *IV = nullptr; 8573 if (Legal->getPrimaryInduction()) 8574 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8575 else { 8576 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8577 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8578 IV = IVRecipe->getVPSingleValue(); 8579 } 8580 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8581 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8582 8583 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8584 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8585 // as a second argument, we only pass the IV here and extract the 8586 // tripcount from the transform state where codegen of the VP instructions 8587 // happen. 8588 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8589 } else { 8590 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8591 } 8592 return BlockMaskCache[BB] = BlockMask; 8593 } 8594 8595 // This is the block mask. We OR all incoming edges. 8596 for (auto *Predecessor : predecessors(BB)) { 8597 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8598 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8599 return BlockMaskCache[BB] = EdgeMask; 8600 8601 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8602 BlockMask = EdgeMask; 8603 continue; 8604 } 8605 8606 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8607 } 8608 8609 return BlockMaskCache[BB] = BlockMask; 8610 } 8611 8612 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8613 ArrayRef<VPValue *> Operands, 8614 VFRange &Range, 8615 VPlanPtr &Plan) { 8616 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8617 "Must be called with either a load or store"); 8618 8619 auto willWiden = [&](ElementCount VF) -> bool { 8620 if (VF.isScalar()) 8621 return false; 8622 LoopVectorizationCostModel::InstWidening Decision = 8623 CM.getWideningDecision(I, VF); 8624 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8625 "CM decision should be taken at this point."); 8626 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8627 return true; 8628 if (CM.isScalarAfterVectorization(I, VF) || 8629 CM.isProfitableToScalarize(I, VF)) 8630 return false; 8631 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8632 }; 8633 8634 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8635 return nullptr; 8636 8637 VPValue *Mask = nullptr; 8638 if (Legal->isMaskRequired(I)) 8639 Mask = createBlockInMask(I->getParent(), Plan); 8640 8641 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8642 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8643 8644 StoreInst *Store = cast<StoreInst>(I); 8645 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8646 Mask); 8647 } 8648 8649 VPWidenIntOrFpInductionRecipe * 8650 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8651 ArrayRef<VPValue *> Operands) const { 8652 // Check if this is an integer or fp induction. If so, build the recipe that 8653 // produces its scalar and vector values. 8654 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8655 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8656 II.getKind() == InductionDescriptor::IK_FpInduction) { 8657 assert(II.getStartValue() == 8658 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8659 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8660 return new VPWidenIntOrFpInductionRecipe( 8661 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8662 } 8663 8664 return nullptr; 8665 } 8666 8667 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8668 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8669 VPlan &Plan) const { 8670 // Optimize the special case where the source is a constant integer 8671 // induction variable. Notice that we can only optimize the 'trunc' case 8672 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8673 // (c) other casts depend on pointer size. 8674 8675 // Determine whether \p K is a truncation based on an induction variable that 8676 // can be optimized. 8677 auto isOptimizableIVTruncate = 8678 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8679 return [=](ElementCount VF) -> bool { 8680 return CM.isOptimizableIVTruncate(K, VF); 8681 }; 8682 }; 8683 8684 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8685 isOptimizableIVTruncate(I), Range)) { 8686 8687 InductionDescriptor II = 8688 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8689 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8690 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8691 Start, nullptr, I); 8692 } 8693 return nullptr; 8694 } 8695 8696 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8697 ArrayRef<VPValue *> Operands, 8698 VPlanPtr &Plan) { 8699 // If all incoming values are equal, the incoming VPValue can be used directly 8700 // instead of creating a new VPBlendRecipe. 8701 VPValue *FirstIncoming = Operands[0]; 8702 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8703 return FirstIncoming == Inc; 8704 })) { 8705 return Operands[0]; 8706 } 8707 8708 // We know that all PHIs in non-header blocks are converted into selects, so 8709 // we don't have to worry about the insertion order and we can just use the 8710 // builder. At this point we generate the predication tree. There may be 8711 // duplications since this is a simple recursive scan, but future 8712 // optimizations will clean it up. 8713 SmallVector<VPValue *, 2> OperandsWithMask; 8714 unsigned NumIncoming = Phi->getNumIncomingValues(); 8715 8716 for (unsigned In = 0; In < NumIncoming; In++) { 8717 VPValue *EdgeMask = 8718 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8719 assert((EdgeMask || NumIncoming == 1) && 8720 "Multiple predecessors with one having a full mask"); 8721 OperandsWithMask.push_back(Operands[In]); 8722 if (EdgeMask) 8723 OperandsWithMask.push_back(EdgeMask); 8724 } 8725 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8726 } 8727 8728 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8729 ArrayRef<VPValue *> Operands, 8730 VFRange &Range) const { 8731 8732 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8733 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8734 Range); 8735 8736 if (IsPredicated) 8737 return nullptr; 8738 8739 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8740 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8741 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8742 ID == Intrinsic::pseudoprobe || 8743 ID == Intrinsic::experimental_noalias_scope_decl)) 8744 return nullptr; 8745 8746 auto willWiden = [&](ElementCount VF) -> bool { 8747 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8748 // The following case may be scalarized depending on the VF. 8749 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8750 // version of the instruction. 8751 // Is it beneficial to perform intrinsic call compared to lib call? 8752 bool NeedToScalarize = false; 8753 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8754 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8755 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8756 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8757 "Either the intrinsic cost or vector call cost must be valid"); 8758 return UseVectorIntrinsic || !NeedToScalarize; 8759 }; 8760 8761 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8762 return nullptr; 8763 8764 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8765 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8766 } 8767 8768 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8769 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8770 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8771 // Instruction should be widened, unless it is scalar after vectorization, 8772 // scalarization is profitable or it is predicated. 8773 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8774 return CM.isScalarAfterVectorization(I, VF) || 8775 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8776 }; 8777 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8778 Range); 8779 } 8780 8781 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8782 ArrayRef<VPValue *> Operands) const { 8783 auto IsVectorizableOpcode = [](unsigned Opcode) { 8784 switch (Opcode) { 8785 case Instruction::Add: 8786 case Instruction::And: 8787 case Instruction::AShr: 8788 case Instruction::BitCast: 8789 case Instruction::FAdd: 8790 case Instruction::FCmp: 8791 case Instruction::FDiv: 8792 case Instruction::FMul: 8793 case Instruction::FNeg: 8794 case Instruction::FPExt: 8795 case Instruction::FPToSI: 8796 case Instruction::FPToUI: 8797 case Instruction::FPTrunc: 8798 case Instruction::FRem: 8799 case Instruction::FSub: 8800 case Instruction::ICmp: 8801 case Instruction::IntToPtr: 8802 case Instruction::LShr: 8803 case Instruction::Mul: 8804 case Instruction::Or: 8805 case Instruction::PtrToInt: 8806 case Instruction::SDiv: 8807 case Instruction::Select: 8808 case Instruction::SExt: 8809 case Instruction::Shl: 8810 case Instruction::SIToFP: 8811 case Instruction::SRem: 8812 case Instruction::Sub: 8813 case Instruction::Trunc: 8814 case Instruction::UDiv: 8815 case Instruction::UIToFP: 8816 case Instruction::URem: 8817 case Instruction::Xor: 8818 case Instruction::ZExt: 8819 return true; 8820 } 8821 return false; 8822 }; 8823 8824 if (!IsVectorizableOpcode(I->getOpcode())) 8825 return nullptr; 8826 8827 // Success: widen this instruction. 8828 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8829 } 8830 8831 void VPRecipeBuilder::fixHeaderPhis() { 8832 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8833 for (VPWidenPHIRecipe *R : PhisToFix) { 8834 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8835 VPRecipeBase *IncR = 8836 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8837 R->addOperand(IncR->getVPSingleValue()); 8838 } 8839 } 8840 8841 VPBasicBlock *VPRecipeBuilder::handleReplication( 8842 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8843 VPlanPtr &Plan) { 8844 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8845 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8846 Range); 8847 8848 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8849 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8850 8851 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8852 IsUniform, IsPredicated); 8853 setRecipe(I, Recipe); 8854 Plan->addVPValue(I, Recipe); 8855 8856 // Find if I uses a predicated instruction. If so, it will use its scalar 8857 // value. Avoid hoisting the insert-element which packs the scalar value into 8858 // a vector value, as that happens iff all users use the vector value. 8859 for (VPValue *Op : Recipe->operands()) { 8860 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8861 if (!PredR) 8862 continue; 8863 auto *RepR = 8864 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8865 assert(RepR->isPredicated() && 8866 "expected Replicate recipe to be predicated"); 8867 RepR->setAlsoPack(false); 8868 } 8869 8870 // Finalize the recipe for Instr, first if it is not predicated. 8871 if (!IsPredicated) { 8872 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8873 VPBB->appendRecipe(Recipe); 8874 return VPBB; 8875 } 8876 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8877 assert(VPBB->getSuccessors().empty() && 8878 "VPBB has successors when handling predicated replication."); 8879 // Record predicated instructions for above packing optimizations. 8880 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8881 VPBlockUtils::insertBlockAfter(Region, VPBB); 8882 auto *RegSucc = new VPBasicBlock(); 8883 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8884 return RegSucc; 8885 } 8886 8887 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8888 VPRecipeBase *PredRecipe, 8889 VPlanPtr &Plan) { 8890 // Instructions marked for predication are replicated and placed under an 8891 // if-then construct to prevent side-effects. 8892 8893 // Generate recipes to compute the block mask for this region. 8894 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8895 8896 // Build the triangular if-then region. 8897 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8898 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8899 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8900 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8901 auto *PHIRecipe = Instr->getType()->isVoidTy() 8902 ? nullptr 8903 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8904 if (PHIRecipe) { 8905 Plan->removeVPValueFor(Instr); 8906 Plan->addVPValue(Instr, PHIRecipe); 8907 } 8908 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8909 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8910 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8911 8912 // Note: first set Entry as region entry and then connect successors starting 8913 // from it in order, to propagate the "parent" of each VPBasicBlock. 8914 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8915 VPBlockUtils::connectBlocks(Pred, Exit); 8916 8917 return Region; 8918 } 8919 8920 VPRecipeOrVPValueTy 8921 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8922 ArrayRef<VPValue *> Operands, 8923 VFRange &Range, VPlanPtr &Plan) { 8924 // First, check for specific widening recipes that deal with calls, memory 8925 // operations, inductions and Phi nodes. 8926 if (auto *CI = dyn_cast<CallInst>(Instr)) 8927 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8928 8929 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8930 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8931 8932 VPRecipeBase *Recipe; 8933 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8934 if (Phi->getParent() != OrigLoop->getHeader()) 8935 return tryToBlend(Phi, Operands, Plan); 8936 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8937 return toVPRecipeResult(Recipe); 8938 8939 if (Legal->isReductionVariable(Phi)) { 8940 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8941 assert(RdxDesc.getRecurrenceStartValue() == 8942 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8943 VPValue *StartV = Operands[0]; 8944 8945 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8946 PhisToFix.push_back(PhiRecipe); 8947 // Record the incoming value from the backedge, so we can add the incoming 8948 // value from the backedge after all recipes have been created. 8949 recordRecipeOf(cast<Instruction>( 8950 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8951 return toVPRecipeResult(PhiRecipe); 8952 } 8953 8954 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8955 } 8956 8957 if (isa<TruncInst>(Instr) && 8958 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8959 Range, *Plan))) 8960 return toVPRecipeResult(Recipe); 8961 8962 if (!shouldWiden(Instr, Range)) 8963 return nullptr; 8964 8965 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8966 return toVPRecipeResult(new VPWidenGEPRecipe( 8967 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8968 8969 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8970 bool InvariantCond = 8971 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8972 return toVPRecipeResult(new VPWidenSelectRecipe( 8973 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8974 } 8975 8976 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8977 } 8978 8979 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8980 ElementCount MaxVF) { 8981 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8982 8983 // Collect instructions from the original loop that will become trivially dead 8984 // in the vectorized loop. We don't need to vectorize these instructions. For 8985 // example, original induction update instructions can become dead because we 8986 // separately emit induction "steps" when generating code for the new loop. 8987 // Similarly, we create a new latch condition when setting up the structure 8988 // of the new loop, so the old one can become dead. 8989 SmallPtrSet<Instruction *, 4> DeadInstructions; 8990 collectTriviallyDeadInstructions(DeadInstructions); 8991 8992 // Add assume instructions we need to drop to DeadInstructions, to prevent 8993 // them from being added to the VPlan. 8994 // TODO: We only need to drop assumes in blocks that get flattend. If the 8995 // control flow is preserved, we should keep them. 8996 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8997 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8998 8999 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9000 // Dead instructions do not need sinking. Remove them from SinkAfter. 9001 for (Instruction *I : DeadInstructions) 9002 SinkAfter.erase(I); 9003 9004 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9005 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9006 VFRange SubRange = {VF, MaxVFPlusOne}; 9007 VPlans.push_back( 9008 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9009 VF = SubRange.End; 9010 } 9011 } 9012 9013 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9014 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9015 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 9016 9017 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9018 9019 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9020 9021 // --------------------------------------------------------------------------- 9022 // Pre-construction: record ingredients whose recipes we'll need to further 9023 // process after constructing the initial VPlan. 9024 // --------------------------------------------------------------------------- 9025 9026 // Mark instructions we'll need to sink later and their targets as 9027 // ingredients whose recipe we'll need to record. 9028 for (auto &Entry : SinkAfter) { 9029 RecipeBuilder.recordRecipeOf(Entry.first); 9030 RecipeBuilder.recordRecipeOf(Entry.second); 9031 } 9032 for (auto &Reduction : CM.getInLoopReductionChains()) { 9033 PHINode *Phi = Reduction.first; 9034 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9035 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9036 9037 RecipeBuilder.recordRecipeOf(Phi); 9038 for (auto &R : ReductionOperations) { 9039 RecipeBuilder.recordRecipeOf(R); 9040 // For min/max reducitons, where we have a pair of icmp/select, we also 9041 // need to record the ICmp recipe, so it can be removed later. 9042 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9043 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9044 } 9045 } 9046 9047 // For each interleave group which is relevant for this (possibly trimmed) 9048 // Range, add it to the set of groups to be later applied to the VPlan and add 9049 // placeholders for its members' Recipes which we'll be replacing with a 9050 // single VPInterleaveRecipe. 9051 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9052 auto applyIG = [IG, this](ElementCount VF) -> bool { 9053 return (VF.isVector() && // Query is illegal for VF == 1 9054 CM.getWideningDecision(IG->getInsertPos(), VF) == 9055 LoopVectorizationCostModel::CM_Interleave); 9056 }; 9057 if (!getDecisionAndClampRange(applyIG, Range)) 9058 continue; 9059 InterleaveGroups.insert(IG); 9060 for (unsigned i = 0; i < IG->getFactor(); i++) 9061 if (Instruction *Member = IG->getMember(i)) 9062 RecipeBuilder.recordRecipeOf(Member); 9063 }; 9064 9065 // --------------------------------------------------------------------------- 9066 // Build initial VPlan: Scan the body of the loop in a topological order to 9067 // visit each basic block after having visited its predecessor basic blocks. 9068 // --------------------------------------------------------------------------- 9069 9070 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9071 auto Plan = std::make_unique<VPlan>(); 9072 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9073 Plan->setEntry(VPBB); 9074 9075 // Scan the body of the loop in a topological order to visit each basic block 9076 // after having visited its predecessor basic blocks. 9077 LoopBlocksDFS DFS(OrigLoop); 9078 DFS.perform(LI); 9079 9080 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9081 // Relevant instructions from basic block BB will be grouped into VPRecipe 9082 // ingredients and fill a new VPBasicBlock. 9083 unsigned VPBBsForBB = 0; 9084 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9085 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9086 VPBB = FirstVPBBForBB; 9087 Builder.setInsertPoint(VPBB); 9088 9089 // Introduce each ingredient into VPlan. 9090 // TODO: Model and preserve debug instrinsics in VPlan. 9091 for (Instruction &I : BB->instructionsWithoutDebug()) { 9092 Instruction *Instr = &I; 9093 9094 // First filter out irrelevant instructions, to ensure no recipes are 9095 // built for them. 9096 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9097 continue; 9098 9099 SmallVector<VPValue *, 4> Operands; 9100 auto *Phi = dyn_cast<PHINode>(Instr); 9101 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9102 Operands.push_back(Plan->getOrAddVPValue( 9103 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9104 } else { 9105 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9106 Operands = {OpRange.begin(), OpRange.end()}; 9107 } 9108 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9109 Instr, Operands, Range, Plan)) { 9110 // If Instr can be simplified to an existing VPValue, use it. 9111 if (RecipeOrValue.is<VPValue *>()) { 9112 auto *VPV = RecipeOrValue.get<VPValue *>(); 9113 Plan->addVPValue(Instr, VPV); 9114 // If the re-used value is a recipe, register the recipe for the 9115 // instruction, in case the recipe for Instr needs to be recorded. 9116 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9117 RecipeBuilder.setRecipe(Instr, R); 9118 continue; 9119 } 9120 // Otherwise, add the new recipe. 9121 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9122 for (auto *Def : Recipe->definedValues()) { 9123 auto *UV = Def->getUnderlyingValue(); 9124 Plan->addVPValue(UV, Def); 9125 } 9126 9127 RecipeBuilder.setRecipe(Instr, Recipe); 9128 VPBB->appendRecipe(Recipe); 9129 continue; 9130 } 9131 9132 // Otherwise, if all widening options failed, Instruction is to be 9133 // replicated. This may create a successor for VPBB. 9134 VPBasicBlock *NextVPBB = 9135 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9136 if (NextVPBB != VPBB) { 9137 VPBB = NextVPBB; 9138 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9139 : ""); 9140 } 9141 } 9142 } 9143 9144 RecipeBuilder.fixHeaderPhis(); 9145 9146 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9147 // may also be empty, such as the last one VPBB, reflecting original 9148 // basic-blocks with no recipes. 9149 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9150 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9151 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9152 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9153 delete PreEntry; 9154 9155 // --------------------------------------------------------------------------- 9156 // Transform initial VPlan: Apply previously taken decisions, in order, to 9157 // bring the VPlan to its final state. 9158 // --------------------------------------------------------------------------- 9159 9160 // Apply Sink-After legal constraints. 9161 for (auto &Entry : SinkAfter) { 9162 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9163 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9164 9165 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9166 auto *Region = 9167 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9168 if (Region && Region->isReplicator()) 9169 return Region; 9170 return nullptr; 9171 }; 9172 9173 // If the target is in a replication region, make sure to move Sink to the 9174 // block after it, not into the replication region itself. 9175 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9176 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9177 assert(!GetReplicateRegion(Sink) && 9178 "cannot sink a region into another region yet"); 9179 VPBasicBlock *NextBlock = 9180 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9181 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9182 continue; 9183 } 9184 9185 auto *SinkRegion = GetReplicateRegion(Sink); 9186 // Unless the sink source is in a replicate region, sink the recipe 9187 // directly. 9188 if (!SinkRegion) { 9189 Sink->moveAfter(Target); 9190 continue; 9191 } 9192 9193 // If the sink source is in a replicate region, we need to move the whole 9194 // replicate region, which should only contain a single recipe in the main 9195 // block. 9196 assert(Sink->getParent()->size() == 1 && 9197 "parent must be a replicator with a single recipe"); 9198 auto *SplitBlock = 9199 Target->getParent()->splitAt(std::next(Target->getIterator())); 9200 9201 auto *Pred = SinkRegion->getSinglePredecessor(); 9202 auto *Succ = SinkRegion->getSingleSuccessor(); 9203 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9204 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9205 VPBlockUtils::connectBlocks(Pred, Succ); 9206 9207 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9208 9209 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9210 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9211 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9212 if (VPBB == SplitPred) 9213 VPBB = SplitBlock; 9214 } 9215 9216 // Interleave memory: for each Interleave Group we marked earlier as relevant 9217 // for this VPlan, replace the Recipes widening its memory instructions with a 9218 // single VPInterleaveRecipe at its insertion point. 9219 for (auto IG : InterleaveGroups) { 9220 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9221 RecipeBuilder.getRecipe(IG->getInsertPos())); 9222 SmallVector<VPValue *, 4> StoredValues; 9223 for (unsigned i = 0; i < IG->getFactor(); ++i) 9224 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9225 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9226 9227 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9228 Recipe->getMask()); 9229 VPIG->insertBefore(Recipe); 9230 unsigned J = 0; 9231 for (unsigned i = 0; i < IG->getFactor(); ++i) 9232 if (Instruction *Member = IG->getMember(i)) { 9233 if (!Member->getType()->isVoidTy()) { 9234 VPValue *OriginalV = Plan->getVPValue(Member); 9235 Plan->removeVPValueFor(Member); 9236 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9237 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9238 J++; 9239 } 9240 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9241 } 9242 } 9243 9244 // Adjust the recipes for any inloop reductions. 9245 if (Range.Start.isVector()) 9246 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9247 9248 // Finally, if tail is folded by masking, introduce selects between the phi 9249 // and the live-out instruction of each reduction, at the end of the latch. 9250 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9251 Builder.setInsertPoint(VPBB); 9252 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9253 for (auto &Reduction : Legal->getReductionVars()) { 9254 if (CM.isInLoopReduction(Reduction.first)) 9255 continue; 9256 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9257 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9258 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9259 } 9260 } 9261 9262 VPlanTransforms::sinkScalarOperands(*Plan); 9263 9264 std::string PlanName; 9265 raw_string_ostream RSO(PlanName); 9266 ElementCount VF = Range.Start; 9267 Plan->addVF(VF); 9268 RSO << "Initial VPlan for VF={" << VF; 9269 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9270 Plan->addVF(VF); 9271 RSO << "," << VF; 9272 } 9273 RSO << "},UF>=1"; 9274 RSO.flush(); 9275 Plan->setName(PlanName); 9276 9277 return Plan; 9278 } 9279 9280 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9281 // Outer loop handling: They may require CFG and instruction level 9282 // transformations before even evaluating whether vectorization is profitable. 9283 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9284 // the vectorization pipeline. 9285 assert(!OrigLoop->isInnermost()); 9286 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9287 9288 // Create new empty VPlan 9289 auto Plan = std::make_unique<VPlan>(); 9290 9291 // Build hierarchical CFG 9292 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9293 HCFGBuilder.buildHierarchicalCFG(); 9294 9295 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9296 VF *= 2) 9297 Plan->addVF(VF); 9298 9299 if (EnableVPlanPredication) { 9300 VPlanPredicator VPP(*Plan); 9301 VPP.predicate(); 9302 9303 // Avoid running transformation to recipes until masked code generation in 9304 // VPlan-native path is in place. 9305 return Plan; 9306 } 9307 9308 SmallPtrSet<Instruction *, 1> DeadInstructions; 9309 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9310 Legal->getInductionVars(), 9311 DeadInstructions, *PSE.getSE()); 9312 return Plan; 9313 } 9314 9315 // Adjust the recipes for any inloop reductions. The chain of instructions 9316 // leading from the loop exit instr to the phi need to be converted to 9317 // reductions, with one operand being vector and the other being the scalar 9318 // reduction chain. 9319 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9320 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9321 for (auto &Reduction : CM.getInLoopReductionChains()) { 9322 PHINode *Phi = Reduction.first; 9323 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9324 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9325 9326 // ReductionOperations are orders top-down from the phi's use to the 9327 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9328 // which of the two operands will remain scalar and which will be reduced. 9329 // For minmax the chain will be the select instructions. 9330 Instruction *Chain = Phi; 9331 for (Instruction *R : ReductionOperations) { 9332 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9333 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9334 9335 VPValue *ChainOp = Plan->getVPValue(Chain); 9336 unsigned FirstOpId; 9337 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9338 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9339 "Expected to replace a VPWidenSelectSC"); 9340 FirstOpId = 1; 9341 } else { 9342 assert(isa<VPWidenRecipe>(WidenRecipe) && 9343 "Expected to replace a VPWidenSC"); 9344 FirstOpId = 0; 9345 } 9346 unsigned VecOpId = 9347 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9348 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9349 9350 auto *CondOp = CM.foldTailByMasking() 9351 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9352 : nullptr; 9353 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9354 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9355 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9356 Plan->removeVPValueFor(R); 9357 Plan->addVPValue(R, RedRecipe); 9358 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9359 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9360 WidenRecipe->eraseFromParent(); 9361 9362 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9363 VPRecipeBase *CompareRecipe = 9364 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9365 assert(isa<VPWidenRecipe>(CompareRecipe) && 9366 "Expected to replace a VPWidenSC"); 9367 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9368 "Expected no remaining users"); 9369 CompareRecipe->eraseFromParent(); 9370 } 9371 Chain = R; 9372 } 9373 } 9374 } 9375 9376 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9377 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9378 VPSlotTracker &SlotTracker) const { 9379 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9380 IG->getInsertPos()->printAsOperand(O, false); 9381 O << ", "; 9382 getAddr()->printAsOperand(O, SlotTracker); 9383 VPValue *Mask = getMask(); 9384 if (Mask) { 9385 O << ", "; 9386 Mask->printAsOperand(O, SlotTracker); 9387 } 9388 for (unsigned i = 0; i < IG->getFactor(); ++i) 9389 if (Instruction *I = IG->getMember(i)) 9390 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9391 } 9392 #endif 9393 9394 void VPWidenCallRecipe::execute(VPTransformState &State) { 9395 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9396 *this, State); 9397 } 9398 9399 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9400 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9401 this, *this, InvariantCond, State); 9402 } 9403 9404 void VPWidenRecipe::execute(VPTransformState &State) { 9405 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9406 } 9407 9408 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9409 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9410 *this, State.UF, State.VF, IsPtrLoopInvariant, 9411 IsIndexLoopInvariant, State); 9412 } 9413 9414 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9415 assert(!State.Instance && "Int or FP induction being replicated."); 9416 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9417 getTruncInst(), getVPValue(0), 9418 getCastValue(), State); 9419 } 9420 9421 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9422 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9423 this, State); 9424 } 9425 9426 void VPBlendRecipe::execute(VPTransformState &State) { 9427 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9428 // We know that all PHIs in non-header blocks are converted into 9429 // selects, so we don't have to worry about the insertion order and we 9430 // can just use the builder. 9431 // At this point we generate the predication tree. There may be 9432 // duplications since this is a simple recursive scan, but future 9433 // optimizations will clean it up. 9434 9435 unsigned NumIncoming = getNumIncomingValues(); 9436 9437 // Generate a sequence of selects of the form: 9438 // SELECT(Mask3, In3, 9439 // SELECT(Mask2, In2, 9440 // SELECT(Mask1, In1, 9441 // In0))) 9442 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9443 // are essentially undef are taken from In0. 9444 InnerLoopVectorizer::VectorParts Entry(State.UF); 9445 for (unsigned In = 0; In < NumIncoming; ++In) { 9446 for (unsigned Part = 0; Part < State.UF; ++Part) { 9447 // We might have single edge PHIs (blocks) - use an identity 9448 // 'select' for the first PHI operand. 9449 Value *In0 = State.get(getIncomingValue(In), Part); 9450 if (In == 0) 9451 Entry[Part] = In0; // Initialize with the first incoming value. 9452 else { 9453 // Select between the current value and the previous incoming edge 9454 // based on the incoming mask. 9455 Value *Cond = State.get(getMask(In), Part); 9456 Entry[Part] = 9457 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9458 } 9459 } 9460 } 9461 for (unsigned Part = 0; Part < State.UF; ++Part) 9462 State.set(this, Entry[Part], Part); 9463 } 9464 9465 void VPInterleaveRecipe::execute(VPTransformState &State) { 9466 assert(!State.Instance && "Interleave group being replicated."); 9467 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9468 getStoredValues(), getMask()); 9469 } 9470 9471 void VPReductionRecipe::execute(VPTransformState &State) { 9472 assert(!State.Instance && "Reduction being replicated."); 9473 Value *PrevInChain = State.get(getChainOp(), 0); 9474 for (unsigned Part = 0; Part < State.UF; ++Part) { 9475 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9476 bool IsOrdered = useOrderedReductions(*RdxDesc); 9477 Value *NewVecOp = State.get(getVecOp(), Part); 9478 if (VPValue *Cond = getCondOp()) { 9479 Value *NewCond = State.get(Cond, Part); 9480 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9481 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9482 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9483 Constant *IdenVec = 9484 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9485 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9486 NewVecOp = Select; 9487 } 9488 Value *NewRed; 9489 Value *NextInChain; 9490 if (IsOrdered) { 9491 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9492 PrevInChain); 9493 PrevInChain = NewRed; 9494 } else { 9495 PrevInChain = State.get(getChainOp(), Part); 9496 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9497 } 9498 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9499 NextInChain = 9500 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9501 NewRed, PrevInChain); 9502 } else if (IsOrdered) 9503 NextInChain = NewRed; 9504 else { 9505 NextInChain = State.Builder.CreateBinOp( 9506 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9507 PrevInChain); 9508 } 9509 State.set(this, NextInChain, Part); 9510 } 9511 } 9512 9513 void VPReplicateRecipe::execute(VPTransformState &State) { 9514 if (State.Instance) { // Generate a single instance. 9515 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9516 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9517 *State.Instance, IsPredicated, State); 9518 // Insert scalar instance packing it into a vector. 9519 if (AlsoPack && State.VF.isVector()) { 9520 // If we're constructing lane 0, initialize to start from poison. 9521 if (State.Instance->Lane.isFirstLane()) { 9522 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9523 Value *Poison = PoisonValue::get( 9524 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9525 State.set(this, Poison, State.Instance->Part); 9526 } 9527 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9528 } 9529 return; 9530 } 9531 9532 // Generate scalar instances for all VF lanes of all UF parts, unless the 9533 // instruction is uniform inwhich case generate only the first lane for each 9534 // of the UF parts. 9535 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9536 assert((!State.VF.isScalable() || IsUniform) && 9537 "Can't scalarize a scalable vector"); 9538 for (unsigned Part = 0; Part < State.UF; ++Part) 9539 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9540 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9541 VPIteration(Part, Lane), IsPredicated, 9542 State); 9543 } 9544 9545 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9546 assert(State.Instance && "Branch on Mask works only on single instance."); 9547 9548 unsigned Part = State.Instance->Part; 9549 unsigned Lane = State.Instance->Lane.getKnownLane(); 9550 9551 Value *ConditionBit = nullptr; 9552 VPValue *BlockInMask = getMask(); 9553 if (BlockInMask) { 9554 ConditionBit = State.get(BlockInMask, Part); 9555 if (ConditionBit->getType()->isVectorTy()) 9556 ConditionBit = State.Builder.CreateExtractElement( 9557 ConditionBit, State.Builder.getInt32(Lane)); 9558 } else // Block in mask is all-one. 9559 ConditionBit = State.Builder.getTrue(); 9560 9561 // Replace the temporary unreachable terminator with a new conditional branch, 9562 // whose two destinations will be set later when they are created. 9563 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9564 assert(isa<UnreachableInst>(CurrentTerminator) && 9565 "Expected to replace unreachable terminator with conditional branch."); 9566 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9567 CondBr->setSuccessor(0, nullptr); 9568 ReplaceInstWithInst(CurrentTerminator, CondBr); 9569 } 9570 9571 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9572 assert(State.Instance && "Predicated instruction PHI works per instance."); 9573 Instruction *ScalarPredInst = 9574 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9575 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9576 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9577 assert(PredicatingBB && "Predicated block has no single predecessor."); 9578 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9579 "operand must be VPReplicateRecipe"); 9580 9581 // By current pack/unpack logic we need to generate only a single phi node: if 9582 // a vector value for the predicated instruction exists at this point it means 9583 // the instruction has vector users only, and a phi for the vector value is 9584 // needed. In this case the recipe of the predicated instruction is marked to 9585 // also do that packing, thereby "hoisting" the insert-element sequence. 9586 // Otherwise, a phi node for the scalar value is needed. 9587 unsigned Part = State.Instance->Part; 9588 if (State.hasVectorValue(getOperand(0), Part)) { 9589 Value *VectorValue = State.get(getOperand(0), Part); 9590 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9591 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9592 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9593 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9594 if (State.hasVectorValue(this, Part)) 9595 State.reset(this, VPhi, Part); 9596 else 9597 State.set(this, VPhi, Part); 9598 // NOTE: Currently we need to update the value of the operand, so the next 9599 // predicated iteration inserts its generated value in the correct vector. 9600 State.reset(getOperand(0), VPhi, Part); 9601 } else { 9602 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9603 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9604 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9605 PredicatingBB); 9606 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9607 if (State.hasScalarValue(this, *State.Instance)) 9608 State.reset(this, Phi, *State.Instance); 9609 else 9610 State.set(this, Phi, *State.Instance); 9611 // NOTE: Currently we need to update the value of the operand, so the next 9612 // predicated iteration inserts its generated value in the correct vector. 9613 State.reset(getOperand(0), Phi, *State.Instance); 9614 } 9615 } 9616 9617 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9618 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9619 State.ILV->vectorizeMemoryInstruction( 9620 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9621 StoredValue, getMask()); 9622 } 9623 9624 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9625 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9626 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9627 // for predication. 9628 static ScalarEpilogueLowering getScalarEpilogueLowering( 9629 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9630 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9631 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9632 LoopVectorizationLegality &LVL) { 9633 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9634 // don't look at hints or options, and don't request a scalar epilogue. 9635 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9636 // LoopAccessInfo (due to code dependency and not being able to reliably get 9637 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9638 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9639 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9640 // back to the old way and vectorize with versioning when forced. See D81345.) 9641 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9642 PGSOQueryType::IRPass) && 9643 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9644 return CM_ScalarEpilogueNotAllowedOptSize; 9645 9646 // 2) If set, obey the directives 9647 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9648 switch (PreferPredicateOverEpilogue) { 9649 case PreferPredicateTy::ScalarEpilogue: 9650 return CM_ScalarEpilogueAllowed; 9651 case PreferPredicateTy::PredicateElseScalarEpilogue: 9652 return CM_ScalarEpilogueNotNeededUsePredicate; 9653 case PreferPredicateTy::PredicateOrDontVectorize: 9654 return CM_ScalarEpilogueNotAllowedUsePredicate; 9655 }; 9656 } 9657 9658 // 3) If set, obey the hints 9659 switch (Hints.getPredicate()) { 9660 case LoopVectorizeHints::FK_Enabled: 9661 return CM_ScalarEpilogueNotNeededUsePredicate; 9662 case LoopVectorizeHints::FK_Disabled: 9663 return CM_ScalarEpilogueAllowed; 9664 }; 9665 9666 // 4) if the TTI hook indicates this is profitable, request predication. 9667 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9668 LVL.getLAI())) 9669 return CM_ScalarEpilogueNotNeededUsePredicate; 9670 9671 return CM_ScalarEpilogueAllowed; 9672 } 9673 9674 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9675 // If Values have been set for this Def return the one relevant for \p Part. 9676 if (hasVectorValue(Def, Part)) 9677 return Data.PerPartOutput[Def][Part]; 9678 9679 if (!hasScalarValue(Def, {Part, 0})) { 9680 Value *IRV = Def->getLiveInIRValue(); 9681 Value *B = ILV->getBroadcastInstrs(IRV); 9682 set(Def, B, Part); 9683 return B; 9684 } 9685 9686 Value *ScalarValue = get(Def, {Part, 0}); 9687 // If we aren't vectorizing, we can just copy the scalar map values over 9688 // to the vector map. 9689 if (VF.isScalar()) { 9690 set(Def, ScalarValue, Part); 9691 return ScalarValue; 9692 } 9693 9694 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9695 bool IsUniform = RepR && RepR->isUniform(); 9696 9697 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9698 // Check if there is a scalar value for the selected lane. 9699 if (!hasScalarValue(Def, {Part, LastLane})) { 9700 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9701 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9702 "unexpected recipe found to be invariant"); 9703 IsUniform = true; 9704 LastLane = 0; 9705 } 9706 9707 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9708 9709 // Set the insert point after the last scalarized instruction. This 9710 // ensures the insertelement sequence will directly follow the scalar 9711 // definitions. 9712 auto OldIP = Builder.saveIP(); 9713 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9714 Builder.SetInsertPoint(&*NewIP); 9715 9716 // However, if we are vectorizing, we need to construct the vector values. 9717 // If the value is known to be uniform after vectorization, we can just 9718 // broadcast the scalar value corresponding to lane zero for each unroll 9719 // iteration. Otherwise, we construct the vector values using 9720 // insertelement instructions. Since the resulting vectors are stored in 9721 // State, we will only generate the insertelements once. 9722 Value *VectorValue = nullptr; 9723 if (IsUniform) { 9724 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9725 set(Def, VectorValue, Part); 9726 } else { 9727 // Initialize packing with insertelements to start from undef. 9728 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9729 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9730 set(Def, Undef, Part); 9731 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9732 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9733 VectorValue = get(Def, Part); 9734 } 9735 Builder.restoreIP(OldIP); 9736 return VectorValue; 9737 } 9738 9739 // Process the loop in the VPlan-native vectorization path. This path builds 9740 // VPlan upfront in the vectorization pipeline, which allows to apply 9741 // VPlan-to-VPlan transformations from the very beginning without modifying the 9742 // input LLVM IR. 9743 static bool processLoopInVPlanNativePath( 9744 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9745 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9746 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9747 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9748 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9749 LoopVectorizationRequirements &Requirements) { 9750 9751 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9752 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9753 return false; 9754 } 9755 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9756 Function *F = L->getHeader()->getParent(); 9757 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9758 9759 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9760 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9761 9762 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9763 &Hints, IAI); 9764 // Use the planner for outer loop vectorization. 9765 // TODO: CM is not used at this point inside the planner. Turn CM into an 9766 // optional argument if we don't need it in the future. 9767 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9768 Requirements, ORE); 9769 9770 // Get user vectorization factor. 9771 ElementCount UserVF = Hints.getWidth(); 9772 9773 // Plan how to best vectorize, return the best VF and its cost. 9774 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9775 9776 // If we are stress testing VPlan builds, do not attempt to generate vector 9777 // code. Masked vector code generation support will follow soon. 9778 // Also, do not attempt to vectorize if no vector code will be produced. 9779 if (VPlanBuildStressTest || EnableVPlanPredication || 9780 VectorizationFactor::Disabled() == VF) 9781 return false; 9782 9783 LVP.setBestPlan(VF.Width, 1); 9784 9785 { 9786 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9787 F->getParent()->getDataLayout()); 9788 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9789 &CM, BFI, PSI, Checks); 9790 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9791 << L->getHeader()->getParent()->getName() << "\"\n"); 9792 LVP.executePlan(LB, DT); 9793 } 9794 9795 // Mark the loop as already vectorized to avoid vectorizing again. 9796 Hints.setAlreadyVectorized(); 9797 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9798 return true; 9799 } 9800 9801 // Emit a remark if there are stores to floats that required a floating point 9802 // extension. If the vectorized loop was generated with floating point there 9803 // will be a performance penalty from the conversion overhead and the change in 9804 // the vector width. 9805 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9806 SmallVector<Instruction *, 4> Worklist; 9807 for (BasicBlock *BB : L->getBlocks()) { 9808 for (Instruction &Inst : *BB) { 9809 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9810 if (S->getValueOperand()->getType()->isFloatTy()) 9811 Worklist.push_back(S); 9812 } 9813 } 9814 } 9815 9816 // Traverse the floating point stores upwards searching, for floating point 9817 // conversions. 9818 SmallPtrSet<const Instruction *, 4> Visited; 9819 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9820 while (!Worklist.empty()) { 9821 auto *I = Worklist.pop_back_val(); 9822 if (!L->contains(I)) 9823 continue; 9824 if (!Visited.insert(I).second) 9825 continue; 9826 9827 // Emit a remark if the floating point store required a floating 9828 // point conversion. 9829 // TODO: More work could be done to identify the root cause such as a 9830 // constant or a function return type and point the user to it. 9831 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9832 ORE->emit([&]() { 9833 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9834 I->getDebugLoc(), L->getHeader()) 9835 << "floating point conversion changes vector width. " 9836 << "Mixed floating point precision requires an up/down " 9837 << "cast that will negatively impact performance."; 9838 }); 9839 9840 for (Use &Op : I->operands()) 9841 if (auto *OpI = dyn_cast<Instruction>(Op)) 9842 Worklist.push_back(OpI); 9843 } 9844 } 9845 9846 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9847 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9848 !EnableLoopInterleaving), 9849 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9850 !EnableLoopVectorization) {} 9851 9852 bool LoopVectorizePass::processLoop(Loop *L) { 9853 assert((EnableVPlanNativePath || L->isInnermost()) && 9854 "VPlan-native path is not enabled. Only process inner loops."); 9855 9856 #ifndef NDEBUG 9857 const std::string DebugLocStr = getDebugLocString(L); 9858 #endif /* NDEBUG */ 9859 9860 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9861 << L->getHeader()->getParent()->getName() << "\" from " 9862 << DebugLocStr << "\n"); 9863 9864 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9865 9866 LLVM_DEBUG( 9867 dbgs() << "LV: Loop hints:" 9868 << " force=" 9869 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9870 ? "disabled" 9871 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9872 ? "enabled" 9873 : "?")) 9874 << " width=" << Hints.getWidth() 9875 << " interleave=" << Hints.getInterleave() << "\n"); 9876 9877 // Function containing loop 9878 Function *F = L->getHeader()->getParent(); 9879 9880 // Looking at the diagnostic output is the only way to determine if a loop 9881 // was vectorized (other than looking at the IR or machine code), so it 9882 // is important to generate an optimization remark for each loop. Most of 9883 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9884 // generated as OptimizationRemark and OptimizationRemarkMissed are 9885 // less verbose reporting vectorized loops and unvectorized loops that may 9886 // benefit from vectorization, respectively. 9887 9888 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9889 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9890 return false; 9891 } 9892 9893 PredicatedScalarEvolution PSE(*SE, *L); 9894 9895 // Check if it is legal to vectorize the loop. 9896 LoopVectorizationRequirements Requirements; 9897 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9898 &Requirements, &Hints, DB, AC, BFI, PSI); 9899 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9900 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9901 Hints.emitRemarkWithHints(); 9902 return false; 9903 } 9904 9905 // Check the function attributes and profiles to find out if this function 9906 // should be optimized for size. 9907 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9908 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9909 9910 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9911 // here. They may require CFG and instruction level transformations before 9912 // even evaluating whether vectorization is profitable. Since we cannot modify 9913 // the incoming IR, we need to build VPlan upfront in the vectorization 9914 // pipeline. 9915 if (!L->isInnermost()) 9916 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9917 ORE, BFI, PSI, Hints, Requirements); 9918 9919 assert(L->isInnermost() && "Inner loop expected."); 9920 9921 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9922 // count by optimizing for size, to minimize overheads. 9923 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9924 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9925 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9926 << "This loop is worth vectorizing only if no scalar " 9927 << "iteration overheads are incurred."); 9928 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9929 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9930 else { 9931 LLVM_DEBUG(dbgs() << "\n"); 9932 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9933 } 9934 } 9935 9936 // Check the function attributes to see if implicit floats are allowed. 9937 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9938 // an integer loop and the vector instructions selected are purely integer 9939 // vector instructions? 9940 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9941 reportVectorizationFailure( 9942 "Can't vectorize when the NoImplicitFloat attribute is used", 9943 "loop not vectorized due to NoImplicitFloat attribute", 9944 "NoImplicitFloat", ORE, L); 9945 Hints.emitRemarkWithHints(); 9946 return false; 9947 } 9948 9949 // Check if the target supports potentially unsafe FP vectorization. 9950 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9951 // for the target we're vectorizing for, to make sure none of the 9952 // additional fp-math flags can help. 9953 if (Hints.isPotentiallyUnsafe() && 9954 TTI->isFPVectorizationPotentiallyUnsafe()) { 9955 reportVectorizationFailure( 9956 "Potentially unsafe FP op prevents vectorization", 9957 "loop not vectorized due to unsafe FP support.", 9958 "UnsafeFP", ORE, L); 9959 Hints.emitRemarkWithHints(); 9960 return false; 9961 } 9962 9963 if (!LVL.canVectorizeFPMath(EnableStrictReductions)) { 9964 ORE->emit([&]() { 9965 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9966 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9967 ExactFPMathInst->getDebugLoc(), 9968 ExactFPMathInst->getParent()) 9969 << "loop not vectorized: cannot prove it is safe to reorder " 9970 "floating-point operations"; 9971 }); 9972 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9973 "reorder floating-point operations\n"); 9974 Hints.emitRemarkWithHints(); 9975 return false; 9976 } 9977 9978 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9979 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9980 9981 // If an override option has been passed in for interleaved accesses, use it. 9982 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9983 UseInterleaved = EnableInterleavedMemAccesses; 9984 9985 // Analyze interleaved memory accesses. 9986 if (UseInterleaved) { 9987 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9988 } 9989 9990 // Use the cost model. 9991 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9992 F, &Hints, IAI); 9993 CM.collectValuesToIgnore(); 9994 9995 // Use the planner for vectorization. 9996 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9997 Requirements, ORE); 9998 9999 // Get user vectorization factor and interleave count. 10000 ElementCount UserVF = Hints.getWidth(); 10001 unsigned UserIC = Hints.getInterleave(); 10002 10003 // Plan how to best vectorize, return the best VF and its cost. 10004 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10005 10006 VectorizationFactor VF = VectorizationFactor::Disabled(); 10007 unsigned IC = 1; 10008 10009 if (MaybeVF) { 10010 VF = *MaybeVF; 10011 // Select the interleave count. 10012 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10013 } 10014 10015 // Identify the diagnostic messages that should be produced. 10016 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10017 bool VectorizeLoop = true, InterleaveLoop = true; 10018 if (VF.Width.isScalar()) { 10019 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10020 VecDiagMsg = std::make_pair( 10021 "VectorizationNotBeneficial", 10022 "the cost-model indicates that vectorization is not beneficial"); 10023 VectorizeLoop = false; 10024 } 10025 10026 if (!MaybeVF && UserIC > 1) { 10027 // Tell the user interleaving was avoided up-front, despite being explicitly 10028 // requested. 10029 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10030 "interleaving should be avoided up front\n"); 10031 IntDiagMsg = std::make_pair( 10032 "InterleavingAvoided", 10033 "Ignoring UserIC, because interleaving was avoided up front"); 10034 InterleaveLoop = false; 10035 } else if (IC == 1 && UserIC <= 1) { 10036 // Tell the user interleaving is not beneficial. 10037 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10038 IntDiagMsg = std::make_pair( 10039 "InterleavingNotBeneficial", 10040 "the cost-model indicates that interleaving is not beneficial"); 10041 InterleaveLoop = false; 10042 if (UserIC == 1) { 10043 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10044 IntDiagMsg.second += 10045 " and is explicitly disabled or interleave count is set to 1"; 10046 } 10047 } else if (IC > 1 && UserIC == 1) { 10048 // Tell the user interleaving is beneficial, but it explicitly disabled. 10049 LLVM_DEBUG( 10050 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10051 IntDiagMsg = std::make_pair( 10052 "InterleavingBeneficialButDisabled", 10053 "the cost-model indicates that interleaving is beneficial " 10054 "but is explicitly disabled or interleave count is set to 1"); 10055 InterleaveLoop = false; 10056 } 10057 10058 // Override IC if user provided an interleave count. 10059 IC = UserIC > 0 ? UserIC : IC; 10060 10061 // Emit diagnostic messages, if any. 10062 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10063 if (!VectorizeLoop && !InterleaveLoop) { 10064 // Do not vectorize or interleaving the loop. 10065 ORE->emit([&]() { 10066 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10067 L->getStartLoc(), L->getHeader()) 10068 << VecDiagMsg.second; 10069 }); 10070 ORE->emit([&]() { 10071 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10072 L->getStartLoc(), L->getHeader()) 10073 << IntDiagMsg.second; 10074 }); 10075 return false; 10076 } else if (!VectorizeLoop && InterleaveLoop) { 10077 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10078 ORE->emit([&]() { 10079 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10080 L->getStartLoc(), L->getHeader()) 10081 << VecDiagMsg.second; 10082 }); 10083 } else if (VectorizeLoop && !InterleaveLoop) { 10084 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10085 << ") in " << DebugLocStr << '\n'); 10086 ORE->emit([&]() { 10087 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10088 L->getStartLoc(), L->getHeader()) 10089 << IntDiagMsg.second; 10090 }); 10091 } else if (VectorizeLoop && InterleaveLoop) { 10092 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10093 << ") in " << DebugLocStr << '\n'); 10094 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10095 } 10096 10097 bool DisableRuntimeUnroll = false; 10098 MDNode *OrigLoopID = L->getLoopID(); 10099 { 10100 // Optimistically generate runtime checks. Drop them if they turn out to not 10101 // be profitable. Limit the scope of Checks, so the cleanup happens 10102 // immediately after vector codegeneration is done. 10103 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10104 F->getParent()->getDataLayout()); 10105 if (!VF.Width.isScalar() || IC > 1) 10106 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10107 LVP.setBestPlan(VF.Width, IC); 10108 10109 using namespace ore; 10110 if (!VectorizeLoop) { 10111 assert(IC > 1 && "interleave count should not be 1 or 0"); 10112 // If we decided that it is not legal to vectorize the loop, then 10113 // interleave it. 10114 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10115 &CM, BFI, PSI, Checks); 10116 LVP.executePlan(Unroller, DT); 10117 10118 ORE->emit([&]() { 10119 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10120 L->getHeader()) 10121 << "interleaved loop (interleaved count: " 10122 << NV("InterleaveCount", IC) << ")"; 10123 }); 10124 } else { 10125 // If we decided that it is *legal* to vectorize the loop, then do it. 10126 10127 // Consider vectorizing the epilogue too if it's profitable. 10128 VectorizationFactor EpilogueVF = 10129 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10130 if (EpilogueVF.Width.isVector()) { 10131 10132 // The first pass vectorizes the main loop and creates a scalar epilogue 10133 // to be vectorized by executing the plan (potentially with a different 10134 // factor) again shortly afterwards. 10135 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10136 EpilogueVF.Width.getKnownMinValue(), 10137 1); 10138 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10139 EPI, &LVL, &CM, BFI, PSI, Checks); 10140 10141 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10142 LVP.executePlan(MainILV, DT); 10143 ++LoopsVectorized; 10144 10145 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10146 formLCSSARecursively(*L, *DT, LI, SE); 10147 10148 // Second pass vectorizes the epilogue and adjusts the control flow 10149 // edges from the first pass. 10150 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10151 EPI.MainLoopVF = EPI.EpilogueVF; 10152 EPI.MainLoopUF = EPI.EpilogueUF; 10153 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10154 ORE, EPI, &LVL, &CM, BFI, PSI, 10155 Checks); 10156 LVP.executePlan(EpilogILV, DT); 10157 ++LoopsEpilogueVectorized; 10158 10159 if (!MainILV.areSafetyChecksAdded()) 10160 DisableRuntimeUnroll = true; 10161 } else { 10162 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10163 &LVL, &CM, BFI, PSI, Checks); 10164 LVP.executePlan(LB, DT); 10165 ++LoopsVectorized; 10166 10167 // Add metadata to disable runtime unrolling a scalar loop when there 10168 // are no runtime checks about strides and memory. A scalar loop that is 10169 // rarely used is not worth unrolling. 10170 if (!LB.areSafetyChecksAdded()) 10171 DisableRuntimeUnroll = true; 10172 } 10173 // Report the vectorization decision. 10174 ORE->emit([&]() { 10175 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10176 L->getHeader()) 10177 << "vectorized loop (vectorization width: " 10178 << NV("VectorizationFactor", VF.Width) 10179 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10180 }); 10181 } 10182 10183 if (ORE->allowExtraAnalysis(LV_NAME)) 10184 checkMixedPrecision(L, ORE); 10185 } 10186 10187 Optional<MDNode *> RemainderLoopID = 10188 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10189 LLVMLoopVectorizeFollowupEpilogue}); 10190 if (RemainderLoopID.hasValue()) { 10191 L->setLoopID(RemainderLoopID.getValue()); 10192 } else { 10193 if (DisableRuntimeUnroll) 10194 AddRuntimeUnrollDisableMetaData(L); 10195 10196 // Mark the loop as already vectorized to avoid vectorizing again. 10197 Hints.setAlreadyVectorized(); 10198 } 10199 10200 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10201 return true; 10202 } 10203 10204 LoopVectorizeResult LoopVectorizePass::runImpl( 10205 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10206 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10207 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10208 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10209 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10210 SE = &SE_; 10211 LI = &LI_; 10212 TTI = &TTI_; 10213 DT = &DT_; 10214 BFI = &BFI_; 10215 TLI = TLI_; 10216 AA = &AA_; 10217 AC = &AC_; 10218 GetLAA = &GetLAA_; 10219 DB = &DB_; 10220 ORE = &ORE_; 10221 PSI = PSI_; 10222 10223 // Don't attempt if 10224 // 1. the target claims to have no vector registers, and 10225 // 2. interleaving won't help ILP. 10226 // 10227 // The second condition is necessary because, even if the target has no 10228 // vector registers, loop vectorization may still enable scalar 10229 // interleaving. 10230 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10231 TTI->getMaxInterleaveFactor(1) < 2) 10232 return LoopVectorizeResult(false, false); 10233 10234 bool Changed = false, CFGChanged = false; 10235 10236 // The vectorizer requires loops to be in simplified form. 10237 // Since simplification may add new inner loops, it has to run before the 10238 // legality and profitability checks. This means running the loop vectorizer 10239 // will simplify all loops, regardless of whether anything end up being 10240 // vectorized. 10241 for (auto &L : *LI) 10242 Changed |= CFGChanged |= 10243 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10244 10245 // Build up a worklist of inner-loops to vectorize. This is necessary as 10246 // the act of vectorizing or partially unrolling a loop creates new loops 10247 // and can invalidate iterators across the loops. 10248 SmallVector<Loop *, 8> Worklist; 10249 10250 for (Loop *L : *LI) 10251 collectSupportedLoops(*L, LI, ORE, Worklist); 10252 10253 LoopsAnalyzed += Worklist.size(); 10254 10255 // Now walk the identified inner loops. 10256 while (!Worklist.empty()) { 10257 Loop *L = Worklist.pop_back_val(); 10258 10259 // For the inner loops we actually process, form LCSSA to simplify the 10260 // transform. 10261 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10262 10263 Changed |= CFGChanged |= processLoop(L); 10264 } 10265 10266 // Process each loop nest in the function. 10267 return LoopVectorizeResult(Changed, CFGChanged); 10268 } 10269 10270 PreservedAnalyses LoopVectorizePass::run(Function &F, 10271 FunctionAnalysisManager &AM) { 10272 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10273 auto &LI = AM.getResult<LoopAnalysis>(F); 10274 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10275 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10276 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10277 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10278 auto &AA = AM.getResult<AAManager>(F); 10279 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10280 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10281 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10282 MemorySSA *MSSA = EnableMSSALoopDependency 10283 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10284 : nullptr; 10285 10286 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10287 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10288 [&](Loop &L) -> const LoopAccessInfo & { 10289 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10290 TLI, TTI, nullptr, MSSA}; 10291 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10292 }; 10293 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10294 ProfileSummaryInfo *PSI = 10295 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10296 LoopVectorizeResult Result = 10297 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10298 if (!Result.MadeAnyChange) 10299 return PreservedAnalyses::all(); 10300 PreservedAnalyses PA; 10301 10302 // We currently do not preserve loopinfo/dominator analyses with outer loop 10303 // vectorization. Until this is addressed, mark these analyses as preserved 10304 // only for non-VPlan-native path. 10305 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10306 if (!EnableVPlanNativePath) { 10307 PA.preserve<LoopAnalysis>(); 10308 PA.preserve<DominatorTreeAnalysis>(); 10309 } 10310 if (!Result.MadeCFGChange) 10311 PA.preserveSet<CFGAnalyses>(); 10312 return PA; 10313 } 10314