1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 // FIXME: When loop hints are passed which allow reordering of FP operations, 335 // we still choose to use strict reductions with this flag. We should instead 336 // use the default behaviour of vectorizing with unordered reductions if 337 // reordering is allowed. 338 cl::opt<bool> EnableStrictReductions( 339 "enable-strict-reductions", cl::init(false), cl::Hidden, 340 cl::desc("Enable the vectorisation of loops with in-order (strict) " 341 "FP reductions")); 342 343 static cl::opt<bool> PreferPredicatedReductionSelect( 344 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 345 cl::desc( 346 "Prefer predicating a reduction operation over an after loop select.")); 347 348 cl::opt<bool> EnableVPlanNativePath( 349 "enable-vplan-native-path", cl::init(false), cl::Hidden, 350 cl::desc("Enable VPlan-native vectorization path with " 351 "support for outer loop vectorization.")); 352 353 // FIXME: Remove this switch once we have divergence analysis. Currently we 354 // assume divergent non-backedge branches when this switch is true. 355 cl::opt<bool> EnableVPlanPredication( 356 "enable-vplan-predication", cl::init(false), cl::Hidden, 357 cl::desc("Enable VPlan-native vectorization path predicator with " 358 "support for outer loop vectorization.")); 359 360 // This flag enables the stress testing of the VPlan H-CFG construction in the 361 // VPlan-native vectorization path. It must be used in conjuction with 362 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 363 // verification of the H-CFGs built. 364 static cl::opt<bool> VPlanBuildStressTest( 365 "vplan-build-stress-test", cl::init(false), cl::Hidden, 366 cl::desc( 367 "Build VPlan for every supported loop nest in the function and bail " 368 "out right after the build (stress test the VPlan H-CFG construction " 369 "in the VPlan-native vectorization path).")); 370 371 cl::opt<bool> llvm::EnableLoopInterleaving( 372 "interleave-loops", cl::init(true), cl::Hidden, 373 cl::desc("Enable loop interleaving in Loop vectorization passes")); 374 cl::opt<bool> llvm::EnableLoopVectorization( 375 "vectorize-loops", cl::init(true), cl::Hidden, 376 cl::desc("Run the Loop vectorization passes")); 377 378 cl::opt<bool> PrintVPlansInDotFormat( 379 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 380 cl::desc("Use dot format instead of plain text when dumping VPlans")); 381 382 /// A helper function that returns the type of loaded or stored value. 383 static Type *getMemInstValueType(Value *I) { 384 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 385 "Expected Load or Store instruction"); 386 if (auto *LI = dyn_cast<LoadInst>(I)) 387 return LI->getType(); 388 return cast<StoreInst>(I)->getValueOperand()->getType(); 389 } 390 391 /// A helper function that returns true if the given type is irregular. The 392 /// type is irregular if its allocated size doesn't equal the store size of an 393 /// element of the corresponding vector type. 394 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 395 // Determine if an array of N elements of type Ty is "bitcast compatible" 396 // with a <N x Ty> vector. 397 // This is only true if there is no padding between the array elements. 398 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 399 } 400 401 /// A helper function that returns the reciprocal of the block probability of 402 /// predicated blocks. If we return X, we are assuming the predicated block 403 /// will execute once for every X iterations of the loop header. 404 /// 405 /// TODO: We should use actual block probability here, if available. Currently, 406 /// we always assume predicated blocks have a 50% chance of executing. 407 static unsigned getReciprocalPredBlockProb() { return 2; } 408 409 /// A helper function that returns an integer or floating-point constant with 410 /// value C. 411 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 412 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 413 : ConstantFP::get(Ty, C); 414 } 415 416 /// Returns "best known" trip count for the specified loop \p L as defined by 417 /// the following procedure: 418 /// 1) Returns exact trip count if it is known. 419 /// 2) Returns expected trip count according to profile data if any. 420 /// 3) Returns upper bound estimate if it is known. 421 /// 4) Returns None if all of the above failed. 422 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 423 // Check if exact trip count is known. 424 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 425 return ExpectedTC; 426 427 // Check if there is an expected trip count available from profile data. 428 if (LoopVectorizeWithBlockFrequency) 429 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 430 return EstimatedTC; 431 432 // Check if upper bound estimate is known. 433 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 434 return ExpectedTC; 435 436 return None; 437 } 438 439 // Forward declare GeneratedRTChecks. 440 class GeneratedRTChecks; 441 442 namespace llvm { 443 444 /// InnerLoopVectorizer vectorizes loops which contain only one basic 445 /// block to a specified vectorization factor (VF). 446 /// This class performs the widening of scalars into vectors, or multiple 447 /// scalars. This class also implements the following features: 448 /// * It inserts an epilogue loop for handling loops that don't have iteration 449 /// counts that are known to be a multiple of the vectorization factor. 450 /// * It handles the code generation for reduction variables. 451 /// * Scalarization (implementation using scalars) of un-vectorizable 452 /// instructions. 453 /// InnerLoopVectorizer does not perform any vectorization-legality 454 /// checks, and relies on the caller to check for the different legality 455 /// aspects. The InnerLoopVectorizer relies on the 456 /// LoopVectorizationLegality class to provide information about the induction 457 /// and reduction variables that were found to a given vectorization factor. 458 class InnerLoopVectorizer { 459 public: 460 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 461 LoopInfo *LI, DominatorTree *DT, 462 const TargetLibraryInfo *TLI, 463 const TargetTransformInfo *TTI, AssumptionCache *AC, 464 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 465 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 466 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 467 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 468 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 469 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 470 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 471 PSI(PSI), RTChecks(RTChecks) { 472 // Query this against the original loop and save it here because the profile 473 // of the original loop header may change as the transformation happens. 474 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 475 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 476 } 477 478 virtual ~InnerLoopVectorizer() = default; 479 480 /// Create a new empty loop that will contain vectorized instructions later 481 /// on, while the old loop will be used as the scalar remainder. Control flow 482 /// is generated around the vectorized (and scalar epilogue) loops consisting 483 /// of various checks and bypasses. Return the pre-header block of the new 484 /// loop. 485 /// In the case of epilogue vectorization, this function is overriden to 486 /// handle the more complex control flow around the loops. 487 virtual BasicBlock *createVectorizedLoopSkeleton(); 488 489 /// Widen a single instruction within the innermost loop. 490 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 491 VPTransformState &State); 492 493 /// Widen a single call instruction within the innermost loop. 494 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 495 VPTransformState &State); 496 497 /// Widen a single select instruction within the innermost loop. 498 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 499 bool InvariantCond, VPTransformState &State); 500 501 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 502 void fixVectorizedLoop(VPTransformState &State); 503 504 // Return true if any runtime check is added. 505 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 506 507 /// A type for vectorized values in the new loop. Each value from the 508 /// original loop, when vectorized, is represented by UF vector values in the 509 /// new unrolled loop, where UF is the unroll factor. 510 using VectorParts = SmallVector<Value *, 2>; 511 512 /// Vectorize a single GetElementPtrInst based on information gathered and 513 /// decisions taken during planning. 514 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 515 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 516 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 517 518 /// Vectorize a single PHINode in a block. This method handles the induction 519 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 520 /// arbitrary length vectors. 521 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 522 VPWidenPHIRecipe *PhiR, VPTransformState &State); 523 524 /// A helper function to scalarize a single Instruction in the innermost loop. 525 /// Generates a sequence of scalar instances for each lane between \p MinLane 526 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 527 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 528 /// Instr's operands. 529 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 530 const VPIteration &Instance, bool IfPredicateInstr, 531 VPTransformState &State); 532 533 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 534 /// is provided, the integer induction variable will first be truncated to 535 /// the corresponding type. 536 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 537 VPValue *Def, VPValue *CastDef, 538 VPTransformState &State); 539 540 /// Construct the vector value of a scalarized value \p V one lane at a time. 541 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 542 VPTransformState &State); 543 544 /// Try to vectorize interleaved access group \p Group with the base address 545 /// given in \p Addr, optionally masking the vector operations if \p 546 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 547 /// values in the vectorized loop. 548 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 549 ArrayRef<VPValue *> VPDefs, 550 VPTransformState &State, VPValue *Addr, 551 ArrayRef<VPValue *> StoredValues, 552 VPValue *BlockInMask = nullptr); 553 554 /// Vectorize Load and Store instructions with the base address given in \p 555 /// Addr, optionally masking the vector operations if \p BlockInMask is 556 /// non-null. Use \p State to translate given VPValues to IR values in the 557 /// vectorized loop. 558 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 559 VPValue *Def, VPValue *Addr, 560 VPValue *StoredValue, VPValue *BlockInMask); 561 562 /// Set the debug location in the builder using the debug location in 563 /// the instruction. 564 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 565 566 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 567 void fixNonInductionPHIs(VPTransformState &State); 568 569 /// Create a broadcast instruction. This method generates a broadcast 570 /// instruction (shuffle) for loop invariant values and for the induction 571 /// value. If this is the induction variable then we extend it to N, N+1, ... 572 /// this is needed because each iteration in the loop corresponds to a SIMD 573 /// element. 574 virtual Value *getBroadcastInstrs(Value *V); 575 576 protected: 577 friend class LoopVectorizationPlanner; 578 579 /// A small list of PHINodes. 580 using PhiVector = SmallVector<PHINode *, 4>; 581 582 /// A type for scalarized values in the new loop. Each value from the 583 /// original loop, when scalarized, is represented by UF x VF scalar values 584 /// in the new unrolled loop, where UF is the unroll factor and VF is the 585 /// vectorization factor. 586 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 587 588 /// Set up the values of the IVs correctly when exiting the vector loop. 589 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 590 Value *CountRoundDown, Value *EndValue, 591 BasicBlock *MiddleBlock); 592 593 /// Create a new induction variable inside L. 594 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 595 Value *Step, Instruction *DL); 596 597 /// Handle all cross-iteration phis in the header. 598 void fixCrossIterationPHIs(VPTransformState &State); 599 600 /// Fix a first-order recurrence. This is the second phase of vectorizing 601 /// this phi node. 602 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 603 604 /// Fix a reduction cross-iteration phi. This is the second phase of 605 /// vectorizing this phi node. 606 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 607 608 /// Clear NSW/NUW flags from reduction instructions if necessary. 609 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 610 VPTransformState &State); 611 612 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 613 /// means we need to add the appropriate incoming value from the middle 614 /// block as exiting edges from the scalar epilogue loop (if present) are 615 /// already in place, and we exit the vector loop exclusively to the middle 616 /// block. 617 void fixLCSSAPHIs(VPTransformState &State); 618 619 /// Iteratively sink the scalarized operands of a predicated instruction into 620 /// the block that was created for it. 621 void sinkScalarOperands(Instruction *PredInst); 622 623 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 624 /// represented as. 625 void truncateToMinimalBitwidths(VPTransformState &State); 626 627 /// This function adds 628 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 629 /// to each vector element of Val. The sequence starts at StartIndex. 630 /// \p Opcode is relevant for FP induction variable. 631 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 632 Instruction::BinaryOps Opcode = 633 Instruction::BinaryOpsEnd); 634 635 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 636 /// variable on which to base the steps, \p Step is the size of the step, and 637 /// \p EntryVal is the value from the original loop that maps to the steps. 638 /// Note that \p EntryVal doesn't have to be an induction variable - it 639 /// can also be a truncate instruction. 640 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 641 const InductionDescriptor &ID, VPValue *Def, 642 VPValue *CastDef, VPTransformState &State); 643 644 /// Create a vector induction phi node based on an existing scalar one. \p 645 /// EntryVal is the value from the original loop that maps to the vector phi 646 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 647 /// truncate instruction, instead of widening the original IV, we widen a 648 /// version of the IV truncated to \p EntryVal's type. 649 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 650 Value *Step, Value *Start, 651 Instruction *EntryVal, VPValue *Def, 652 VPValue *CastDef, 653 VPTransformState &State); 654 655 /// Returns true if an instruction \p I should be scalarized instead of 656 /// vectorized for the chosen vectorization factor. 657 bool shouldScalarizeInstruction(Instruction *I) const; 658 659 /// Returns true if we should generate a scalar version of \p IV. 660 bool needsScalarInduction(Instruction *IV) const; 661 662 /// If there is a cast involved in the induction variable \p ID, which should 663 /// be ignored in the vectorized loop body, this function records the 664 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 665 /// cast. We had already proved that the casted Phi is equal to the uncasted 666 /// Phi in the vectorized loop (under a runtime guard), and therefore 667 /// there is no need to vectorize the cast - the same value can be used in the 668 /// vector loop for both the Phi and the cast. 669 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 670 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 671 /// 672 /// \p EntryVal is the value from the original loop that maps to the vector 673 /// phi node and is used to distinguish what is the IV currently being 674 /// processed - original one (if \p EntryVal is a phi corresponding to the 675 /// original IV) or the "newly-created" one based on the proof mentioned above 676 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 677 /// latter case \p EntryVal is a TruncInst and we must not record anything for 678 /// that IV, but it's error-prone to expect callers of this routine to care 679 /// about that, hence this explicit parameter. 680 void recordVectorLoopValueForInductionCast( 681 const InductionDescriptor &ID, const Instruction *EntryVal, 682 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 683 unsigned Part, unsigned Lane = UINT_MAX); 684 685 /// Generate a shuffle sequence that will reverse the vector Vec. 686 virtual Value *reverseVector(Value *Vec); 687 688 /// Returns (and creates if needed) the original loop trip count. 689 Value *getOrCreateTripCount(Loop *NewLoop); 690 691 /// Returns (and creates if needed) the trip count of the widened loop. 692 Value *getOrCreateVectorTripCount(Loop *NewLoop); 693 694 /// Returns a bitcasted value to the requested vector type. 695 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 696 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 697 const DataLayout &DL); 698 699 /// Emit a bypass check to see if the vector trip count is zero, including if 700 /// it overflows. 701 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 702 703 /// Emit a bypass check to see if all of the SCEV assumptions we've 704 /// had to make are correct. Returns the block containing the checks or 705 /// nullptr if no checks have been added. 706 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 707 708 /// Emit bypass checks to check any memory assumptions we may have made. 709 /// Returns the block containing the checks or nullptr if no checks have been 710 /// added. 711 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 712 713 /// Compute the transformed value of Index at offset StartValue using step 714 /// StepValue. 715 /// For integer induction, returns StartValue + Index * StepValue. 716 /// For pointer induction, returns StartValue[Index * StepValue]. 717 /// FIXME: The newly created binary instructions should contain nsw/nuw 718 /// flags, which can be found from the original scalar operations. 719 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 720 const DataLayout &DL, 721 const InductionDescriptor &ID) const; 722 723 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 724 /// vector loop preheader, middle block and scalar preheader. Also 725 /// allocate a loop object for the new vector loop and return it. 726 Loop *createVectorLoopSkeleton(StringRef Prefix); 727 728 /// Create new phi nodes for the induction variables to resume iteration count 729 /// in the scalar epilogue, from where the vectorized loop left off (given by 730 /// \p VectorTripCount). 731 /// In cases where the loop skeleton is more complicated (eg. epilogue 732 /// vectorization) and the resume values can come from an additional bypass 733 /// block, the \p AdditionalBypass pair provides information about the bypass 734 /// block and the end value on the edge from bypass to this loop. 735 void createInductionResumeValues( 736 Loop *L, Value *VectorTripCount, 737 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 738 739 /// Complete the loop skeleton by adding debug MDs, creating appropriate 740 /// conditional branches in the middle block, preparing the builder and 741 /// running the verifier. Take in the vector loop \p L as argument, and return 742 /// the preheader of the completed vector loop. 743 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 744 745 /// Add additional metadata to \p To that was not present on \p Orig. 746 /// 747 /// Currently this is used to add the noalias annotations based on the 748 /// inserted memchecks. Use this for instructions that are *cloned* into the 749 /// vector loop. 750 void addNewMetadata(Instruction *To, const Instruction *Orig); 751 752 /// Add metadata from one instruction to another. 753 /// 754 /// This includes both the original MDs from \p From and additional ones (\see 755 /// addNewMetadata). Use this for *newly created* instructions in the vector 756 /// loop. 757 void addMetadata(Instruction *To, Instruction *From); 758 759 /// Similar to the previous function but it adds the metadata to a 760 /// vector of instructions. 761 void addMetadata(ArrayRef<Value *> To, Instruction *From); 762 763 /// Allow subclasses to override and print debug traces before/after vplan 764 /// execution, when trace information is requested. 765 virtual void printDebugTracesAtStart(){}; 766 virtual void printDebugTracesAtEnd(){}; 767 768 /// The original loop. 769 Loop *OrigLoop; 770 771 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 772 /// dynamic knowledge to simplify SCEV expressions and converts them to a 773 /// more usable form. 774 PredicatedScalarEvolution &PSE; 775 776 /// Loop Info. 777 LoopInfo *LI; 778 779 /// Dominator Tree. 780 DominatorTree *DT; 781 782 /// Alias Analysis. 783 AAResults *AA; 784 785 /// Target Library Info. 786 const TargetLibraryInfo *TLI; 787 788 /// Target Transform Info. 789 const TargetTransformInfo *TTI; 790 791 /// Assumption Cache. 792 AssumptionCache *AC; 793 794 /// Interface to emit optimization remarks. 795 OptimizationRemarkEmitter *ORE; 796 797 /// LoopVersioning. It's only set up (non-null) if memchecks were 798 /// used. 799 /// 800 /// This is currently only used to add no-alias metadata based on the 801 /// memchecks. The actually versioning is performed manually. 802 std::unique_ptr<LoopVersioning> LVer; 803 804 /// The vectorization SIMD factor to use. Each vector will have this many 805 /// vector elements. 806 ElementCount VF; 807 808 /// The vectorization unroll factor to use. Each scalar is vectorized to this 809 /// many different vector instructions. 810 unsigned UF; 811 812 /// The builder that we use 813 IRBuilder<> Builder; 814 815 // --- Vectorization state --- 816 817 /// The vector-loop preheader. 818 BasicBlock *LoopVectorPreHeader; 819 820 /// The scalar-loop preheader. 821 BasicBlock *LoopScalarPreHeader; 822 823 /// Middle Block between the vector and the scalar. 824 BasicBlock *LoopMiddleBlock; 825 826 /// The (unique) ExitBlock of the scalar loop. Note that 827 /// there can be multiple exiting edges reaching this block. 828 BasicBlock *LoopExitBlock; 829 830 /// The vector loop body. 831 BasicBlock *LoopVectorBody; 832 833 /// The scalar loop body. 834 BasicBlock *LoopScalarBody; 835 836 /// A list of all bypass blocks. The first block is the entry of the loop. 837 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 838 839 /// The new Induction variable which was added to the new block. 840 PHINode *Induction = nullptr; 841 842 /// The induction variable of the old basic block. 843 PHINode *OldInduction = nullptr; 844 845 /// Store instructions that were predicated. 846 SmallVector<Instruction *, 4> PredicatedInstructions; 847 848 /// Trip count of the original loop. 849 Value *TripCount = nullptr; 850 851 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 852 Value *VectorTripCount = nullptr; 853 854 /// The legality analysis. 855 LoopVectorizationLegality *Legal; 856 857 /// The profitablity analysis. 858 LoopVectorizationCostModel *Cost; 859 860 // Record whether runtime checks are added. 861 bool AddedSafetyChecks = false; 862 863 // Holds the end values for each induction variable. We save the end values 864 // so we can later fix-up the external users of the induction variables. 865 DenseMap<PHINode *, Value *> IVEndValues; 866 867 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 868 // fixed up at the end of vector code generation. 869 SmallVector<PHINode *, 8> OrigPHIsToFix; 870 871 /// BFI and PSI are used to check for profile guided size optimizations. 872 BlockFrequencyInfo *BFI; 873 ProfileSummaryInfo *PSI; 874 875 // Whether this loop should be optimized for size based on profile guided size 876 // optimizatios. 877 bool OptForSizeBasedOnProfile; 878 879 /// Structure to hold information about generated runtime checks, responsible 880 /// for cleaning the checks, if vectorization turns out unprofitable. 881 GeneratedRTChecks &RTChecks; 882 }; 883 884 class InnerLoopUnroller : public InnerLoopVectorizer { 885 public: 886 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 887 LoopInfo *LI, DominatorTree *DT, 888 const TargetLibraryInfo *TLI, 889 const TargetTransformInfo *TTI, AssumptionCache *AC, 890 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 891 LoopVectorizationLegality *LVL, 892 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 893 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 894 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 895 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 896 BFI, PSI, Check) {} 897 898 private: 899 Value *getBroadcastInstrs(Value *V) override; 900 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 901 Instruction::BinaryOps Opcode = 902 Instruction::BinaryOpsEnd) override; 903 Value *reverseVector(Value *Vec) override; 904 }; 905 906 /// Encapsulate information regarding vectorization of a loop and its epilogue. 907 /// This information is meant to be updated and used across two stages of 908 /// epilogue vectorization. 909 struct EpilogueLoopVectorizationInfo { 910 ElementCount MainLoopVF = ElementCount::getFixed(0); 911 unsigned MainLoopUF = 0; 912 ElementCount EpilogueVF = ElementCount::getFixed(0); 913 unsigned EpilogueUF = 0; 914 BasicBlock *MainLoopIterationCountCheck = nullptr; 915 BasicBlock *EpilogueIterationCountCheck = nullptr; 916 BasicBlock *SCEVSafetyCheck = nullptr; 917 BasicBlock *MemSafetyCheck = nullptr; 918 Value *TripCount = nullptr; 919 Value *VectorTripCount = nullptr; 920 921 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 922 unsigned EUF) 923 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 924 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 925 assert(EUF == 1 && 926 "A high UF for the epilogue loop is likely not beneficial."); 927 } 928 }; 929 930 /// An extension of the inner loop vectorizer that creates a skeleton for a 931 /// vectorized loop that has its epilogue (residual) also vectorized. 932 /// The idea is to run the vplan on a given loop twice, firstly to setup the 933 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 934 /// from the first step and vectorize the epilogue. This is achieved by 935 /// deriving two concrete strategy classes from this base class and invoking 936 /// them in succession from the loop vectorizer planner. 937 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 938 public: 939 InnerLoopAndEpilogueVectorizer( 940 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 941 DominatorTree *DT, const TargetLibraryInfo *TLI, 942 const TargetTransformInfo *TTI, AssumptionCache *AC, 943 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 944 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 945 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 946 GeneratedRTChecks &Checks) 947 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 948 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 949 Checks), 950 EPI(EPI) {} 951 952 // Override this function to handle the more complex control flow around the 953 // three loops. 954 BasicBlock *createVectorizedLoopSkeleton() final override { 955 return createEpilogueVectorizedLoopSkeleton(); 956 } 957 958 /// The interface for creating a vectorized skeleton using one of two 959 /// different strategies, each corresponding to one execution of the vplan 960 /// as described above. 961 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 962 963 /// Holds and updates state information required to vectorize the main loop 964 /// and its epilogue in two separate passes. This setup helps us avoid 965 /// regenerating and recomputing runtime safety checks. It also helps us to 966 /// shorten the iteration-count-check path length for the cases where the 967 /// iteration count of the loop is so small that the main vector loop is 968 /// completely skipped. 969 EpilogueLoopVectorizationInfo &EPI; 970 }; 971 972 /// A specialized derived class of inner loop vectorizer that performs 973 /// vectorization of *main* loops in the process of vectorizing loops and their 974 /// epilogues. 975 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 976 public: 977 EpilogueVectorizerMainLoop( 978 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 979 DominatorTree *DT, const TargetLibraryInfo *TLI, 980 const TargetTransformInfo *TTI, AssumptionCache *AC, 981 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 982 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 983 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 984 GeneratedRTChecks &Check) 985 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 986 EPI, LVL, CM, BFI, PSI, Check) {} 987 /// Implements the interface for creating a vectorized skeleton using the 988 /// *main loop* strategy (ie the first pass of vplan execution). 989 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 990 991 protected: 992 /// Emits an iteration count bypass check once for the main loop (when \p 993 /// ForEpilogue is false) and once for the epilogue loop (when \p 994 /// ForEpilogue is true). 995 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 996 bool ForEpilogue); 997 void printDebugTracesAtStart() override; 998 void printDebugTracesAtEnd() override; 999 }; 1000 1001 // A specialized derived class of inner loop vectorizer that performs 1002 // vectorization of *epilogue* loops in the process of vectorizing loops and 1003 // their epilogues. 1004 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1005 public: 1006 EpilogueVectorizerEpilogueLoop( 1007 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1008 DominatorTree *DT, const TargetLibraryInfo *TLI, 1009 const TargetTransformInfo *TTI, AssumptionCache *AC, 1010 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1011 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1012 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1013 GeneratedRTChecks &Checks) 1014 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1015 EPI, LVL, CM, BFI, PSI, Checks) {} 1016 /// Implements the interface for creating a vectorized skeleton using the 1017 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1018 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1019 1020 protected: 1021 /// Emits an iteration count bypass check after the main vector loop has 1022 /// finished to see if there are any iterations left to execute by either 1023 /// the vector epilogue or the scalar epilogue. 1024 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1025 BasicBlock *Bypass, 1026 BasicBlock *Insert); 1027 void printDebugTracesAtStart() override; 1028 void printDebugTracesAtEnd() override; 1029 }; 1030 } // end namespace llvm 1031 1032 /// Look for a meaningful debug location on the instruction or it's 1033 /// operands. 1034 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1035 if (!I) 1036 return I; 1037 1038 DebugLoc Empty; 1039 if (I->getDebugLoc() != Empty) 1040 return I; 1041 1042 for (Use &Op : I->operands()) { 1043 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1044 if (OpInst->getDebugLoc() != Empty) 1045 return OpInst; 1046 } 1047 1048 return I; 1049 } 1050 1051 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1052 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1053 const DILocation *DIL = Inst->getDebugLoc(); 1054 1055 // When a FSDiscriminator is enabled, we don't need to add the multiply 1056 // factors to the discriminators. 1057 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1058 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) { 1059 // FIXME: For scalable vectors, assume vscale=1. 1060 auto NewDIL = 1061 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1062 if (NewDIL) 1063 B.SetCurrentDebugLocation(NewDIL.getValue()); 1064 else 1065 LLVM_DEBUG(dbgs() 1066 << "Failed to create new discriminator: " 1067 << DIL->getFilename() << " Line: " << DIL->getLine()); 1068 } else 1069 B.SetCurrentDebugLocation(DIL); 1070 } else 1071 B.SetCurrentDebugLocation(DebugLoc()); 1072 } 1073 1074 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1075 /// is passed, the message relates to that particular instruction. 1076 #ifndef NDEBUG 1077 static void debugVectorizationMessage(const StringRef Prefix, 1078 const StringRef DebugMsg, 1079 Instruction *I) { 1080 dbgs() << "LV: " << Prefix << DebugMsg; 1081 if (I != nullptr) 1082 dbgs() << " " << *I; 1083 else 1084 dbgs() << '.'; 1085 dbgs() << '\n'; 1086 } 1087 #endif 1088 1089 /// Create an analysis remark that explains why vectorization failed 1090 /// 1091 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1092 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1093 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1094 /// the location of the remark. \return the remark object that can be 1095 /// streamed to. 1096 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1097 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1098 Value *CodeRegion = TheLoop->getHeader(); 1099 DebugLoc DL = TheLoop->getStartLoc(); 1100 1101 if (I) { 1102 CodeRegion = I->getParent(); 1103 // If there is no debug location attached to the instruction, revert back to 1104 // using the loop's. 1105 if (I->getDebugLoc()) 1106 DL = I->getDebugLoc(); 1107 } 1108 1109 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1110 } 1111 1112 /// Return a value for Step multiplied by VF. 1113 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1114 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1115 Constant *StepVal = ConstantInt::get( 1116 Step->getType(), 1117 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1118 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1119 } 1120 1121 namespace llvm { 1122 1123 /// Return the runtime value for VF. 1124 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1125 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1126 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1127 } 1128 1129 void reportVectorizationFailure(const StringRef DebugMsg, 1130 const StringRef OREMsg, const StringRef ORETag, 1131 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1132 Instruction *I) { 1133 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1134 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1135 ORE->emit( 1136 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1137 << "loop not vectorized: " << OREMsg); 1138 } 1139 1140 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1141 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1142 Instruction *I) { 1143 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1144 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1145 ORE->emit( 1146 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1147 << Msg); 1148 } 1149 1150 } // end namespace llvm 1151 1152 #ifndef NDEBUG 1153 /// \return string containing a file name and a line # for the given loop. 1154 static std::string getDebugLocString(const Loop *L) { 1155 std::string Result; 1156 if (L) { 1157 raw_string_ostream OS(Result); 1158 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1159 LoopDbgLoc.print(OS); 1160 else 1161 // Just print the module name. 1162 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1163 OS.flush(); 1164 } 1165 return Result; 1166 } 1167 #endif 1168 1169 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1170 const Instruction *Orig) { 1171 // If the loop was versioned with memchecks, add the corresponding no-alias 1172 // metadata. 1173 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1174 LVer->annotateInstWithNoAlias(To, Orig); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(Instruction *To, 1178 Instruction *From) { 1179 propagateMetadata(To, From); 1180 addNewMetadata(To, From); 1181 } 1182 1183 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1184 Instruction *From) { 1185 for (Value *V : To) { 1186 if (Instruction *I = dyn_cast<Instruction>(V)) 1187 addMetadata(I, From); 1188 } 1189 } 1190 1191 namespace llvm { 1192 1193 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1194 // lowered. 1195 enum ScalarEpilogueLowering { 1196 1197 // The default: allowing scalar epilogues. 1198 CM_ScalarEpilogueAllowed, 1199 1200 // Vectorization with OptForSize: don't allow epilogues. 1201 CM_ScalarEpilogueNotAllowedOptSize, 1202 1203 // A special case of vectorisation with OptForSize: loops with a very small 1204 // trip count are considered for vectorization under OptForSize, thereby 1205 // making sure the cost of their loop body is dominant, free of runtime 1206 // guards and scalar iteration overheads. 1207 CM_ScalarEpilogueNotAllowedLowTripLoop, 1208 1209 // Loop hint predicate indicating an epilogue is undesired. 1210 CM_ScalarEpilogueNotNeededUsePredicate, 1211 1212 // Directive indicating we must either tail fold or not vectorize 1213 CM_ScalarEpilogueNotAllowedUsePredicate 1214 }; 1215 1216 /// LoopVectorizationCostModel - estimates the expected speedups due to 1217 /// vectorization. 1218 /// In many cases vectorization is not profitable. This can happen because of 1219 /// a number of reasons. In this class we mainly attempt to predict the 1220 /// expected speedup/slowdowns due to the supported instruction set. We use the 1221 /// TargetTransformInfo to query the different backends for the cost of 1222 /// different operations. 1223 class LoopVectorizationCostModel { 1224 public: 1225 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1226 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1227 LoopVectorizationLegality *Legal, 1228 const TargetTransformInfo &TTI, 1229 const TargetLibraryInfo *TLI, DemandedBits *DB, 1230 AssumptionCache *AC, 1231 OptimizationRemarkEmitter *ORE, const Function *F, 1232 const LoopVectorizeHints *Hints, 1233 InterleavedAccessInfo &IAI) 1234 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1235 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1236 Hints(Hints), InterleaveInfo(IAI) {} 1237 1238 /// \return An upper bound for the vectorization factors (both fixed and 1239 /// scalable). If the factors are 0, vectorization and interleaving should be 1240 /// avoided up front. 1241 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1242 1243 /// \return True if runtime checks are required for vectorization, and false 1244 /// otherwise. 1245 bool runtimeChecksRequired(); 1246 1247 /// \return The most profitable vectorization factor and the cost of that VF. 1248 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1249 /// then this vectorization factor will be selected if vectorization is 1250 /// possible. 1251 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1252 VectorizationFactor 1253 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1254 const LoopVectorizationPlanner &LVP); 1255 1256 /// Setup cost-based decisions for user vectorization factor. 1257 void selectUserVectorizationFactor(ElementCount UserVF) { 1258 collectUniformsAndScalars(UserVF); 1259 collectInstsToScalarize(UserVF); 1260 } 1261 1262 /// \return The size (in bits) of the smallest and widest types in the code 1263 /// that needs to be vectorized. We ignore values that remain scalar such as 1264 /// 64 bit loop indices. 1265 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1266 1267 /// \return The desired interleave count. 1268 /// If interleave count has been specified by metadata it will be returned. 1269 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1270 /// are the selected vectorization factor and the cost of the selected VF. 1271 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1272 1273 /// Memory access instruction may be vectorized in more than one way. 1274 /// Form of instruction after vectorization depends on cost. 1275 /// This function takes cost-based decisions for Load/Store instructions 1276 /// and collects them in a map. This decisions map is used for building 1277 /// the lists of loop-uniform and loop-scalar instructions. 1278 /// The calculated cost is saved with widening decision in order to 1279 /// avoid redundant calculations. 1280 void setCostBasedWideningDecision(ElementCount VF); 1281 1282 /// A struct that represents some properties of the register usage 1283 /// of a loop. 1284 struct RegisterUsage { 1285 /// Holds the number of loop invariant values that are used in the loop. 1286 /// The key is ClassID of target-provided register class. 1287 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1288 /// Holds the maximum number of concurrent live intervals in the loop. 1289 /// The key is ClassID of target-provided register class. 1290 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1291 }; 1292 1293 /// \return Returns information about the register usages of the loop for the 1294 /// given vectorization factors. 1295 SmallVector<RegisterUsage, 8> 1296 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1297 1298 /// Collect values we want to ignore in the cost model. 1299 void collectValuesToIgnore(); 1300 1301 /// Split reductions into those that happen in the loop, and those that happen 1302 /// outside. In loop reductions are collected into InLoopReductionChains. 1303 void collectInLoopReductions(); 1304 1305 /// \returns The smallest bitwidth each instruction can be represented with. 1306 /// The vector equivalents of these instructions should be truncated to this 1307 /// type. 1308 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1309 return MinBWs; 1310 } 1311 1312 /// \returns True if it is more profitable to scalarize instruction \p I for 1313 /// vectorization factor \p VF. 1314 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1315 assert(VF.isVector() && 1316 "Profitable to scalarize relevant only for VF > 1."); 1317 1318 // Cost model is not run in the VPlan-native path - return conservative 1319 // result until this changes. 1320 if (EnableVPlanNativePath) 1321 return false; 1322 1323 auto Scalars = InstsToScalarize.find(VF); 1324 assert(Scalars != InstsToScalarize.end() && 1325 "VF not yet analyzed for scalarization profitability"); 1326 return Scalars->second.find(I) != Scalars->second.end(); 1327 } 1328 1329 /// Returns true if \p I is known to be uniform after vectorization. 1330 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1331 if (VF.isScalar()) 1332 return true; 1333 1334 // Cost model is not run in the VPlan-native path - return conservative 1335 // result until this changes. 1336 if (EnableVPlanNativePath) 1337 return false; 1338 1339 auto UniformsPerVF = Uniforms.find(VF); 1340 assert(UniformsPerVF != Uniforms.end() && 1341 "VF not yet analyzed for uniformity"); 1342 return UniformsPerVF->second.count(I); 1343 } 1344 1345 /// Returns true if \p I is known to be scalar after vectorization. 1346 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1347 if (VF.isScalar()) 1348 return true; 1349 1350 // Cost model is not run in the VPlan-native path - return conservative 1351 // result until this changes. 1352 if (EnableVPlanNativePath) 1353 return false; 1354 1355 auto ScalarsPerVF = Scalars.find(VF); 1356 assert(ScalarsPerVF != Scalars.end() && 1357 "Scalar values are not calculated for VF"); 1358 return ScalarsPerVF->second.count(I); 1359 } 1360 1361 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1362 /// for vectorization factor \p VF. 1363 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1364 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1365 !isProfitableToScalarize(I, VF) && 1366 !isScalarAfterVectorization(I, VF); 1367 } 1368 1369 /// Decision that was taken during cost calculation for memory instruction. 1370 enum InstWidening { 1371 CM_Unknown, 1372 CM_Widen, // For consecutive accesses with stride +1. 1373 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1374 CM_Interleave, 1375 CM_GatherScatter, 1376 CM_Scalarize 1377 }; 1378 1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1380 /// instruction \p I and vector width \p VF. 1381 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1382 InstructionCost Cost) { 1383 assert(VF.isVector() && "Expected VF >=2"); 1384 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1385 } 1386 1387 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1388 /// interleaving group \p Grp and vector width \p VF. 1389 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1390 ElementCount VF, InstWidening W, 1391 InstructionCost Cost) { 1392 assert(VF.isVector() && "Expected VF >=2"); 1393 /// Broadcast this decicion to all instructions inside the group. 1394 /// But the cost will be assigned to one instruction only. 1395 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1396 if (auto *I = Grp->getMember(i)) { 1397 if (Grp->getInsertPos() == I) 1398 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1399 else 1400 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1401 } 1402 } 1403 } 1404 1405 /// Return the cost model decision for the given instruction \p I and vector 1406 /// width \p VF. Return CM_Unknown if this instruction did not pass 1407 /// through the cost modeling. 1408 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1409 assert(VF.isVector() && "Expected VF to be a vector VF"); 1410 // Cost model is not run in the VPlan-native path - return conservative 1411 // result until this changes. 1412 if (EnableVPlanNativePath) 1413 return CM_GatherScatter; 1414 1415 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1416 auto Itr = WideningDecisions.find(InstOnVF); 1417 if (Itr == WideningDecisions.end()) 1418 return CM_Unknown; 1419 return Itr->second.first; 1420 } 1421 1422 /// Return the vectorization cost for the given instruction \p I and vector 1423 /// width \p VF. 1424 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1425 assert(VF.isVector() && "Expected VF >=2"); 1426 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1427 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1428 "The cost is not calculated"); 1429 return WideningDecisions[InstOnVF].second; 1430 } 1431 1432 /// Return True if instruction \p I is an optimizable truncate whose operand 1433 /// is an induction variable. Such a truncate will be removed by adding a new 1434 /// induction variable with the destination type. 1435 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1436 // If the instruction is not a truncate, return false. 1437 auto *Trunc = dyn_cast<TruncInst>(I); 1438 if (!Trunc) 1439 return false; 1440 1441 // Get the source and destination types of the truncate. 1442 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1443 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1444 1445 // If the truncate is free for the given types, return false. Replacing a 1446 // free truncate with an induction variable would add an induction variable 1447 // update instruction to each iteration of the loop. We exclude from this 1448 // check the primary induction variable since it will need an update 1449 // instruction regardless. 1450 Value *Op = Trunc->getOperand(0); 1451 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1452 return false; 1453 1454 // If the truncated value is not an induction variable, return false. 1455 return Legal->isInductionPhi(Op); 1456 } 1457 1458 /// Collects the instructions to scalarize for each predicated instruction in 1459 /// the loop. 1460 void collectInstsToScalarize(ElementCount VF); 1461 1462 /// Collect Uniform and Scalar values for the given \p VF. 1463 /// The sets depend on CM decision for Load/Store instructions 1464 /// that may be vectorized as interleave, gather-scatter or scalarized. 1465 void collectUniformsAndScalars(ElementCount VF) { 1466 // Do the analysis once. 1467 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1468 return; 1469 setCostBasedWideningDecision(VF); 1470 collectLoopUniforms(VF); 1471 collectLoopScalars(VF); 1472 } 1473 1474 /// Returns true if the target machine supports masked store operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedStore(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked load operation 1482 /// for the given \p DataType and kind of access to \p Ptr. 1483 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1484 return Legal->isConsecutivePtr(Ptr) && 1485 TTI.isLegalMaskedLoad(DataType, Alignment); 1486 } 1487 1488 /// Returns true if the target machine supports masked scatter operation 1489 /// for the given \p DataType. 1490 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1491 return TTI.isLegalMaskedScatter(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine supports masked gather operation 1495 /// for the given \p DataType. 1496 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1497 return TTI.isLegalMaskedGather(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine can represent \p V as a masked gather 1501 /// or scatter operation. 1502 bool isLegalGatherOrScatter(Value *V) { 1503 bool LI = isa<LoadInst>(V); 1504 bool SI = isa<StoreInst>(V); 1505 if (!LI && !SI) 1506 return false; 1507 auto *Ty = getMemInstValueType(V); 1508 Align Align = getLoadStoreAlignment(V); 1509 return (LI && isLegalMaskedGather(Ty, Align)) || 1510 (SI && isLegalMaskedScatter(Ty, Align)); 1511 } 1512 1513 /// Returns true if the target machine supports all of the reduction 1514 /// variables found for the given VF. 1515 bool canVectorizeReductions(ElementCount VF) { 1516 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1517 RecurrenceDescriptor RdxDesc = Reduction.second; 1518 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1519 })); 1520 } 1521 1522 /// Returns true if \p I is an instruction that will be scalarized with 1523 /// predication. Such instructions include conditional stores and 1524 /// instructions that may divide by zero. 1525 /// If a non-zero VF has been calculated, we check if I will be scalarized 1526 /// predication for that VF. 1527 bool isScalarWithPredication(Instruction *I) const; 1528 1529 // Returns true if \p I is an instruction that will be predicated either 1530 // through scalar predication or masked load/store or masked gather/scatter. 1531 // Superset of instructions that return true for isScalarWithPredication. 1532 bool isPredicatedInst(Instruction *I) { 1533 if (!blockNeedsPredication(I->getParent())) 1534 return false; 1535 // Loads and stores that need some form of masked operation are predicated 1536 // instructions. 1537 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1538 return Legal->isMaskRequired(I); 1539 return isScalarWithPredication(I); 1540 } 1541 1542 /// Returns true if \p I is a memory instruction with consecutive memory 1543 /// access that can be widened. 1544 bool 1545 memoryInstructionCanBeWidened(Instruction *I, 1546 ElementCount VF = ElementCount::getFixed(1)); 1547 1548 /// Returns true if \p I is a memory instruction in an interleaved-group 1549 /// of memory accesses that can be vectorized with wide vector loads/stores 1550 /// and shuffles. 1551 bool 1552 interleavedAccessCanBeWidened(Instruction *I, 1553 ElementCount VF = ElementCount::getFixed(1)); 1554 1555 /// Check if \p Instr belongs to any interleaved access group. 1556 bool isAccessInterleaved(Instruction *Instr) { 1557 return InterleaveInfo.isInterleaved(Instr); 1558 } 1559 1560 /// Get the interleaved access group that \p Instr belongs to. 1561 const InterleaveGroup<Instruction> * 1562 getInterleavedAccessGroup(Instruction *Instr) { 1563 return InterleaveInfo.getInterleaveGroup(Instr); 1564 } 1565 1566 /// Returns true if we're required to use a scalar epilogue for at least 1567 /// the final iteration of the original loop. 1568 bool requiresScalarEpilogue() const { 1569 if (!isScalarEpilogueAllowed()) 1570 return false; 1571 // If we might exit from anywhere but the latch, must run the exiting 1572 // iteration in scalar form. 1573 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1574 return true; 1575 return InterleaveInfo.requiresScalarEpilogue(); 1576 } 1577 1578 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1579 /// loop hint annotation. 1580 bool isScalarEpilogueAllowed() const { 1581 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1582 } 1583 1584 /// Returns true if all loop blocks should be masked to fold tail loop. 1585 bool foldTailByMasking() const { return FoldTailByMasking; } 1586 1587 bool blockNeedsPredication(BasicBlock *BB) const { 1588 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1589 } 1590 1591 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1592 /// nodes to the chain of instructions representing the reductions. Uses a 1593 /// MapVector to ensure deterministic iteration order. 1594 using ReductionChainMap = 1595 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1596 1597 /// Return the chain of instructions representing an inloop reduction. 1598 const ReductionChainMap &getInLoopReductionChains() const { 1599 return InLoopReductionChains; 1600 } 1601 1602 /// Returns true if the Phi is part of an inloop reduction. 1603 bool isInLoopReduction(PHINode *Phi) const { 1604 return InLoopReductionChains.count(Phi); 1605 } 1606 1607 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1608 /// with factor VF. Return the cost of the instruction, including 1609 /// scalarization overhead if it's needed. 1610 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1611 1612 /// Estimate cost of a call instruction CI if it were vectorized with factor 1613 /// VF. Return the cost of the instruction, including scalarization overhead 1614 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1615 /// scalarized - 1616 /// i.e. either vector version isn't available, or is too expensive. 1617 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1618 bool &NeedToScalarize) const; 1619 1620 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1621 /// that of B. 1622 bool isMoreProfitable(const VectorizationFactor &A, 1623 const VectorizationFactor &B) const; 1624 1625 /// Invalidates decisions already taken by the cost model. 1626 void invalidateCostModelingDecisions() { 1627 WideningDecisions.clear(); 1628 Uniforms.clear(); 1629 Scalars.clear(); 1630 } 1631 1632 private: 1633 unsigned NumPredStores = 0; 1634 1635 /// \return An upper bound for the vectorization factors for both 1636 /// fixed and scalable vectorization, where the minimum-known number of 1637 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1638 /// disabled or unsupported, then the scalable part will be equal to 1639 /// ElementCount::getScalable(0). 1640 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, 1641 ElementCount UserVF); 1642 1643 /// \return the maximized element count based on the targets vector 1644 /// registers and the loop trip-count, but limited to a maximum safe VF. 1645 /// This is a helper function of computeFeasibleMaxVF. 1646 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1647 /// issue that occurred on one of the buildbots which cannot be reproduced 1648 /// without having access to the properietary compiler (see comments on 1649 /// D98509). The issue is currently under investigation and this workaround 1650 /// will be removed as soon as possible. 1651 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1652 unsigned SmallestType, 1653 unsigned WidestType, 1654 const ElementCount &MaxSafeVF); 1655 1656 /// \return the maximum legal scalable VF, based on the safe max number 1657 /// of elements. 1658 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1659 1660 /// The vectorization cost is a combination of the cost itself and a boolean 1661 /// indicating whether any of the contributing operations will actually 1662 /// operate on 1663 /// vector values after type legalization in the backend. If this latter value 1664 /// is 1665 /// false, then all operations will be scalarized (i.e. no vectorization has 1666 /// actually taken place). 1667 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1668 1669 /// Returns the expected execution cost. The unit of the cost does 1670 /// not matter because we use the 'cost' units to compare different 1671 /// vector widths. The cost that is returned is *not* normalized by 1672 /// the factor width. 1673 VectorizationCostTy expectedCost(ElementCount VF); 1674 1675 /// Returns the execution time cost of an instruction for a given vector 1676 /// width. Vector width of one means scalar. 1677 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1678 1679 /// The cost-computation logic from getInstructionCost which provides 1680 /// the vector type as an output parameter. 1681 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1682 Type *&VectorTy); 1683 1684 /// Return the cost of instructions in an inloop reduction pattern, if I is 1685 /// part of that pattern. 1686 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1687 Type *VectorTy, 1688 TTI::TargetCostKind CostKind); 1689 1690 /// Calculate vectorization cost of memory instruction \p I. 1691 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1692 1693 /// The cost computation for scalarized memory instruction. 1694 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1695 1696 /// The cost computation for interleaving group of memory instructions. 1697 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1698 1699 /// The cost computation for Gather/Scatter instruction. 1700 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1701 1702 /// The cost computation for widening instruction \p I with consecutive 1703 /// memory access. 1704 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1705 1706 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1707 /// Load: scalar load + broadcast. 1708 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1709 /// element) 1710 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1711 1712 /// Estimate the overhead of scalarizing an instruction. This is a 1713 /// convenience wrapper for the type-based getScalarizationOverhead API. 1714 InstructionCost getScalarizationOverhead(Instruction *I, 1715 ElementCount VF) const; 1716 1717 /// Returns whether the instruction is a load or store and will be a emitted 1718 /// as a vector operation. 1719 bool isConsecutiveLoadOrStore(Instruction *I); 1720 1721 /// Returns true if an artificially high cost for emulated masked memrefs 1722 /// should be used. 1723 bool useEmulatedMaskMemRefHack(Instruction *I); 1724 1725 /// Map of scalar integer values to the smallest bitwidth they can be legally 1726 /// represented as. The vector equivalents of these values should be truncated 1727 /// to this type. 1728 MapVector<Instruction *, uint64_t> MinBWs; 1729 1730 /// A type representing the costs for instructions if they were to be 1731 /// scalarized rather than vectorized. The entries are Instruction-Cost 1732 /// pairs. 1733 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1734 1735 /// A set containing all BasicBlocks that are known to present after 1736 /// vectorization as a predicated block. 1737 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1738 1739 /// Records whether it is allowed to have the original scalar loop execute at 1740 /// least once. This may be needed as a fallback loop in case runtime 1741 /// aliasing/dependence checks fail, or to handle the tail/remainder 1742 /// iterations when the trip count is unknown or doesn't divide by the VF, 1743 /// or as a peel-loop to handle gaps in interleave-groups. 1744 /// Under optsize and when the trip count is very small we don't allow any 1745 /// iterations to execute in the scalar loop. 1746 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1747 1748 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1749 bool FoldTailByMasking = false; 1750 1751 /// A map holding scalar costs for different vectorization factors. The 1752 /// presence of a cost for an instruction in the mapping indicates that the 1753 /// instruction will be scalarized when vectorizing with the associated 1754 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1755 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1756 1757 /// Holds the instructions known to be uniform after vectorization. 1758 /// The data is collected per VF. 1759 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1760 1761 /// Holds the instructions known to be scalar after vectorization. 1762 /// The data is collected per VF. 1763 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1764 1765 /// Holds the instructions (address computations) that are forced to be 1766 /// scalarized. 1767 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1768 1769 /// PHINodes of the reductions that should be expanded in-loop along with 1770 /// their associated chains of reduction operations, in program order from top 1771 /// (PHI) to bottom 1772 ReductionChainMap InLoopReductionChains; 1773 1774 /// A Map of inloop reduction operations and their immediate chain operand. 1775 /// FIXME: This can be removed once reductions can be costed correctly in 1776 /// vplan. This was added to allow quick lookup to the inloop operations, 1777 /// without having to loop through InLoopReductionChains. 1778 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1779 1780 /// Returns the expected difference in cost from scalarizing the expression 1781 /// feeding a predicated instruction \p PredInst. The instructions to 1782 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1783 /// non-negative return value implies the expression will be scalarized. 1784 /// Currently, only single-use chains are considered for scalarization. 1785 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1786 ElementCount VF); 1787 1788 /// Collect the instructions that are uniform after vectorization. An 1789 /// instruction is uniform if we represent it with a single scalar value in 1790 /// the vectorized loop corresponding to each vector iteration. Examples of 1791 /// uniform instructions include pointer operands of consecutive or 1792 /// interleaved memory accesses. Note that although uniformity implies an 1793 /// instruction will be scalar, the reverse is not true. In general, a 1794 /// scalarized instruction will be represented by VF scalar values in the 1795 /// vectorized loop, each corresponding to an iteration of the original 1796 /// scalar loop. 1797 void collectLoopUniforms(ElementCount VF); 1798 1799 /// Collect the instructions that are scalar after vectorization. An 1800 /// instruction is scalar if it is known to be uniform or will be scalarized 1801 /// during vectorization. Non-uniform scalarized instructions will be 1802 /// represented by VF values in the vectorized loop, each corresponding to an 1803 /// iteration of the original scalar loop. 1804 void collectLoopScalars(ElementCount VF); 1805 1806 /// Keeps cost model vectorization decision and cost for instructions. 1807 /// Right now it is used for memory instructions only. 1808 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1809 std::pair<InstWidening, InstructionCost>>; 1810 1811 DecisionList WideningDecisions; 1812 1813 /// Returns true if \p V is expected to be vectorized and it needs to be 1814 /// extracted. 1815 bool needsExtract(Value *V, ElementCount VF) const { 1816 Instruction *I = dyn_cast<Instruction>(V); 1817 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1818 TheLoop->isLoopInvariant(I)) 1819 return false; 1820 1821 // Assume we can vectorize V (and hence we need extraction) if the 1822 // scalars are not computed yet. This can happen, because it is called 1823 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1824 // the scalars are collected. That should be a safe assumption in most 1825 // cases, because we check if the operands have vectorizable types 1826 // beforehand in LoopVectorizationLegality. 1827 return Scalars.find(VF) == Scalars.end() || 1828 !isScalarAfterVectorization(I, VF); 1829 }; 1830 1831 /// Returns a range containing only operands needing to be extracted. 1832 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1833 ElementCount VF) const { 1834 return SmallVector<Value *, 4>(make_filter_range( 1835 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1836 } 1837 1838 /// Determines if we have the infrastructure to vectorize loop \p L and its 1839 /// epilogue, assuming the main loop is vectorized by \p VF. 1840 bool isCandidateForEpilogueVectorization(const Loop &L, 1841 const ElementCount VF) const; 1842 1843 /// Returns true if epilogue vectorization is considered profitable, and 1844 /// false otherwise. 1845 /// \p VF is the vectorization factor chosen for the original loop. 1846 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1847 1848 public: 1849 /// The loop that we evaluate. 1850 Loop *TheLoop; 1851 1852 /// Predicated scalar evolution analysis. 1853 PredicatedScalarEvolution &PSE; 1854 1855 /// Loop Info analysis. 1856 LoopInfo *LI; 1857 1858 /// Vectorization legality. 1859 LoopVectorizationLegality *Legal; 1860 1861 /// Vector target information. 1862 const TargetTransformInfo &TTI; 1863 1864 /// Target Library Info. 1865 const TargetLibraryInfo *TLI; 1866 1867 /// Demanded bits analysis. 1868 DemandedBits *DB; 1869 1870 /// Assumption cache. 1871 AssumptionCache *AC; 1872 1873 /// Interface to emit optimization remarks. 1874 OptimizationRemarkEmitter *ORE; 1875 1876 const Function *TheFunction; 1877 1878 /// Loop Vectorize Hint. 1879 const LoopVectorizeHints *Hints; 1880 1881 /// The interleave access information contains groups of interleaved accesses 1882 /// with the same stride and close to each other. 1883 InterleavedAccessInfo &InterleaveInfo; 1884 1885 /// Values to ignore in the cost model. 1886 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1887 1888 /// Values to ignore in the cost model when VF > 1. 1889 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1890 1891 /// Profitable vector factors. 1892 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1893 }; 1894 } // end namespace llvm 1895 1896 /// Helper struct to manage generating runtime checks for vectorization. 1897 /// 1898 /// The runtime checks are created up-front in temporary blocks to allow better 1899 /// estimating the cost and un-linked from the existing IR. After deciding to 1900 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1901 /// temporary blocks are completely removed. 1902 class GeneratedRTChecks { 1903 /// Basic block which contains the generated SCEV checks, if any. 1904 BasicBlock *SCEVCheckBlock = nullptr; 1905 1906 /// The value representing the result of the generated SCEV checks. If it is 1907 /// nullptr, either no SCEV checks have been generated or they have been used. 1908 Value *SCEVCheckCond = nullptr; 1909 1910 /// Basic block which contains the generated memory runtime checks, if any. 1911 BasicBlock *MemCheckBlock = nullptr; 1912 1913 /// The value representing the result of the generated memory runtime checks. 1914 /// If it is nullptr, either no memory runtime checks have been generated or 1915 /// they have been used. 1916 Instruction *MemRuntimeCheckCond = nullptr; 1917 1918 DominatorTree *DT; 1919 LoopInfo *LI; 1920 1921 SCEVExpander SCEVExp; 1922 SCEVExpander MemCheckExp; 1923 1924 public: 1925 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1926 const DataLayout &DL) 1927 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1928 MemCheckExp(SE, DL, "scev.check") {} 1929 1930 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1931 /// accurately estimate the cost of the runtime checks. The blocks are 1932 /// un-linked from the IR and is added back during vector code generation. If 1933 /// there is no vector code generation, the check blocks are removed 1934 /// completely. 1935 void Create(Loop *L, const LoopAccessInfo &LAI, 1936 const SCEVUnionPredicate &UnionPred) { 1937 1938 BasicBlock *LoopHeader = L->getHeader(); 1939 BasicBlock *Preheader = L->getLoopPreheader(); 1940 1941 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1942 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1943 // may be used by SCEVExpander. The blocks will be un-linked from their 1944 // predecessors and removed from LI & DT at the end of the function. 1945 if (!UnionPred.isAlwaysTrue()) { 1946 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1947 nullptr, "vector.scevcheck"); 1948 1949 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1950 &UnionPred, SCEVCheckBlock->getTerminator()); 1951 } 1952 1953 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1954 if (RtPtrChecking.Need) { 1955 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1956 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1957 "vector.memcheck"); 1958 1959 std::tie(std::ignore, MemRuntimeCheckCond) = 1960 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1961 RtPtrChecking.getChecks(), MemCheckExp); 1962 assert(MemRuntimeCheckCond && 1963 "no RT checks generated although RtPtrChecking " 1964 "claimed checks are required"); 1965 } 1966 1967 if (!MemCheckBlock && !SCEVCheckBlock) 1968 return; 1969 1970 // Unhook the temporary block with the checks, update various places 1971 // accordingly. 1972 if (SCEVCheckBlock) 1973 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1974 if (MemCheckBlock) 1975 MemCheckBlock->replaceAllUsesWith(Preheader); 1976 1977 if (SCEVCheckBlock) { 1978 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1979 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1980 Preheader->getTerminator()->eraseFromParent(); 1981 } 1982 if (MemCheckBlock) { 1983 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1984 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1985 Preheader->getTerminator()->eraseFromParent(); 1986 } 1987 1988 DT->changeImmediateDominator(LoopHeader, Preheader); 1989 if (MemCheckBlock) { 1990 DT->eraseNode(MemCheckBlock); 1991 LI->removeBlock(MemCheckBlock); 1992 } 1993 if (SCEVCheckBlock) { 1994 DT->eraseNode(SCEVCheckBlock); 1995 LI->removeBlock(SCEVCheckBlock); 1996 } 1997 } 1998 1999 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2000 /// unused. 2001 ~GeneratedRTChecks() { 2002 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 2003 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 2004 if (!SCEVCheckCond) 2005 SCEVCleaner.markResultUsed(); 2006 2007 if (!MemRuntimeCheckCond) 2008 MemCheckCleaner.markResultUsed(); 2009 2010 if (MemRuntimeCheckCond) { 2011 auto &SE = *MemCheckExp.getSE(); 2012 // Memory runtime check generation creates compares that use expanded 2013 // values. Remove them before running the SCEVExpanderCleaners. 2014 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2015 if (MemCheckExp.isInsertedInstruction(&I)) 2016 continue; 2017 SE.forgetValue(&I); 2018 SE.eraseValueFromMap(&I); 2019 I.eraseFromParent(); 2020 } 2021 } 2022 MemCheckCleaner.cleanup(); 2023 SCEVCleaner.cleanup(); 2024 2025 if (SCEVCheckCond) 2026 SCEVCheckBlock->eraseFromParent(); 2027 if (MemRuntimeCheckCond) 2028 MemCheckBlock->eraseFromParent(); 2029 } 2030 2031 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2032 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2033 /// depending on the generated condition. 2034 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2035 BasicBlock *LoopVectorPreHeader, 2036 BasicBlock *LoopExitBlock) { 2037 if (!SCEVCheckCond) 2038 return nullptr; 2039 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2040 if (C->isZero()) 2041 return nullptr; 2042 2043 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2044 2045 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2046 // Create new preheader for vector loop. 2047 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2048 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2049 2050 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2051 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2052 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2053 SCEVCheckBlock); 2054 2055 DT->addNewBlock(SCEVCheckBlock, Pred); 2056 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2057 2058 ReplaceInstWithInst( 2059 SCEVCheckBlock->getTerminator(), 2060 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2061 // Mark the check as used, to prevent it from being removed during cleanup. 2062 SCEVCheckCond = nullptr; 2063 return SCEVCheckBlock; 2064 } 2065 2066 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2067 /// the branches to branch to the vector preheader or \p Bypass, depending on 2068 /// the generated condition. 2069 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2070 BasicBlock *LoopVectorPreHeader) { 2071 // Check if we generated code that checks in runtime if arrays overlap. 2072 if (!MemRuntimeCheckCond) 2073 return nullptr; 2074 2075 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2076 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2077 MemCheckBlock); 2078 2079 DT->addNewBlock(MemCheckBlock, Pred); 2080 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2081 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2082 2083 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2084 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2085 2086 ReplaceInstWithInst( 2087 MemCheckBlock->getTerminator(), 2088 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2089 MemCheckBlock->getTerminator()->setDebugLoc( 2090 Pred->getTerminator()->getDebugLoc()); 2091 2092 // Mark the check as used, to prevent it from being removed during cleanup. 2093 MemRuntimeCheckCond = nullptr; 2094 return MemCheckBlock; 2095 } 2096 }; 2097 2098 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2099 // vectorization. The loop needs to be annotated with #pragma omp simd 2100 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2101 // vector length information is not provided, vectorization is not considered 2102 // explicit. Interleave hints are not allowed either. These limitations will be 2103 // relaxed in the future. 2104 // Please, note that we are currently forced to abuse the pragma 'clang 2105 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2106 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2107 // provides *explicit vectorization hints* (LV can bypass legal checks and 2108 // assume that vectorization is legal). However, both hints are implemented 2109 // using the same metadata (llvm.loop.vectorize, processed by 2110 // LoopVectorizeHints). This will be fixed in the future when the native IR 2111 // representation for pragma 'omp simd' is introduced. 2112 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2113 OptimizationRemarkEmitter *ORE) { 2114 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2115 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2116 2117 // Only outer loops with an explicit vectorization hint are supported. 2118 // Unannotated outer loops are ignored. 2119 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2120 return false; 2121 2122 Function *Fn = OuterLp->getHeader()->getParent(); 2123 if (!Hints.allowVectorization(Fn, OuterLp, 2124 true /*VectorizeOnlyWhenForced*/)) { 2125 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2126 return false; 2127 } 2128 2129 if (Hints.getInterleave() > 1) { 2130 // TODO: Interleave support is future work. 2131 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2132 "outer loops.\n"); 2133 Hints.emitRemarkWithHints(); 2134 return false; 2135 } 2136 2137 return true; 2138 } 2139 2140 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2141 OptimizationRemarkEmitter *ORE, 2142 SmallVectorImpl<Loop *> &V) { 2143 // Collect inner loops and outer loops without irreducible control flow. For 2144 // now, only collect outer loops that have explicit vectorization hints. If we 2145 // are stress testing the VPlan H-CFG construction, we collect the outermost 2146 // loop of every loop nest. 2147 if (L.isInnermost() || VPlanBuildStressTest || 2148 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2149 LoopBlocksRPO RPOT(&L); 2150 RPOT.perform(LI); 2151 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2152 V.push_back(&L); 2153 // TODO: Collect inner loops inside marked outer loops in case 2154 // vectorization fails for the outer loop. Do not invoke 2155 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2156 // already known to be reducible. We can use an inherited attribute for 2157 // that. 2158 return; 2159 } 2160 } 2161 for (Loop *InnerL : L) 2162 collectSupportedLoops(*InnerL, LI, ORE, V); 2163 } 2164 2165 namespace { 2166 2167 /// The LoopVectorize Pass. 2168 struct LoopVectorize : public FunctionPass { 2169 /// Pass identification, replacement for typeid 2170 static char ID; 2171 2172 LoopVectorizePass Impl; 2173 2174 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2175 bool VectorizeOnlyWhenForced = false) 2176 : FunctionPass(ID), 2177 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2178 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2179 } 2180 2181 bool runOnFunction(Function &F) override { 2182 if (skipFunction(F)) 2183 return false; 2184 2185 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2186 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2187 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2188 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2189 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2190 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2191 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2192 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2193 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2194 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2195 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2196 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2197 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2198 2199 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2200 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2201 2202 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2203 GetLAA, *ORE, PSI).MadeAnyChange; 2204 } 2205 2206 void getAnalysisUsage(AnalysisUsage &AU) const override { 2207 AU.addRequired<AssumptionCacheTracker>(); 2208 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2209 AU.addRequired<DominatorTreeWrapperPass>(); 2210 AU.addRequired<LoopInfoWrapperPass>(); 2211 AU.addRequired<ScalarEvolutionWrapperPass>(); 2212 AU.addRequired<TargetTransformInfoWrapperPass>(); 2213 AU.addRequired<AAResultsWrapperPass>(); 2214 AU.addRequired<LoopAccessLegacyAnalysis>(); 2215 AU.addRequired<DemandedBitsWrapperPass>(); 2216 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2217 AU.addRequired<InjectTLIMappingsLegacy>(); 2218 2219 // We currently do not preserve loopinfo/dominator analyses with outer loop 2220 // vectorization. Until this is addressed, mark these analyses as preserved 2221 // only for non-VPlan-native path. 2222 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2223 if (!EnableVPlanNativePath) { 2224 AU.addPreserved<LoopInfoWrapperPass>(); 2225 AU.addPreserved<DominatorTreeWrapperPass>(); 2226 } 2227 2228 AU.addPreserved<BasicAAWrapperPass>(); 2229 AU.addPreserved<GlobalsAAWrapperPass>(); 2230 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2231 } 2232 }; 2233 2234 } // end anonymous namespace 2235 2236 //===----------------------------------------------------------------------===// 2237 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2238 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2239 //===----------------------------------------------------------------------===// 2240 2241 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2242 // We need to place the broadcast of invariant variables outside the loop, 2243 // but only if it's proven safe to do so. Else, broadcast will be inside 2244 // vector loop body. 2245 Instruction *Instr = dyn_cast<Instruction>(V); 2246 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2247 (!Instr || 2248 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2249 // Place the code for broadcasting invariant variables in the new preheader. 2250 IRBuilder<>::InsertPointGuard Guard(Builder); 2251 if (SafeToHoist) 2252 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2253 2254 // Broadcast the scalar into all locations in the vector. 2255 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2256 2257 return Shuf; 2258 } 2259 2260 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2261 const InductionDescriptor &II, Value *Step, Value *Start, 2262 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2263 VPTransformState &State) { 2264 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2265 "Expected either an induction phi-node or a truncate of it!"); 2266 2267 // Construct the initial value of the vector IV in the vector loop preheader 2268 auto CurrIP = Builder.saveIP(); 2269 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2270 if (isa<TruncInst>(EntryVal)) { 2271 assert(Start->getType()->isIntegerTy() && 2272 "Truncation requires an integer type"); 2273 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2274 Step = Builder.CreateTrunc(Step, TruncType); 2275 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2276 } 2277 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2278 Value *SteppedStart = 2279 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2280 2281 // We create vector phi nodes for both integer and floating-point induction 2282 // variables. Here, we determine the kind of arithmetic we will perform. 2283 Instruction::BinaryOps AddOp; 2284 Instruction::BinaryOps MulOp; 2285 if (Step->getType()->isIntegerTy()) { 2286 AddOp = Instruction::Add; 2287 MulOp = Instruction::Mul; 2288 } else { 2289 AddOp = II.getInductionOpcode(); 2290 MulOp = Instruction::FMul; 2291 } 2292 2293 // Multiply the vectorization factor by the step using integer or 2294 // floating-point arithmetic as appropriate. 2295 Type *StepType = Step->getType(); 2296 if (Step->getType()->isFloatingPointTy()) 2297 StepType = IntegerType::get(StepType->getContext(), 2298 StepType->getScalarSizeInBits()); 2299 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2300 if (Step->getType()->isFloatingPointTy()) 2301 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2302 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2303 2304 // Create a vector splat to use in the induction update. 2305 // 2306 // FIXME: If the step is non-constant, we create the vector splat with 2307 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2308 // handle a constant vector splat. 2309 Value *SplatVF = isa<Constant>(Mul) 2310 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2311 : Builder.CreateVectorSplat(VF, Mul); 2312 Builder.restoreIP(CurrIP); 2313 2314 // We may need to add the step a number of times, depending on the unroll 2315 // factor. The last of those goes into the PHI. 2316 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2317 &*LoopVectorBody->getFirstInsertionPt()); 2318 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2319 Instruction *LastInduction = VecInd; 2320 for (unsigned Part = 0; Part < UF; ++Part) { 2321 State.set(Def, LastInduction, Part); 2322 2323 if (isa<TruncInst>(EntryVal)) 2324 addMetadata(LastInduction, EntryVal); 2325 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2326 State, Part); 2327 2328 LastInduction = cast<Instruction>( 2329 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2330 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2331 } 2332 2333 // Move the last step to the end of the latch block. This ensures consistent 2334 // placement of all induction updates. 2335 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2336 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2337 auto *ICmp = cast<Instruction>(Br->getCondition()); 2338 LastInduction->moveBefore(ICmp); 2339 LastInduction->setName("vec.ind.next"); 2340 2341 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2342 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2343 } 2344 2345 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2346 return Cost->isScalarAfterVectorization(I, VF) || 2347 Cost->isProfitableToScalarize(I, VF); 2348 } 2349 2350 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2351 if (shouldScalarizeInstruction(IV)) 2352 return true; 2353 auto isScalarInst = [&](User *U) -> bool { 2354 auto *I = cast<Instruction>(U); 2355 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2356 }; 2357 return llvm::any_of(IV->users(), isScalarInst); 2358 } 2359 2360 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2361 const InductionDescriptor &ID, const Instruction *EntryVal, 2362 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2363 unsigned Part, unsigned Lane) { 2364 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2365 "Expected either an induction phi-node or a truncate of it!"); 2366 2367 // This induction variable is not the phi from the original loop but the 2368 // newly-created IV based on the proof that casted Phi is equal to the 2369 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2370 // re-uses the same InductionDescriptor that original IV uses but we don't 2371 // have to do any recording in this case - that is done when original IV is 2372 // processed. 2373 if (isa<TruncInst>(EntryVal)) 2374 return; 2375 2376 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2377 if (Casts.empty()) 2378 return; 2379 // Only the first Cast instruction in the Casts vector is of interest. 2380 // The rest of the Casts (if exist) have no uses outside the 2381 // induction update chain itself. 2382 if (Lane < UINT_MAX) 2383 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2384 else 2385 State.set(CastDef, VectorLoopVal, Part); 2386 } 2387 2388 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2389 TruncInst *Trunc, VPValue *Def, 2390 VPValue *CastDef, 2391 VPTransformState &State) { 2392 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2393 "Primary induction variable must have an integer type"); 2394 2395 auto II = Legal->getInductionVars().find(IV); 2396 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2397 2398 auto ID = II->second; 2399 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2400 2401 // The value from the original loop to which we are mapping the new induction 2402 // variable. 2403 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2404 2405 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2406 2407 // Generate code for the induction step. Note that induction steps are 2408 // required to be loop-invariant 2409 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2410 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2411 "Induction step should be loop invariant"); 2412 if (PSE.getSE()->isSCEVable(IV->getType())) { 2413 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2414 return Exp.expandCodeFor(Step, Step->getType(), 2415 LoopVectorPreHeader->getTerminator()); 2416 } 2417 return cast<SCEVUnknown>(Step)->getValue(); 2418 }; 2419 2420 // The scalar value to broadcast. This is derived from the canonical 2421 // induction variable. If a truncation type is given, truncate the canonical 2422 // induction variable and step. Otherwise, derive these values from the 2423 // induction descriptor. 2424 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2425 Value *ScalarIV = Induction; 2426 if (IV != OldInduction) { 2427 ScalarIV = IV->getType()->isIntegerTy() 2428 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2429 : Builder.CreateCast(Instruction::SIToFP, Induction, 2430 IV->getType()); 2431 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2432 ScalarIV->setName("offset.idx"); 2433 } 2434 if (Trunc) { 2435 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2436 assert(Step->getType()->isIntegerTy() && 2437 "Truncation requires an integer step"); 2438 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2439 Step = Builder.CreateTrunc(Step, TruncType); 2440 } 2441 return ScalarIV; 2442 }; 2443 2444 // Create the vector values from the scalar IV, in the absence of creating a 2445 // vector IV. 2446 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2447 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2448 for (unsigned Part = 0; Part < UF; ++Part) { 2449 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2450 Value *EntryPart = 2451 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2452 ID.getInductionOpcode()); 2453 State.set(Def, EntryPart, Part); 2454 if (Trunc) 2455 addMetadata(EntryPart, Trunc); 2456 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2457 State, Part); 2458 } 2459 }; 2460 2461 // Fast-math-flags propagate from the original induction instruction. 2462 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2463 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2464 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2465 2466 // Now do the actual transformations, and start with creating the step value. 2467 Value *Step = CreateStepValue(ID.getStep()); 2468 if (VF.isZero() || VF.isScalar()) { 2469 Value *ScalarIV = CreateScalarIV(Step); 2470 CreateSplatIV(ScalarIV, Step); 2471 return; 2472 } 2473 2474 // Determine if we want a scalar version of the induction variable. This is 2475 // true if the induction variable itself is not widened, or if it has at 2476 // least one user in the loop that is not widened. 2477 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2478 if (!NeedsScalarIV) { 2479 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2480 State); 2481 return; 2482 } 2483 2484 // Try to create a new independent vector induction variable. If we can't 2485 // create the phi node, we will splat the scalar induction variable in each 2486 // loop iteration. 2487 if (!shouldScalarizeInstruction(EntryVal)) { 2488 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2489 State); 2490 Value *ScalarIV = CreateScalarIV(Step); 2491 // Create scalar steps that can be used by instructions we will later 2492 // scalarize. Note that the addition of the scalar steps will not increase 2493 // the number of instructions in the loop in the common case prior to 2494 // InstCombine. We will be trading one vector extract for each scalar step. 2495 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2496 return; 2497 } 2498 2499 // All IV users are scalar instructions, so only emit a scalar IV, not a 2500 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2501 // predicate used by the masked loads/stores. 2502 Value *ScalarIV = CreateScalarIV(Step); 2503 if (!Cost->isScalarEpilogueAllowed()) 2504 CreateSplatIV(ScalarIV, Step); 2505 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2506 } 2507 2508 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2509 Instruction::BinaryOps BinOp) { 2510 // Create and check the types. 2511 auto *ValVTy = cast<VectorType>(Val->getType()); 2512 ElementCount VLen = ValVTy->getElementCount(); 2513 2514 Type *STy = Val->getType()->getScalarType(); 2515 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2516 "Induction Step must be an integer or FP"); 2517 assert(Step->getType() == STy && "Step has wrong type"); 2518 2519 SmallVector<Constant *, 8> Indices; 2520 2521 // Create a vector of consecutive numbers from zero to VF. 2522 VectorType *InitVecValVTy = ValVTy; 2523 Type *InitVecValSTy = STy; 2524 if (STy->isFloatingPointTy()) { 2525 InitVecValSTy = 2526 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2527 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2528 } 2529 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2530 2531 // Add on StartIdx 2532 Value *StartIdxSplat = Builder.CreateVectorSplat( 2533 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2534 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2535 2536 if (STy->isIntegerTy()) { 2537 Step = Builder.CreateVectorSplat(VLen, Step); 2538 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2539 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2540 // which can be found from the original scalar operations. 2541 Step = Builder.CreateMul(InitVec, Step); 2542 return Builder.CreateAdd(Val, Step, "induction"); 2543 } 2544 2545 // Floating point induction. 2546 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2547 "Binary Opcode should be specified for FP induction"); 2548 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2549 Step = Builder.CreateVectorSplat(VLen, Step); 2550 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2551 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2552 } 2553 2554 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2555 Instruction *EntryVal, 2556 const InductionDescriptor &ID, 2557 VPValue *Def, VPValue *CastDef, 2558 VPTransformState &State) { 2559 // We shouldn't have to build scalar steps if we aren't vectorizing. 2560 assert(VF.isVector() && "VF should be greater than one"); 2561 // Get the value type and ensure it and the step have the same integer type. 2562 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2563 assert(ScalarIVTy == Step->getType() && 2564 "Val and Step should have the same type"); 2565 2566 // We build scalar steps for both integer and floating-point induction 2567 // variables. Here, we determine the kind of arithmetic we will perform. 2568 Instruction::BinaryOps AddOp; 2569 Instruction::BinaryOps MulOp; 2570 if (ScalarIVTy->isIntegerTy()) { 2571 AddOp = Instruction::Add; 2572 MulOp = Instruction::Mul; 2573 } else { 2574 AddOp = ID.getInductionOpcode(); 2575 MulOp = Instruction::FMul; 2576 } 2577 2578 // Determine the number of scalars we need to generate for each unroll 2579 // iteration. If EntryVal is uniform, we only need to generate the first 2580 // lane. Otherwise, we generate all VF values. 2581 bool IsUniform = 2582 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2583 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2584 // Compute the scalar steps and save the results in State. 2585 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2586 ScalarIVTy->getScalarSizeInBits()); 2587 Type *VecIVTy = nullptr; 2588 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2589 if (!IsUniform && VF.isScalable()) { 2590 VecIVTy = VectorType::get(ScalarIVTy, VF); 2591 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2592 SplatStep = Builder.CreateVectorSplat(VF, Step); 2593 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2594 } 2595 2596 for (unsigned Part = 0; Part < UF; ++Part) { 2597 Value *StartIdx0 = 2598 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2599 2600 if (!IsUniform && VF.isScalable()) { 2601 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2602 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2603 if (ScalarIVTy->isFloatingPointTy()) 2604 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2605 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2606 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2607 State.set(Def, Add, Part); 2608 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2609 Part); 2610 // It's useful to record the lane values too for the known minimum number 2611 // of elements so we do those below. This improves the code quality when 2612 // trying to extract the first element, for example. 2613 } 2614 2615 if (ScalarIVTy->isFloatingPointTy()) 2616 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2617 2618 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2619 Value *StartIdx = Builder.CreateBinOp( 2620 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2621 // The step returned by `createStepForVF` is a runtime-evaluated value 2622 // when VF is scalable. Otherwise, it should be folded into a Constant. 2623 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2624 "Expected StartIdx to be folded to a constant when VF is not " 2625 "scalable"); 2626 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2627 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2628 State.set(Def, Add, VPIteration(Part, Lane)); 2629 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2630 Part, Lane); 2631 } 2632 } 2633 } 2634 2635 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2636 const VPIteration &Instance, 2637 VPTransformState &State) { 2638 Value *ScalarInst = State.get(Def, Instance); 2639 Value *VectorValue = State.get(Def, Instance.Part); 2640 VectorValue = Builder.CreateInsertElement( 2641 VectorValue, ScalarInst, 2642 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2643 State.set(Def, VectorValue, Instance.Part); 2644 } 2645 2646 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2647 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2648 return Builder.CreateVectorReverse(Vec, "reverse"); 2649 } 2650 2651 // Return whether we allow using masked interleave-groups (for dealing with 2652 // strided loads/stores that reside in predicated blocks, or for dealing 2653 // with gaps). 2654 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2655 // If an override option has been passed in for interleaved accesses, use it. 2656 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2657 return EnableMaskedInterleavedMemAccesses; 2658 2659 return TTI.enableMaskedInterleavedAccessVectorization(); 2660 } 2661 2662 // Try to vectorize the interleave group that \p Instr belongs to. 2663 // 2664 // E.g. Translate following interleaved load group (factor = 3): 2665 // for (i = 0; i < N; i+=3) { 2666 // R = Pic[i]; // Member of index 0 2667 // G = Pic[i+1]; // Member of index 1 2668 // B = Pic[i+2]; // Member of index 2 2669 // ... // do something to R, G, B 2670 // } 2671 // To: 2672 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2673 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2674 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2675 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2676 // 2677 // Or translate following interleaved store group (factor = 3): 2678 // for (i = 0; i < N; i+=3) { 2679 // ... do something to R, G, B 2680 // Pic[i] = R; // Member of index 0 2681 // Pic[i+1] = G; // Member of index 1 2682 // Pic[i+2] = B; // Member of index 2 2683 // } 2684 // To: 2685 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2686 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2687 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2688 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2689 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2690 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2691 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2692 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2693 VPValue *BlockInMask) { 2694 Instruction *Instr = Group->getInsertPos(); 2695 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2696 2697 // Prepare for the vector type of the interleaved load/store. 2698 Type *ScalarTy = getMemInstValueType(Instr); 2699 unsigned InterleaveFactor = Group->getFactor(); 2700 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2701 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2702 2703 // Prepare for the new pointers. 2704 SmallVector<Value *, 2> AddrParts; 2705 unsigned Index = Group->getIndex(Instr); 2706 2707 // TODO: extend the masked interleaved-group support to reversed access. 2708 assert((!BlockInMask || !Group->isReverse()) && 2709 "Reversed masked interleave-group not supported."); 2710 2711 // If the group is reverse, adjust the index to refer to the last vector lane 2712 // instead of the first. We adjust the index from the first vector lane, 2713 // rather than directly getting the pointer for lane VF - 1, because the 2714 // pointer operand of the interleaved access is supposed to be uniform. For 2715 // uniform instructions, we're only required to generate a value for the 2716 // first vector lane in each unroll iteration. 2717 if (Group->isReverse()) 2718 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2719 2720 for (unsigned Part = 0; Part < UF; Part++) { 2721 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2722 setDebugLocFromInst(Builder, AddrPart); 2723 2724 // Notice current instruction could be any index. Need to adjust the address 2725 // to the member of index 0. 2726 // 2727 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2728 // b = A[i]; // Member of index 0 2729 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2730 // 2731 // E.g. A[i+1] = a; // Member of index 1 2732 // A[i] = b; // Member of index 0 2733 // A[i+2] = c; // Member of index 2 (Current instruction) 2734 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2735 2736 bool InBounds = false; 2737 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2738 InBounds = gep->isInBounds(); 2739 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2740 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2741 2742 // Cast to the vector pointer type. 2743 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2744 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2745 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2746 } 2747 2748 setDebugLocFromInst(Builder, Instr); 2749 Value *PoisonVec = PoisonValue::get(VecTy); 2750 2751 Value *MaskForGaps = nullptr; 2752 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2753 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2754 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2755 } 2756 2757 // Vectorize the interleaved load group. 2758 if (isa<LoadInst>(Instr)) { 2759 // For each unroll part, create a wide load for the group. 2760 SmallVector<Value *, 2> NewLoads; 2761 for (unsigned Part = 0; Part < UF; Part++) { 2762 Instruction *NewLoad; 2763 if (BlockInMask || MaskForGaps) { 2764 assert(useMaskedInterleavedAccesses(*TTI) && 2765 "masked interleaved groups are not allowed."); 2766 Value *GroupMask = MaskForGaps; 2767 if (BlockInMask) { 2768 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2769 Value *ShuffledMask = Builder.CreateShuffleVector( 2770 BlockInMaskPart, 2771 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2772 "interleaved.mask"); 2773 GroupMask = MaskForGaps 2774 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2775 MaskForGaps) 2776 : ShuffledMask; 2777 } 2778 NewLoad = 2779 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2780 GroupMask, PoisonVec, "wide.masked.vec"); 2781 } 2782 else 2783 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2784 Group->getAlign(), "wide.vec"); 2785 Group->addMetadata(NewLoad); 2786 NewLoads.push_back(NewLoad); 2787 } 2788 2789 // For each member in the group, shuffle out the appropriate data from the 2790 // wide loads. 2791 unsigned J = 0; 2792 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2793 Instruction *Member = Group->getMember(I); 2794 2795 // Skip the gaps in the group. 2796 if (!Member) 2797 continue; 2798 2799 auto StrideMask = 2800 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2801 for (unsigned Part = 0; Part < UF; Part++) { 2802 Value *StridedVec = Builder.CreateShuffleVector( 2803 NewLoads[Part], StrideMask, "strided.vec"); 2804 2805 // If this member has different type, cast the result type. 2806 if (Member->getType() != ScalarTy) { 2807 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2808 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2809 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2810 } 2811 2812 if (Group->isReverse()) 2813 StridedVec = reverseVector(StridedVec); 2814 2815 State.set(VPDefs[J], StridedVec, Part); 2816 } 2817 ++J; 2818 } 2819 return; 2820 } 2821 2822 // The sub vector type for current instruction. 2823 auto *SubVT = VectorType::get(ScalarTy, VF); 2824 2825 // Vectorize the interleaved store group. 2826 for (unsigned Part = 0; Part < UF; Part++) { 2827 // Collect the stored vector from each member. 2828 SmallVector<Value *, 4> StoredVecs; 2829 for (unsigned i = 0; i < InterleaveFactor; i++) { 2830 // Interleaved store group doesn't allow a gap, so each index has a member 2831 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2832 2833 Value *StoredVec = State.get(StoredValues[i], Part); 2834 2835 if (Group->isReverse()) 2836 StoredVec = reverseVector(StoredVec); 2837 2838 // If this member has different type, cast it to a unified type. 2839 2840 if (StoredVec->getType() != SubVT) 2841 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2842 2843 StoredVecs.push_back(StoredVec); 2844 } 2845 2846 // Concatenate all vectors into a wide vector. 2847 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2848 2849 // Interleave the elements in the wide vector. 2850 Value *IVec = Builder.CreateShuffleVector( 2851 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2852 "interleaved.vec"); 2853 2854 Instruction *NewStoreInstr; 2855 if (BlockInMask) { 2856 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2857 Value *ShuffledMask = Builder.CreateShuffleVector( 2858 BlockInMaskPart, 2859 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2860 "interleaved.mask"); 2861 NewStoreInstr = Builder.CreateMaskedStore( 2862 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2863 } 2864 else 2865 NewStoreInstr = 2866 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2867 2868 Group->addMetadata(NewStoreInstr); 2869 } 2870 } 2871 2872 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2873 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2874 VPValue *StoredValue, VPValue *BlockInMask) { 2875 // Attempt to issue a wide load. 2876 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2877 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2878 2879 assert((LI || SI) && "Invalid Load/Store instruction"); 2880 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2881 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2882 2883 LoopVectorizationCostModel::InstWidening Decision = 2884 Cost->getWideningDecision(Instr, VF); 2885 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2886 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2887 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2888 "CM decision is not to widen the memory instruction"); 2889 2890 Type *ScalarDataTy = getMemInstValueType(Instr); 2891 2892 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2893 const Align Alignment = getLoadStoreAlignment(Instr); 2894 2895 // Determine if the pointer operand of the access is either consecutive or 2896 // reverse consecutive. 2897 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2898 bool ConsecutiveStride = 2899 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2900 bool CreateGatherScatter = 2901 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2902 2903 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2904 // gather/scatter. Otherwise Decision should have been to Scalarize. 2905 assert((ConsecutiveStride || CreateGatherScatter) && 2906 "The instruction should be scalarized"); 2907 (void)ConsecutiveStride; 2908 2909 VectorParts BlockInMaskParts(UF); 2910 bool isMaskRequired = BlockInMask; 2911 if (isMaskRequired) 2912 for (unsigned Part = 0; Part < UF; ++Part) 2913 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2914 2915 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2916 // Calculate the pointer for the specific unroll-part. 2917 GetElementPtrInst *PartPtr = nullptr; 2918 2919 bool InBounds = false; 2920 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2921 InBounds = gep->isInBounds(); 2922 if (Reverse) { 2923 // If the address is consecutive but reversed, then the 2924 // wide store needs to start at the last vector element. 2925 // RunTimeVF = VScale * VF.getKnownMinValue() 2926 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2927 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2928 // NumElt = -Part * RunTimeVF 2929 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2930 // LastLane = 1 - RunTimeVF 2931 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2932 PartPtr = 2933 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2934 PartPtr->setIsInBounds(InBounds); 2935 PartPtr = cast<GetElementPtrInst>( 2936 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2937 PartPtr->setIsInBounds(InBounds); 2938 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2939 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2940 } else { 2941 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2942 PartPtr = cast<GetElementPtrInst>( 2943 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2944 PartPtr->setIsInBounds(InBounds); 2945 } 2946 2947 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2948 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2949 }; 2950 2951 // Handle Stores: 2952 if (SI) { 2953 setDebugLocFromInst(Builder, SI); 2954 2955 for (unsigned Part = 0; Part < UF; ++Part) { 2956 Instruction *NewSI = nullptr; 2957 Value *StoredVal = State.get(StoredValue, Part); 2958 if (CreateGatherScatter) { 2959 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2960 Value *VectorGep = State.get(Addr, Part); 2961 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2962 MaskPart); 2963 } else { 2964 if (Reverse) { 2965 // If we store to reverse consecutive memory locations, then we need 2966 // to reverse the order of elements in the stored value. 2967 StoredVal = reverseVector(StoredVal); 2968 // We don't want to update the value in the map as it might be used in 2969 // another expression. So don't call resetVectorValue(StoredVal). 2970 } 2971 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2972 if (isMaskRequired) 2973 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2974 BlockInMaskParts[Part]); 2975 else 2976 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2977 } 2978 addMetadata(NewSI, SI); 2979 } 2980 return; 2981 } 2982 2983 // Handle loads. 2984 assert(LI && "Must have a load instruction"); 2985 setDebugLocFromInst(Builder, LI); 2986 for (unsigned Part = 0; Part < UF; ++Part) { 2987 Value *NewLI; 2988 if (CreateGatherScatter) { 2989 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2990 Value *VectorGep = State.get(Addr, Part); 2991 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2992 nullptr, "wide.masked.gather"); 2993 addMetadata(NewLI, LI); 2994 } else { 2995 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2996 if (isMaskRequired) 2997 NewLI = Builder.CreateMaskedLoad( 2998 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2999 "wide.masked.load"); 3000 else 3001 NewLI = 3002 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 3003 3004 // Add metadata to the load, but setVectorValue to the reverse shuffle. 3005 addMetadata(NewLI, LI); 3006 if (Reverse) 3007 NewLI = reverseVector(NewLI); 3008 } 3009 3010 State.set(Def, NewLI, Part); 3011 } 3012 } 3013 3014 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3015 VPUser &User, 3016 const VPIteration &Instance, 3017 bool IfPredicateInstr, 3018 VPTransformState &State) { 3019 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3020 3021 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3022 // the first lane and part. 3023 if (isa<NoAliasScopeDeclInst>(Instr)) 3024 if (!Instance.isFirstIteration()) 3025 return; 3026 3027 setDebugLocFromInst(Builder, Instr); 3028 3029 // Does this instruction return a value ? 3030 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3031 3032 Instruction *Cloned = Instr->clone(); 3033 if (!IsVoidRetTy) 3034 Cloned->setName(Instr->getName() + ".cloned"); 3035 3036 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3037 Builder.GetInsertPoint()); 3038 // Replace the operands of the cloned instructions with their scalar 3039 // equivalents in the new loop. 3040 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3041 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3042 auto InputInstance = Instance; 3043 if (!Operand || !OrigLoop->contains(Operand) || 3044 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3045 InputInstance.Lane = VPLane::getFirstLane(); 3046 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3047 Cloned->setOperand(op, NewOp); 3048 } 3049 addNewMetadata(Cloned, Instr); 3050 3051 // Place the cloned scalar in the new loop. 3052 Builder.Insert(Cloned); 3053 3054 State.set(Def, Cloned, Instance); 3055 3056 // If we just cloned a new assumption, add it the assumption cache. 3057 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3058 AC->registerAssumption(II); 3059 3060 // End if-block. 3061 if (IfPredicateInstr) 3062 PredicatedInstructions.push_back(Cloned); 3063 } 3064 3065 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3066 Value *End, Value *Step, 3067 Instruction *DL) { 3068 BasicBlock *Header = L->getHeader(); 3069 BasicBlock *Latch = L->getLoopLatch(); 3070 // As we're just creating this loop, it's possible no latch exists 3071 // yet. If so, use the header as this will be a single block loop. 3072 if (!Latch) 3073 Latch = Header; 3074 3075 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3076 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3077 setDebugLocFromInst(Builder, OldInst); 3078 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3079 3080 Builder.SetInsertPoint(Latch->getTerminator()); 3081 setDebugLocFromInst(Builder, OldInst); 3082 3083 // Create i+1 and fill the PHINode. 3084 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3085 Induction->addIncoming(Start, L->getLoopPreheader()); 3086 Induction->addIncoming(Next, Latch); 3087 // Create the compare. 3088 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3089 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3090 3091 // Now we have two terminators. Remove the old one from the block. 3092 Latch->getTerminator()->eraseFromParent(); 3093 3094 return Induction; 3095 } 3096 3097 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3098 if (TripCount) 3099 return TripCount; 3100 3101 assert(L && "Create Trip Count for null loop."); 3102 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3103 // Find the loop boundaries. 3104 ScalarEvolution *SE = PSE.getSE(); 3105 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3106 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3107 "Invalid loop count"); 3108 3109 Type *IdxTy = Legal->getWidestInductionType(); 3110 assert(IdxTy && "No type for induction"); 3111 3112 // The exit count might have the type of i64 while the phi is i32. This can 3113 // happen if we have an induction variable that is sign extended before the 3114 // compare. The only way that we get a backedge taken count is that the 3115 // induction variable was signed and as such will not overflow. In such a case 3116 // truncation is legal. 3117 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3118 IdxTy->getPrimitiveSizeInBits()) 3119 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3120 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3121 3122 // Get the total trip count from the count by adding 1. 3123 const SCEV *ExitCount = SE->getAddExpr( 3124 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3125 3126 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3127 3128 // Expand the trip count and place the new instructions in the preheader. 3129 // Notice that the pre-header does not change, only the loop body. 3130 SCEVExpander Exp(*SE, DL, "induction"); 3131 3132 // Count holds the overall loop count (N). 3133 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3134 L->getLoopPreheader()->getTerminator()); 3135 3136 if (TripCount->getType()->isPointerTy()) 3137 TripCount = 3138 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3139 L->getLoopPreheader()->getTerminator()); 3140 3141 return TripCount; 3142 } 3143 3144 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3145 if (VectorTripCount) 3146 return VectorTripCount; 3147 3148 Value *TC = getOrCreateTripCount(L); 3149 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3150 3151 Type *Ty = TC->getType(); 3152 // This is where we can make the step a runtime constant. 3153 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3154 3155 // If the tail is to be folded by masking, round the number of iterations N 3156 // up to a multiple of Step instead of rounding down. This is done by first 3157 // adding Step-1 and then rounding down. Note that it's ok if this addition 3158 // overflows: the vector induction variable will eventually wrap to zero given 3159 // that it starts at zero and its Step is a power of two; the loop will then 3160 // exit, with the last early-exit vector comparison also producing all-true. 3161 if (Cost->foldTailByMasking()) { 3162 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3163 "VF*UF must be a power of 2 when folding tail by masking"); 3164 assert(!VF.isScalable() && 3165 "Tail folding not yet supported for scalable vectors"); 3166 TC = Builder.CreateAdd( 3167 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3168 } 3169 3170 // Now we need to generate the expression for the part of the loop that the 3171 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3172 // iterations are not required for correctness, or N - Step, otherwise. Step 3173 // is equal to the vectorization factor (number of SIMD elements) times the 3174 // unroll factor (number of SIMD instructions). 3175 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3176 3177 // There are two cases where we need to ensure (at least) the last iteration 3178 // runs in the scalar remainder loop. Thus, if the step evenly divides 3179 // the trip count, we set the remainder to be equal to the step. If the step 3180 // does not evenly divide the trip count, no adjustment is necessary since 3181 // there will already be scalar iterations. Note that the minimum iterations 3182 // check ensures that N >= Step. The cases are: 3183 // 1) If there is a non-reversed interleaved group that may speculatively 3184 // access memory out-of-bounds. 3185 // 2) If any instruction may follow a conditionally taken exit. That is, if 3186 // the loop contains multiple exiting blocks, or a single exiting block 3187 // which is not the latch. 3188 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3189 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3190 R = Builder.CreateSelect(IsZero, Step, R); 3191 } 3192 3193 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3194 3195 return VectorTripCount; 3196 } 3197 3198 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3199 const DataLayout &DL) { 3200 // Verify that V is a vector type with same number of elements as DstVTy. 3201 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3202 unsigned VF = DstFVTy->getNumElements(); 3203 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3204 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3205 Type *SrcElemTy = SrcVecTy->getElementType(); 3206 Type *DstElemTy = DstFVTy->getElementType(); 3207 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3208 "Vector elements must have same size"); 3209 3210 // Do a direct cast if element types are castable. 3211 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3212 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3213 } 3214 // V cannot be directly casted to desired vector type. 3215 // May happen when V is a floating point vector but DstVTy is a vector of 3216 // pointers or vice-versa. Handle this using a two-step bitcast using an 3217 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3218 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3219 "Only one type should be a pointer type"); 3220 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3221 "Only one type should be a floating point type"); 3222 Type *IntTy = 3223 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3224 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3225 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3226 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3227 } 3228 3229 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3230 BasicBlock *Bypass) { 3231 Value *Count = getOrCreateTripCount(L); 3232 // Reuse existing vector loop preheader for TC checks. 3233 // Note that new preheader block is generated for vector loop. 3234 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3235 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3236 3237 // Generate code to check if the loop's trip count is less than VF * UF, or 3238 // equal to it in case a scalar epilogue is required; this implies that the 3239 // vector trip count is zero. This check also covers the case where adding one 3240 // to the backedge-taken count overflowed leading to an incorrect trip count 3241 // of zero. In this case we will also jump to the scalar loop. 3242 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3243 : ICmpInst::ICMP_ULT; 3244 3245 // If tail is to be folded, vector loop takes care of all iterations. 3246 Value *CheckMinIters = Builder.getFalse(); 3247 if (!Cost->foldTailByMasking()) { 3248 Value *Step = 3249 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3250 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3251 } 3252 // Create new preheader for vector loop. 3253 LoopVectorPreHeader = 3254 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3255 "vector.ph"); 3256 3257 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3258 DT->getNode(Bypass)->getIDom()) && 3259 "TC check is expected to dominate Bypass"); 3260 3261 // Update dominator for Bypass & LoopExit. 3262 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3263 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3264 3265 ReplaceInstWithInst( 3266 TCCheckBlock->getTerminator(), 3267 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3268 LoopBypassBlocks.push_back(TCCheckBlock); 3269 } 3270 3271 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3272 3273 BasicBlock *const SCEVCheckBlock = 3274 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3275 if (!SCEVCheckBlock) 3276 return nullptr; 3277 3278 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3279 (OptForSizeBasedOnProfile && 3280 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3281 "Cannot SCEV check stride or overflow when optimizing for size"); 3282 3283 3284 // Update dominator only if this is first RT check. 3285 if (LoopBypassBlocks.empty()) { 3286 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3287 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3288 } 3289 3290 LoopBypassBlocks.push_back(SCEVCheckBlock); 3291 AddedSafetyChecks = true; 3292 return SCEVCheckBlock; 3293 } 3294 3295 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3296 BasicBlock *Bypass) { 3297 // VPlan-native path does not do any analysis for runtime checks currently. 3298 if (EnableVPlanNativePath) 3299 return nullptr; 3300 3301 BasicBlock *const MemCheckBlock = 3302 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3303 3304 // Check if we generated code that checks in runtime if arrays overlap. We put 3305 // the checks into a separate block to make the more common case of few 3306 // elements faster. 3307 if (!MemCheckBlock) 3308 return nullptr; 3309 3310 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3311 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3312 "Cannot emit memory checks when optimizing for size, unless forced " 3313 "to vectorize."); 3314 ORE->emit([&]() { 3315 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3316 L->getStartLoc(), L->getHeader()) 3317 << "Code-size may be reduced by not forcing " 3318 "vectorization, or by source-code modifications " 3319 "eliminating the need for runtime checks " 3320 "(e.g., adding 'restrict')."; 3321 }); 3322 } 3323 3324 LoopBypassBlocks.push_back(MemCheckBlock); 3325 3326 AddedSafetyChecks = true; 3327 3328 // We currently don't use LoopVersioning for the actual loop cloning but we 3329 // still use it to add the noalias metadata. 3330 LVer = std::make_unique<LoopVersioning>( 3331 *Legal->getLAI(), 3332 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3333 DT, PSE.getSE()); 3334 LVer->prepareNoAliasMetadata(); 3335 return MemCheckBlock; 3336 } 3337 3338 Value *InnerLoopVectorizer::emitTransformedIndex( 3339 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3340 const InductionDescriptor &ID) const { 3341 3342 SCEVExpander Exp(*SE, DL, "induction"); 3343 auto Step = ID.getStep(); 3344 auto StartValue = ID.getStartValue(); 3345 assert(Index->getType()->getScalarType() == Step->getType() && 3346 "Index scalar type does not match StepValue type"); 3347 3348 // Note: the IR at this point is broken. We cannot use SE to create any new 3349 // SCEV and then expand it, hoping that SCEV's simplification will give us 3350 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3351 // lead to various SCEV crashes. So all we can do is to use builder and rely 3352 // on InstCombine for future simplifications. Here we handle some trivial 3353 // cases only. 3354 auto CreateAdd = [&B](Value *X, Value *Y) { 3355 assert(X->getType() == Y->getType() && "Types don't match!"); 3356 if (auto *CX = dyn_cast<ConstantInt>(X)) 3357 if (CX->isZero()) 3358 return Y; 3359 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3360 if (CY->isZero()) 3361 return X; 3362 return B.CreateAdd(X, Y); 3363 }; 3364 3365 // We allow X to be a vector type, in which case Y will potentially be 3366 // splatted into a vector with the same element count. 3367 auto CreateMul = [&B](Value *X, Value *Y) { 3368 assert(X->getType()->getScalarType() == Y->getType() && 3369 "Types don't match!"); 3370 if (auto *CX = dyn_cast<ConstantInt>(X)) 3371 if (CX->isOne()) 3372 return Y; 3373 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3374 if (CY->isOne()) 3375 return X; 3376 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3377 if (XVTy && !isa<VectorType>(Y->getType())) 3378 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3379 return B.CreateMul(X, Y); 3380 }; 3381 3382 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3383 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3384 // the DomTree is not kept up-to-date for additional blocks generated in the 3385 // vector loop. By using the header as insertion point, we guarantee that the 3386 // expanded instructions dominate all their uses. 3387 auto GetInsertPoint = [this, &B]() { 3388 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3389 if (InsertBB != LoopVectorBody && 3390 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3391 return LoopVectorBody->getTerminator(); 3392 return &*B.GetInsertPoint(); 3393 }; 3394 3395 switch (ID.getKind()) { 3396 case InductionDescriptor::IK_IntInduction: { 3397 assert(!isa<VectorType>(Index->getType()) && 3398 "Vector indices not supported for integer inductions yet"); 3399 assert(Index->getType() == StartValue->getType() && 3400 "Index type does not match StartValue type"); 3401 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3402 return B.CreateSub(StartValue, Index); 3403 auto *Offset = CreateMul( 3404 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3405 return CreateAdd(StartValue, Offset); 3406 } 3407 case InductionDescriptor::IK_PtrInduction: { 3408 assert(isa<SCEVConstant>(Step) && 3409 "Expected constant step for pointer induction"); 3410 return B.CreateGEP( 3411 StartValue->getType()->getPointerElementType(), StartValue, 3412 CreateMul(Index, 3413 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3414 GetInsertPoint()))); 3415 } 3416 case InductionDescriptor::IK_FpInduction: { 3417 assert(!isa<VectorType>(Index->getType()) && 3418 "Vector indices not supported for FP inductions yet"); 3419 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3420 auto InductionBinOp = ID.getInductionBinOp(); 3421 assert(InductionBinOp && 3422 (InductionBinOp->getOpcode() == Instruction::FAdd || 3423 InductionBinOp->getOpcode() == Instruction::FSub) && 3424 "Original bin op should be defined for FP induction"); 3425 3426 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3427 Value *MulExp = B.CreateFMul(StepValue, Index); 3428 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3429 "induction"); 3430 } 3431 case InductionDescriptor::IK_NoInduction: 3432 return nullptr; 3433 } 3434 llvm_unreachable("invalid enum"); 3435 } 3436 3437 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3438 LoopScalarBody = OrigLoop->getHeader(); 3439 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3440 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3441 assert(LoopExitBlock && "Must have an exit block"); 3442 assert(LoopVectorPreHeader && "Invalid loop structure"); 3443 3444 LoopMiddleBlock = 3445 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3446 LI, nullptr, Twine(Prefix) + "middle.block"); 3447 LoopScalarPreHeader = 3448 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3449 nullptr, Twine(Prefix) + "scalar.ph"); 3450 3451 // Set up branch from middle block to the exit and scalar preheader blocks. 3452 // completeLoopSkeleton will update the condition to use an iteration check, 3453 // if required to decide whether to execute the remainder. 3454 BranchInst *BrInst = 3455 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3456 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3457 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3458 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3459 3460 // We intentionally don't let SplitBlock to update LoopInfo since 3461 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3462 // LoopVectorBody is explicitly added to the correct place few lines later. 3463 LoopVectorBody = 3464 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3465 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3466 3467 // Update dominator for loop exit. 3468 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3469 3470 // Create and register the new vector loop. 3471 Loop *Lp = LI->AllocateLoop(); 3472 Loop *ParentLoop = OrigLoop->getParentLoop(); 3473 3474 // Insert the new loop into the loop nest and register the new basic blocks 3475 // before calling any utilities such as SCEV that require valid LoopInfo. 3476 if (ParentLoop) { 3477 ParentLoop->addChildLoop(Lp); 3478 } else { 3479 LI->addTopLevelLoop(Lp); 3480 } 3481 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3482 return Lp; 3483 } 3484 3485 void InnerLoopVectorizer::createInductionResumeValues( 3486 Loop *L, Value *VectorTripCount, 3487 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3488 assert(VectorTripCount && L && "Expected valid arguments"); 3489 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3490 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3491 "Inconsistent information about additional bypass."); 3492 // We are going to resume the execution of the scalar loop. 3493 // Go over all of the induction variables that we found and fix the 3494 // PHIs that are left in the scalar version of the loop. 3495 // The starting values of PHI nodes depend on the counter of the last 3496 // iteration in the vectorized loop. 3497 // If we come from a bypass edge then we need to start from the original 3498 // start value. 3499 for (auto &InductionEntry : Legal->getInductionVars()) { 3500 PHINode *OrigPhi = InductionEntry.first; 3501 InductionDescriptor II = InductionEntry.second; 3502 3503 // Create phi nodes to merge from the backedge-taken check block. 3504 PHINode *BCResumeVal = 3505 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3506 LoopScalarPreHeader->getTerminator()); 3507 // Copy original phi DL over to the new one. 3508 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3509 Value *&EndValue = IVEndValues[OrigPhi]; 3510 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3511 if (OrigPhi == OldInduction) { 3512 // We know what the end value is. 3513 EndValue = VectorTripCount; 3514 } else { 3515 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3516 3517 // Fast-math-flags propagate from the original induction instruction. 3518 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3519 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3520 3521 Type *StepType = II.getStep()->getType(); 3522 Instruction::CastOps CastOp = 3523 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3524 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3525 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3526 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3527 EndValue->setName("ind.end"); 3528 3529 // Compute the end value for the additional bypass (if applicable). 3530 if (AdditionalBypass.first) { 3531 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3532 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3533 StepType, true); 3534 CRD = 3535 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3536 EndValueFromAdditionalBypass = 3537 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3538 EndValueFromAdditionalBypass->setName("ind.end"); 3539 } 3540 } 3541 // The new PHI merges the original incoming value, in case of a bypass, 3542 // or the value at the end of the vectorized loop. 3543 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3544 3545 // Fix the scalar body counter (PHI node). 3546 // The old induction's phi node in the scalar body needs the truncated 3547 // value. 3548 for (BasicBlock *BB : LoopBypassBlocks) 3549 BCResumeVal->addIncoming(II.getStartValue(), BB); 3550 3551 if (AdditionalBypass.first) 3552 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3553 EndValueFromAdditionalBypass); 3554 3555 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3556 } 3557 } 3558 3559 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3560 MDNode *OrigLoopID) { 3561 assert(L && "Expected valid loop."); 3562 3563 // The trip counts should be cached by now. 3564 Value *Count = getOrCreateTripCount(L); 3565 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3566 3567 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3568 3569 // Add a check in the middle block to see if we have completed 3570 // all of the iterations in the first vector loop. 3571 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3572 // If tail is to be folded, we know we don't need to run the remainder. 3573 if (!Cost->foldTailByMasking()) { 3574 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3575 Count, VectorTripCount, "cmp.n", 3576 LoopMiddleBlock->getTerminator()); 3577 3578 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3579 // of the corresponding compare because they may have ended up with 3580 // different line numbers and we want to avoid awkward line stepping while 3581 // debugging. Eg. if the compare has got a line number inside the loop. 3582 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3583 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3584 } 3585 3586 // Get ready to start creating new instructions into the vectorized body. 3587 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3588 "Inconsistent vector loop preheader"); 3589 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3590 3591 Optional<MDNode *> VectorizedLoopID = 3592 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3593 LLVMLoopVectorizeFollowupVectorized}); 3594 if (VectorizedLoopID.hasValue()) { 3595 L->setLoopID(VectorizedLoopID.getValue()); 3596 3597 // Do not setAlreadyVectorized if loop attributes have been defined 3598 // explicitly. 3599 return LoopVectorPreHeader; 3600 } 3601 3602 // Keep all loop hints from the original loop on the vector loop (we'll 3603 // replace the vectorizer-specific hints below). 3604 if (MDNode *LID = OrigLoop->getLoopID()) 3605 L->setLoopID(LID); 3606 3607 LoopVectorizeHints Hints(L, true, *ORE); 3608 Hints.setAlreadyVectorized(); 3609 3610 #ifdef EXPENSIVE_CHECKS 3611 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3612 LI->verify(*DT); 3613 #endif 3614 3615 return LoopVectorPreHeader; 3616 } 3617 3618 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3619 /* 3620 In this function we generate a new loop. The new loop will contain 3621 the vectorized instructions while the old loop will continue to run the 3622 scalar remainder. 3623 3624 [ ] <-- loop iteration number check. 3625 / | 3626 / v 3627 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3628 | / | 3629 | / v 3630 || [ ] <-- vector pre header. 3631 |/ | 3632 | v 3633 | [ ] \ 3634 | [ ]_| <-- vector loop. 3635 | | 3636 | v 3637 | -[ ] <--- middle-block. 3638 | / | 3639 | / v 3640 -|- >[ ] <--- new preheader. 3641 | | 3642 | v 3643 | [ ] \ 3644 | [ ]_| <-- old scalar loop to handle remainder. 3645 \ | 3646 \ v 3647 >[ ] <-- exit block. 3648 ... 3649 */ 3650 3651 // Get the metadata of the original loop before it gets modified. 3652 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3653 3654 // Workaround! Compute the trip count of the original loop and cache it 3655 // before we start modifying the CFG. This code has a systemic problem 3656 // wherein it tries to run analysis over partially constructed IR; this is 3657 // wrong, and not simply for SCEV. The trip count of the original loop 3658 // simply happens to be prone to hitting this in practice. In theory, we 3659 // can hit the same issue for any SCEV, or ValueTracking query done during 3660 // mutation. See PR49900. 3661 getOrCreateTripCount(OrigLoop); 3662 3663 // Create an empty vector loop, and prepare basic blocks for the runtime 3664 // checks. 3665 Loop *Lp = createVectorLoopSkeleton(""); 3666 3667 // Now, compare the new count to zero. If it is zero skip the vector loop and 3668 // jump to the scalar loop. This check also covers the case where the 3669 // backedge-taken count is uint##_max: adding one to it will overflow leading 3670 // to an incorrect trip count of zero. In this (rare) case we will also jump 3671 // to the scalar loop. 3672 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3673 3674 // Generate the code to check any assumptions that we've made for SCEV 3675 // expressions. 3676 emitSCEVChecks(Lp, LoopScalarPreHeader); 3677 3678 // Generate the code that checks in runtime if arrays overlap. We put the 3679 // checks into a separate block to make the more common case of few elements 3680 // faster. 3681 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3682 3683 // Some loops have a single integer induction variable, while other loops 3684 // don't. One example is c++ iterators that often have multiple pointer 3685 // induction variables. In the code below we also support a case where we 3686 // don't have a single induction variable. 3687 // 3688 // We try to obtain an induction variable from the original loop as hard 3689 // as possible. However if we don't find one that: 3690 // - is an integer 3691 // - counts from zero, stepping by one 3692 // - is the size of the widest induction variable type 3693 // then we create a new one. 3694 OldInduction = Legal->getPrimaryInduction(); 3695 Type *IdxTy = Legal->getWidestInductionType(); 3696 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3697 // The loop step is equal to the vectorization factor (num of SIMD elements) 3698 // times the unroll factor (num of SIMD instructions). 3699 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3700 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3701 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3702 Induction = 3703 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3704 getDebugLocFromInstOrOperands(OldInduction)); 3705 3706 // Emit phis for the new starting index of the scalar loop. 3707 createInductionResumeValues(Lp, CountRoundDown); 3708 3709 return completeLoopSkeleton(Lp, OrigLoopID); 3710 } 3711 3712 // Fix up external users of the induction variable. At this point, we are 3713 // in LCSSA form, with all external PHIs that use the IV having one input value, 3714 // coming from the remainder loop. We need those PHIs to also have a correct 3715 // value for the IV when arriving directly from the middle block. 3716 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3717 const InductionDescriptor &II, 3718 Value *CountRoundDown, Value *EndValue, 3719 BasicBlock *MiddleBlock) { 3720 // There are two kinds of external IV usages - those that use the value 3721 // computed in the last iteration (the PHI) and those that use the penultimate 3722 // value (the value that feeds into the phi from the loop latch). 3723 // We allow both, but they, obviously, have different values. 3724 3725 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3726 3727 DenseMap<Value *, Value *> MissingVals; 3728 3729 // An external user of the last iteration's value should see the value that 3730 // the remainder loop uses to initialize its own IV. 3731 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3732 for (User *U : PostInc->users()) { 3733 Instruction *UI = cast<Instruction>(U); 3734 if (!OrigLoop->contains(UI)) { 3735 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3736 MissingVals[UI] = EndValue; 3737 } 3738 } 3739 3740 // An external user of the penultimate value need to see EndValue - Step. 3741 // The simplest way to get this is to recompute it from the constituent SCEVs, 3742 // that is Start + (Step * (CRD - 1)). 3743 for (User *U : OrigPhi->users()) { 3744 auto *UI = cast<Instruction>(U); 3745 if (!OrigLoop->contains(UI)) { 3746 const DataLayout &DL = 3747 OrigLoop->getHeader()->getModule()->getDataLayout(); 3748 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3749 3750 IRBuilder<> B(MiddleBlock->getTerminator()); 3751 3752 // Fast-math-flags propagate from the original induction instruction. 3753 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3754 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3755 3756 Value *CountMinusOne = B.CreateSub( 3757 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3758 Value *CMO = 3759 !II.getStep()->getType()->isIntegerTy() 3760 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3761 II.getStep()->getType()) 3762 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3763 CMO->setName("cast.cmo"); 3764 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3765 Escape->setName("ind.escape"); 3766 MissingVals[UI] = Escape; 3767 } 3768 } 3769 3770 for (auto &I : MissingVals) { 3771 PHINode *PHI = cast<PHINode>(I.first); 3772 // One corner case we have to handle is two IVs "chasing" each-other, 3773 // that is %IV2 = phi [...], [ %IV1, %latch ] 3774 // In this case, if IV1 has an external use, we need to avoid adding both 3775 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3776 // don't already have an incoming value for the middle block. 3777 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3778 PHI->addIncoming(I.second, MiddleBlock); 3779 } 3780 } 3781 3782 namespace { 3783 3784 struct CSEDenseMapInfo { 3785 static bool canHandle(const Instruction *I) { 3786 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3787 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3788 } 3789 3790 static inline Instruction *getEmptyKey() { 3791 return DenseMapInfo<Instruction *>::getEmptyKey(); 3792 } 3793 3794 static inline Instruction *getTombstoneKey() { 3795 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3796 } 3797 3798 static unsigned getHashValue(const Instruction *I) { 3799 assert(canHandle(I) && "Unknown instruction!"); 3800 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3801 I->value_op_end())); 3802 } 3803 3804 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3805 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3806 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3807 return LHS == RHS; 3808 return LHS->isIdenticalTo(RHS); 3809 } 3810 }; 3811 3812 } // end anonymous namespace 3813 3814 ///Perform cse of induction variable instructions. 3815 static void cse(BasicBlock *BB) { 3816 // Perform simple cse. 3817 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3818 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3819 Instruction *In = &*I++; 3820 3821 if (!CSEDenseMapInfo::canHandle(In)) 3822 continue; 3823 3824 // Check if we can replace this instruction with any of the 3825 // visited instructions. 3826 if (Instruction *V = CSEMap.lookup(In)) { 3827 In->replaceAllUsesWith(V); 3828 In->eraseFromParent(); 3829 continue; 3830 } 3831 3832 CSEMap[In] = In; 3833 } 3834 } 3835 3836 InstructionCost 3837 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3838 bool &NeedToScalarize) const { 3839 Function *F = CI->getCalledFunction(); 3840 Type *ScalarRetTy = CI->getType(); 3841 SmallVector<Type *, 4> Tys, ScalarTys; 3842 for (auto &ArgOp : CI->arg_operands()) 3843 ScalarTys.push_back(ArgOp->getType()); 3844 3845 // Estimate cost of scalarized vector call. The source operands are assumed 3846 // to be vectors, so we need to extract individual elements from there, 3847 // execute VF scalar calls, and then gather the result into the vector return 3848 // value. 3849 InstructionCost ScalarCallCost = 3850 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3851 if (VF.isScalar()) 3852 return ScalarCallCost; 3853 3854 // Compute corresponding vector type for return value and arguments. 3855 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3856 for (Type *ScalarTy : ScalarTys) 3857 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3858 3859 // Compute costs of unpacking argument values for the scalar calls and 3860 // packing the return values to a vector. 3861 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3862 3863 InstructionCost Cost = 3864 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3865 3866 // If we can't emit a vector call for this function, then the currently found 3867 // cost is the cost we need to return. 3868 NeedToScalarize = true; 3869 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3870 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3871 3872 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3873 return Cost; 3874 3875 // If the corresponding vector cost is cheaper, return its cost. 3876 InstructionCost VectorCallCost = 3877 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3878 if (VectorCallCost < Cost) { 3879 NeedToScalarize = false; 3880 Cost = VectorCallCost; 3881 } 3882 return Cost; 3883 } 3884 3885 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3886 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3887 return Elt; 3888 return VectorType::get(Elt, VF); 3889 } 3890 3891 InstructionCost 3892 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3893 ElementCount VF) const { 3894 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3895 assert(ID && "Expected intrinsic call!"); 3896 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3897 FastMathFlags FMF; 3898 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3899 FMF = FPMO->getFastMathFlags(); 3900 3901 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3902 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3903 SmallVector<Type *> ParamTys; 3904 std::transform(FTy->param_begin(), FTy->param_end(), 3905 std::back_inserter(ParamTys), 3906 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3907 3908 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3909 dyn_cast<IntrinsicInst>(CI)); 3910 return TTI.getIntrinsicInstrCost(CostAttrs, 3911 TargetTransformInfo::TCK_RecipThroughput); 3912 } 3913 3914 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3915 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3916 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3917 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3918 } 3919 3920 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3921 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3922 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3923 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3924 } 3925 3926 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3927 // For every instruction `I` in MinBWs, truncate the operands, create a 3928 // truncated version of `I` and reextend its result. InstCombine runs 3929 // later and will remove any ext/trunc pairs. 3930 SmallPtrSet<Value *, 4> Erased; 3931 for (const auto &KV : Cost->getMinimalBitwidths()) { 3932 // If the value wasn't vectorized, we must maintain the original scalar 3933 // type. The absence of the value from State indicates that it 3934 // wasn't vectorized. 3935 VPValue *Def = State.Plan->getVPValue(KV.first); 3936 if (!State.hasAnyVectorValue(Def)) 3937 continue; 3938 for (unsigned Part = 0; Part < UF; ++Part) { 3939 Value *I = State.get(Def, Part); 3940 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3941 continue; 3942 Type *OriginalTy = I->getType(); 3943 Type *ScalarTruncatedTy = 3944 IntegerType::get(OriginalTy->getContext(), KV.second); 3945 auto *TruncatedTy = FixedVectorType::get( 3946 ScalarTruncatedTy, 3947 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3948 if (TruncatedTy == OriginalTy) 3949 continue; 3950 3951 IRBuilder<> B(cast<Instruction>(I)); 3952 auto ShrinkOperand = [&](Value *V) -> Value * { 3953 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3954 if (ZI->getSrcTy() == TruncatedTy) 3955 return ZI->getOperand(0); 3956 return B.CreateZExtOrTrunc(V, TruncatedTy); 3957 }; 3958 3959 // The actual instruction modification depends on the instruction type, 3960 // unfortunately. 3961 Value *NewI = nullptr; 3962 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3963 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3964 ShrinkOperand(BO->getOperand(1))); 3965 3966 // Any wrapping introduced by shrinking this operation shouldn't be 3967 // considered undefined behavior. So, we can't unconditionally copy 3968 // arithmetic wrapping flags to NewI. 3969 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3970 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3971 NewI = 3972 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3973 ShrinkOperand(CI->getOperand(1))); 3974 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3975 NewI = B.CreateSelect(SI->getCondition(), 3976 ShrinkOperand(SI->getTrueValue()), 3977 ShrinkOperand(SI->getFalseValue())); 3978 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3979 switch (CI->getOpcode()) { 3980 default: 3981 llvm_unreachable("Unhandled cast!"); 3982 case Instruction::Trunc: 3983 NewI = ShrinkOperand(CI->getOperand(0)); 3984 break; 3985 case Instruction::SExt: 3986 NewI = B.CreateSExtOrTrunc( 3987 CI->getOperand(0), 3988 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3989 break; 3990 case Instruction::ZExt: 3991 NewI = B.CreateZExtOrTrunc( 3992 CI->getOperand(0), 3993 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3994 break; 3995 } 3996 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3997 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3998 ->getNumElements(); 3999 auto *O0 = B.CreateZExtOrTrunc( 4000 SI->getOperand(0), 4001 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 4002 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 4003 ->getNumElements(); 4004 auto *O1 = B.CreateZExtOrTrunc( 4005 SI->getOperand(1), 4006 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 4007 4008 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4009 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4010 // Don't do anything with the operands, just extend the result. 4011 continue; 4012 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4013 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 4014 ->getNumElements(); 4015 auto *O0 = B.CreateZExtOrTrunc( 4016 IE->getOperand(0), 4017 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4018 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4019 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4020 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4021 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4022 ->getNumElements(); 4023 auto *O0 = B.CreateZExtOrTrunc( 4024 EE->getOperand(0), 4025 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4026 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4027 } else { 4028 // If we don't know what to do, be conservative and don't do anything. 4029 continue; 4030 } 4031 4032 // Lastly, extend the result. 4033 NewI->takeName(cast<Instruction>(I)); 4034 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4035 I->replaceAllUsesWith(Res); 4036 cast<Instruction>(I)->eraseFromParent(); 4037 Erased.insert(I); 4038 State.reset(Def, Res, Part); 4039 } 4040 } 4041 4042 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4043 for (const auto &KV : Cost->getMinimalBitwidths()) { 4044 // If the value wasn't vectorized, we must maintain the original scalar 4045 // type. The absence of the value from State indicates that it 4046 // wasn't vectorized. 4047 VPValue *Def = State.Plan->getVPValue(KV.first); 4048 if (!State.hasAnyVectorValue(Def)) 4049 continue; 4050 for (unsigned Part = 0; Part < UF; ++Part) { 4051 Value *I = State.get(Def, Part); 4052 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4053 if (Inst && Inst->use_empty()) { 4054 Value *NewI = Inst->getOperand(0); 4055 Inst->eraseFromParent(); 4056 State.reset(Def, NewI, Part); 4057 } 4058 } 4059 } 4060 } 4061 4062 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4063 // Insert truncates and extends for any truncated instructions as hints to 4064 // InstCombine. 4065 if (VF.isVector()) 4066 truncateToMinimalBitwidths(State); 4067 4068 // Fix widened non-induction PHIs by setting up the PHI operands. 4069 if (OrigPHIsToFix.size()) { 4070 assert(EnableVPlanNativePath && 4071 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4072 fixNonInductionPHIs(State); 4073 } 4074 4075 // At this point every instruction in the original loop is widened to a 4076 // vector form. Now we need to fix the recurrences in the loop. These PHI 4077 // nodes are currently empty because we did not want to introduce cycles. 4078 // This is the second stage of vectorizing recurrences. 4079 fixCrossIterationPHIs(State); 4080 4081 // Forget the original basic block. 4082 PSE.getSE()->forgetLoop(OrigLoop); 4083 4084 // Fix-up external users of the induction variables. 4085 for (auto &Entry : Legal->getInductionVars()) 4086 fixupIVUsers(Entry.first, Entry.second, 4087 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4088 IVEndValues[Entry.first], LoopMiddleBlock); 4089 4090 fixLCSSAPHIs(State); 4091 for (Instruction *PI : PredicatedInstructions) 4092 sinkScalarOperands(&*PI); 4093 4094 // Remove redundant induction instructions. 4095 cse(LoopVectorBody); 4096 4097 // Set/update profile weights for the vector and remainder loops as original 4098 // loop iterations are now distributed among them. Note that original loop 4099 // represented by LoopScalarBody becomes remainder loop after vectorization. 4100 // 4101 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4102 // end up getting slightly roughened result but that should be OK since 4103 // profile is not inherently precise anyway. Note also possible bypass of 4104 // vector code caused by legality checks is ignored, assigning all the weight 4105 // to the vector loop, optimistically. 4106 // 4107 // For scalable vectorization we can't know at compile time how many iterations 4108 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4109 // vscale of '1'. 4110 setProfileInfoAfterUnrolling( 4111 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4112 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4113 } 4114 4115 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4116 // In order to support recurrences we need to be able to vectorize Phi nodes. 4117 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4118 // stage #2: We now need to fix the recurrences by adding incoming edges to 4119 // the currently empty PHI nodes. At this point every instruction in the 4120 // original loop is widened to a vector form so we can use them to construct 4121 // the incoming edges. 4122 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4123 for (VPRecipeBase &R : Header->phis()) { 4124 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4125 if (!PhiR) 4126 continue; 4127 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4128 if (PhiR->getRecurrenceDescriptor()) { 4129 fixReduction(PhiR, State); 4130 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4131 fixFirstOrderRecurrence(OrigPhi, State); 4132 } 4133 } 4134 4135 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4136 VPTransformState &State) { 4137 // This is the second phase of vectorizing first-order recurrences. An 4138 // overview of the transformation is described below. Suppose we have the 4139 // following loop. 4140 // 4141 // for (int i = 0; i < n; ++i) 4142 // b[i] = a[i] - a[i - 1]; 4143 // 4144 // There is a first-order recurrence on "a". For this loop, the shorthand 4145 // scalar IR looks like: 4146 // 4147 // scalar.ph: 4148 // s_init = a[-1] 4149 // br scalar.body 4150 // 4151 // scalar.body: 4152 // i = phi [0, scalar.ph], [i+1, scalar.body] 4153 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4154 // s2 = a[i] 4155 // b[i] = s2 - s1 4156 // br cond, scalar.body, ... 4157 // 4158 // In this example, s1 is a recurrence because it's value depends on the 4159 // previous iteration. In the first phase of vectorization, we created a 4160 // temporary value for s1. We now complete the vectorization and produce the 4161 // shorthand vector IR shown below (for VF = 4, UF = 1). 4162 // 4163 // vector.ph: 4164 // v_init = vector(..., ..., ..., a[-1]) 4165 // br vector.body 4166 // 4167 // vector.body 4168 // i = phi [0, vector.ph], [i+4, vector.body] 4169 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4170 // v2 = a[i, i+1, i+2, i+3]; 4171 // v3 = vector(v1(3), v2(0, 1, 2)) 4172 // b[i, i+1, i+2, i+3] = v2 - v3 4173 // br cond, vector.body, middle.block 4174 // 4175 // middle.block: 4176 // x = v2(3) 4177 // br scalar.ph 4178 // 4179 // scalar.ph: 4180 // s_init = phi [x, middle.block], [a[-1], otherwise] 4181 // br scalar.body 4182 // 4183 // After execution completes the vector loop, we extract the next value of 4184 // the recurrence (x) to use as the initial value in the scalar loop. 4185 4186 // Get the original loop preheader and single loop latch. 4187 auto *Preheader = OrigLoop->getLoopPreheader(); 4188 auto *Latch = OrigLoop->getLoopLatch(); 4189 4190 // Get the initial and previous values of the scalar recurrence. 4191 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4192 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4193 4194 auto *IdxTy = Builder.getInt32Ty(); 4195 auto *One = ConstantInt::get(IdxTy, 1); 4196 4197 // Create a vector from the initial value. 4198 auto *VectorInit = ScalarInit; 4199 if (VF.isVector()) { 4200 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4201 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4202 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4203 VectorInit = Builder.CreateInsertElement( 4204 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4205 VectorInit, LastIdx, "vector.recur.init"); 4206 } 4207 4208 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4209 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4210 // We constructed a temporary phi node in the first phase of vectorization. 4211 // This phi node will eventually be deleted. 4212 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4213 4214 // Create a phi node for the new recurrence. The current value will either be 4215 // the initial value inserted into a vector or loop-varying vector value. 4216 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4217 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4218 4219 // Get the vectorized previous value of the last part UF - 1. It appears last 4220 // among all unrolled iterations, due to the order of their construction. 4221 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4222 4223 // Find and set the insertion point after the previous value if it is an 4224 // instruction. 4225 BasicBlock::iterator InsertPt; 4226 // Note that the previous value may have been constant-folded so it is not 4227 // guaranteed to be an instruction in the vector loop. 4228 // FIXME: Loop invariant values do not form recurrences. We should deal with 4229 // them earlier. 4230 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4231 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4232 else { 4233 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4234 if (isa<PHINode>(PreviousLastPart)) 4235 // If the previous value is a phi node, we should insert after all the phi 4236 // nodes in the block containing the PHI to avoid breaking basic block 4237 // verification. Note that the basic block may be different to 4238 // LoopVectorBody, in case we predicate the loop. 4239 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4240 else 4241 InsertPt = ++PreviousInst->getIterator(); 4242 } 4243 Builder.SetInsertPoint(&*InsertPt); 4244 4245 // The vector from which to take the initial value for the current iteration 4246 // (actual or unrolled). Initially, this is the vector phi node. 4247 Value *Incoming = VecPhi; 4248 4249 // Shuffle the current and previous vector and update the vector parts. 4250 for (unsigned Part = 0; Part < UF; ++Part) { 4251 Value *PreviousPart = State.get(PreviousDef, Part); 4252 Value *PhiPart = State.get(PhiDef, Part); 4253 auto *Shuffle = VF.isVector() 4254 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4255 : Incoming; 4256 PhiPart->replaceAllUsesWith(Shuffle); 4257 cast<Instruction>(PhiPart)->eraseFromParent(); 4258 State.reset(PhiDef, Shuffle, Part); 4259 Incoming = PreviousPart; 4260 } 4261 4262 // Fix the latch value of the new recurrence in the vector loop. 4263 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4264 4265 // Extract the last vector element in the middle block. This will be the 4266 // initial value for the recurrence when jumping to the scalar loop. 4267 auto *ExtractForScalar = Incoming; 4268 if (VF.isVector()) { 4269 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4270 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4271 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4272 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4273 "vector.recur.extract"); 4274 } 4275 // Extract the second last element in the middle block if the 4276 // Phi is used outside the loop. We need to extract the phi itself 4277 // and not the last element (the phi update in the current iteration). This 4278 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4279 // when the scalar loop is not run at all. 4280 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4281 if (VF.isVector()) { 4282 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4283 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4284 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4285 Incoming, Idx, "vector.recur.extract.for.phi"); 4286 } else if (UF > 1) 4287 // When loop is unrolled without vectorizing, initialize 4288 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4289 // of `Incoming`. This is analogous to the vectorized case above: extracting 4290 // the second last element when VF > 1. 4291 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4292 4293 // Fix the initial value of the original recurrence in the scalar loop. 4294 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4295 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4296 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4297 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4298 Start->addIncoming(Incoming, BB); 4299 } 4300 4301 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4302 Phi->setName("scalar.recur"); 4303 4304 // Finally, fix users of the recurrence outside the loop. The users will need 4305 // either the last value of the scalar recurrence or the last value of the 4306 // vector recurrence we extracted in the middle block. Since the loop is in 4307 // LCSSA form, we just need to find all the phi nodes for the original scalar 4308 // recurrence in the exit block, and then add an edge for the middle block. 4309 // Note that LCSSA does not imply single entry when the original scalar loop 4310 // had multiple exiting edges (as we always run the last iteration in the 4311 // scalar epilogue); in that case, the exiting path through middle will be 4312 // dynamically dead and the value picked for the phi doesn't matter. 4313 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4314 if (any_of(LCSSAPhi.incoming_values(), 4315 [Phi](Value *V) { return V == Phi; })) 4316 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4317 } 4318 4319 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4320 return EnableStrictReductions && RdxDesc.isOrdered(); 4321 } 4322 4323 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4324 VPTransformState &State) { 4325 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4326 // Get it's reduction variable descriptor. 4327 assert(Legal->isReductionVariable(OrigPhi) && 4328 "Unable to find the reduction variable"); 4329 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4330 4331 RecurKind RK = RdxDesc.getRecurrenceKind(); 4332 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4333 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4334 setDebugLocFromInst(Builder, ReductionStartValue); 4335 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4336 4337 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4338 // This is the vector-clone of the value that leaves the loop. 4339 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4340 4341 // Wrap flags are in general invalid after vectorization, clear them. 4342 clearReductionWrapFlags(RdxDesc, State); 4343 4344 // Fix the vector-loop phi. 4345 4346 // Reductions do not have to start at zero. They can start with 4347 // any loop invariant values. 4348 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4349 4350 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4351 useOrderedReductions(RdxDesc); 4352 4353 for (unsigned Part = 0; Part < UF; ++Part) { 4354 if (IsOrdered && Part > 0) 4355 break; 4356 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4357 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4358 if (IsOrdered) 4359 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4360 4361 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4362 } 4363 4364 // Before each round, move the insertion point right between 4365 // the PHIs and the values we are going to write. 4366 // This allows us to write both PHINodes and the extractelement 4367 // instructions. 4368 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4369 4370 setDebugLocFromInst(Builder, LoopExitInst); 4371 4372 Type *PhiTy = OrigPhi->getType(); 4373 // If tail is folded by masking, the vector value to leave the loop should be 4374 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4375 // instead of the former. For an inloop reduction the reduction will already 4376 // be predicated, and does not need to be handled here. 4377 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4378 for (unsigned Part = 0; Part < UF; ++Part) { 4379 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4380 Value *Sel = nullptr; 4381 for (User *U : VecLoopExitInst->users()) { 4382 if (isa<SelectInst>(U)) { 4383 assert(!Sel && "Reduction exit feeding two selects"); 4384 Sel = U; 4385 } else 4386 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4387 } 4388 assert(Sel && "Reduction exit feeds no select"); 4389 State.reset(LoopExitInstDef, Sel, Part); 4390 4391 // If the target can create a predicated operator for the reduction at no 4392 // extra cost in the loop (for example a predicated vadd), it can be 4393 // cheaper for the select to remain in the loop than be sunk out of it, 4394 // and so use the select value for the phi instead of the old 4395 // LoopExitValue. 4396 if (PreferPredicatedReductionSelect || 4397 TTI->preferPredicatedReductionSelect( 4398 RdxDesc.getOpcode(), PhiTy, 4399 TargetTransformInfo::ReductionFlags())) { 4400 auto *VecRdxPhi = 4401 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4402 VecRdxPhi->setIncomingValueForBlock( 4403 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4404 } 4405 } 4406 } 4407 4408 // If the vector reduction can be performed in a smaller type, we truncate 4409 // then extend the loop exit value to enable InstCombine to evaluate the 4410 // entire expression in the smaller type. 4411 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4412 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4413 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4414 Builder.SetInsertPoint( 4415 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4416 VectorParts RdxParts(UF); 4417 for (unsigned Part = 0; Part < UF; ++Part) { 4418 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4419 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4420 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4421 : Builder.CreateZExt(Trunc, VecTy); 4422 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4423 UI != RdxParts[Part]->user_end();) 4424 if (*UI != Trunc) { 4425 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4426 RdxParts[Part] = Extnd; 4427 } else { 4428 ++UI; 4429 } 4430 } 4431 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4432 for (unsigned Part = 0; Part < UF; ++Part) { 4433 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4434 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4435 } 4436 } 4437 4438 // Reduce all of the unrolled parts into a single vector. 4439 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4440 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4441 4442 // The middle block terminator has already been assigned a DebugLoc here (the 4443 // OrigLoop's single latch terminator). We want the whole middle block to 4444 // appear to execute on this line because: (a) it is all compiler generated, 4445 // (b) these instructions are always executed after evaluating the latch 4446 // conditional branch, and (c) other passes may add new predecessors which 4447 // terminate on this line. This is the easiest way to ensure we don't 4448 // accidentally cause an extra step back into the loop while debugging. 4449 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4450 if (IsOrdered) 4451 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4452 else { 4453 // Floating-point operations should have some FMF to enable the reduction. 4454 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4455 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4456 for (unsigned Part = 1; Part < UF; ++Part) { 4457 Value *RdxPart = State.get(LoopExitInstDef, Part); 4458 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4459 ReducedPartRdx = Builder.CreateBinOp( 4460 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4461 } else { 4462 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4463 } 4464 } 4465 } 4466 4467 // Create the reduction after the loop. Note that inloop reductions create the 4468 // target reduction in the loop using a Reduction recipe. 4469 if (VF.isVector() && !IsInLoopReductionPhi) { 4470 ReducedPartRdx = 4471 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4472 // If the reduction can be performed in a smaller type, we need to extend 4473 // the reduction to the wider type before we branch to the original loop. 4474 if (PhiTy != RdxDesc.getRecurrenceType()) 4475 ReducedPartRdx = RdxDesc.isSigned() 4476 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4477 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4478 } 4479 4480 // Create a phi node that merges control-flow from the backedge-taken check 4481 // block and the middle block. 4482 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4483 LoopScalarPreHeader->getTerminator()); 4484 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4485 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4486 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4487 4488 // Now, we need to fix the users of the reduction variable 4489 // inside and outside of the scalar remainder loop. 4490 4491 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4492 // in the exit blocks. See comment on analogous loop in 4493 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4494 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4495 if (any_of(LCSSAPhi.incoming_values(), 4496 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4497 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4498 4499 // Fix the scalar loop reduction variable with the incoming reduction sum 4500 // from the vector body and from the backedge value. 4501 int IncomingEdgeBlockIdx = 4502 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4503 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4504 // Pick the other block. 4505 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4506 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4507 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4508 } 4509 4510 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4511 VPTransformState &State) { 4512 RecurKind RK = RdxDesc.getRecurrenceKind(); 4513 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4514 return; 4515 4516 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4517 assert(LoopExitInstr && "null loop exit instruction"); 4518 SmallVector<Instruction *, 8> Worklist; 4519 SmallPtrSet<Instruction *, 8> Visited; 4520 Worklist.push_back(LoopExitInstr); 4521 Visited.insert(LoopExitInstr); 4522 4523 while (!Worklist.empty()) { 4524 Instruction *Cur = Worklist.pop_back_val(); 4525 if (isa<OverflowingBinaryOperator>(Cur)) 4526 for (unsigned Part = 0; Part < UF; ++Part) { 4527 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4528 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4529 } 4530 4531 for (User *U : Cur->users()) { 4532 Instruction *UI = cast<Instruction>(U); 4533 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4534 Visited.insert(UI).second) 4535 Worklist.push_back(UI); 4536 } 4537 } 4538 } 4539 4540 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4541 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4542 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4543 // Some phis were already hand updated by the reduction and recurrence 4544 // code above, leave them alone. 4545 continue; 4546 4547 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4548 // Non-instruction incoming values will have only one value. 4549 4550 VPLane Lane = VPLane::getFirstLane(); 4551 if (isa<Instruction>(IncomingValue) && 4552 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4553 VF)) 4554 Lane = VPLane::getLastLaneForVF(VF); 4555 4556 // Can be a loop invariant incoming value or the last scalar value to be 4557 // extracted from the vectorized loop. 4558 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4559 Value *lastIncomingValue = 4560 OrigLoop->isLoopInvariant(IncomingValue) 4561 ? IncomingValue 4562 : State.get(State.Plan->getVPValue(IncomingValue), 4563 VPIteration(UF - 1, Lane)); 4564 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4565 } 4566 } 4567 4568 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4569 // The basic block and loop containing the predicated instruction. 4570 auto *PredBB = PredInst->getParent(); 4571 auto *VectorLoop = LI->getLoopFor(PredBB); 4572 4573 // Initialize a worklist with the operands of the predicated instruction. 4574 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4575 4576 // Holds instructions that we need to analyze again. An instruction may be 4577 // reanalyzed if we don't yet know if we can sink it or not. 4578 SmallVector<Instruction *, 8> InstsToReanalyze; 4579 4580 // Returns true if a given use occurs in the predicated block. Phi nodes use 4581 // their operands in their corresponding predecessor blocks. 4582 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4583 auto *I = cast<Instruction>(U.getUser()); 4584 BasicBlock *BB = I->getParent(); 4585 if (auto *Phi = dyn_cast<PHINode>(I)) 4586 BB = Phi->getIncomingBlock( 4587 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4588 return BB == PredBB; 4589 }; 4590 4591 // Iteratively sink the scalarized operands of the predicated instruction 4592 // into the block we created for it. When an instruction is sunk, it's 4593 // operands are then added to the worklist. The algorithm ends after one pass 4594 // through the worklist doesn't sink a single instruction. 4595 bool Changed; 4596 do { 4597 // Add the instructions that need to be reanalyzed to the worklist, and 4598 // reset the changed indicator. 4599 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4600 InstsToReanalyze.clear(); 4601 Changed = false; 4602 4603 while (!Worklist.empty()) { 4604 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4605 4606 // We can't sink an instruction if it is a phi node, is not in the loop, 4607 // or may have side effects. 4608 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 4609 I->mayHaveSideEffects()) 4610 continue; 4611 4612 // If the instruction is already in PredBB, check if we can sink its 4613 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 4614 // sinking the scalar instruction I, hence it appears in PredBB; but it 4615 // may have failed to sink I's operands (recursively), which we try 4616 // (again) here. 4617 if (I->getParent() == PredBB) { 4618 Worklist.insert(I->op_begin(), I->op_end()); 4619 continue; 4620 } 4621 4622 // It's legal to sink the instruction if all its uses occur in the 4623 // predicated block. Otherwise, there's nothing to do yet, and we may 4624 // need to reanalyze the instruction. 4625 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4626 InstsToReanalyze.push_back(I); 4627 continue; 4628 } 4629 4630 // Move the instruction to the beginning of the predicated block, and add 4631 // it's operands to the worklist. 4632 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4633 Worklist.insert(I->op_begin(), I->op_end()); 4634 4635 // The sinking may have enabled other instructions to be sunk, so we will 4636 // need to iterate. 4637 Changed = true; 4638 } 4639 } while (Changed); 4640 } 4641 4642 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4643 for (PHINode *OrigPhi : OrigPHIsToFix) { 4644 VPWidenPHIRecipe *VPPhi = 4645 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4646 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4647 // Make sure the builder has a valid insert point. 4648 Builder.SetInsertPoint(NewPhi); 4649 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4650 VPValue *Inc = VPPhi->getIncomingValue(i); 4651 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4652 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4653 } 4654 } 4655 } 4656 4657 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4658 VPUser &Operands, unsigned UF, 4659 ElementCount VF, bool IsPtrLoopInvariant, 4660 SmallBitVector &IsIndexLoopInvariant, 4661 VPTransformState &State) { 4662 // Construct a vector GEP by widening the operands of the scalar GEP as 4663 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4664 // results in a vector of pointers when at least one operand of the GEP 4665 // is vector-typed. Thus, to keep the representation compact, we only use 4666 // vector-typed operands for loop-varying values. 4667 4668 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4669 // If we are vectorizing, but the GEP has only loop-invariant operands, 4670 // the GEP we build (by only using vector-typed operands for 4671 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4672 // produce a vector of pointers, we need to either arbitrarily pick an 4673 // operand to broadcast, or broadcast a clone of the original GEP. 4674 // Here, we broadcast a clone of the original. 4675 // 4676 // TODO: If at some point we decide to scalarize instructions having 4677 // loop-invariant operands, this special case will no longer be 4678 // required. We would add the scalarization decision to 4679 // collectLoopScalars() and teach getVectorValue() to broadcast 4680 // the lane-zero scalar value. 4681 auto *Clone = Builder.Insert(GEP->clone()); 4682 for (unsigned Part = 0; Part < UF; ++Part) { 4683 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4684 State.set(VPDef, EntryPart, Part); 4685 addMetadata(EntryPart, GEP); 4686 } 4687 } else { 4688 // If the GEP has at least one loop-varying operand, we are sure to 4689 // produce a vector of pointers. But if we are only unrolling, we want 4690 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4691 // produce with the code below will be scalar (if VF == 1) or vector 4692 // (otherwise). Note that for the unroll-only case, we still maintain 4693 // values in the vector mapping with initVector, as we do for other 4694 // instructions. 4695 for (unsigned Part = 0; Part < UF; ++Part) { 4696 // The pointer operand of the new GEP. If it's loop-invariant, we 4697 // won't broadcast it. 4698 auto *Ptr = IsPtrLoopInvariant 4699 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4700 : State.get(Operands.getOperand(0), Part); 4701 4702 // Collect all the indices for the new GEP. If any index is 4703 // loop-invariant, we won't broadcast it. 4704 SmallVector<Value *, 4> Indices; 4705 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4706 VPValue *Operand = Operands.getOperand(I); 4707 if (IsIndexLoopInvariant[I - 1]) 4708 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4709 else 4710 Indices.push_back(State.get(Operand, Part)); 4711 } 4712 4713 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4714 // but it should be a vector, otherwise. 4715 auto *NewGEP = 4716 GEP->isInBounds() 4717 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4718 Indices) 4719 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4720 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4721 "NewGEP is not a pointer vector"); 4722 State.set(VPDef, NewGEP, Part); 4723 addMetadata(NewGEP, GEP); 4724 } 4725 } 4726 } 4727 4728 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4729 RecurrenceDescriptor *RdxDesc, 4730 VPWidenPHIRecipe *PhiR, 4731 VPTransformState &State) { 4732 PHINode *P = cast<PHINode>(PN); 4733 if (EnableVPlanNativePath) { 4734 // Currently we enter here in the VPlan-native path for non-induction 4735 // PHIs where all control flow is uniform. We simply widen these PHIs. 4736 // Create a vector phi with no operands - the vector phi operands will be 4737 // set at the end of vector code generation. 4738 Type *VecTy = (State.VF.isScalar()) 4739 ? PN->getType() 4740 : VectorType::get(PN->getType(), State.VF); 4741 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4742 State.set(PhiR, VecPhi, 0); 4743 OrigPHIsToFix.push_back(P); 4744 4745 return; 4746 } 4747 4748 assert(PN->getParent() == OrigLoop->getHeader() && 4749 "Non-header phis should have been handled elsewhere"); 4750 4751 VPValue *StartVPV = PhiR->getStartValue(); 4752 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4753 // In order to support recurrences we need to be able to vectorize Phi nodes. 4754 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4755 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4756 // this value when we vectorize all of the instructions that use the PHI. 4757 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4758 Value *Iden = nullptr; 4759 bool ScalarPHI = 4760 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4761 Type *VecTy = 4762 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4763 4764 if (RdxDesc) { 4765 assert(Legal->isReductionVariable(P) && StartV && 4766 "RdxDesc should only be set for reduction variables; in that case " 4767 "a StartV is also required"); 4768 RecurKind RK = RdxDesc->getRecurrenceKind(); 4769 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4770 // MinMax reduction have the start value as their identify. 4771 if (ScalarPHI) { 4772 Iden = StartV; 4773 } else { 4774 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4775 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4776 StartV = Iden = 4777 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4778 } 4779 } else { 4780 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4781 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4782 Iden = IdenC; 4783 4784 if (!ScalarPHI) { 4785 Iden = ConstantVector::getSplat(State.VF, IdenC); 4786 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4787 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4788 Constant *Zero = Builder.getInt32(0); 4789 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4790 } 4791 } 4792 } 4793 4794 bool IsOrdered = State.VF.isVector() && 4795 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4796 useOrderedReductions(*RdxDesc); 4797 4798 for (unsigned Part = 0; Part < State.UF; ++Part) { 4799 // This is phase one of vectorizing PHIs. 4800 if (Part > 0 && IsOrdered) 4801 return; 4802 Value *EntryPart = PHINode::Create( 4803 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4804 State.set(PhiR, EntryPart, Part); 4805 if (StartV) { 4806 // Make sure to add the reduction start value only to the 4807 // first unroll part. 4808 Value *StartVal = (Part == 0) ? StartV : Iden; 4809 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4810 } 4811 } 4812 return; 4813 } 4814 4815 assert(!Legal->isReductionVariable(P) && 4816 "reductions should be handled above"); 4817 4818 setDebugLocFromInst(Builder, P); 4819 4820 // This PHINode must be an induction variable. 4821 // Make sure that we know about it. 4822 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4823 4824 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4825 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4826 4827 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4828 // which can be found from the original scalar operations. 4829 switch (II.getKind()) { 4830 case InductionDescriptor::IK_NoInduction: 4831 llvm_unreachable("Unknown induction"); 4832 case InductionDescriptor::IK_IntInduction: 4833 case InductionDescriptor::IK_FpInduction: 4834 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4835 case InductionDescriptor::IK_PtrInduction: { 4836 // Handle the pointer induction variable case. 4837 assert(P->getType()->isPointerTy() && "Unexpected type."); 4838 4839 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4840 // This is the normalized GEP that starts counting at zero. 4841 Value *PtrInd = 4842 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4843 // Determine the number of scalars we need to generate for each unroll 4844 // iteration. If the instruction is uniform, we only need to generate the 4845 // first lane. Otherwise, we generate all VF values. 4846 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4847 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4848 4849 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4850 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4851 if (NeedsVectorIndex) { 4852 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4853 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4854 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4855 } 4856 4857 for (unsigned Part = 0; Part < UF; ++Part) { 4858 Value *PartStart = createStepForVF( 4859 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4860 4861 if (NeedsVectorIndex) { 4862 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4863 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4864 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4865 Value *SclrGep = 4866 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4867 SclrGep->setName("next.gep"); 4868 State.set(PhiR, SclrGep, Part); 4869 // We've cached the whole vector, which means we can support the 4870 // extraction of any lane. 4871 continue; 4872 } 4873 4874 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4875 Value *Idx = Builder.CreateAdd( 4876 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4877 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4878 Value *SclrGep = 4879 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4880 SclrGep->setName("next.gep"); 4881 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4882 } 4883 } 4884 return; 4885 } 4886 assert(isa<SCEVConstant>(II.getStep()) && 4887 "Induction step not a SCEV constant!"); 4888 Type *PhiType = II.getStep()->getType(); 4889 4890 // Build a pointer phi 4891 Value *ScalarStartValue = II.getStartValue(); 4892 Type *ScStValueType = ScalarStartValue->getType(); 4893 PHINode *NewPointerPhi = 4894 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4895 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4896 4897 // A pointer induction, performed by using a gep 4898 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4899 Instruction *InductionLoc = LoopLatch->getTerminator(); 4900 const SCEV *ScalarStep = II.getStep(); 4901 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4902 Value *ScalarStepValue = 4903 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4904 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4905 Value *NumUnrolledElems = 4906 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4907 Value *InductionGEP = GetElementPtrInst::Create( 4908 ScStValueType->getPointerElementType(), NewPointerPhi, 4909 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4910 InductionLoc); 4911 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4912 4913 // Create UF many actual address geps that use the pointer 4914 // phi as base and a vectorized version of the step value 4915 // (<step*0, ..., step*N>) as offset. 4916 for (unsigned Part = 0; Part < State.UF; ++Part) { 4917 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4918 Value *StartOffsetScalar = 4919 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4920 Value *StartOffset = 4921 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4922 // Create a vector of consecutive numbers from zero to VF. 4923 StartOffset = 4924 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4925 4926 Value *GEP = Builder.CreateGEP( 4927 ScStValueType->getPointerElementType(), NewPointerPhi, 4928 Builder.CreateMul( 4929 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4930 "vector.gep")); 4931 State.set(PhiR, GEP, Part); 4932 } 4933 } 4934 } 4935 } 4936 4937 /// A helper function for checking whether an integer division-related 4938 /// instruction may divide by zero (in which case it must be predicated if 4939 /// executed conditionally in the scalar code). 4940 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4941 /// Non-zero divisors that are non compile-time constants will not be 4942 /// converted into multiplication, so we will still end up scalarizing 4943 /// the division, but can do so w/o predication. 4944 static bool mayDivideByZero(Instruction &I) { 4945 assert((I.getOpcode() == Instruction::UDiv || 4946 I.getOpcode() == Instruction::SDiv || 4947 I.getOpcode() == Instruction::URem || 4948 I.getOpcode() == Instruction::SRem) && 4949 "Unexpected instruction"); 4950 Value *Divisor = I.getOperand(1); 4951 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4952 return !CInt || CInt->isZero(); 4953 } 4954 4955 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4956 VPUser &User, 4957 VPTransformState &State) { 4958 switch (I.getOpcode()) { 4959 case Instruction::Call: 4960 case Instruction::Br: 4961 case Instruction::PHI: 4962 case Instruction::GetElementPtr: 4963 case Instruction::Select: 4964 llvm_unreachable("This instruction is handled by a different recipe."); 4965 case Instruction::UDiv: 4966 case Instruction::SDiv: 4967 case Instruction::SRem: 4968 case Instruction::URem: 4969 case Instruction::Add: 4970 case Instruction::FAdd: 4971 case Instruction::Sub: 4972 case Instruction::FSub: 4973 case Instruction::FNeg: 4974 case Instruction::Mul: 4975 case Instruction::FMul: 4976 case Instruction::FDiv: 4977 case Instruction::FRem: 4978 case Instruction::Shl: 4979 case Instruction::LShr: 4980 case Instruction::AShr: 4981 case Instruction::And: 4982 case Instruction::Or: 4983 case Instruction::Xor: { 4984 // Just widen unops and binops. 4985 setDebugLocFromInst(Builder, &I); 4986 4987 for (unsigned Part = 0; Part < UF; ++Part) { 4988 SmallVector<Value *, 2> Ops; 4989 for (VPValue *VPOp : User.operands()) 4990 Ops.push_back(State.get(VPOp, Part)); 4991 4992 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4993 4994 if (auto *VecOp = dyn_cast<Instruction>(V)) 4995 VecOp->copyIRFlags(&I); 4996 4997 // Use this vector value for all users of the original instruction. 4998 State.set(Def, V, Part); 4999 addMetadata(V, &I); 5000 } 5001 5002 break; 5003 } 5004 case Instruction::ICmp: 5005 case Instruction::FCmp: { 5006 // Widen compares. Generate vector compares. 5007 bool FCmp = (I.getOpcode() == Instruction::FCmp); 5008 auto *Cmp = cast<CmpInst>(&I); 5009 setDebugLocFromInst(Builder, Cmp); 5010 for (unsigned Part = 0; Part < UF; ++Part) { 5011 Value *A = State.get(User.getOperand(0), Part); 5012 Value *B = State.get(User.getOperand(1), Part); 5013 Value *C = nullptr; 5014 if (FCmp) { 5015 // Propagate fast math flags. 5016 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 5017 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 5018 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 5019 } else { 5020 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 5021 } 5022 State.set(Def, C, Part); 5023 addMetadata(C, &I); 5024 } 5025 5026 break; 5027 } 5028 5029 case Instruction::ZExt: 5030 case Instruction::SExt: 5031 case Instruction::FPToUI: 5032 case Instruction::FPToSI: 5033 case Instruction::FPExt: 5034 case Instruction::PtrToInt: 5035 case Instruction::IntToPtr: 5036 case Instruction::SIToFP: 5037 case Instruction::UIToFP: 5038 case Instruction::Trunc: 5039 case Instruction::FPTrunc: 5040 case Instruction::BitCast: { 5041 auto *CI = cast<CastInst>(&I); 5042 setDebugLocFromInst(Builder, CI); 5043 5044 /// Vectorize casts. 5045 Type *DestTy = 5046 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 5047 5048 for (unsigned Part = 0; Part < UF; ++Part) { 5049 Value *A = State.get(User.getOperand(0), Part); 5050 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5051 State.set(Def, Cast, Part); 5052 addMetadata(Cast, &I); 5053 } 5054 break; 5055 } 5056 default: 5057 // This instruction is not vectorized by simple widening. 5058 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5059 llvm_unreachable("Unhandled instruction!"); 5060 } // end of switch. 5061 } 5062 5063 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5064 VPUser &ArgOperands, 5065 VPTransformState &State) { 5066 assert(!isa<DbgInfoIntrinsic>(I) && 5067 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5068 setDebugLocFromInst(Builder, &I); 5069 5070 Module *M = I.getParent()->getParent()->getParent(); 5071 auto *CI = cast<CallInst>(&I); 5072 5073 SmallVector<Type *, 4> Tys; 5074 for (Value *ArgOperand : CI->arg_operands()) 5075 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5076 5077 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5078 5079 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5080 // version of the instruction. 5081 // Is it beneficial to perform intrinsic call compared to lib call? 5082 bool NeedToScalarize = false; 5083 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5084 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5085 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5086 assert((UseVectorIntrinsic || !NeedToScalarize) && 5087 "Instruction should be scalarized elsewhere."); 5088 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5089 "Either the intrinsic cost or vector call cost must be valid"); 5090 5091 for (unsigned Part = 0; Part < UF; ++Part) { 5092 SmallVector<Value *, 4> Args; 5093 for (auto &I : enumerate(ArgOperands.operands())) { 5094 // Some intrinsics have a scalar argument - don't replace it with a 5095 // vector. 5096 Value *Arg; 5097 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5098 Arg = State.get(I.value(), Part); 5099 else 5100 Arg = State.get(I.value(), VPIteration(0, 0)); 5101 Args.push_back(Arg); 5102 } 5103 5104 Function *VectorF; 5105 if (UseVectorIntrinsic) { 5106 // Use vector version of the intrinsic. 5107 Type *TysForDecl[] = {CI->getType()}; 5108 if (VF.isVector()) 5109 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5110 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5111 assert(VectorF && "Can't retrieve vector intrinsic."); 5112 } else { 5113 // Use vector version of the function call. 5114 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5115 #ifndef NDEBUG 5116 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5117 "Can't create vector function."); 5118 #endif 5119 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5120 } 5121 SmallVector<OperandBundleDef, 1> OpBundles; 5122 CI->getOperandBundlesAsDefs(OpBundles); 5123 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5124 5125 if (isa<FPMathOperator>(V)) 5126 V->copyFastMathFlags(CI); 5127 5128 State.set(Def, V, Part); 5129 addMetadata(V, &I); 5130 } 5131 } 5132 5133 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5134 VPUser &Operands, 5135 bool InvariantCond, 5136 VPTransformState &State) { 5137 setDebugLocFromInst(Builder, &I); 5138 5139 // The condition can be loop invariant but still defined inside the 5140 // loop. This means that we can't just use the original 'cond' value. 5141 // We have to take the 'vectorized' value and pick the first lane. 5142 // Instcombine will make this a no-op. 5143 auto *InvarCond = InvariantCond 5144 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5145 : nullptr; 5146 5147 for (unsigned Part = 0; Part < UF; ++Part) { 5148 Value *Cond = 5149 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5150 Value *Op0 = State.get(Operands.getOperand(1), Part); 5151 Value *Op1 = State.get(Operands.getOperand(2), Part); 5152 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5153 State.set(VPDef, Sel, Part); 5154 addMetadata(Sel, &I); 5155 } 5156 } 5157 5158 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5159 // We should not collect Scalars more than once per VF. Right now, this 5160 // function is called from collectUniformsAndScalars(), which already does 5161 // this check. Collecting Scalars for VF=1 does not make any sense. 5162 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5163 "This function should not be visited twice for the same VF"); 5164 5165 SmallSetVector<Instruction *, 8> Worklist; 5166 5167 // These sets are used to seed the analysis with pointers used by memory 5168 // accesses that will remain scalar. 5169 SmallSetVector<Instruction *, 8> ScalarPtrs; 5170 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5171 auto *Latch = TheLoop->getLoopLatch(); 5172 5173 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5174 // The pointer operands of loads and stores will be scalar as long as the 5175 // memory access is not a gather or scatter operation. The value operand of a 5176 // store will remain scalar if the store is scalarized. 5177 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5178 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5179 assert(WideningDecision != CM_Unknown && 5180 "Widening decision should be ready at this moment"); 5181 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5182 if (Ptr == Store->getValueOperand()) 5183 return WideningDecision == CM_Scalarize; 5184 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5185 "Ptr is neither a value or pointer operand"); 5186 return WideningDecision != CM_GatherScatter; 5187 }; 5188 5189 // A helper that returns true if the given value is a bitcast or 5190 // getelementptr instruction contained in the loop. 5191 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5192 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5193 isa<GetElementPtrInst>(V)) && 5194 !TheLoop->isLoopInvariant(V); 5195 }; 5196 5197 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5198 if (!isa<PHINode>(Ptr) || 5199 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5200 return false; 5201 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5202 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5203 return false; 5204 return isScalarUse(MemAccess, Ptr); 5205 }; 5206 5207 // A helper that evaluates a memory access's use of a pointer. If the 5208 // pointer is actually the pointer induction of a loop, it is being 5209 // inserted into Worklist. If the use will be a scalar use, and the 5210 // pointer is only used by memory accesses, we place the pointer in 5211 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5212 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5213 if (isScalarPtrInduction(MemAccess, Ptr)) { 5214 Worklist.insert(cast<Instruction>(Ptr)); 5215 Instruction *Update = cast<Instruction>( 5216 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5217 Worklist.insert(Update); 5218 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5219 << "\n"); 5220 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5221 << "\n"); 5222 return; 5223 } 5224 // We only care about bitcast and getelementptr instructions contained in 5225 // the loop. 5226 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5227 return; 5228 5229 // If the pointer has already been identified as scalar (e.g., if it was 5230 // also identified as uniform), there's nothing to do. 5231 auto *I = cast<Instruction>(Ptr); 5232 if (Worklist.count(I)) 5233 return; 5234 5235 // If the use of the pointer will be a scalar use, and all users of the 5236 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5237 // place the pointer in PossibleNonScalarPtrs. 5238 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5239 return isa<LoadInst>(U) || isa<StoreInst>(U); 5240 })) 5241 ScalarPtrs.insert(I); 5242 else 5243 PossibleNonScalarPtrs.insert(I); 5244 }; 5245 5246 // We seed the scalars analysis with three classes of instructions: (1) 5247 // instructions marked uniform-after-vectorization and (2) bitcast, 5248 // getelementptr and (pointer) phi instructions used by memory accesses 5249 // requiring a scalar use. 5250 // 5251 // (1) Add to the worklist all instructions that have been identified as 5252 // uniform-after-vectorization. 5253 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5254 5255 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5256 // memory accesses requiring a scalar use. The pointer operands of loads and 5257 // stores will be scalar as long as the memory accesses is not a gather or 5258 // scatter operation. The value operand of a store will remain scalar if the 5259 // store is scalarized. 5260 for (auto *BB : TheLoop->blocks()) 5261 for (auto &I : *BB) { 5262 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5263 evaluatePtrUse(Load, Load->getPointerOperand()); 5264 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5265 evaluatePtrUse(Store, Store->getPointerOperand()); 5266 evaluatePtrUse(Store, Store->getValueOperand()); 5267 } 5268 } 5269 for (auto *I : ScalarPtrs) 5270 if (!PossibleNonScalarPtrs.count(I)) { 5271 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5272 Worklist.insert(I); 5273 } 5274 5275 // Insert the forced scalars. 5276 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5277 // induction variable when the PHI user is scalarized. 5278 auto ForcedScalar = ForcedScalars.find(VF); 5279 if (ForcedScalar != ForcedScalars.end()) 5280 for (auto *I : ForcedScalar->second) 5281 Worklist.insert(I); 5282 5283 // Expand the worklist by looking through any bitcasts and getelementptr 5284 // instructions we've already identified as scalar. This is similar to the 5285 // expansion step in collectLoopUniforms(); however, here we're only 5286 // expanding to include additional bitcasts and getelementptr instructions. 5287 unsigned Idx = 0; 5288 while (Idx != Worklist.size()) { 5289 Instruction *Dst = Worklist[Idx++]; 5290 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5291 continue; 5292 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5293 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5294 auto *J = cast<Instruction>(U); 5295 return !TheLoop->contains(J) || Worklist.count(J) || 5296 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5297 isScalarUse(J, Src)); 5298 })) { 5299 Worklist.insert(Src); 5300 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5301 } 5302 } 5303 5304 // An induction variable will remain scalar if all users of the induction 5305 // variable and induction variable update remain scalar. 5306 for (auto &Induction : Legal->getInductionVars()) { 5307 auto *Ind = Induction.first; 5308 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5309 5310 // If tail-folding is applied, the primary induction variable will be used 5311 // to feed a vector compare. 5312 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5313 continue; 5314 5315 // Determine if all users of the induction variable are scalar after 5316 // vectorization. 5317 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5318 auto *I = cast<Instruction>(U); 5319 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5320 }); 5321 if (!ScalarInd) 5322 continue; 5323 5324 // Determine if all users of the induction variable update instruction are 5325 // scalar after vectorization. 5326 auto ScalarIndUpdate = 5327 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5328 auto *I = cast<Instruction>(U); 5329 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5330 }); 5331 if (!ScalarIndUpdate) 5332 continue; 5333 5334 // The induction variable and its update instruction will remain scalar. 5335 Worklist.insert(Ind); 5336 Worklist.insert(IndUpdate); 5337 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5338 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5339 << "\n"); 5340 } 5341 5342 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5343 } 5344 5345 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5346 if (!blockNeedsPredication(I->getParent())) 5347 return false; 5348 switch(I->getOpcode()) { 5349 default: 5350 break; 5351 case Instruction::Load: 5352 case Instruction::Store: { 5353 if (!Legal->isMaskRequired(I)) 5354 return false; 5355 auto *Ptr = getLoadStorePointerOperand(I); 5356 auto *Ty = getMemInstValueType(I); 5357 const Align Alignment = getLoadStoreAlignment(I); 5358 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5359 isLegalMaskedGather(Ty, Alignment)) 5360 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5361 isLegalMaskedScatter(Ty, Alignment)); 5362 } 5363 case Instruction::UDiv: 5364 case Instruction::SDiv: 5365 case Instruction::SRem: 5366 case Instruction::URem: 5367 return mayDivideByZero(*I); 5368 } 5369 return false; 5370 } 5371 5372 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5373 Instruction *I, ElementCount VF) { 5374 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5375 assert(getWideningDecision(I, VF) == CM_Unknown && 5376 "Decision should not be set yet."); 5377 auto *Group = getInterleavedAccessGroup(I); 5378 assert(Group && "Must have a group."); 5379 5380 // If the instruction's allocated size doesn't equal it's type size, it 5381 // requires padding and will be scalarized. 5382 auto &DL = I->getModule()->getDataLayout(); 5383 auto *ScalarTy = getMemInstValueType(I); 5384 if (hasIrregularType(ScalarTy, DL)) 5385 return false; 5386 5387 // Check if masking is required. 5388 // A Group may need masking for one of two reasons: it resides in a block that 5389 // needs predication, or it was decided to use masking to deal with gaps. 5390 bool PredicatedAccessRequiresMasking = 5391 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5392 bool AccessWithGapsRequiresMasking = 5393 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5394 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5395 return true; 5396 5397 // If masked interleaving is required, we expect that the user/target had 5398 // enabled it, because otherwise it either wouldn't have been created or 5399 // it should have been invalidated by the CostModel. 5400 assert(useMaskedInterleavedAccesses(TTI) && 5401 "Masked interleave-groups for predicated accesses are not enabled."); 5402 5403 auto *Ty = getMemInstValueType(I); 5404 const Align Alignment = getLoadStoreAlignment(I); 5405 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5406 : TTI.isLegalMaskedStore(Ty, Alignment); 5407 } 5408 5409 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5410 Instruction *I, ElementCount VF) { 5411 // Get and ensure we have a valid memory instruction. 5412 LoadInst *LI = dyn_cast<LoadInst>(I); 5413 StoreInst *SI = dyn_cast<StoreInst>(I); 5414 assert((LI || SI) && "Invalid memory instruction"); 5415 5416 auto *Ptr = getLoadStorePointerOperand(I); 5417 5418 // In order to be widened, the pointer should be consecutive, first of all. 5419 if (!Legal->isConsecutivePtr(Ptr)) 5420 return false; 5421 5422 // If the instruction is a store located in a predicated block, it will be 5423 // scalarized. 5424 if (isScalarWithPredication(I)) 5425 return false; 5426 5427 // If the instruction's allocated size doesn't equal it's type size, it 5428 // requires padding and will be scalarized. 5429 auto &DL = I->getModule()->getDataLayout(); 5430 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5431 if (hasIrregularType(ScalarTy, DL)) 5432 return false; 5433 5434 return true; 5435 } 5436 5437 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5438 // We should not collect Uniforms more than once per VF. Right now, 5439 // this function is called from collectUniformsAndScalars(), which 5440 // already does this check. Collecting Uniforms for VF=1 does not make any 5441 // sense. 5442 5443 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5444 "This function should not be visited twice for the same VF"); 5445 5446 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5447 // not analyze again. Uniforms.count(VF) will return 1. 5448 Uniforms[VF].clear(); 5449 5450 // We now know that the loop is vectorizable! 5451 // Collect instructions inside the loop that will remain uniform after 5452 // vectorization. 5453 5454 // Global values, params and instructions outside of current loop are out of 5455 // scope. 5456 auto isOutOfScope = [&](Value *V) -> bool { 5457 Instruction *I = dyn_cast<Instruction>(V); 5458 return (!I || !TheLoop->contains(I)); 5459 }; 5460 5461 SetVector<Instruction *> Worklist; 5462 BasicBlock *Latch = TheLoop->getLoopLatch(); 5463 5464 // Instructions that are scalar with predication must not be considered 5465 // uniform after vectorization, because that would create an erroneous 5466 // replicating region where only a single instance out of VF should be formed. 5467 // TODO: optimize such seldom cases if found important, see PR40816. 5468 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5469 if (isOutOfScope(I)) { 5470 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5471 << *I << "\n"); 5472 return; 5473 } 5474 if (isScalarWithPredication(I)) { 5475 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5476 << *I << "\n"); 5477 return; 5478 } 5479 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5480 Worklist.insert(I); 5481 }; 5482 5483 // Start with the conditional branch. If the branch condition is an 5484 // instruction contained in the loop that is only used by the branch, it is 5485 // uniform. 5486 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5487 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5488 addToWorklistIfAllowed(Cmp); 5489 5490 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5491 InstWidening WideningDecision = getWideningDecision(I, VF); 5492 assert(WideningDecision != CM_Unknown && 5493 "Widening decision should be ready at this moment"); 5494 5495 // A uniform memory op is itself uniform. We exclude uniform stores 5496 // here as they demand the last lane, not the first one. 5497 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5498 assert(WideningDecision == CM_Scalarize); 5499 return true; 5500 } 5501 5502 return (WideningDecision == CM_Widen || 5503 WideningDecision == CM_Widen_Reverse || 5504 WideningDecision == CM_Interleave); 5505 }; 5506 5507 5508 // Returns true if Ptr is the pointer operand of a memory access instruction 5509 // I, and I is known to not require scalarization. 5510 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5511 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5512 }; 5513 5514 // Holds a list of values which are known to have at least one uniform use. 5515 // Note that there may be other uses which aren't uniform. A "uniform use" 5516 // here is something which only demands lane 0 of the unrolled iterations; 5517 // it does not imply that all lanes produce the same value (e.g. this is not 5518 // the usual meaning of uniform) 5519 SetVector<Value *> HasUniformUse; 5520 5521 // Scan the loop for instructions which are either a) known to have only 5522 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5523 for (auto *BB : TheLoop->blocks()) 5524 for (auto &I : *BB) { 5525 // If there's no pointer operand, there's nothing to do. 5526 auto *Ptr = getLoadStorePointerOperand(&I); 5527 if (!Ptr) 5528 continue; 5529 5530 // A uniform memory op is itself uniform. We exclude uniform stores 5531 // here as they demand the last lane, not the first one. 5532 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5533 addToWorklistIfAllowed(&I); 5534 5535 if (isUniformDecision(&I, VF)) { 5536 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5537 HasUniformUse.insert(Ptr); 5538 } 5539 } 5540 5541 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5542 // demanding) users. Since loops are assumed to be in LCSSA form, this 5543 // disallows uses outside the loop as well. 5544 for (auto *V : HasUniformUse) { 5545 if (isOutOfScope(V)) 5546 continue; 5547 auto *I = cast<Instruction>(V); 5548 auto UsersAreMemAccesses = 5549 llvm::all_of(I->users(), [&](User *U) -> bool { 5550 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5551 }); 5552 if (UsersAreMemAccesses) 5553 addToWorklistIfAllowed(I); 5554 } 5555 5556 // Expand Worklist in topological order: whenever a new instruction 5557 // is added , its users should be already inside Worklist. It ensures 5558 // a uniform instruction will only be used by uniform instructions. 5559 unsigned idx = 0; 5560 while (idx != Worklist.size()) { 5561 Instruction *I = Worklist[idx++]; 5562 5563 for (auto OV : I->operand_values()) { 5564 // isOutOfScope operands cannot be uniform instructions. 5565 if (isOutOfScope(OV)) 5566 continue; 5567 // First order recurrence Phi's should typically be considered 5568 // non-uniform. 5569 auto *OP = dyn_cast<PHINode>(OV); 5570 if (OP && Legal->isFirstOrderRecurrence(OP)) 5571 continue; 5572 // If all the users of the operand are uniform, then add the 5573 // operand into the uniform worklist. 5574 auto *OI = cast<Instruction>(OV); 5575 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5576 auto *J = cast<Instruction>(U); 5577 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5578 })) 5579 addToWorklistIfAllowed(OI); 5580 } 5581 } 5582 5583 // For an instruction to be added into Worklist above, all its users inside 5584 // the loop should also be in Worklist. However, this condition cannot be 5585 // true for phi nodes that form a cyclic dependence. We must process phi 5586 // nodes separately. An induction variable will remain uniform if all users 5587 // of the induction variable and induction variable update remain uniform. 5588 // The code below handles both pointer and non-pointer induction variables. 5589 for (auto &Induction : Legal->getInductionVars()) { 5590 auto *Ind = Induction.first; 5591 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5592 5593 // Determine if all users of the induction variable are uniform after 5594 // vectorization. 5595 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5596 auto *I = cast<Instruction>(U); 5597 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5598 isVectorizedMemAccessUse(I, Ind); 5599 }); 5600 if (!UniformInd) 5601 continue; 5602 5603 // Determine if all users of the induction variable update instruction are 5604 // uniform after vectorization. 5605 auto UniformIndUpdate = 5606 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5607 auto *I = cast<Instruction>(U); 5608 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5609 isVectorizedMemAccessUse(I, IndUpdate); 5610 }); 5611 if (!UniformIndUpdate) 5612 continue; 5613 5614 // The induction variable and its update instruction will remain uniform. 5615 addToWorklistIfAllowed(Ind); 5616 addToWorklistIfAllowed(IndUpdate); 5617 } 5618 5619 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5620 } 5621 5622 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5623 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5624 5625 if (Legal->getRuntimePointerChecking()->Need) { 5626 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5627 "runtime pointer checks needed. Enable vectorization of this " 5628 "loop with '#pragma clang loop vectorize(enable)' when " 5629 "compiling with -Os/-Oz", 5630 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5631 return true; 5632 } 5633 5634 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5635 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5636 "runtime SCEV checks needed. Enable vectorization of this " 5637 "loop with '#pragma clang loop vectorize(enable)' when " 5638 "compiling with -Os/-Oz", 5639 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5640 return true; 5641 } 5642 5643 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5644 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5645 reportVectorizationFailure("Runtime stride check for small trip count", 5646 "runtime stride == 1 checks needed. Enable vectorization of " 5647 "this loop without such check by compiling with -Os/-Oz", 5648 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5649 return true; 5650 } 5651 5652 return false; 5653 } 5654 5655 ElementCount 5656 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5657 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5658 reportVectorizationInfo( 5659 "Disabling scalable vectorization, because target does not " 5660 "support scalable vectors.", 5661 "ScalableVectorsUnsupported", ORE, TheLoop); 5662 return ElementCount::getScalable(0); 5663 } 5664 5665 if (Hints->isScalableVectorizationDisabled()) { 5666 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 5667 "ScalableVectorizationDisabled", ORE, TheLoop); 5668 return ElementCount::getScalable(0); 5669 } 5670 5671 auto MaxScalableVF = ElementCount::getScalable( 5672 std::numeric_limits<ElementCount::ScalarTy>::max()); 5673 5674 // Disable scalable vectorization if the loop contains unsupported reductions. 5675 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5676 // FIXME: While for scalable vectors this is currently sufficient, this should 5677 // be replaced by a more detailed mechanism that filters out specific VFs, 5678 // instead of invalidating vectorization for a whole set of VFs based on the 5679 // MaxVF. 5680 if (!canVectorizeReductions(MaxScalableVF)) { 5681 reportVectorizationInfo( 5682 "Scalable vectorization not supported for the reduction " 5683 "operations found in this loop.", 5684 "ScalableVFUnfeasible", ORE, TheLoop); 5685 return ElementCount::getScalable(0); 5686 } 5687 5688 if (Legal->isSafeForAnyVectorWidth()) 5689 return MaxScalableVF; 5690 5691 // Limit MaxScalableVF by the maximum safe dependence distance. 5692 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5693 MaxScalableVF = ElementCount::getScalable( 5694 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5695 if (!MaxScalableVF) 5696 reportVectorizationInfo( 5697 "Max legal vector width too small, scalable vectorization " 5698 "unfeasible.", 5699 "ScalableVFUnfeasible", ORE, TheLoop); 5700 5701 return MaxScalableVF; 5702 } 5703 5704 FixedScalableVFPair 5705 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5706 ElementCount UserVF) { 5707 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5708 unsigned SmallestType, WidestType; 5709 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5710 5711 // Get the maximum safe dependence distance in bits computed by LAA. 5712 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5713 // the memory accesses that is most restrictive (involved in the smallest 5714 // dependence distance). 5715 unsigned MaxSafeElements = 5716 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5717 5718 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5719 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5720 5721 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5722 << ".\n"); 5723 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5724 << ".\n"); 5725 5726 // First analyze the UserVF, fall back if the UserVF should be ignored. 5727 if (UserVF) { 5728 auto MaxSafeUserVF = 5729 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5730 5731 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5732 return UserVF; 5733 5734 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5735 5736 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5737 // is better to ignore the hint and let the compiler choose a suitable VF. 5738 if (!UserVF.isScalable()) { 5739 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5740 << " is unsafe, clamping to max safe VF=" 5741 << MaxSafeFixedVF << ".\n"); 5742 ORE->emit([&]() { 5743 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5744 TheLoop->getStartLoc(), 5745 TheLoop->getHeader()) 5746 << "User-specified vectorization factor " 5747 << ore::NV("UserVectorizationFactor", UserVF) 5748 << " is unsafe, clamping to maximum safe vectorization factor " 5749 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5750 }); 5751 return MaxSafeFixedVF; 5752 } 5753 5754 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5755 << " is unsafe. Ignoring scalable UserVF.\n"); 5756 ORE->emit([&]() { 5757 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5758 TheLoop->getStartLoc(), 5759 TheLoop->getHeader()) 5760 << "User-specified vectorization factor " 5761 << ore::NV("UserVectorizationFactor", UserVF) 5762 << " is unsafe. Ignoring the hint to let the compiler pick a " 5763 "suitable VF."; 5764 }); 5765 } 5766 5767 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5768 << " / " << WidestType << " bits.\n"); 5769 5770 FixedScalableVFPair Result(ElementCount::getFixed(1), 5771 ElementCount::getScalable(0)); 5772 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5773 WidestType, MaxSafeFixedVF)) 5774 Result.FixedVF = MaxVF; 5775 5776 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5777 WidestType, MaxSafeScalableVF)) 5778 if (MaxVF.isScalable()) { 5779 Result.ScalableVF = MaxVF; 5780 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5781 << "\n"); 5782 } 5783 5784 return Result; 5785 } 5786 5787 FixedScalableVFPair 5788 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5789 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5790 // TODO: It may by useful to do since it's still likely to be dynamically 5791 // uniform if the target can skip. 5792 reportVectorizationFailure( 5793 "Not inserting runtime ptr check for divergent target", 5794 "runtime pointer checks needed. Not enabled for divergent target", 5795 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5796 return FixedScalableVFPair::getNone(); 5797 } 5798 5799 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5800 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5801 if (TC == 1) { 5802 reportVectorizationFailure("Single iteration (non) loop", 5803 "loop trip count is one, irrelevant for vectorization", 5804 "SingleIterationLoop", ORE, TheLoop); 5805 return FixedScalableVFPair::getNone(); 5806 } 5807 5808 switch (ScalarEpilogueStatus) { 5809 case CM_ScalarEpilogueAllowed: 5810 return computeFeasibleMaxVF(TC, UserVF); 5811 case CM_ScalarEpilogueNotAllowedUsePredicate: 5812 LLVM_FALLTHROUGH; 5813 case CM_ScalarEpilogueNotNeededUsePredicate: 5814 LLVM_DEBUG( 5815 dbgs() << "LV: vector predicate hint/switch found.\n" 5816 << "LV: Not allowing scalar epilogue, creating predicated " 5817 << "vector loop.\n"); 5818 break; 5819 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5820 // fallthrough as a special case of OptForSize 5821 case CM_ScalarEpilogueNotAllowedOptSize: 5822 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5823 LLVM_DEBUG( 5824 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5825 else 5826 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5827 << "count.\n"); 5828 5829 // Bail if runtime checks are required, which are not good when optimising 5830 // for size. 5831 if (runtimeChecksRequired()) 5832 return FixedScalableVFPair::getNone(); 5833 5834 break; 5835 } 5836 5837 // The only loops we can vectorize without a scalar epilogue, are loops with 5838 // a bottom-test and a single exiting block. We'd have to handle the fact 5839 // that not every instruction executes on the last iteration. This will 5840 // require a lane mask which varies through the vector loop body. (TODO) 5841 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5842 // If there was a tail-folding hint/switch, but we can't fold the tail by 5843 // masking, fallback to a vectorization with a scalar epilogue. 5844 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5845 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5846 "scalar epilogue instead.\n"); 5847 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5848 return computeFeasibleMaxVF(TC, UserVF); 5849 } 5850 return FixedScalableVFPair::getNone(); 5851 } 5852 5853 // Now try the tail folding 5854 5855 // Invalidate interleave groups that require an epilogue if we can't mask 5856 // the interleave-group. 5857 if (!useMaskedInterleavedAccesses(TTI)) { 5858 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5859 "No decisions should have been taken at this point"); 5860 // Note: There is no need to invalidate any cost modeling decisions here, as 5861 // non where taken so far. 5862 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5863 } 5864 5865 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); 5866 // Avoid tail folding if the trip count is known to be a multiple of any VF 5867 // we chose. 5868 // FIXME: The condition below pessimises the case for fixed-width vectors, 5869 // when scalable VFs are also candidates for vectorization. 5870 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) { 5871 ElementCount MaxFixedVF = MaxFactors.FixedVF; 5872 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && 5873 "MaxFixedVF must be a power of 2"); 5874 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC 5875 : MaxFixedVF.getFixedValue(); 5876 ScalarEvolution *SE = PSE.getSE(); 5877 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5878 const SCEV *ExitCount = SE->getAddExpr( 5879 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5880 const SCEV *Rem = SE->getURemExpr( 5881 SE->applyLoopGuards(ExitCount, TheLoop), 5882 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5883 if (Rem->isZero()) { 5884 // Accept MaxFixedVF if we do not have a tail. 5885 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5886 return MaxFactors; 5887 } 5888 } 5889 5890 // If we don't know the precise trip count, or if the trip count that we 5891 // found modulo the vectorization factor is not zero, try to fold the tail 5892 // by masking. 5893 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5894 if (Legal->prepareToFoldTailByMasking()) { 5895 FoldTailByMasking = true; 5896 return MaxFactors; 5897 } 5898 5899 // If there was a tail-folding hint/switch, but we can't fold the tail by 5900 // masking, fallback to a vectorization with a scalar epilogue. 5901 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5902 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5903 "scalar epilogue instead.\n"); 5904 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5905 return MaxFactors; 5906 } 5907 5908 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5909 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5910 return FixedScalableVFPair::getNone(); 5911 } 5912 5913 if (TC == 0) { 5914 reportVectorizationFailure( 5915 "Unable to calculate the loop count due to complex control flow", 5916 "unable to calculate the loop count due to complex control flow", 5917 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5918 return FixedScalableVFPair::getNone(); 5919 } 5920 5921 reportVectorizationFailure( 5922 "Cannot optimize for size and vectorize at the same time.", 5923 "cannot optimize for size and vectorize at the same time. " 5924 "Enable vectorization of this loop with '#pragma clang loop " 5925 "vectorize(enable)' when compiling with -Os/-Oz", 5926 "NoTailLoopWithOptForSize", ORE, TheLoop); 5927 return FixedScalableVFPair::getNone(); 5928 } 5929 5930 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5931 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5932 const ElementCount &MaxSafeVF) { 5933 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5934 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5935 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5936 : TargetTransformInfo::RGK_FixedWidthVector); 5937 5938 // Convenience function to return the minimum of two ElementCounts. 5939 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5940 assert((LHS.isScalable() == RHS.isScalable()) && 5941 "Scalable flags must match"); 5942 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5943 }; 5944 5945 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5946 // Note that both WidestRegister and WidestType may not be a powers of 2. 5947 auto MaxVectorElementCount = ElementCount::get( 5948 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5949 ComputeScalableMaxVF); 5950 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5951 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5952 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5953 5954 if (!MaxVectorElementCount) { 5955 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5956 return ElementCount::getFixed(1); 5957 } 5958 5959 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5960 if (ConstTripCount && 5961 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5962 isPowerOf2_32(ConstTripCount)) { 5963 // We need to clamp the VF to be the ConstTripCount. There is no point in 5964 // choosing a higher viable VF as done in the loop below. If 5965 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5966 // the TC is less than or equal to the known number of lanes. 5967 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5968 << ConstTripCount << "\n"); 5969 return TripCountEC; 5970 } 5971 5972 ElementCount MaxVF = MaxVectorElementCount; 5973 if (TTI.shouldMaximizeVectorBandwidth() || 5974 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5975 auto MaxVectorElementCountMaxBW = ElementCount::get( 5976 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5977 ComputeScalableMaxVF); 5978 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5979 5980 // Collect all viable vectorization factors larger than the default MaxVF 5981 // (i.e. MaxVectorElementCount). 5982 SmallVector<ElementCount, 8> VFs; 5983 for (ElementCount VS = MaxVectorElementCount * 2; 5984 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5985 VFs.push_back(VS); 5986 5987 // For each VF calculate its register usage. 5988 auto RUs = calculateRegisterUsage(VFs); 5989 5990 // Select the largest VF which doesn't require more registers than existing 5991 // ones. 5992 for (int i = RUs.size() - 1; i >= 0; --i) { 5993 bool Selected = true; 5994 for (auto &pair : RUs[i].MaxLocalUsers) { 5995 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5996 if (pair.second > TargetNumRegisters) 5997 Selected = false; 5998 } 5999 if (Selected) { 6000 MaxVF = VFs[i]; 6001 break; 6002 } 6003 } 6004 if (ElementCount MinVF = 6005 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 6006 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 6007 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 6008 << ") with target's minimum: " << MinVF << '\n'); 6009 MaxVF = MinVF; 6010 } 6011 } 6012 } 6013 return MaxVF; 6014 } 6015 6016 bool LoopVectorizationCostModel::isMoreProfitable( 6017 const VectorizationFactor &A, const VectorizationFactor &B) const { 6018 InstructionCost::CostType CostA = *A.Cost.getValue(); 6019 InstructionCost::CostType CostB = *B.Cost.getValue(); 6020 6021 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6022 6023 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6024 MaxTripCount) { 6025 // If we are folding the tail and the trip count is a known (possibly small) 6026 // constant, the trip count will be rounded up to an integer number of 6027 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6028 // which we compare directly. When not folding the tail, the total cost will 6029 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6030 // approximated with the per-lane cost below instead of using the tripcount 6031 // as here. 6032 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6033 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6034 return RTCostA < RTCostB; 6035 } 6036 6037 // To avoid the need for FP division: 6038 // (CostA / A.Width) < (CostB / B.Width) 6039 // <=> (CostA * B.Width) < (CostB * A.Width) 6040 return (CostA * B.Width.getKnownMinValue()) < 6041 (CostB * A.Width.getKnownMinValue()); 6042 } 6043 6044 VectorizationFactor 6045 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 6046 // FIXME: This can be fixed for scalable vectors later, because at this stage 6047 // the LoopVectorizer will only consider vectorizing a loop with scalable 6048 // vectors when the loop has a hint to enable vectorization for a given VF. 6049 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 6050 6051 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6052 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6053 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6054 6055 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6056 VectorizationFactor ChosenFactor = ScalarCost; 6057 6058 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6059 if (ForceVectorization && MaxVF.isVector()) { 6060 // Ignore scalar width, because the user explicitly wants vectorization. 6061 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6062 // evaluation. 6063 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6064 } 6065 6066 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 6067 i *= 2) { 6068 // Notice that the vector loop needs to be executed less times, so 6069 // we need to divide the cost of the vector loops by the width of 6070 // the vector elements. 6071 VectorizationCostTy C = expectedCost(i); 6072 6073 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6074 VectorizationFactor Candidate(i, C.first); 6075 LLVM_DEBUG( 6076 dbgs() << "LV: Vector loop of width " << i << " costs: " 6077 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 6078 << ".\n"); 6079 6080 if (!C.second && !ForceVectorization) { 6081 LLVM_DEBUG( 6082 dbgs() << "LV: Not considering vector loop of width " << i 6083 << " because it will not generate any vector instructions.\n"); 6084 continue; 6085 } 6086 6087 // If profitable add it to ProfitableVF list. 6088 if (isMoreProfitable(Candidate, ScalarCost)) 6089 ProfitableVFs.push_back(Candidate); 6090 6091 if (isMoreProfitable(Candidate, ChosenFactor)) 6092 ChosenFactor = Candidate; 6093 } 6094 6095 if (!EnableCondStoresVectorization && NumPredStores) { 6096 reportVectorizationFailure("There are conditional stores.", 6097 "store that is conditionally executed prevents vectorization", 6098 "ConditionalStore", ORE, TheLoop); 6099 ChosenFactor = ScalarCost; 6100 } 6101 6102 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6103 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6104 dbgs() 6105 << "LV: Vectorization seems to be not beneficial, " 6106 << "but was forced by a user.\n"); 6107 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6108 return ChosenFactor; 6109 } 6110 6111 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6112 const Loop &L, ElementCount VF) const { 6113 // Cross iteration phis such as reductions need special handling and are 6114 // currently unsupported. 6115 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6116 return Legal->isFirstOrderRecurrence(&Phi) || 6117 Legal->isReductionVariable(&Phi); 6118 })) 6119 return false; 6120 6121 // Phis with uses outside of the loop require special handling and are 6122 // currently unsupported. 6123 for (auto &Entry : Legal->getInductionVars()) { 6124 // Look for uses of the value of the induction at the last iteration. 6125 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6126 for (User *U : PostInc->users()) 6127 if (!L.contains(cast<Instruction>(U))) 6128 return false; 6129 // Look for uses of penultimate value of the induction. 6130 for (User *U : Entry.first->users()) 6131 if (!L.contains(cast<Instruction>(U))) 6132 return false; 6133 } 6134 6135 // Induction variables that are widened require special handling that is 6136 // currently not supported. 6137 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6138 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6139 this->isProfitableToScalarize(Entry.first, VF)); 6140 })) 6141 return false; 6142 6143 return true; 6144 } 6145 6146 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6147 const ElementCount VF) const { 6148 // FIXME: We need a much better cost-model to take different parameters such 6149 // as register pressure, code size increase and cost of extra branches into 6150 // account. For now we apply a very crude heuristic and only consider loops 6151 // with vectorization factors larger than a certain value. 6152 // We also consider epilogue vectorization unprofitable for targets that don't 6153 // consider interleaving beneficial (eg. MVE). 6154 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6155 return false; 6156 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6157 return true; 6158 return false; 6159 } 6160 6161 VectorizationFactor 6162 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6163 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6164 VectorizationFactor Result = VectorizationFactor::Disabled(); 6165 if (!EnableEpilogueVectorization) { 6166 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6167 return Result; 6168 } 6169 6170 if (!isScalarEpilogueAllowed()) { 6171 LLVM_DEBUG( 6172 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6173 "allowed.\n";); 6174 return Result; 6175 } 6176 6177 // FIXME: This can be fixed for scalable vectors later, because at this stage 6178 // the LoopVectorizer will only consider vectorizing a loop with scalable 6179 // vectors when the loop has a hint to enable vectorization for a given VF. 6180 if (MainLoopVF.isScalable()) { 6181 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6182 "yet supported.\n"); 6183 return Result; 6184 } 6185 6186 // Not really a cost consideration, but check for unsupported cases here to 6187 // simplify the logic. 6188 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6189 LLVM_DEBUG( 6190 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6191 "not a supported candidate.\n";); 6192 return Result; 6193 } 6194 6195 if (EpilogueVectorizationForceVF > 1) { 6196 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6197 if (LVP.hasPlanWithVFs( 6198 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6199 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6200 else { 6201 LLVM_DEBUG( 6202 dbgs() 6203 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6204 return Result; 6205 } 6206 } 6207 6208 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6209 TheLoop->getHeader()->getParent()->hasMinSize()) { 6210 LLVM_DEBUG( 6211 dbgs() 6212 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6213 return Result; 6214 } 6215 6216 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6217 return Result; 6218 6219 for (auto &NextVF : ProfitableVFs) 6220 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6221 (Result.Width.getFixedValue() == 1 || 6222 isMoreProfitable(NextVF, Result)) && 6223 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6224 Result = NextVF; 6225 6226 if (Result != VectorizationFactor::Disabled()) 6227 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6228 << Result.Width.getFixedValue() << "\n";); 6229 return Result; 6230 } 6231 6232 std::pair<unsigned, unsigned> 6233 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6234 unsigned MinWidth = -1U; 6235 unsigned MaxWidth = 8; 6236 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6237 6238 // For each block. 6239 for (BasicBlock *BB : TheLoop->blocks()) { 6240 // For each instruction in the loop. 6241 for (Instruction &I : BB->instructionsWithoutDebug()) { 6242 Type *T = I.getType(); 6243 6244 // Skip ignored values. 6245 if (ValuesToIgnore.count(&I)) 6246 continue; 6247 6248 // Only examine Loads, Stores and PHINodes. 6249 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6250 continue; 6251 6252 // Examine PHI nodes that are reduction variables. Update the type to 6253 // account for the recurrence type. 6254 if (auto *PN = dyn_cast<PHINode>(&I)) { 6255 if (!Legal->isReductionVariable(PN)) 6256 continue; 6257 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6258 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6259 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6260 RdxDesc.getRecurrenceType(), 6261 TargetTransformInfo::ReductionFlags())) 6262 continue; 6263 T = RdxDesc.getRecurrenceType(); 6264 } 6265 6266 // Examine the stored values. 6267 if (auto *ST = dyn_cast<StoreInst>(&I)) 6268 T = ST->getValueOperand()->getType(); 6269 6270 // Ignore loaded pointer types and stored pointer types that are not 6271 // vectorizable. 6272 // 6273 // FIXME: The check here attempts to predict whether a load or store will 6274 // be vectorized. We only know this for certain after a VF has 6275 // been selected. Here, we assume that if an access can be 6276 // vectorized, it will be. We should also look at extending this 6277 // optimization to non-pointer types. 6278 // 6279 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6280 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6281 continue; 6282 6283 MinWidth = std::min(MinWidth, 6284 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6285 MaxWidth = std::max(MaxWidth, 6286 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6287 } 6288 } 6289 6290 return {MinWidth, MaxWidth}; 6291 } 6292 6293 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6294 unsigned LoopCost) { 6295 // -- The interleave heuristics -- 6296 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6297 // There are many micro-architectural considerations that we can't predict 6298 // at this level. For example, frontend pressure (on decode or fetch) due to 6299 // code size, or the number and capabilities of the execution ports. 6300 // 6301 // We use the following heuristics to select the interleave count: 6302 // 1. If the code has reductions, then we interleave to break the cross 6303 // iteration dependency. 6304 // 2. If the loop is really small, then we interleave to reduce the loop 6305 // overhead. 6306 // 3. We don't interleave if we think that we will spill registers to memory 6307 // due to the increased register pressure. 6308 6309 if (!isScalarEpilogueAllowed()) 6310 return 1; 6311 6312 // We used the distance for the interleave count. 6313 if (Legal->getMaxSafeDepDistBytes() != -1U) 6314 return 1; 6315 6316 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6317 const bool HasReductions = !Legal->getReductionVars().empty(); 6318 // Do not interleave loops with a relatively small known or estimated trip 6319 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6320 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6321 // because with the above conditions interleaving can expose ILP and break 6322 // cross iteration dependences for reductions. 6323 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6324 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6325 return 1; 6326 6327 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6328 // We divide by these constants so assume that we have at least one 6329 // instruction that uses at least one register. 6330 for (auto& pair : R.MaxLocalUsers) { 6331 pair.second = std::max(pair.second, 1U); 6332 } 6333 6334 // We calculate the interleave count using the following formula. 6335 // Subtract the number of loop invariants from the number of available 6336 // registers. These registers are used by all of the interleaved instances. 6337 // Next, divide the remaining registers by the number of registers that is 6338 // required by the loop, in order to estimate how many parallel instances 6339 // fit without causing spills. All of this is rounded down if necessary to be 6340 // a power of two. We want power of two interleave count to simplify any 6341 // addressing operations or alignment considerations. 6342 // We also want power of two interleave counts to ensure that the induction 6343 // variable of the vector loop wraps to zero, when tail is folded by masking; 6344 // this currently happens when OptForSize, in which case IC is set to 1 above. 6345 unsigned IC = UINT_MAX; 6346 6347 for (auto& pair : R.MaxLocalUsers) { 6348 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6349 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6350 << " registers of " 6351 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6352 if (VF.isScalar()) { 6353 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6354 TargetNumRegisters = ForceTargetNumScalarRegs; 6355 } else { 6356 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6357 TargetNumRegisters = ForceTargetNumVectorRegs; 6358 } 6359 unsigned MaxLocalUsers = pair.second; 6360 unsigned LoopInvariantRegs = 0; 6361 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6362 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6363 6364 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6365 // Don't count the induction variable as interleaved. 6366 if (EnableIndVarRegisterHeur) { 6367 TmpIC = 6368 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6369 std::max(1U, (MaxLocalUsers - 1))); 6370 } 6371 6372 IC = std::min(IC, TmpIC); 6373 } 6374 6375 // Clamp the interleave ranges to reasonable counts. 6376 unsigned MaxInterleaveCount = 6377 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6378 6379 // Check if the user has overridden the max. 6380 if (VF.isScalar()) { 6381 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6382 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6383 } else { 6384 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6385 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6386 } 6387 6388 // If trip count is known or estimated compile time constant, limit the 6389 // interleave count to be less than the trip count divided by VF, provided it 6390 // is at least 1. 6391 // 6392 // For scalable vectors we can't know if interleaving is beneficial. It may 6393 // not be beneficial for small loops if none of the lanes in the second vector 6394 // iterations is enabled. However, for larger loops, there is likely to be a 6395 // similar benefit as for fixed-width vectors. For now, we choose to leave 6396 // the InterleaveCount as if vscale is '1', although if some information about 6397 // the vector is known (e.g. min vector size), we can make a better decision. 6398 if (BestKnownTC) { 6399 MaxInterleaveCount = 6400 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6401 // Make sure MaxInterleaveCount is greater than 0. 6402 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6403 } 6404 6405 assert(MaxInterleaveCount > 0 && 6406 "Maximum interleave count must be greater than 0"); 6407 6408 // Clamp the calculated IC to be between the 1 and the max interleave count 6409 // that the target and trip count allows. 6410 if (IC > MaxInterleaveCount) 6411 IC = MaxInterleaveCount; 6412 else 6413 // Make sure IC is greater than 0. 6414 IC = std::max(1u, IC); 6415 6416 assert(IC > 0 && "Interleave count must be greater than 0."); 6417 6418 // If we did not calculate the cost for VF (because the user selected the VF) 6419 // then we calculate the cost of VF here. 6420 if (LoopCost == 0) { 6421 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6422 LoopCost = *expectedCost(VF).first.getValue(); 6423 } 6424 6425 assert(LoopCost && "Non-zero loop cost expected"); 6426 6427 // Interleave if we vectorized this loop and there is a reduction that could 6428 // benefit from interleaving. 6429 if (VF.isVector() && HasReductions) { 6430 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6431 return IC; 6432 } 6433 6434 // Note that if we've already vectorized the loop we will have done the 6435 // runtime check and so interleaving won't require further checks. 6436 bool InterleavingRequiresRuntimePointerCheck = 6437 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6438 6439 // We want to interleave small loops in order to reduce the loop overhead and 6440 // potentially expose ILP opportunities. 6441 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6442 << "LV: IC is " << IC << '\n' 6443 << "LV: VF is " << VF << '\n'); 6444 const bool AggressivelyInterleaveReductions = 6445 TTI.enableAggressiveInterleaving(HasReductions); 6446 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6447 // We assume that the cost overhead is 1 and we use the cost model 6448 // to estimate the cost of the loop and interleave until the cost of the 6449 // loop overhead is about 5% of the cost of the loop. 6450 unsigned SmallIC = 6451 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6452 6453 // Interleave until store/load ports (estimated by max interleave count) are 6454 // saturated. 6455 unsigned NumStores = Legal->getNumStores(); 6456 unsigned NumLoads = Legal->getNumLoads(); 6457 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6458 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6459 6460 // If we have a scalar reduction (vector reductions are already dealt with 6461 // by this point), we can increase the critical path length if the loop 6462 // we're interleaving is inside another loop. Limit, by default to 2, so the 6463 // critical path only gets increased by one reduction operation. 6464 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6465 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6466 SmallIC = std::min(SmallIC, F); 6467 StoresIC = std::min(StoresIC, F); 6468 LoadsIC = std::min(LoadsIC, F); 6469 } 6470 6471 if (EnableLoadStoreRuntimeInterleave && 6472 std::max(StoresIC, LoadsIC) > SmallIC) { 6473 LLVM_DEBUG( 6474 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6475 return std::max(StoresIC, LoadsIC); 6476 } 6477 6478 // If there are scalar reductions and TTI has enabled aggressive 6479 // interleaving for reductions, we will interleave to expose ILP. 6480 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6481 AggressivelyInterleaveReductions) { 6482 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6483 // Interleave no less than SmallIC but not as aggressive as the normal IC 6484 // to satisfy the rare situation when resources are too limited. 6485 return std::max(IC / 2, SmallIC); 6486 } else { 6487 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6488 return SmallIC; 6489 } 6490 } 6491 6492 // Interleave if this is a large loop (small loops are already dealt with by 6493 // this point) that could benefit from interleaving. 6494 if (AggressivelyInterleaveReductions) { 6495 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6496 return IC; 6497 } 6498 6499 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6500 return 1; 6501 } 6502 6503 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6504 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6505 // This function calculates the register usage by measuring the highest number 6506 // of values that are alive at a single location. Obviously, this is a very 6507 // rough estimation. We scan the loop in a topological order in order and 6508 // assign a number to each instruction. We use RPO to ensure that defs are 6509 // met before their users. We assume that each instruction that has in-loop 6510 // users starts an interval. We record every time that an in-loop value is 6511 // used, so we have a list of the first and last occurrences of each 6512 // instruction. Next, we transpose this data structure into a multi map that 6513 // holds the list of intervals that *end* at a specific location. This multi 6514 // map allows us to perform a linear search. We scan the instructions linearly 6515 // and record each time that a new interval starts, by placing it in a set. 6516 // If we find this value in the multi-map then we remove it from the set. 6517 // The max register usage is the maximum size of the set. 6518 // We also search for instructions that are defined outside the loop, but are 6519 // used inside the loop. We need this number separately from the max-interval 6520 // usage number because when we unroll, loop-invariant values do not take 6521 // more register. 6522 LoopBlocksDFS DFS(TheLoop); 6523 DFS.perform(LI); 6524 6525 RegisterUsage RU; 6526 6527 // Each 'key' in the map opens a new interval. The values 6528 // of the map are the index of the 'last seen' usage of the 6529 // instruction that is the key. 6530 using IntervalMap = DenseMap<Instruction *, unsigned>; 6531 6532 // Maps instruction to its index. 6533 SmallVector<Instruction *, 64> IdxToInstr; 6534 // Marks the end of each interval. 6535 IntervalMap EndPoint; 6536 // Saves the list of instruction indices that are used in the loop. 6537 SmallPtrSet<Instruction *, 8> Ends; 6538 // Saves the list of values that are used in the loop but are 6539 // defined outside the loop, such as arguments and constants. 6540 SmallPtrSet<Value *, 8> LoopInvariants; 6541 6542 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6543 for (Instruction &I : BB->instructionsWithoutDebug()) { 6544 IdxToInstr.push_back(&I); 6545 6546 // Save the end location of each USE. 6547 for (Value *U : I.operands()) { 6548 auto *Instr = dyn_cast<Instruction>(U); 6549 6550 // Ignore non-instruction values such as arguments, constants, etc. 6551 if (!Instr) 6552 continue; 6553 6554 // If this instruction is outside the loop then record it and continue. 6555 if (!TheLoop->contains(Instr)) { 6556 LoopInvariants.insert(Instr); 6557 continue; 6558 } 6559 6560 // Overwrite previous end points. 6561 EndPoint[Instr] = IdxToInstr.size(); 6562 Ends.insert(Instr); 6563 } 6564 } 6565 } 6566 6567 // Saves the list of intervals that end with the index in 'key'. 6568 using InstrList = SmallVector<Instruction *, 2>; 6569 DenseMap<unsigned, InstrList> TransposeEnds; 6570 6571 // Transpose the EndPoints to a list of values that end at each index. 6572 for (auto &Interval : EndPoint) 6573 TransposeEnds[Interval.second].push_back(Interval.first); 6574 6575 SmallPtrSet<Instruction *, 8> OpenIntervals; 6576 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6577 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6578 6579 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6580 6581 // A lambda that gets the register usage for the given type and VF. 6582 const auto &TTICapture = TTI; 6583 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6584 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6585 return 0; 6586 return *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue(); 6587 }; 6588 6589 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6590 Instruction *I = IdxToInstr[i]; 6591 6592 // Remove all of the instructions that end at this location. 6593 InstrList &List = TransposeEnds[i]; 6594 for (Instruction *ToRemove : List) 6595 OpenIntervals.erase(ToRemove); 6596 6597 // Ignore instructions that are never used within the loop. 6598 if (!Ends.count(I)) 6599 continue; 6600 6601 // Skip ignored values. 6602 if (ValuesToIgnore.count(I)) 6603 continue; 6604 6605 // For each VF find the maximum usage of registers. 6606 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6607 // Count the number of live intervals. 6608 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6609 6610 if (VFs[j].isScalar()) { 6611 for (auto Inst : OpenIntervals) { 6612 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6613 if (RegUsage.find(ClassID) == RegUsage.end()) 6614 RegUsage[ClassID] = 1; 6615 else 6616 RegUsage[ClassID] += 1; 6617 } 6618 } else { 6619 collectUniformsAndScalars(VFs[j]); 6620 for (auto Inst : OpenIntervals) { 6621 // Skip ignored values for VF > 1. 6622 if (VecValuesToIgnore.count(Inst)) 6623 continue; 6624 if (isScalarAfterVectorization(Inst, VFs[j])) { 6625 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6626 if (RegUsage.find(ClassID) == RegUsage.end()) 6627 RegUsage[ClassID] = 1; 6628 else 6629 RegUsage[ClassID] += 1; 6630 } else { 6631 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6632 if (RegUsage.find(ClassID) == RegUsage.end()) 6633 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6634 else 6635 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6636 } 6637 } 6638 } 6639 6640 for (auto& pair : RegUsage) { 6641 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6642 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6643 else 6644 MaxUsages[j][pair.first] = pair.second; 6645 } 6646 } 6647 6648 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6649 << OpenIntervals.size() << '\n'); 6650 6651 // Add the current instruction to the list of open intervals. 6652 OpenIntervals.insert(I); 6653 } 6654 6655 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6656 SmallMapVector<unsigned, unsigned, 4> Invariant; 6657 6658 for (auto Inst : LoopInvariants) { 6659 unsigned Usage = 6660 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6661 unsigned ClassID = 6662 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6663 if (Invariant.find(ClassID) == Invariant.end()) 6664 Invariant[ClassID] = Usage; 6665 else 6666 Invariant[ClassID] += Usage; 6667 } 6668 6669 LLVM_DEBUG({ 6670 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6671 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6672 << " item\n"; 6673 for (const auto &pair : MaxUsages[i]) { 6674 dbgs() << "LV(REG): RegisterClass: " 6675 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6676 << " registers\n"; 6677 } 6678 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6679 << " item\n"; 6680 for (const auto &pair : Invariant) { 6681 dbgs() << "LV(REG): RegisterClass: " 6682 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6683 << " registers\n"; 6684 } 6685 }); 6686 6687 RU.LoopInvariantRegs = Invariant; 6688 RU.MaxLocalUsers = MaxUsages[i]; 6689 RUs[i] = RU; 6690 } 6691 6692 return RUs; 6693 } 6694 6695 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6696 // TODO: Cost model for emulated masked load/store is completely 6697 // broken. This hack guides the cost model to use an artificially 6698 // high enough value to practically disable vectorization with such 6699 // operations, except where previously deployed legality hack allowed 6700 // using very low cost values. This is to avoid regressions coming simply 6701 // from moving "masked load/store" check from legality to cost model. 6702 // Masked Load/Gather emulation was previously never allowed. 6703 // Limited number of Masked Store/Scatter emulation was allowed. 6704 assert(isPredicatedInst(I) && 6705 "Expecting a scalar emulated instruction"); 6706 return isa<LoadInst>(I) || 6707 (isa<StoreInst>(I) && 6708 NumPredStores > NumberOfStoresToPredicate); 6709 } 6710 6711 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6712 // If we aren't vectorizing the loop, or if we've already collected the 6713 // instructions to scalarize, there's nothing to do. Collection may already 6714 // have occurred if we have a user-selected VF and are now computing the 6715 // expected cost for interleaving. 6716 if (VF.isScalar() || VF.isZero() || 6717 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6718 return; 6719 6720 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6721 // not profitable to scalarize any instructions, the presence of VF in the 6722 // map will indicate that we've analyzed it already. 6723 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6724 6725 // Find all the instructions that are scalar with predication in the loop and 6726 // determine if it would be better to not if-convert the blocks they are in. 6727 // If so, we also record the instructions to scalarize. 6728 for (BasicBlock *BB : TheLoop->blocks()) { 6729 if (!blockNeedsPredication(BB)) 6730 continue; 6731 for (Instruction &I : *BB) 6732 if (isScalarWithPredication(&I)) { 6733 ScalarCostsTy ScalarCosts; 6734 // Do not apply discount logic if hacked cost is needed 6735 // for emulated masked memrefs. 6736 if (!useEmulatedMaskMemRefHack(&I) && 6737 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6738 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6739 // Remember that BB will remain after vectorization. 6740 PredicatedBBsAfterVectorization.insert(BB); 6741 } 6742 } 6743 } 6744 6745 int LoopVectorizationCostModel::computePredInstDiscount( 6746 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6747 assert(!isUniformAfterVectorization(PredInst, VF) && 6748 "Instruction marked uniform-after-vectorization will be predicated"); 6749 6750 // Initialize the discount to zero, meaning that the scalar version and the 6751 // vector version cost the same. 6752 InstructionCost Discount = 0; 6753 6754 // Holds instructions to analyze. The instructions we visit are mapped in 6755 // ScalarCosts. Those instructions are the ones that would be scalarized if 6756 // we find that the scalar version costs less. 6757 SmallVector<Instruction *, 8> Worklist; 6758 6759 // Returns true if the given instruction can be scalarized. 6760 auto canBeScalarized = [&](Instruction *I) -> bool { 6761 // We only attempt to scalarize instructions forming a single-use chain 6762 // from the original predicated block that would otherwise be vectorized. 6763 // Although not strictly necessary, we give up on instructions we know will 6764 // already be scalar to avoid traversing chains that are unlikely to be 6765 // beneficial. 6766 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6767 isScalarAfterVectorization(I, VF)) 6768 return false; 6769 6770 // If the instruction is scalar with predication, it will be analyzed 6771 // separately. We ignore it within the context of PredInst. 6772 if (isScalarWithPredication(I)) 6773 return false; 6774 6775 // If any of the instruction's operands are uniform after vectorization, 6776 // the instruction cannot be scalarized. This prevents, for example, a 6777 // masked load from being scalarized. 6778 // 6779 // We assume we will only emit a value for lane zero of an instruction 6780 // marked uniform after vectorization, rather than VF identical values. 6781 // Thus, if we scalarize an instruction that uses a uniform, we would 6782 // create uses of values corresponding to the lanes we aren't emitting code 6783 // for. This behavior can be changed by allowing getScalarValue to clone 6784 // the lane zero values for uniforms rather than asserting. 6785 for (Use &U : I->operands()) 6786 if (auto *J = dyn_cast<Instruction>(U.get())) 6787 if (isUniformAfterVectorization(J, VF)) 6788 return false; 6789 6790 // Otherwise, we can scalarize the instruction. 6791 return true; 6792 }; 6793 6794 // Compute the expected cost discount from scalarizing the entire expression 6795 // feeding the predicated instruction. We currently only consider expressions 6796 // that are single-use instruction chains. 6797 Worklist.push_back(PredInst); 6798 while (!Worklist.empty()) { 6799 Instruction *I = Worklist.pop_back_val(); 6800 6801 // If we've already analyzed the instruction, there's nothing to do. 6802 if (ScalarCosts.find(I) != ScalarCosts.end()) 6803 continue; 6804 6805 // Compute the cost of the vector instruction. Note that this cost already 6806 // includes the scalarization overhead of the predicated instruction. 6807 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6808 6809 // Compute the cost of the scalarized instruction. This cost is the cost of 6810 // the instruction as if it wasn't if-converted and instead remained in the 6811 // predicated block. We will scale this cost by block probability after 6812 // computing the scalarization overhead. 6813 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6814 InstructionCost ScalarCost = 6815 VF.getKnownMinValue() * 6816 getInstructionCost(I, ElementCount::getFixed(1)).first; 6817 6818 // Compute the scalarization overhead of needed insertelement instructions 6819 // and phi nodes. 6820 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6821 ScalarCost += TTI.getScalarizationOverhead( 6822 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6823 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6824 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6825 ScalarCost += 6826 VF.getKnownMinValue() * 6827 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6828 } 6829 6830 // Compute the scalarization overhead of needed extractelement 6831 // instructions. For each of the instruction's operands, if the operand can 6832 // be scalarized, add it to the worklist; otherwise, account for the 6833 // overhead. 6834 for (Use &U : I->operands()) 6835 if (auto *J = dyn_cast<Instruction>(U.get())) { 6836 assert(VectorType::isValidElementType(J->getType()) && 6837 "Instruction has non-scalar type"); 6838 if (canBeScalarized(J)) 6839 Worklist.push_back(J); 6840 else if (needsExtract(J, VF)) { 6841 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6842 ScalarCost += TTI.getScalarizationOverhead( 6843 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6844 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6845 } 6846 } 6847 6848 // Scale the total scalar cost by block probability. 6849 ScalarCost /= getReciprocalPredBlockProb(); 6850 6851 // Compute the discount. A non-negative discount means the vector version 6852 // of the instruction costs more, and scalarizing would be beneficial. 6853 Discount += VectorCost - ScalarCost; 6854 ScalarCosts[I] = ScalarCost; 6855 } 6856 6857 return *Discount.getValue(); 6858 } 6859 6860 LoopVectorizationCostModel::VectorizationCostTy 6861 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6862 VectorizationCostTy Cost; 6863 6864 // For each block. 6865 for (BasicBlock *BB : TheLoop->blocks()) { 6866 VectorizationCostTy BlockCost; 6867 6868 // For each instruction in the old loop. 6869 for (Instruction &I : BB->instructionsWithoutDebug()) { 6870 // Skip ignored values. 6871 if (ValuesToIgnore.count(&I) || 6872 (VF.isVector() && VecValuesToIgnore.count(&I))) 6873 continue; 6874 6875 VectorizationCostTy C = getInstructionCost(&I, VF); 6876 6877 // Check if we should override the cost. 6878 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6879 C.first = InstructionCost(ForceTargetInstructionCost); 6880 6881 BlockCost.first += C.first; 6882 BlockCost.second |= C.second; 6883 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6884 << " for VF " << VF << " For instruction: " << I 6885 << '\n'); 6886 } 6887 6888 // If we are vectorizing a predicated block, it will have been 6889 // if-converted. This means that the block's instructions (aside from 6890 // stores and instructions that may divide by zero) will now be 6891 // unconditionally executed. For the scalar case, we may not always execute 6892 // the predicated block, if it is an if-else block. Thus, scale the block's 6893 // cost by the probability of executing it. blockNeedsPredication from 6894 // Legal is used so as to not include all blocks in tail folded loops. 6895 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6896 BlockCost.first /= getReciprocalPredBlockProb(); 6897 6898 Cost.first += BlockCost.first; 6899 Cost.second |= BlockCost.second; 6900 } 6901 6902 return Cost; 6903 } 6904 6905 /// Gets Address Access SCEV after verifying that the access pattern 6906 /// is loop invariant except the induction variable dependence. 6907 /// 6908 /// This SCEV can be sent to the Target in order to estimate the address 6909 /// calculation cost. 6910 static const SCEV *getAddressAccessSCEV( 6911 Value *Ptr, 6912 LoopVectorizationLegality *Legal, 6913 PredicatedScalarEvolution &PSE, 6914 const Loop *TheLoop) { 6915 6916 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6917 if (!Gep) 6918 return nullptr; 6919 6920 // We are looking for a gep with all loop invariant indices except for one 6921 // which should be an induction variable. 6922 auto SE = PSE.getSE(); 6923 unsigned NumOperands = Gep->getNumOperands(); 6924 for (unsigned i = 1; i < NumOperands; ++i) { 6925 Value *Opd = Gep->getOperand(i); 6926 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6927 !Legal->isInductionVariable(Opd)) 6928 return nullptr; 6929 } 6930 6931 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6932 return PSE.getSCEV(Ptr); 6933 } 6934 6935 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6936 return Legal->hasStride(I->getOperand(0)) || 6937 Legal->hasStride(I->getOperand(1)); 6938 } 6939 6940 InstructionCost 6941 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6942 ElementCount VF) { 6943 assert(VF.isVector() && 6944 "Scalarization cost of instruction implies vectorization."); 6945 if (VF.isScalable()) 6946 return InstructionCost::getInvalid(); 6947 6948 Type *ValTy = getMemInstValueType(I); 6949 auto SE = PSE.getSE(); 6950 6951 unsigned AS = getLoadStoreAddressSpace(I); 6952 Value *Ptr = getLoadStorePointerOperand(I); 6953 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6954 6955 // Figure out whether the access is strided and get the stride value 6956 // if it's known in compile time 6957 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6958 6959 // Get the cost of the scalar memory instruction and address computation. 6960 InstructionCost Cost = 6961 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6962 6963 // Don't pass *I here, since it is scalar but will actually be part of a 6964 // vectorized loop where the user of it is a vectorized instruction. 6965 const Align Alignment = getLoadStoreAlignment(I); 6966 Cost += VF.getKnownMinValue() * 6967 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6968 AS, TTI::TCK_RecipThroughput); 6969 6970 // Get the overhead of the extractelement and insertelement instructions 6971 // we might create due to scalarization. 6972 Cost += getScalarizationOverhead(I, VF); 6973 6974 // If we have a predicated load/store, it will need extra i1 extracts and 6975 // conditional branches, but may not be executed for each vector lane. Scale 6976 // the cost by the probability of executing the predicated block. 6977 if (isPredicatedInst(I)) { 6978 Cost /= getReciprocalPredBlockProb(); 6979 6980 // Add the cost of an i1 extract and a branch 6981 auto *Vec_i1Ty = 6982 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6983 Cost += TTI.getScalarizationOverhead( 6984 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6985 /*Insert=*/false, /*Extract=*/true); 6986 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6987 6988 if (useEmulatedMaskMemRefHack(I)) 6989 // Artificially setting to a high enough value to practically disable 6990 // vectorization with such operations. 6991 Cost = 3000000; 6992 } 6993 6994 return Cost; 6995 } 6996 6997 InstructionCost 6998 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6999 ElementCount VF) { 7000 Type *ValTy = getMemInstValueType(I); 7001 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7002 Value *Ptr = getLoadStorePointerOperand(I); 7003 unsigned AS = getLoadStoreAddressSpace(I); 7004 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7005 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7006 7007 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7008 "Stride should be 1 or -1 for consecutive memory access"); 7009 const Align Alignment = getLoadStoreAlignment(I); 7010 InstructionCost Cost = 0; 7011 if (Legal->isMaskRequired(I)) 7012 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7013 CostKind); 7014 else 7015 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7016 CostKind, I); 7017 7018 bool Reverse = ConsecutiveStride < 0; 7019 if (Reverse) 7020 Cost += 7021 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7022 return Cost; 7023 } 7024 7025 InstructionCost 7026 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7027 ElementCount VF) { 7028 assert(Legal->isUniformMemOp(*I)); 7029 7030 Type *ValTy = getMemInstValueType(I); 7031 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7032 const Align Alignment = getLoadStoreAlignment(I); 7033 unsigned AS = getLoadStoreAddressSpace(I); 7034 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7035 if (isa<LoadInst>(I)) { 7036 return TTI.getAddressComputationCost(ValTy) + 7037 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7038 CostKind) + 7039 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7040 } 7041 StoreInst *SI = cast<StoreInst>(I); 7042 7043 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7044 return TTI.getAddressComputationCost(ValTy) + 7045 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7046 CostKind) + 7047 (isLoopInvariantStoreValue 7048 ? 0 7049 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7050 VF.getKnownMinValue() - 1)); 7051 } 7052 7053 InstructionCost 7054 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7055 ElementCount VF) { 7056 Type *ValTy = getMemInstValueType(I); 7057 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7058 const Align Alignment = getLoadStoreAlignment(I); 7059 const Value *Ptr = getLoadStorePointerOperand(I); 7060 7061 return TTI.getAddressComputationCost(VectorTy) + 7062 TTI.getGatherScatterOpCost( 7063 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7064 TargetTransformInfo::TCK_RecipThroughput, I); 7065 } 7066 7067 InstructionCost 7068 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7069 ElementCount VF) { 7070 // TODO: Once we have support for interleaving with scalable vectors 7071 // we can calculate the cost properly here. 7072 if (VF.isScalable()) 7073 return InstructionCost::getInvalid(); 7074 7075 Type *ValTy = getMemInstValueType(I); 7076 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7077 unsigned AS = getLoadStoreAddressSpace(I); 7078 7079 auto Group = getInterleavedAccessGroup(I); 7080 assert(Group && "Fail to get an interleaved access group."); 7081 7082 unsigned InterleaveFactor = Group->getFactor(); 7083 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7084 7085 // Holds the indices of existing members in an interleaved load group. 7086 // An interleaved store group doesn't need this as it doesn't allow gaps. 7087 SmallVector<unsigned, 4> Indices; 7088 if (isa<LoadInst>(I)) { 7089 for (unsigned i = 0; i < InterleaveFactor; i++) 7090 if (Group->getMember(i)) 7091 Indices.push_back(i); 7092 } 7093 7094 // Calculate the cost of the whole interleaved group. 7095 bool UseMaskForGaps = 7096 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7097 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7098 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7099 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7100 7101 if (Group->isReverse()) { 7102 // TODO: Add support for reversed masked interleaved access. 7103 assert(!Legal->isMaskRequired(I) && 7104 "Reverse masked interleaved access not supported."); 7105 Cost += 7106 Group->getNumMembers() * 7107 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7108 } 7109 return Cost; 7110 } 7111 7112 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7113 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7114 // Early exit for no inloop reductions 7115 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7116 return InstructionCost::getInvalid(); 7117 auto *VectorTy = cast<VectorType>(Ty); 7118 7119 // We are looking for a pattern of, and finding the minimal acceptable cost: 7120 // reduce(mul(ext(A), ext(B))) or 7121 // reduce(mul(A, B)) or 7122 // reduce(ext(A)) or 7123 // reduce(A). 7124 // The basic idea is that we walk down the tree to do that, finding the root 7125 // reduction instruction in InLoopReductionImmediateChains. From there we find 7126 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7127 // of the components. If the reduction cost is lower then we return it for the 7128 // reduction instruction and 0 for the other instructions in the pattern. If 7129 // it is not we return an invalid cost specifying the orignal cost method 7130 // should be used. 7131 Instruction *RetI = I; 7132 if ((RetI->getOpcode() == Instruction::SExt || 7133 RetI->getOpcode() == Instruction::ZExt)) { 7134 if (!RetI->hasOneUser()) 7135 return InstructionCost::getInvalid(); 7136 RetI = RetI->user_back(); 7137 } 7138 if (RetI->getOpcode() == Instruction::Mul && 7139 RetI->user_back()->getOpcode() == Instruction::Add) { 7140 if (!RetI->hasOneUser()) 7141 return InstructionCost::getInvalid(); 7142 RetI = RetI->user_back(); 7143 } 7144 7145 // Test if the found instruction is a reduction, and if not return an invalid 7146 // cost specifying the parent to use the original cost modelling. 7147 if (!InLoopReductionImmediateChains.count(RetI)) 7148 return InstructionCost::getInvalid(); 7149 7150 // Find the reduction this chain is a part of and calculate the basic cost of 7151 // the reduction on its own. 7152 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7153 Instruction *ReductionPhi = LastChain; 7154 while (!isa<PHINode>(ReductionPhi)) 7155 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7156 7157 RecurrenceDescriptor RdxDesc = 7158 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7159 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7160 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7161 7162 // Get the operand that was not the reduction chain and match it to one of the 7163 // patterns, returning the better cost if it is found. 7164 Instruction *RedOp = RetI->getOperand(1) == LastChain 7165 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7166 : dyn_cast<Instruction>(RetI->getOperand(1)); 7167 7168 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7169 7170 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7171 !TheLoop->isLoopInvariant(RedOp)) { 7172 bool IsUnsigned = isa<ZExtInst>(RedOp); 7173 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7174 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7175 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7176 CostKind); 7177 7178 InstructionCost ExtCost = 7179 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7180 TTI::CastContextHint::None, CostKind, RedOp); 7181 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7182 return I == RetI ? *RedCost.getValue() : 0; 7183 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7184 Instruction *Mul = RedOp; 7185 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7186 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7187 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7188 Op0->getOpcode() == Op1->getOpcode() && 7189 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7190 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7191 bool IsUnsigned = isa<ZExtInst>(Op0); 7192 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7193 // reduce(mul(ext, ext)) 7194 InstructionCost ExtCost = 7195 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7196 TTI::CastContextHint::None, CostKind, Op0); 7197 InstructionCost MulCost = 7198 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7199 7200 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7201 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7202 CostKind); 7203 7204 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7205 return I == RetI ? *RedCost.getValue() : 0; 7206 } else { 7207 InstructionCost MulCost = 7208 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7209 7210 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7211 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7212 CostKind); 7213 7214 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7215 return I == RetI ? *RedCost.getValue() : 0; 7216 } 7217 } 7218 7219 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7220 } 7221 7222 InstructionCost 7223 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7224 ElementCount VF) { 7225 // Calculate scalar cost only. Vectorization cost should be ready at this 7226 // moment. 7227 if (VF.isScalar()) { 7228 Type *ValTy = getMemInstValueType(I); 7229 const Align Alignment = getLoadStoreAlignment(I); 7230 unsigned AS = getLoadStoreAddressSpace(I); 7231 7232 return TTI.getAddressComputationCost(ValTy) + 7233 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7234 TTI::TCK_RecipThroughput, I); 7235 } 7236 return getWideningCost(I, VF); 7237 } 7238 7239 LoopVectorizationCostModel::VectorizationCostTy 7240 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7241 ElementCount VF) { 7242 // If we know that this instruction will remain uniform, check the cost of 7243 // the scalar version. 7244 if (isUniformAfterVectorization(I, VF)) 7245 VF = ElementCount::getFixed(1); 7246 7247 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7248 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7249 7250 // Forced scalars do not have any scalarization overhead. 7251 auto ForcedScalar = ForcedScalars.find(VF); 7252 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7253 auto InstSet = ForcedScalar->second; 7254 if (InstSet.count(I)) 7255 return VectorizationCostTy( 7256 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7257 VF.getKnownMinValue()), 7258 false); 7259 } 7260 7261 Type *VectorTy; 7262 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7263 7264 bool TypeNotScalarized = 7265 VF.isVector() && VectorTy->isVectorTy() && 7266 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7267 return VectorizationCostTy(C, TypeNotScalarized); 7268 } 7269 7270 InstructionCost 7271 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7272 ElementCount VF) const { 7273 7274 if (VF.isScalable()) 7275 return InstructionCost::getInvalid(); 7276 7277 if (VF.isScalar()) 7278 return 0; 7279 7280 InstructionCost Cost = 0; 7281 Type *RetTy = ToVectorTy(I->getType(), VF); 7282 if (!RetTy->isVoidTy() && 7283 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7284 Cost += TTI.getScalarizationOverhead( 7285 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7286 true, false); 7287 7288 // Some targets keep addresses scalar. 7289 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7290 return Cost; 7291 7292 // Some targets support efficient element stores. 7293 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7294 return Cost; 7295 7296 // Collect operands to consider. 7297 CallInst *CI = dyn_cast<CallInst>(I); 7298 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7299 7300 // Skip operands that do not require extraction/scalarization and do not incur 7301 // any overhead. 7302 SmallVector<Type *> Tys; 7303 for (auto *V : filterExtractingOperands(Ops, VF)) 7304 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7305 return Cost + TTI.getOperandsScalarizationOverhead( 7306 filterExtractingOperands(Ops, VF), Tys); 7307 } 7308 7309 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7310 if (VF.isScalar()) 7311 return; 7312 NumPredStores = 0; 7313 for (BasicBlock *BB : TheLoop->blocks()) { 7314 // For each instruction in the old loop. 7315 for (Instruction &I : *BB) { 7316 Value *Ptr = getLoadStorePointerOperand(&I); 7317 if (!Ptr) 7318 continue; 7319 7320 // TODO: We should generate better code and update the cost model for 7321 // predicated uniform stores. Today they are treated as any other 7322 // predicated store (see added test cases in 7323 // invariant-store-vectorization.ll). 7324 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7325 NumPredStores++; 7326 7327 if (Legal->isUniformMemOp(I)) { 7328 // TODO: Avoid replicating loads and stores instead of 7329 // relying on instcombine to remove them. 7330 // Load: Scalar load + broadcast 7331 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7332 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7333 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7334 continue; 7335 } 7336 7337 // We assume that widening is the best solution when possible. 7338 if (memoryInstructionCanBeWidened(&I, VF)) { 7339 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7340 int ConsecutiveStride = 7341 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7342 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7343 "Expected consecutive stride."); 7344 InstWidening Decision = 7345 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7346 setWideningDecision(&I, VF, Decision, Cost); 7347 continue; 7348 } 7349 7350 // Choose between Interleaving, Gather/Scatter or Scalarization. 7351 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7352 unsigned NumAccesses = 1; 7353 if (isAccessInterleaved(&I)) { 7354 auto Group = getInterleavedAccessGroup(&I); 7355 assert(Group && "Fail to get an interleaved access group."); 7356 7357 // Make one decision for the whole group. 7358 if (getWideningDecision(&I, VF) != CM_Unknown) 7359 continue; 7360 7361 NumAccesses = Group->getNumMembers(); 7362 if (interleavedAccessCanBeWidened(&I, VF)) 7363 InterleaveCost = getInterleaveGroupCost(&I, VF); 7364 } 7365 7366 InstructionCost GatherScatterCost = 7367 isLegalGatherOrScatter(&I) 7368 ? getGatherScatterCost(&I, VF) * NumAccesses 7369 : InstructionCost::getInvalid(); 7370 7371 InstructionCost ScalarizationCost = 7372 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7373 7374 // Choose better solution for the current VF, 7375 // write down this decision and use it during vectorization. 7376 InstructionCost Cost; 7377 InstWidening Decision; 7378 if (InterleaveCost <= GatherScatterCost && 7379 InterleaveCost < ScalarizationCost) { 7380 Decision = CM_Interleave; 7381 Cost = InterleaveCost; 7382 } else if (GatherScatterCost < ScalarizationCost) { 7383 Decision = CM_GatherScatter; 7384 Cost = GatherScatterCost; 7385 } else { 7386 assert(!VF.isScalable() && 7387 "We cannot yet scalarise for scalable vectors"); 7388 Decision = CM_Scalarize; 7389 Cost = ScalarizationCost; 7390 } 7391 // If the instructions belongs to an interleave group, the whole group 7392 // receives the same decision. The whole group receives the cost, but 7393 // the cost will actually be assigned to one instruction. 7394 if (auto Group = getInterleavedAccessGroup(&I)) 7395 setWideningDecision(Group, VF, Decision, Cost); 7396 else 7397 setWideningDecision(&I, VF, Decision, Cost); 7398 } 7399 } 7400 7401 // Make sure that any load of address and any other address computation 7402 // remains scalar unless there is gather/scatter support. This avoids 7403 // inevitable extracts into address registers, and also has the benefit of 7404 // activating LSR more, since that pass can't optimize vectorized 7405 // addresses. 7406 if (TTI.prefersVectorizedAddressing()) 7407 return; 7408 7409 // Start with all scalar pointer uses. 7410 SmallPtrSet<Instruction *, 8> AddrDefs; 7411 for (BasicBlock *BB : TheLoop->blocks()) 7412 for (Instruction &I : *BB) { 7413 Instruction *PtrDef = 7414 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7415 if (PtrDef && TheLoop->contains(PtrDef) && 7416 getWideningDecision(&I, VF) != CM_GatherScatter) 7417 AddrDefs.insert(PtrDef); 7418 } 7419 7420 // Add all instructions used to generate the addresses. 7421 SmallVector<Instruction *, 4> Worklist; 7422 append_range(Worklist, AddrDefs); 7423 while (!Worklist.empty()) { 7424 Instruction *I = Worklist.pop_back_val(); 7425 for (auto &Op : I->operands()) 7426 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7427 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7428 AddrDefs.insert(InstOp).second) 7429 Worklist.push_back(InstOp); 7430 } 7431 7432 for (auto *I : AddrDefs) { 7433 if (isa<LoadInst>(I)) { 7434 // Setting the desired widening decision should ideally be handled in 7435 // by cost functions, but since this involves the task of finding out 7436 // if the loaded register is involved in an address computation, it is 7437 // instead changed here when we know this is the case. 7438 InstWidening Decision = getWideningDecision(I, VF); 7439 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7440 // Scalarize a widened load of address. 7441 setWideningDecision( 7442 I, VF, CM_Scalarize, 7443 (VF.getKnownMinValue() * 7444 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7445 else if (auto Group = getInterleavedAccessGroup(I)) { 7446 // Scalarize an interleave group of address loads. 7447 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7448 if (Instruction *Member = Group->getMember(I)) 7449 setWideningDecision( 7450 Member, VF, CM_Scalarize, 7451 (VF.getKnownMinValue() * 7452 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7453 } 7454 } 7455 } else 7456 // Make sure I gets scalarized and a cost estimate without 7457 // scalarization overhead. 7458 ForcedScalars[VF].insert(I); 7459 } 7460 } 7461 7462 InstructionCost 7463 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7464 Type *&VectorTy) { 7465 Type *RetTy = I->getType(); 7466 if (canTruncateToMinimalBitwidth(I, VF)) 7467 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7468 auto SE = PSE.getSE(); 7469 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7470 7471 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7472 ElementCount VF) -> bool { 7473 if (VF.isScalar()) 7474 return true; 7475 7476 auto Scalarized = InstsToScalarize.find(VF); 7477 assert(Scalarized != InstsToScalarize.end() && 7478 "VF not yet analyzed for scalarization profitability"); 7479 return !Scalarized->second.count(I) && 7480 llvm::all_of(I->users(), [&](User *U) { 7481 auto *UI = cast<Instruction>(U); 7482 return !Scalarized->second.count(UI); 7483 }); 7484 }; 7485 (void) hasSingleCopyAfterVectorization; 7486 7487 if (isScalarAfterVectorization(I, VF)) { 7488 // With the exception of GEPs and PHIs, after scalarization there should 7489 // only be one copy of the instruction generated in the loop. This is 7490 // because the VF is either 1, or any instructions that need scalarizing 7491 // have already been dealt with by the the time we get here. As a result, 7492 // it means we don't have to multiply the instruction cost by VF. 7493 assert(I->getOpcode() == Instruction::GetElementPtr || 7494 I->getOpcode() == Instruction::PHI || 7495 (I->getOpcode() == Instruction::BitCast && 7496 I->getType()->isPointerTy()) || 7497 hasSingleCopyAfterVectorization(I, VF)); 7498 VectorTy = RetTy; 7499 } else 7500 VectorTy = ToVectorTy(RetTy, VF); 7501 7502 // TODO: We need to estimate the cost of intrinsic calls. 7503 switch (I->getOpcode()) { 7504 case Instruction::GetElementPtr: 7505 // We mark this instruction as zero-cost because the cost of GEPs in 7506 // vectorized code depends on whether the corresponding memory instruction 7507 // is scalarized or not. Therefore, we handle GEPs with the memory 7508 // instruction cost. 7509 return 0; 7510 case Instruction::Br: { 7511 // In cases of scalarized and predicated instructions, there will be VF 7512 // predicated blocks in the vectorized loop. Each branch around these 7513 // blocks requires also an extract of its vector compare i1 element. 7514 bool ScalarPredicatedBB = false; 7515 BranchInst *BI = cast<BranchInst>(I); 7516 if (VF.isVector() && BI->isConditional() && 7517 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7518 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7519 ScalarPredicatedBB = true; 7520 7521 if (ScalarPredicatedBB) { 7522 // Return cost for branches around scalarized and predicated blocks. 7523 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7524 auto *Vec_i1Ty = 7525 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7526 return (TTI.getScalarizationOverhead( 7527 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7528 false, true) + 7529 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7530 VF.getKnownMinValue())); 7531 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7532 // The back-edge branch will remain, as will all scalar branches. 7533 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7534 else 7535 // This branch will be eliminated by if-conversion. 7536 return 0; 7537 // Note: We currently assume zero cost for an unconditional branch inside 7538 // a predicated block since it will become a fall-through, although we 7539 // may decide in the future to call TTI for all branches. 7540 } 7541 case Instruction::PHI: { 7542 auto *Phi = cast<PHINode>(I); 7543 7544 // First-order recurrences are replaced by vector shuffles inside the loop. 7545 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7546 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7547 return TTI.getShuffleCost( 7548 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7549 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7550 7551 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7552 // converted into select instructions. We require N - 1 selects per phi 7553 // node, where N is the number of incoming values. 7554 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7555 return (Phi->getNumIncomingValues() - 1) * 7556 TTI.getCmpSelInstrCost( 7557 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7558 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7559 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7560 7561 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7562 } 7563 case Instruction::UDiv: 7564 case Instruction::SDiv: 7565 case Instruction::URem: 7566 case Instruction::SRem: 7567 // If we have a predicated instruction, it may not be executed for each 7568 // vector lane. Get the scalarization cost and scale this amount by the 7569 // probability of executing the predicated block. If the instruction is not 7570 // predicated, we fall through to the next case. 7571 if (VF.isVector() && isScalarWithPredication(I)) { 7572 InstructionCost Cost = 0; 7573 7574 // These instructions have a non-void type, so account for the phi nodes 7575 // that we will create. This cost is likely to be zero. The phi node 7576 // cost, if any, should be scaled by the block probability because it 7577 // models a copy at the end of each predicated block. 7578 Cost += VF.getKnownMinValue() * 7579 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7580 7581 // The cost of the non-predicated instruction. 7582 Cost += VF.getKnownMinValue() * 7583 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7584 7585 // The cost of insertelement and extractelement instructions needed for 7586 // scalarization. 7587 Cost += getScalarizationOverhead(I, VF); 7588 7589 // Scale the cost by the probability of executing the predicated blocks. 7590 // This assumes the predicated block for each vector lane is equally 7591 // likely. 7592 return Cost / getReciprocalPredBlockProb(); 7593 } 7594 LLVM_FALLTHROUGH; 7595 case Instruction::Add: 7596 case Instruction::FAdd: 7597 case Instruction::Sub: 7598 case Instruction::FSub: 7599 case Instruction::Mul: 7600 case Instruction::FMul: 7601 case Instruction::FDiv: 7602 case Instruction::FRem: 7603 case Instruction::Shl: 7604 case Instruction::LShr: 7605 case Instruction::AShr: 7606 case Instruction::And: 7607 case Instruction::Or: 7608 case Instruction::Xor: { 7609 // Since we will replace the stride by 1 the multiplication should go away. 7610 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7611 return 0; 7612 7613 // Detect reduction patterns 7614 InstructionCost RedCost; 7615 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7616 .isValid()) 7617 return RedCost; 7618 7619 // Certain instructions can be cheaper to vectorize if they have a constant 7620 // second vector operand. One example of this are shifts on x86. 7621 Value *Op2 = I->getOperand(1); 7622 TargetTransformInfo::OperandValueProperties Op2VP; 7623 TargetTransformInfo::OperandValueKind Op2VK = 7624 TTI.getOperandInfo(Op2, Op2VP); 7625 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7626 Op2VK = TargetTransformInfo::OK_UniformValue; 7627 7628 SmallVector<const Value *, 4> Operands(I->operand_values()); 7629 return TTI.getArithmeticInstrCost( 7630 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7631 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7632 } 7633 case Instruction::FNeg: { 7634 return TTI.getArithmeticInstrCost( 7635 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7636 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7637 TargetTransformInfo::OP_None, I->getOperand(0), I); 7638 } 7639 case Instruction::Select: { 7640 SelectInst *SI = cast<SelectInst>(I); 7641 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7642 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7643 7644 const Value *Op0, *Op1; 7645 using namespace llvm::PatternMatch; 7646 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7647 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7648 // select x, y, false --> x & y 7649 // select x, true, y --> x | y 7650 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7651 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7652 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7653 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7654 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7655 Op1->getType()->getScalarSizeInBits() == 1); 7656 7657 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7658 return TTI.getArithmeticInstrCost( 7659 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7660 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7661 } 7662 7663 Type *CondTy = SI->getCondition()->getType(); 7664 if (!ScalarCond) 7665 CondTy = VectorType::get(CondTy, VF); 7666 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7667 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7668 } 7669 case Instruction::ICmp: 7670 case Instruction::FCmp: { 7671 Type *ValTy = I->getOperand(0)->getType(); 7672 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7673 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7674 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7675 VectorTy = ToVectorTy(ValTy, VF); 7676 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7677 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7678 } 7679 case Instruction::Store: 7680 case Instruction::Load: { 7681 ElementCount Width = VF; 7682 if (Width.isVector()) { 7683 InstWidening Decision = getWideningDecision(I, Width); 7684 assert(Decision != CM_Unknown && 7685 "CM decision should be taken at this point"); 7686 if (Decision == CM_Scalarize) 7687 Width = ElementCount::getFixed(1); 7688 } 7689 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7690 return getMemoryInstructionCost(I, VF); 7691 } 7692 case Instruction::BitCast: 7693 if (I->getType()->isPointerTy()) 7694 return 0; 7695 LLVM_FALLTHROUGH; 7696 case Instruction::ZExt: 7697 case Instruction::SExt: 7698 case Instruction::FPToUI: 7699 case Instruction::FPToSI: 7700 case Instruction::FPExt: 7701 case Instruction::PtrToInt: 7702 case Instruction::IntToPtr: 7703 case Instruction::SIToFP: 7704 case Instruction::UIToFP: 7705 case Instruction::Trunc: 7706 case Instruction::FPTrunc: { 7707 // Computes the CastContextHint from a Load/Store instruction. 7708 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7709 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7710 "Expected a load or a store!"); 7711 7712 if (VF.isScalar() || !TheLoop->contains(I)) 7713 return TTI::CastContextHint::Normal; 7714 7715 switch (getWideningDecision(I, VF)) { 7716 case LoopVectorizationCostModel::CM_GatherScatter: 7717 return TTI::CastContextHint::GatherScatter; 7718 case LoopVectorizationCostModel::CM_Interleave: 7719 return TTI::CastContextHint::Interleave; 7720 case LoopVectorizationCostModel::CM_Scalarize: 7721 case LoopVectorizationCostModel::CM_Widen: 7722 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7723 : TTI::CastContextHint::Normal; 7724 case LoopVectorizationCostModel::CM_Widen_Reverse: 7725 return TTI::CastContextHint::Reversed; 7726 case LoopVectorizationCostModel::CM_Unknown: 7727 llvm_unreachable("Instr did not go through cost modelling?"); 7728 } 7729 7730 llvm_unreachable("Unhandled case!"); 7731 }; 7732 7733 unsigned Opcode = I->getOpcode(); 7734 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7735 // For Trunc, the context is the only user, which must be a StoreInst. 7736 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7737 if (I->hasOneUse()) 7738 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7739 CCH = ComputeCCH(Store); 7740 } 7741 // For Z/Sext, the context is the operand, which must be a LoadInst. 7742 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7743 Opcode == Instruction::FPExt) { 7744 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7745 CCH = ComputeCCH(Load); 7746 } 7747 7748 // We optimize the truncation of induction variables having constant 7749 // integer steps. The cost of these truncations is the same as the scalar 7750 // operation. 7751 if (isOptimizableIVTruncate(I, VF)) { 7752 auto *Trunc = cast<TruncInst>(I); 7753 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7754 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7755 } 7756 7757 // Detect reduction patterns 7758 InstructionCost RedCost; 7759 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7760 .isValid()) 7761 return RedCost; 7762 7763 Type *SrcScalarTy = I->getOperand(0)->getType(); 7764 Type *SrcVecTy = 7765 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7766 if (canTruncateToMinimalBitwidth(I, VF)) { 7767 // This cast is going to be shrunk. This may remove the cast or it might 7768 // turn it into slightly different cast. For example, if MinBW == 16, 7769 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7770 // 7771 // Calculate the modified src and dest types. 7772 Type *MinVecTy = VectorTy; 7773 if (Opcode == Instruction::Trunc) { 7774 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7775 VectorTy = 7776 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7777 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7778 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7779 VectorTy = 7780 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7781 } 7782 } 7783 7784 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7785 } 7786 case Instruction::Call: { 7787 bool NeedToScalarize; 7788 CallInst *CI = cast<CallInst>(I); 7789 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7790 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7791 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7792 return std::min(CallCost, IntrinsicCost); 7793 } 7794 return CallCost; 7795 } 7796 case Instruction::ExtractValue: 7797 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7798 default: 7799 // This opcode is unknown. Assume that it is the same as 'mul'. 7800 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7801 } // end of switch. 7802 } 7803 7804 char LoopVectorize::ID = 0; 7805 7806 static const char lv_name[] = "Loop Vectorization"; 7807 7808 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7809 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7810 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7811 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7812 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7813 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7814 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7815 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7816 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7817 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7818 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7819 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7820 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7821 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7822 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7823 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7824 7825 namespace llvm { 7826 7827 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7828 7829 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7830 bool VectorizeOnlyWhenForced) { 7831 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7832 } 7833 7834 } // end namespace llvm 7835 7836 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7837 // Check if the pointer operand of a load or store instruction is 7838 // consecutive. 7839 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7840 return Legal->isConsecutivePtr(Ptr); 7841 return false; 7842 } 7843 7844 void LoopVectorizationCostModel::collectValuesToIgnore() { 7845 // Ignore ephemeral values. 7846 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7847 7848 // Ignore type-promoting instructions we identified during reduction 7849 // detection. 7850 for (auto &Reduction : Legal->getReductionVars()) { 7851 RecurrenceDescriptor &RedDes = Reduction.second; 7852 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7853 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7854 } 7855 // Ignore type-casting instructions we identified during induction 7856 // detection. 7857 for (auto &Induction : Legal->getInductionVars()) { 7858 InductionDescriptor &IndDes = Induction.second; 7859 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7860 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7861 } 7862 } 7863 7864 void LoopVectorizationCostModel::collectInLoopReductions() { 7865 for (auto &Reduction : Legal->getReductionVars()) { 7866 PHINode *Phi = Reduction.first; 7867 RecurrenceDescriptor &RdxDesc = Reduction.second; 7868 7869 // We don't collect reductions that are type promoted (yet). 7870 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7871 continue; 7872 7873 // If the target would prefer this reduction to happen "in-loop", then we 7874 // want to record it as such. 7875 unsigned Opcode = RdxDesc.getOpcode(); 7876 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7877 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7878 TargetTransformInfo::ReductionFlags())) 7879 continue; 7880 7881 // Check that we can correctly put the reductions into the loop, by 7882 // finding the chain of operations that leads from the phi to the loop 7883 // exit value. 7884 SmallVector<Instruction *, 4> ReductionOperations = 7885 RdxDesc.getReductionOpChain(Phi, TheLoop); 7886 bool InLoop = !ReductionOperations.empty(); 7887 if (InLoop) { 7888 InLoopReductionChains[Phi] = ReductionOperations; 7889 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7890 Instruction *LastChain = Phi; 7891 for (auto *I : ReductionOperations) { 7892 InLoopReductionImmediateChains[I] = LastChain; 7893 LastChain = I; 7894 } 7895 } 7896 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7897 << " reduction for phi: " << *Phi << "\n"); 7898 } 7899 } 7900 7901 // TODO: we could return a pair of values that specify the max VF and 7902 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7903 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7904 // doesn't have a cost model that can choose which plan to execute if 7905 // more than one is generated. 7906 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7907 LoopVectorizationCostModel &CM) { 7908 unsigned WidestType; 7909 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7910 return WidestVectorRegBits / WidestType; 7911 } 7912 7913 VectorizationFactor 7914 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7915 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7916 ElementCount VF = UserVF; 7917 // Outer loop handling: They may require CFG and instruction level 7918 // transformations before even evaluating whether vectorization is profitable. 7919 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7920 // the vectorization pipeline. 7921 if (!OrigLoop->isInnermost()) { 7922 // If the user doesn't provide a vectorization factor, determine a 7923 // reasonable one. 7924 if (UserVF.isZero()) { 7925 VF = ElementCount::getFixed(determineVPlanVF( 7926 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7927 .getFixedSize(), 7928 CM)); 7929 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7930 7931 // Make sure we have a VF > 1 for stress testing. 7932 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7933 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7934 << "overriding computed VF.\n"); 7935 VF = ElementCount::getFixed(4); 7936 } 7937 } 7938 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7939 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7940 "VF needs to be a power of two"); 7941 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7942 << "VF " << VF << " to build VPlans.\n"); 7943 buildVPlans(VF, VF); 7944 7945 // For VPlan build stress testing, we bail out after VPlan construction. 7946 if (VPlanBuildStressTest) 7947 return VectorizationFactor::Disabled(); 7948 7949 return {VF, 0 /*Cost*/}; 7950 } 7951 7952 LLVM_DEBUG( 7953 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7954 "VPlan-native path.\n"); 7955 return VectorizationFactor::Disabled(); 7956 } 7957 7958 Optional<VectorizationFactor> 7959 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7960 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7961 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7962 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7963 return None; 7964 7965 // Invalidate interleave groups if all blocks of loop will be predicated. 7966 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7967 !useMaskedInterleavedAccesses(*TTI)) { 7968 LLVM_DEBUG( 7969 dbgs() 7970 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7971 "which requires masked-interleaved support.\n"); 7972 if (CM.InterleaveInfo.invalidateGroups()) 7973 // Invalidating interleave groups also requires invalidating all decisions 7974 // based on them, which includes widening decisions and uniform and scalar 7975 // values. 7976 CM.invalidateCostModelingDecisions(); 7977 } 7978 7979 ElementCount MaxUserVF = 7980 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7981 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7982 if (!UserVF.isZero() && UserVFIsLegal) { 7983 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7984 << " VF " << UserVF << ".\n"); 7985 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7986 "VF needs to be a power of two"); 7987 // Collect the instructions (and their associated costs) that will be more 7988 // profitable to scalarize. 7989 CM.selectUserVectorizationFactor(UserVF); 7990 CM.collectInLoopReductions(); 7991 buildVPlansWithVPRecipes({UserVF}, {UserVF}); 7992 LLVM_DEBUG(printPlans(dbgs())); 7993 return {{UserVF, 0}}; 7994 } 7995 7996 ElementCount MaxVF = MaxFactors.FixedVF; 7997 assert(!MaxVF.isScalable() && 7998 "Scalable vectors not yet supported beyond this point"); 7999 8000 for (ElementCount VF = ElementCount::getFixed(1); 8001 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 8002 // Collect Uniform and Scalar instructions after vectorization with VF. 8003 CM.collectUniformsAndScalars(VF); 8004 8005 // Collect the instructions (and their associated costs) that will be more 8006 // profitable to scalarize. 8007 if (VF.isVector()) 8008 CM.collectInstsToScalarize(VF); 8009 } 8010 8011 CM.collectInLoopReductions(); 8012 8013 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 8014 LLVM_DEBUG(printPlans(dbgs())); 8015 if (!MaxFactors.hasVector()) 8016 return VectorizationFactor::Disabled(); 8017 8018 // Select the optimal vectorization factor. 8019 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 8020 8021 // Check if it is profitable to vectorize with runtime checks. 8022 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8023 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8024 bool PragmaThresholdReached = 8025 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8026 bool ThresholdReached = 8027 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8028 if ((ThresholdReached && !Hints.allowReordering()) || 8029 PragmaThresholdReached) { 8030 ORE->emit([&]() { 8031 return OptimizationRemarkAnalysisAliasing( 8032 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8033 OrigLoop->getHeader()) 8034 << "loop not vectorized: cannot prove it is safe to reorder " 8035 "memory operations"; 8036 }); 8037 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8038 Hints.emitRemarkWithHints(); 8039 return VectorizationFactor::Disabled(); 8040 } 8041 } 8042 return SelectedVF; 8043 } 8044 8045 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8046 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8047 << '\n'); 8048 BestVF = VF; 8049 BestUF = UF; 8050 8051 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8052 return !Plan->hasVF(VF); 8053 }); 8054 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8055 } 8056 8057 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8058 DominatorTree *DT) { 8059 // Perform the actual loop transformation. 8060 8061 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8062 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8063 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8064 8065 VPTransformState State{ 8066 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8067 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8068 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8069 State.CanonicalIV = ILV.Induction; 8070 8071 ILV.printDebugTracesAtStart(); 8072 8073 //===------------------------------------------------===// 8074 // 8075 // Notice: any optimization or new instruction that go 8076 // into the code below should also be implemented in 8077 // the cost-model. 8078 // 8079 //===------------------------------------------------===// 8080 8081 // 2. Copy and widen instructions from the old loop into the new loop. 8082 VPlans.front()->execute(&State); 8083 8084 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8085 // predication, updating analyses. 8086 ILV.fixVectorizedLoop(State); 8087 8088 ILV.printDebugTracesAtEnd(); 8089 } 8090 8091 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8092 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8093 for (const auto &Plan : VPlans) 8094 if (PrintVPlansInDotFormat) 8095 Plan->printDOT(O); 8096 else 8097 Plan->print(O); 8098 } 8099 #endif 8100 8101 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8102 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8103 8104 // We create new control-flow for the vectorized loop, so the original exit 8105 // conditions will be dead after vectorization if it's only used by the 8106 // terminator 8107 SmallVector<BasicBlock*> ExitingBlocks; 8108 OrigLoop->getExitingBlocks(ExitingBlocks); 8109 for (auto *BB : ExitingBlocks) { 8110 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8111 if (!Cmp || !Cmp->hasOneUse()) 8112 continue; 8113 8114 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8115 if (!DeadInstructions.insert(Cmp).second) 8116 continue; 8117 8118 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8119 // TODO: can recurse through operands in general 8120 for (Value *Op : Cmp->operands()) { 8121 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8122 DeadInstructions.insert(cast<Instruction>(Op)); 8123 } 8124 } 8125 8126 // We create new "steps" for induction variable updates to which the original 8127 // induction variables map. An original update instruction will be dead if 8128 // all its users except the induction variable are dead. 8129 auto *Latch = OrigLoop->getLoopLatch(); 8130 for (auto &Induction : Legal->getInductionVars()) { 8131 PHINode *Ind = Induction.first; 8132 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8133 8134 // If the tail is to be folded by masking, the primary induction variable, 8135 // if exists, isn't dead: it will be used for masking. Don't kill it. 8136 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8137 continue; 8138 8139 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8140 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8141 })) 8142 DeadInstructions.insert(IndUpdate); 8143 8144 // We record as "Dead" also the type-casting instructions we had identified 8145 // during induction analysis. We don't need any handling for them in the 8146 // vectorized loop because we have proven that, under a proper runtime 8147 // test guarding the vectorized loop, the value of the phi, and the casted 8148 // value of the phi, are the same. The last instruction in this casting chain 8149 // will get its scalar/vector/widened def from the scalar/vector/widened def 8150 // of the respective phi node. Any other casts in the induction def-use chain 8151 // have no other uses outside the phi update chain, and will be ignored. 8152 InductionDescriptor &IndDes = Induction.second; 8153 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8154 DeadInstructions.insert(Casts.begin(), Casts.end()); 8155 } 8156 } 8157 8158 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8159 8160 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8161 8162 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8163 Instruction::BinaryOps BinOp) { 8164 // When unrolling and the VF is 1, we only need to add a simple scalar. 8165 Type *Ty = Val->getType(); 8166 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8167 8168 if (Ty->isFloatingPointTy()) { 8169 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8170 8171 // Floating-point operations inherit FMF via the builder's flags. 8172 Value *MulOp = Builder.CreateFMul(C, Step); 8173 return Builder.CreateBinOp(BinOp, Val, MulOp); 8174 } 8175 Constant *C = ConstantInt::get(Ty, StartIdx); 8176 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8177 } 8178 8179 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8180 SmallVector<Metadata *, 4> MDs; 8181 // Reserve first location for self reference to the LoopID metadata node. 8182 MDs.push_back(nullptr); 8183 bool IsUnrollMetadata = false; 8184 MDNode *LoopID = L->getLoopID(); 8185 if (LoopID) { 8186 // First find existing loop unrolling disable metadata. 8187 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8188 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8189 if (MD) { 8190 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8191 IsUnrollMetadata = 8192 S && S->getString().startswith("llvm.loop.unroll.disable"); 8193 } 8194 MDs.push_back(LoopID->getOperand(i)); 8195 } 8196 } 8197 8198 if (!IsUnrollMetadata) { 8199 // Add runtime unroll disable metadata. 8200 LLVMContext &Context = L->getHeader()->getContext(); 8201 SmallVector<Metadata *, 1> DisableOperands; 8202 DisableOperands.push_back( 8203 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8204 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8205 MDs.push_back(DisableNode); 8206 MDNode *NewLoopID = MDNode::get(Context, MDs); 8207 // Set operand 0 to refer to the loop id itself. 8208 NewLoopID->replaceOperandWith(0, NewLoopID); 8209 L->setLoopID(NewLoopID); 8210 } 8211 } 8212 8213 //===--------------------------------------------------------------------===// 8214 // EpilogueVectorizerMainLoop 8215 //===--------------------------------------------------------------------===// 8216 8217 /// This function is partially responsible for generating the control flow 8218 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8219 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8220 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8221 Loop *Lp = createVectorLoopSkeleton(""); 8222 8223 // Generate the code to check the minimum iteration count of the vector 8224 // epilogue (see below). 8225 EPI.EpilogueIterationCountCheck = 8226 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8227 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8228 8229 // Generate the code to check any assumptions that we've made for SCEV 8230 // expressions. 8231 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8232 8233 // Generate the code that checks at runtime if arrays overlap. We put the 8234 // checks into a separate block to make the more common case of few elements 8235 // faster. 8236 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8237 8238 // Generate the iteration count check for the main loop, *after* the check 8239 // for the epilogue loop, so that the path-length is shorter for the case 8240 // that goes directly through the vector epilogue. The longer-path length for 8241 // the main loop is compensated for, by the gain from vectorizing the larger 8242 // trip count. Note: the branch will get updated later on when we vectorize 8243 // the epilogue. 8244 EPI.MainLoopIterationCountCheck = 8245 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8246 8247 // Generate the induction variable. 8248 OldInduction = Legal->getPrimaryInduction(); 8249 Type *IdxTy = Legal->getWidestInductionType(); 8250 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8251 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8252 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8253 EPI.VectorTripCount = CountRoundDown; 8254 Induction = 8255 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8256 getDebugLocFromInstOrOperands(OldInduction)); 8257 8258 // Skip induction resume value creation here because they will be created in 8259 // the second pass. If we created them here, they wouldn't be used anyway, 8260 // because the vplan in the second pass still contains the inductions from the 8261 // original loop. 8262 8263 return completeLoopSkeleton(Lp, OrigLoopID); 8264 } 8265 8266 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8267 LLVM_DEBUG({ 8268 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8269 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8270 << ", Main Loop UF:" << EPI.MainLoopUF 8271 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8272 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8273 }); 8274 } 8275 8276 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8277 DEBUG_WITH_TYPE(VerboseDebug, { 8278 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8279 }); 8280 } 8281 8282 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8283 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8284 assert(L && "Expected valid Loop."); 8285 assert(Bypass && "Expected valid bypass basic block."); 8286 unsigned VFactor = 8287 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8288 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8289 Value *Count = getOrCreateTripCount(L); 8290 // Reuse existing vector loop preheader for TC checks. 8291 // Note that new preheader block is generated for vector loop. 8292 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8293 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8294 8295 // Generate code to check if the loop's trip count is less than VF * UF of the 8296 // main vector loop. 8297 auto P = 8298 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8299 8300 Value *CheckMinIters = Builder.CreateICmp( 8301 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8302 "min.iters.check"); 8303 8304 if (!ForEpilogue) 8305 TCCheckBlock->setName("vector.main.loop.iter.check"); 8306 8307 // Create new preheader for vector loop. 8308 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8309 DT, LI, nullptr, "vector.ph"); 8310 8311 if (ForEpilogue) { 8312 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8313 DT->getNode(Bypass)->getIDom()) && 8314 "TC check is expected to dominate Bypass"); 8315 8316 // Update dominator for Bypass & LoopExit. 8317 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8318 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8319 8320 LoopBypassBlocks.push_back(TCCheckBlock); 8321 8322 // Save the trip count so we don't have to regenerate it in the 8323 // vec.epilog.iter.check. This is safe to do because the trip count 8324 // generated here dominates the vector epilog iter check. 8325 EPI.TripCount = Count; 8326 } 8327 8328 ReplaceInstWithInst( 8329 TCCheckBlock->getTerminator(), 8330 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8331 8332 return TCCheckBlock; 8333 } 8334 8335 //===--------------------------------------------------------------------===// 8336 // EpilogueVectorizerEpilogueLoop 8337 //===--------------------------------------------------------------------===// 8338 8339 /// This function is partially responsible for generating the control flow 8340 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8341 BasicBlock * 8342 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8343 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8344 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8345 8346 // Now, compare the remaining count and if there aren't enough iterations to 8347 // execute the vectorized epilogue skip to the scalar part. 8348 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8349 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8350 LoopVectorPreHeader = 8351 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8352 LI, nullptr, "vec.epilog.ph"); 8353 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8354 VecEpilogueIterationCountCheck); 8355 8356 // Adjust the control flow taking the state info from the main loop 8357 // vectorization into account. 8358 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8359 "expected this to be saved from the previous pass."); 8360 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8361 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8362 8363 DT->changeImmediateDominator(LoopVectorPreHeader, 8364 EPI.MainLoopIterationCountCheck); 8365 8366 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8367 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8368 8369 if (EPI.SCEVSafetyCheck) 8370 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8371 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8372 if (EPI.MemSafetyCheck) 8373 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8374 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8375 8376 DT->changeImmediateDominator( 8377 VecEpilogueIterationCountCheck, 8378 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8379 8380 DT->changeImmediateDominator(LoopScalarPreHeader, 8381 EPI.EpilogueIterationCountCheck); 8382 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8383 8384 // Keep track of bypass blocks, as they feed start values to the induction 8385 // phis in the scalar loop preheader. 8386 if (EPI.SCEVSafetyCheck) 8387 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8388 if (EPI.MemSafetyCheck) 8389 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8390 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8391 8392 // Generate a resume induction for the vector epilogue and put it in the 8393 // vector epilogue preheader 8394 Type *IdxTy = Legal->getWidestInductionType(); 8395 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8396 LoopVectorPreHeader->getFirstNonPHI()); 8397 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8398 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8399 EPI.MainLoopIterationCountCheck); 8400 8401 // Generate the induction variable. 8402 OldInduction = Legal->getPrimaryInduction(); 8403 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8404 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8405 Value *StartIdx = EPResumeVal; 8406 Induction = 8407 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8408 getDebugLocFromInstOrOperands(OldInduction)); 8409 8410 // Generate induction resume values. These variables save the new starting 8411 // indexes for the scalar loop. They are used to test if there are any tail 8412 // iterations left once the vector loop has completed. 8413 // Note that when the vectorized epilogue is skipped due to iteration count 8414 // check, then the resume value for the induction variable comes from 8415 // the trip count of the main vector loop, hence passing the AdditionalBypass 8416 // argument. 8417 createInductionResumeValues(Lp, CountRoundDown, 8418 {VecEpilogueIterationCountCheck, 8419 EPI.VectorTripCount} /* AdditionalBypass */); 8420 8421 AddRuntimeUnrollDisableMetaData(Lp); 8422 return completeLoopSkeleton(Lp, OrigLoopID); 8423 } 8424 8425 BasicBlock * 8426 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8427 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8428 8429 assert(EPI.TripCount && 8430 "Expected trip count to have been safed in the first pass."); 8431 assert( 8432 (!isa<Instruction>(EPI.TripCount) || 8433 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8434 "saved trip count does not dominate insertion point."); 8435 Value *TC = EPI.TripCount; 8436 IRBuilder<> Builder(Insert->getTerminator()); 8437 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8438 8439 // Generate code to check if the loop's trip count is less than VF * UF of the 8440 // vector epilogue loop. 8441 auto P = 8442 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8443 8444 Value *CheckMinIters = Builder.CreateICmp( 8445 P, Count, 8446 ConstantInt::get(Count->getType(), 8447 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8448 "min.epilog.iters.check"); 8449 8450 ReplaceInstWithInst( 8451 Insert->getTerminator(), 8452 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8453 8454 LoopBypassBlocks.push_back(Insert); 8455 return Insert; 8456 } 8457 8458 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8459 LLVM_DEBUG({ 8460 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8461 << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8462 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8463 }); 8464 } 8465 8466 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8467 DEBUG_WITH_TYPE(VerboseDebug, { 8468 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8469 }); 8470 } 8471 8472 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8473 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8474 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8475 bool PredicateAtRangeStart = Predicate(Range.Start); 8476 8477 for (ElementCount TmpVF = Range.Start * 2; 8478 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8479 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8480 Range.End = TmpVF; 8481 break; 8482 } 8483 8484 return PredicateAtRangeStart; 8485 } 8486 8487 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8488 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8489 /// of VF's starting at a given VF and extending it as much as possible. Each 8490 /// vectorization decision can potentially shorten this sub-range during 8491 /// buildVPlan(). 8492 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8493 ElementCount MaxVF) { 8494 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8495 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8496 VFRange SubRange = {VF, MaxVFPlusOne}; 8497 VPlans.push_back(buildVPlan(SubRange)); 8498 VF = SubRange.End; 8499 } 8500 } 8501 8502 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8503 VPlanPtr &Plan) { 8504 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8505 8506 // Look for cached value. 8507 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8508 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8509 if (ECEntryIt != EdgeMaskCache.end()) 8510 return ECEntryIt->second; 8511 8512 VPValue *SrcMask = createBlockInMask(Src, Plan); 8513 8514 // The terminator has to be a branch inst! 8515 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8516 assert(BI && "Unexpected terminator found"); 8517 8518 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8519 return EdgeMaskCache[Edge] = SrcMask; 8520 8521 // If source is an exiting block, we know the exit edge is dynamically dead 8522 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8523 // adding uses of an otherwise potentially dead instruction. 8524 if (OrigLoop->isLoopExiting(Src)) 8525 return EdgeMaskCache[Edge] = SrcMask; 8526 8527 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8528 assert(EdgeMask && "No Edge Mask found for condition"); 8529 8530 if (BI->getSuccessor(0) != Dst) 8531 EdgeMask = Builder.createNot(EdgeMask); 8532 8533 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8534 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8535 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8536 // The select version does not introduce new UB if SrcMask is false and 8537 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8538 VPValue *False = Plan->getOrAddVPValue( 8539 ConstantInt::getFalse(BI->getCondition()->getType())); 8540 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8541 } 8542 8543 return EdgeMaskCache[Edge] = EdgeMask; 8544 } 8545 8546 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8547 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8548 8549 // Look for cached value. 8550 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8551 if (BCEntryIt != BlockMaskCache.end()) 8552 return BCEntryIt->second; 8553 8554 // All-one mask is modelled as no-mask following the convention for masked 8555 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8556 VPValue *BlockMask = nullptr; 8557 8558 if (OrigLoop->getHeader() == BB) { 8559 if (!CM.blockNeedsPredication(BB)) 8560 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8561 8562 // Create the block in mask as the first non-phi instruction in the block. 8563 VPBuilder::InsertPointGuard Guard(Builder); 8564 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8565 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8566 8567 // Introduce the early-exit compare IV <= BTC to form header block mask. 8568 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8569 // Start by constructing the desired canonical IV. 8570 VPValue *IV = nullptr; 8571 if (Legal->getPrimaryInduction()) 8572 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8573 else { 8574 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8575 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8576 IV = IVRecipe->getVPSingleValue(); 8577 } 8578 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8579 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8580 8581 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8582 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8583 // as a second argument, we only pass the IV here and extract the 8584 // tripcount from the transform state where codegen of the VP instructions 8585 // happen. 8586 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8587 } else { 8588 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8589 } 8590 return BlockMaskCache[BB] = BlockMask; 8591 } 8592 8593 // This is the block mask. We OR all incoming edges. 8594 for (auto *Predecessor : predecessors(BB)) { 8595 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8596 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8597 return BlockMaskCache[BB] = EdgeMask; 8598 8599 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8600 BlockMask = EdgeMask; 8601 continue; 8602 } 8603 8604 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8605 } 8606 8607 return BlockMaskCache[BB] = BlockMask; 8608 } 8609 8610 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8611 ArrayRef<VPValue *> Operands, 8612 VFRange &Range, 8613 VPlanPtr &Plan) { 8614 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8615 "Must be called with either a load or store"); 8616 8617 auto willWiden = [&](ElementCount VF) -> bool { 8618 if (VF.isScalar()) 8619 return false; 8620 LoopVectorizationCostModel::InstWidening Decision = 8621 CM.getWideningDecision(I, VF); 8622 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8623 "CM decision should be taken at this point."); 8624 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8625 return true; 8626 if (CM.isScalarAfterVectorization(I, VF) || 8627 CM.isProfitableToScalarize(I, VF)) 8628 return false; 8629 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8630 }; 8631 8632 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8633 return nullptr; 8634 8635 VPValue *Mask = nullptr; 8636 if (Legal->isMaskRequired(I)) 8637 Mask = createBlockInMask(I->getParent(), Plan); 8638 8639 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8640 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8641 8642 StoreInst *Store = cast<StoreInst>(I); 8643 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8644 Mask); 8645 } 8646 8647 VPWidenIntOrFpInductionRecipe * 8648 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8649 ArrayRef<VPValue *> Operands) const { 8650 // Check if this is an integer or fp induction. If so, build the recipe that 8651 // produces its scalar and vector values. 8652 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8653 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8654 II.getKind() == InductionDescriptor::IK_FpInduction) { 8655 assert(II.getStartValue() == 8656 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8657 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8658 return new VPWidenIntOrFpInductionRecipe( 8659 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8660 } 8661 8662 return nullptr; 8663 } 8664 8665 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8666 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8667 VPlan &Plan) const { 8668 // Optimize the special case where the source is a constant integer 8669 // induction variable. Notice that we can only optimize the 'trunc' case 8670 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8671 // (c) other casts depend on pointer size. 8672 8673 // Determine whether \p K is a truncation based on an induction variable that 8674 // can be optimized. 8675 auto isOptimizableIVTruncate = 8676 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8677 return [=](ElementCount VF) -> bool { 8678 return CM.isOptimizableIVTruncate(K, VF); 8679 }; 8680 }; 8681 8682 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8683 isOptimizableIVTruncate(I), Range)) { 8684 8685 InductionDescriptor II = 8686 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8687 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8688 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8689 Start, nullptr, I); 8690 } 8691 return nullptr; 8692 } 8693 8694 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8695 ArrayRef<VPValue *> Operands, 8696 VPlanPtr &Plan) { 8697 // If all incoming values are equal, the incoming VPValue can be used directly 8698 // instead of creating a new VPBlendRecipe. 8699 VPValue *FirstIncoming = Operands[0]; 8700 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8701 return FirstIncoming == Inc; 8702 })) { 8703 return Operands[0]; 8704 } 8705 8706 // We know that all PHIs in non-header blocks are converted into selects, so 8707 // we don't have to worry about the insertion order and we can just use the 8708 // builder. At this point we generate the predication tree. There may be 8709 // duplications since this is a simple recursive scan, but future 8710 // optimizations will clean it up. 8711 SmallVector<VPValue *, 2> OperandsWithMask; 8712 unsigned NumIncoming = Phi->getNumIncomingValues(); 8713 8714 for (unsigned In = 0; In < NumIncoming; In++) { 8715 VPValue *EdgeMask = 8716 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8717 assert((EdgeMask || NumIncoming == 1) && 8718 "Multiple predecessors with one having a full mask"); 8719 OperandsWithMask.push_back(Operands[In]); 8720 if (EdgeMask) 8721 OperandsWithMask.push_back(EdgeMask); 8722 } 8723 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8724 } 8725 8726 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8727 ArrayRef<VPValue *> Operands, 8728 VFRange &Range) const { 8729 8730 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8731 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8732 Range); 8733 8734 if (IsPredicated) 8735 return nullptr; 8736 8737 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8738 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8739 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8740 ID == Intrinsic::pseudoprobe || 8741 ID == Intrinsic::experimental_noalias_scope_decl)) 8742 return nullptr; 8743 8744 auto willWiden = [&](ElementCount VF) -> bool { 8745 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8746 // The following case may be scalarized depending on the VF. 8747 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8748 // version of the instruction. 8749 // Is it beneficial to perform intrinsic call compared to lib call? 8750 bool NeedToScalarize = false; 8751 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8752 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8753 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8754 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8755 "Either the intrinsic cost or vector call cost must be valid"); 8756 return UseVectorIntrinsic || !NeedToScalarize; 8757 }; 8758 8759 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8760 return nullptr; 8761 8762 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8763 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8764 } 8765 8766 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8767 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8768 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8769 // Instruction should be widened, unless it is scalar after vectorization, 8770 // scalarization is profitable or it is predicated. 8771 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8772 return CM.isScalarAfterVectorization(I, VF) || 8773 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8774 }; 8775 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8776 Range); 8777 } 8778 8779 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8780 ArrayRef<VPValue *> Operands) const { 8781 auto IsVectorizableOpcode = [](unsigned Opcode) { 8782 switch (Opcode) { 8783 case Instruction::Add: 8784 case Instruction::And: 8785 case Instruction::AShr: 8786 case Instruction::BitCast: 8787 case Instruction::FAdd: 8788 case Instruction::FCmp: 8789 case Instruction::FDiv: 8790 case Instruction::FMul: 8791 case Instruction::FNeg: 8792 case Instruction::FPExt: 8793 case Instruction::FPToSI: 8794 case Instruction::FPToUI: 8795 case Instruction::FPTrunc: 8796 case Instruction::FRem: 8797 case Instruction::FSub: 8798 case Instruction::ICmp: 8799 case Instruction::IntToPtr: 8800 case Instruction::LShr: 8801 case Instruction::Mul: 8802 case Instruction::Or: 8803 case Instruction::PtrToInt: 8804 case Instruction::SDiv: 8805 case Instruction::Select: 8806 case Instruction::SExt: 8807 case Instruction::Shl: 8808 case Instruction::SIToFP: 8809 case Instruction::SRem: 8810 case Instruction::Sub: 8811 case Instruction::Trunc: 8812 case Instruction::UDiv: 8813 case Instruction::UIToFP: 8814 case Instruction::URem: 8815 case Instruction::Xor: 8816 case Instruction::ZExt: 8817 return true; 8818 } 8819 return false; 8820 }; 8821 8822 if (!IsVectorizableOpcode(I->getOpcode())) 8823 return nullptr; 8824 8825 // Success: widen this instruction. 8826 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8827 } 8828 8829 void VPRecipeBuilder::fixHeaderPhis() { 8830 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8831 for (VPWidenPHIRecipe *R : PhisToFix) { 8832 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8833 VPRecipeBase *IncR = 8834 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8835 R->addOperand(IncR->getVPSingleValue()); 8836 } 8837 } 8838 8839 VPBasicBlock *VPRecipeBuilder::handleReplication( 8840 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8841 VPlanPtr &Plan) { 8842 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8843 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8844 Range); 8845 8846 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8847 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8848 8849 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8850 IsUniform, IsPredicated); 8851 setRecipe(I, Recipe); 8852 Plan->addVPValue(I, Recipe); 8853 8854 // Find if I uses a predicated instruction. If so, it will use its scalar 8855 // value. Avoid hoisting the insert-element which packs the scalar value into 8856 // a vector value, as that happens iff all users use the vector value. 8857 for (VPValue *Op : Recipe->operands()) { 8858 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8859 if (!PredR) 8860 continue; 8861 auto *RepR = 8862 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8863 assert(RepR->isPredicated() && 8864 "expected Replicate recipe to be predicated"); 8865 RepR->setAlsoPack(false); 8866 } 8867 8868 // Finalize the recipe for Instr, first if it is not predicated. 8869 if (!IsPredicated) { 8870 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8871 VPBB->appendRecipe(Recipe); 8872 return VPBB; 8873 } 8874 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8875 assert(VPBB->getSuccessors().empty() && 8876 "VPBB has successors when handling predicated replication."); 8877 // Record predicated instructions for above packing optimizations. 8878 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8879 VPBlockUtils::insertBlockAfter(Region, VPBB); 8880 auto *RegSucc = new VPBasicBlock(); 8881 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8882 return RegSucc; 8883 } 8884 8885 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8886 VPRecipeBase *PredRecipe, 8887 VPlanPtr &Plan) { 8888 // Instructions marked for predication are replicated and placed under an 8889 // if-then construct to prevent side-effects. 8890 8891 // Generate recipes to compute the block mask for this region. 8892 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8893 8894 // Build the triangular if-then region. 8895 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8896 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8897 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8898 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8899 auto *PHIRecipe = Instr->getType()->isVoidTy() 8900 ? nullptr 8901 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8902 if (PHIRecipe) { 8903 Plan->removeVPValueFor(Instr); 8904 Plan->addVPValue(Instr, PHIRecipe); 8905 } 8906 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8907 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8908 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8909 8910 // Note: first set Entry as region entry and then connect successors starting 8911 // from it in order, to propagate the "parent" of each VPBasicBlock. 8912 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8913 VPBlockUtils::connectBlocks(Pred, Exit); 8914 8915 return Region; 8916 } 8917 8918 VPRecipeOrVPValueTy 8919 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8920 ArrayRef<VPValue *> Operands, 8921 VFRange &Range, VPlanPtr &Plan) { 8922 // First, check for specific widening recipes that deal with calls, memory 8923 // operations, inductions and Phi nodes. 8924 if (auto *CI = dyn_cast<CallInst>(Instr)) 8925 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8926 8927 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8928 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8929 8930 VPRecipeBase *Recipe; 8931 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8932 if (Phi->getParent() != OrigLoop->getHeader()) 8933 return tryToBlend(Phi, Operands, Plan); 8934 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8935 return toVPRecipeResult(Recipe); 8936 8937 if (Legal->isReductionVariable(Phi)) { 8938 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8939 assert(RdxDesc.getRecurrenceStartValue() == 8940 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8941 VPValue *StartV = Operands[0]; 8942 8943 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8944 PhisToFix.push_back(PhiRecipe); 8945 // Record the incoming value from the backedge, so we can add the incoming 8946 // value from the backedge after all recipes have been created. 8947 recordRecipeOf(cast<Instruction>( 8948 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8949 return toVPRecipeResult(PhiRecipe); 8950 } 8951 8952 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8953 } 8954 8955 if (isa<TruncInst>(Instr) && 8956 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8957 Range, *Plan))) 8958 return toVPRecipeResult(Recipe); 8959 8960 if (!shouldWiden(Instr, Range)) 8961 return nullptr; 8962 8963 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8964 return toVPRecipeResult(new VPWidenGEPRecipe( 8965 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8966 8967 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8968 bool InvariantCond = 8969 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8970 return toVPRecipeResult(new VPWidenSelectRecipe( 8971 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8972 } 8973 8974 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8975 } 8976 8977 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8978 ElementCount MaxVF) { 8979 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8980 8981 // Collect instructions from the original loop that will become trivially dead 8982 // in the vectorized loop. We don't need to vectorize these instructions. For 8983 // example, original induction update instructions can become dead because we 8984 // separately emit induction "steps" when generating code for the new loop. 8985 // Similarly, we create a new latch condition when setting up the structure 8986 // of the new loop, so the old one can become dead. 8987 SmallPtrSet<Instruction *, 4> DeadInstructions; 8988 collectTriviallyDeadInstructions(DeadInstructions); 8989 8990 // Add assume instructions we need to drop to DeadInstructions, to prevent 8991 // them from being added to the VPlan. 8992 // TODO: We only need to drop assumes in blocks that get flattend. If the 8993 // control flow is preserved, we should keep them. 8994 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8995 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8996 8997 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8998 // Dead instructions do not need sinking. Remove them from SinkAfter. 8999 for (Instruction *I : DeadInstructions) 9000 SinkAfter.erase(I); 9001 9002 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9003 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9004 VFRange SubRange = {VF, MaxVFPlusOne}; 9005 VPlans.push_back( 9006 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9007 VF = SubRange.End; 9008 } 9009 } 9010 9011 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9012 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9013 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 9014 9015 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9016 9017 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9018 9019 // --------------------------------------------------------------------------- 9020 // Pre-construction: record ingredients whose recipes we'll need to further 9021 // process after constructing the initial VPlan. 9022 // --------------------------------------------------------------------------- 9023 9024 // Mark instructions we'll need to sink later and their targets as 9025 // ingredients whose recipe we'll need to record. 9026 for (auto &Entry : SinkAfter) { 9027 RecipeBuilder.recordRecipeOf(Entry.first); 9028 RecipeBuilder.recordRecipeOf(Entry.second); 9029 } 9030 for (auto &Reduction : CM.getInLoopReductionChains()) { 9031 PHINode *Phi = Reduction.first; 9032 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9033 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9034 9035 RecipeBuilder.recordRecipeOf(Phi); 9036 for (auto &R : ReductionOperations) { 9037 RecipeBuilder.recordRecipeOf(R); 9038 // For min/max reducitons, where we have a pair of icmp/select, we also 9039 // need to record the ICmp recipe, so it can be removed later. 9040 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9041 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9042 } 9043 } 9044 9045 // For each interleave group which is relevant for this (possibly trimmed) 9046 // Range, add it to the set of groups to be later applied to the VPlan and add 9047 // placeholders for its members' Recipes which we'll be replacing with a 9048 // single VPInterleaveRecipe. 9049 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9050 auto applyIG = [IG, this](ElementCount VF) -> bool { 9051 return (VF.isVector() && // Query is illegal for VF == 1 9052 CM.getWideningDecision(IG->getInsertPos(), VF) == 9053 LoopVectorizationCostModel::CM_Interleave); 9054 }; 9055 if (!getDecisionAndClampRange(applyIG, Range)) 9056 continue; 9057 InterleaveGroups.insert(IG); 9058 for (unsigned i = 0; i < IG->getFactor(); i++) 9059 if (Instruction *Member = IG->getMember(i)) 9060 RecipeBuilder.recordRecipeOf(Member); 9061 }; 9062 9063 // --------------------------------------------------------------------------- 9064 // Build initial VPlan: Scan the body of the loop in a topological order to 9065 // visit each basic block after having visited its predecessor basic blocks. 9066 // --------------------------------------------------------------------------- 9067 9068 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9069 auto Plan = std::make_unique<VPlan>(); 9070 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9071 Plan->setEntry(VPBB); 9072 9073 // Scan the body of the loop in a topological order to visit each basic block 9074 // after having visited its predecessor basic blocks. 9075 LoopBlocksDFS DFS(OrigLoop); 9076 DFS.perform(LI); 9077 9078 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9079 // Relevant instructions from basic block BB will be grouped into VPRecipe 9080 // ingredients and fill a new VPBasicBlock. 9081 unsigned VPBBsForBB = 0; 9082 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9083 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9084 VPBB = FirstVPBBForBB; 9085 Builder.setInsertPoint(VPBB); 9086 9087 // Introduce each ingredient into VPlan. 9088 // TODO: Model and preserve debug instrinsics in VPlan. 9089 for (Instruction &I : BB->instructionsWithoutDebug()) { 9090 Instruction *Instr = &I; 9091 9092 // First filter out irrelevant instructions, to ensure no recipes are 9093 // built for them. 9094 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9095 continue; 9096 9097 SmallVector<VPValue *, 4> Operands; 9098 auto *Phi = dyn_cast<PHINode>(Instr); 9099 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9100 Operands.push_back(Plan->getOrAddVPValue( 9101 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9102 } else { 9103 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9104 Operands = {OpRange.begin(), OpRange.end()}; 9105 } 9106 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9107 Instr, Operands, Range, Plan)) { 9108 // If Instr can be simplified to an existing VPValue, use it. 9109 if (RecipeOrValue.is<VPValue *>()) { 9110 auto *VPV = RecipeOrValue.get<VPValue *>(); 9111 Plan->addVPValue(Instr, VPV); 9112 // If the re-used value is a recipe, register the recipe for the 9113 // instruction, in case the recipe for Instr needs to be recorded. 9114 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9115 RecipeBuilder.setRecipe(Instr, R); 9116 continue; 9117 } 9118 // Otherwise, add the new recipe. 9119 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9120 for (auto *Def : Recipe->definedValues()) { 9121 auto *UV = Def->getUnderlyingValue(); 9122 Plan->addVPValue(UV, Def); 9123 } 9124 9125 RecipeBuilder.setRecipe(Instr, Recipe); 9126 VPBB->appendRecipe(Recipe); 9127 continue; 9128 } 9129 9130 // Otherwise, if all widening options failed, Instruction is to be 9131 // replicated. This may create a successor for VPBB. 9132 VPBasicBlock *NextVPBB = 9133 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9134 if (NextVPBB != VPBB) { 9135 VPBB = NextVPBB; 9136 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9137 : ""); 9138 } 9139 } 9140 } 9141 9142 RecipeBuilder.fixHeaderPhis(); 9143 9144 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9145 // may also be empty, such as the last one VPBB, reflecting original 9146 // basic-blocks with no recipes. 9147 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9148 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9149 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9150 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9151 delete PreEntry; 9152 9153 // --------------------------------------------------------------------------- 9154 // Transform initial VPlan: Apply previously taken decisions, in order, to 9155 // bring the VPlan to its final state. 9156 // --------------------------------------------------------------------------- 9157 9158 // Apply Sink-After legal constraints. 9159 for (auto &Entry : SinkAfter) { 9160 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9161 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9162 9163 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9164 auto *Region = 9165 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9166 if (Region && Region->isReplicator()) 9167 return Region; 9168 return nullptr; 9169 }; 9170 9171 // If the target is in a replication region, make sure to move Sink to the 9172 // block after it, not into the replication region itself. 9173 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9174 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9175 assert(!GetReplicateRegion(Sink) && 9176 "cannot sink a region into another region yet"); 9177 VPBasicBlock *NextBlock = 9178 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9179 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9180 continue; 9181 } 9182 9183 auto *SinkRegion = GetReplicateRegion(Sink); 9184 // Unless the sink source is in a replicate region, sink the recipe 9185 // directly. 9186 if (!SinkRegion) { 9187 Sink->moveAfter(Target); 9188 continue; 9189 } 9190 9191 // If the sink source is in a replicate region, we need to move the whole 9192 // replicate region, which should only contain a single recipe in the main 9193 // block. 9194 assert(Sink->getParent()->size() == 1 && 9195 "parent must be a replicator with a single recipe"); 9196 auto *SplitBlock = 9197 Target->getParent()->splitAt(std::next(Target->getIterator())); 9198 9199 auto *Pred = SinkRegion->getSinglePredecessor(); 9200 auto *Succ = SinkRegion->getSingleSuccessor(); 9201 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9202 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9203 VPBlockUtils::connectBlocks(Pred, Succ); 9204 9205 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9206 9207 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9208 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9209 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9210 if (VPBB == SplitPred) 9211 VPBB = SplitBlock; 9212 } 9213 9214 // Interleave memory: for each Interleave Group we marked earlier as relevant 9215 // for this VPlan, replace the Recipes widening its memory instructions with a 9216 // single VPInterleaveRecipe at its insertion point. 9217 for (auto IG : InterleaveGroups) { 9218 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9219 RecipeBuilder.getRecipe(IG->getInsertPos())); 9220 SmallVector<VPValue *, 4> StoredValues; 9221 for (unsigned i = 0; i < IG->getFactor(); ++i) 9222 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9223 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9224 9225 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9226 Recipe->getMask()); 9227 VPIG->insertBefore(Recipe); 9228 unsigned J = 0; 9229 for (unsigned i = 0; i < IG->getFactor(); ++i) 9230 if (Instruction *Member = IG->getMember(i)) { 9231 if (!Member->getType()->isVoidTy()) { 9232 VPValue *OriginalV = Plan->getVPValue(Member); 9233 Plan->removeVPValueFor(Member); 9234 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9235 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9236 J++; 9237 } 9238 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9239 } 9240 } 9241 9242 // Adjust the recipes for any inloop reductions. 9243 if (Range.Start.isVector()) 9244 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9245 9246 // Finally, if tail is folded by masking, introduce selects between the phi 9247 // and the live-out instruction of each reduction, at the end of the latch. 9248 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9249 Builder.setInsertPoint(VPBB); 9250 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9251 for (auto &Reduction : Legal->getReductionVars()) { 9252 if (CM.isInLoopReduction(Reduction.first)) 9253 continue; 9254 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9255 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9256 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9257 } 9258 } 9259 9260 VPlanTransforms::sinkScalarOperands(*Plan); 9261 9262 std::string PlanName; 9263 raw_string_ostream RSO(PlanName); 9264 ElementCount VF = Range.Start; 9265 Plan->addVF(VF); 9266 RSO << "Initial VPlan for VF={" << VF; 9267 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9268 Plan->addVF(VF); 9269 RSO << "," << VF; 9270 } 9271 RSO << "},UF>=1"; 9272 RSO.flush(); 9273 Plan->setName(PlanName); 9274 9275 return Plan; 9276 } 9277 9278 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9279 // Outer loop handling: They may require CFG and instruction level 9280 // transformations before even evaluating whether vectorization is profitable. 9281 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9282 // the vectorization pipeline. 9283 assert(!OrigLoop->isInnermost()); 9284 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9285 9286 // Create new empty VPlan 9287 auto Plan = std::make_unique<VPlan>(); 9288 9289 // Build hierarchical CFG 9290 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9291 HCFGBuilder.buildHierarchicalCFG(); 9292 9293 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9294 VF *= 2) 9295 Plan->addVF(VF); 9296 9297 if (EnableVPlanPredication) { 9298 VPlanPredicator VPP(*Plan); 9299 VPP.predicate(); 9300 9301 // Avoid running transformation to recipes until masked code generation in 9302 // VPlan-native path is in place. 9303 return Plan; 9304 } 9305 9306 SmallPtrSet<Instruction *, 1> DeadInstructions; 9307 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9308 Legal->getInductionVars(), 9309 DeadInstructions, *PSE.getSE()); 9310 return Plan; 9311 } 9312 9313 // Adjust the recipes for any inloop reductions. The chain of instructions 9314 // leading from the loop exit instr to the phi need to be converted to 9315 // reductions, with one operand being vector and the other being the scalar 9316 // reduction chain. 9317 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9318 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9319 for (auto &Reduction : CM.getInLoopReductionChains()) { 9320 PHINode *Phi = Reduction.first; 9321 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9322 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9323 9324 // ReductionOperations are orders top-down from the phi's use to the 9325 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9326 // which of the two operands will remain scalar and which will be reduced. 9327 // For minmax the chain will be the select instructions. 9328 Instruction *Chain = Phi; 9329 for (Instruction *R : ReductionOperations) { 9330 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9331 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9332 9333 VPValue *ChainOp = Plan->getVPValue(Chain); 9334 unsigned FirstOpId; 9335 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9336 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9337 "Expected to replace a VPWidenSelectSC"); 9338 FirstOpId = 1; 9339 } else { 9340 assert(isa<VPWidenRecipe>(WidenRecipe) && 9341 "Expected to replace a VPWidenSC"); 9342 FirstOpId = 0; 9343 } 9344 unsigned VecOpId = 9345 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9346 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9347 9348 auto *CondOp = CM.foldTailByMasking() 9349 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9350 : nullptr; 9351 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9352 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9353 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9354 Plan->removeVPValueFor(R); 9355 Plan->addVPValue(R, RedRecipe); 9356 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9357 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9358 WidenRecipe->eraseFromParent(); 9359 9360 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9361 VPRecipeBase *CompareRecipe = 9362 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9363 assert(isa<VPWidenRecipe>(CompareRecipe) && 9364 "Expected to replace a VPWidenSC"); 9365 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9366 "Expected no remaining users"); 9367 CompareRecipe->eraseFromParent(); 9368 } 9369 Chain = R; 9370 } 9371 } 9372 } 9373 9374 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9375 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9376 VPSlotTracker &SlotTracker) const { 9377 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9378 IG->getInsertPos()->printAsOperand(O, false); 9379 O << ", "; 9380 getAddr()->printAsOperand(O, SlotTracker); 9381 VPValue *Mask = getMask(); 9382 if (Mask) { 9383 O << ", "; 9384 Mask->printAsOperand(O, SlotTracker); 9385 } 9386 for (unsigned i = 0; i < IG->getFactor(); ++i) 9387 if (Instruction *I = IG->getMember(i)) 9388 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9389 } 9390 #endif 9391 9392 void VPWidenCallRecipe::execute(VPTransformState &State) { 9393 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9394 *this, State); 9395 } 9396 9397 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9398 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9399 this, *this, InvariantCond, State); 9400 } 9401 9402 void VPWidenRecipe::execute(VPTransformState &State) { 9403 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9404 } 9405 9406 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9407 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9408 *this, State.UF, State.VF, IsPtrLoopInvariant, 9409 IsIndexLoopInvariant, State); 9410 } 9411 9412 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9413 assert(!State.Instance && "Int or FP induction being replicated."); 9414 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9415 getTruncInst(), getVPValue(0), 9416 getCastValue(), State); 9417 } 9418 9419 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9420 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9421 this, State); 9422 } 9423 9424 void VPBlendRecipe::execute(VPTransformState &State) { 9425 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9426 // We know that all PHIs in non-header blocks are converted into 9427 // selects, so we don't have to worry about the insertion order and we 9428 // can just use the builder. 9429 // At this point we generate the predication tree. There may be 9430 // duplications since this is a simple recursive scan, but future 9431 // optimizations will clean it up. 9432 9433 unsigned NumIncoming = getNumIncomingValues(); 9434 9435 // Generate a sequence of selects of the form: 9436 // SELECT(Mask3, In3, 9437 // SELECT(Mask2, In2, 9438 // SELECT(Mask1, In1, 9439 // In0))) 9440 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9441 // are essentially undef are taken from In0. 9442 InnerLoopVectorizer::VectorParts Entry(State.UF); 9443 for (unsigned In = 0; In < NumIncoming; ++In) { 9444 for (unsigned Part = 0; Part < State.UF; ++Part) { 9445 // We might have single edge PHIs (blocks) - use an identity 9446 // 'select' for the first PHI operand. 9447 Value *In0 = State.get(getIncomingValue(In), Part); 9448 if (In == 0) 9449 Entry[Part] = In0; // Initialize with the first incoming value. 9450 else { 9451 // Select between the current value and the previous incoming edge 9452 // based on the incoming mask. 9453 Value *Cond = State.get(getMask(In), Part); 9454 Entry[Part] = 9455 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9456 } 9457 } 9458 } 9459 for (unsigned Part = 0; Part < State.UF; ++Part) 9460 State.set(this, Entry[Part], Part); 9461 } 9462 9463 void VPInterleaveRecipe::execute(VPTransformState &State) { 9464 assert(!State.Instance && "Interleave group being replicated."); 9465 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9466 getStoredValues(), getMask()); 9467 } 9468 9469 void VPReductionRecipe::execute(VPTransformState &State) { 9470 assert(!State.Instance && "Reduction being replicated."); 9471 Value *PrevInChain = State.get(getChainOp(), 0); 9472 for (unsigned Part = 0; Part < State.UF; ++Part) { 9473 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9474 bool IsOrdered = useOrderedReductions(*RdxDesc); 9475 Value *NewVecOp = State.get(getVecOp(), Part); 9476 if (VPValue *Cond = getCondOp()) { 9477 Value *NewCond = State.get(Cond, Part); 9478 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9479 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9480 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9481 Constant *IdenVec = 9482 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9483 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9484 NewVecOp = Select; 9485 } 9486 Value *NewRed; 9487 Value *NextInChain; 9488 if (IsOrdered) { 9489 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9490 PrevInChain); 9491 PrevInChain = NewRed; 9492 } else { 9493 PrevInChain = State.get(getChainOp(), Part); 9494 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9495 } 9496 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9497 NextInChain = 9498 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9499 NewRed, PrevInChain); 9500 } else if (IsOrdered) 9501 NextInChain = NewRed; 9502 else { 9503 NextInChain = State.Builder.CreateBinOp( 9504 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9505 PrevInChain); 9506 } 9507 State.set(this, NextInChain, Part); 9508 } 9509 } 9510 9511 void VPReplicateRecipe::execute(VPTransformState &State) { 9512 if (State.Instance) { // Generate a single instance. 9513 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9514 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9515 *State.Instance, IsPredicated, State); 9516 // Insert scalar instance packing it into a vector. 9517 if (AlsoPack && State.VF.isVector()) { 9518 // If we're constructing lane 0, initialize to start from poison. 9519 if (State.Instance->Lane.isFirstLane()) { 9520 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9521 Value *Poison = PoisonValue::get( 9522 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9523 State.set(this, Poison, State.Instance->Part); 9524 } 9525 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9526 } 9527 return; 9528 } 9529 9530 // Generate scalar instances for all VF lanes of all UF parts, unless the 9531 // instruction is uniform inwhich case generate only the first lane for each 9532 // of the UF parts. 9533 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9534 assert((!State.VF.isScalable() || IsUniform) && 9535 "Can't scalarize a scalable vector"); 9536 for (unsigned Part = 0; Part < State.UF; ++Part) 9537 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9538 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9539 VPIteration(Part, Lane), IsPredicated, 9540 State); 9541 } 9542 9543 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9544 assert(State.Instance && "Branch on Mask works only on single instance."); 9545 9546 unsigned Part = State.Instance->Part; 9547 unsigned Lane = State.Instance->Lane.getKnownLane(); 9548 9549 Value *ConditionBit = nullptr; 9550 VPValue *BlockInMask = getMask(); 9551 if (BlockInMask) { 9552 ConditionBit = State.get(BlockInMask, Part); 9553 if (ConditionBit->getType()->isVectorTy()) 9554 ConditionBit = State.Builder.CreateExtractElement( 9555 ConditionBit, State.Builder.getInt32(Lane)); 9556 } else // Block in mask is all-one. 9557 ConditionBit = State.Builder.getTrue(); 9558 9559 // Replace the temporary unreachable terminator with a new conditional branch, 9560 // whose two destinations will be set later when they are created. 9561 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9562 assert(isa<UnreachableInst>(CurrentTerminator) && 9563 "Expected to replace unreachable terminator with conditional branch."); 9564 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9565 CondBr->setSuccessor(0, nullptr); 9566 ReplaceInstWithInst(CurrentTerminator, CondBr); 9567 } 9568 9569 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9570 assert(State.Instance && "Predicated instruction PHI works per instance."); 9571 Instruction *ScalarPredInst = 9572 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9573 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9574 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9575 assert(PredicatingBB && "Predicated block has no single predecessor."); 9576 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9577 "operand must be VPReplicateRecipe"); 9578 9579 // By current pack/unpack logic we need to generate only a single phi node: if 9580 // a vector value for the predicated instruction exists at this point it means 9581 // the instruction has vector users only, and a phi for the vector value is 9582 // needed. In this case the recipe of the predicated instruction is marked to 9583 // also do that packing, thereby "hoisting" the insert-element sequence. 9584 // Otherwise, a phi node for the scalar value is needed. 9585 unsigned Part = State.Instance->Part; 9586 if (State.hasVectorValue(getOperand(0), Part)) { 9587 Value *VectorValue = State.get(getOperand(0), Part); 9588 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9589 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9590 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9591 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9592 if (State.hasVectorValue(this, Part)) 9593 State.reset(this, VPhi, Part); 9594 else 9595 State.set(this, VPhi, Part); 9596 // NOTE: Currently we need to update the value of the operand, so the next 9597 // predicated iteration inserts its generated value in the correct vector. 9598 State.reset(getOperand(0), VPhi, Part); 9599 } else { 9600 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9601 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9602 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9603 PredicatingBB); 9604 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9605 if (State.hasScalarValue(this, *State.Instance)) 9606 State.reset(this, Phi, *State.Instance); 9607 else 9608 State.set(this, Phi, *State.Instance); 9609 // NOTE: Currently we need to update the value of the operand, so the next 9610 // predicated iteration inserts its generated value in the correct vector. 9611 State.reset(getOperand(0), Phi, *State.Instance); 9612 } 9613 } 9614 9615 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9616 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9617 State.ILV->vectorizeMemoryInstruction( 9618 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9619 StoredValue, getMask()); 9620 } 9621 9622 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9623 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9624 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9625 // for predication. 9626 static ScalarEpilogueLowering getScalarEpilogueLowering( 9627 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9628 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9629 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9630 LoopVectorizationLegality &LVL) { 9631 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9632 // don't look at hints or options, and don't request a scalar epilogue. 9633 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9634 // LoopAccessInfo (due to code dependency and not being able to reliably get 9635 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9636 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9637 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9638 // back to the old way and vectorize with versioning when forced. See D81345.) 9639 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9640 PGSOQueryType::IRPass) && 9641 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9642 return CM_ScalarEpilogueNotAllowedOptSize; 9643 9644 // 2) If set, obey the directives 9645 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9646 switch (PreferPredicateOverEpilogue) { 9647 case PreferPredicateTy::ScalarEpilogue: 9648 return CM_ScalarEpilogueAllowed; 9649 case PreferPredicateTy::PredicateElseScalarEpilogue: 9650 return CM_ScalarEpilogueNotNeededUsePredicate; 9651 case PreferPredicateTy::PredicateOrDontVectorize: 9652 return CM_ScalarEpilogueNotAllowedUsePredicate; 9653 }; 9654 } 9655 9656 // 3) If set, obey the hints 9657 switch (Hints.getPredicate()) { 9658 case LoopVectorizeHints::FK_Enabled: 9659 return CM_ScalarEpilogueNotNeededUsePredicate; 9660 case LoopVectorizeHints::FK_Disabled: 9661 return CM_ScalarEpilogueAllowed; 9662 }; 9663 9664 // 4) if the TTI hook indicates this is profitable, request predication. 9665 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9666 LVL.getLAI())) 9667 return CM_ScalarEpilogueNotNeededUsePredicate; 9668 9669 return CM_ScalarEpilogueAllowed; 9670 } 9671 9672 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9673 // If Values have been set for this Def return the one relevant for \p Part. 9674 if (hasVectorValue(Def, Part)) 9675 return Data.PerPartOutput[Def][Part]; 9676 9677 if (!hasScalarValue(Def, {Part, 0})) { 9678 Value *IRV = Def->getLiveInIRValue(); 9679 Value *B = ILV->getBroadcastInstrs(IRV); 9680 set(Def, B, Part); 9681 return B; 9682 } 9683 9684 Value *ScalarValue = get(Def, {Part, 0}); 9685 // If we aren't vectorizing, we can just copy the scalar map values over 9686 // to the vector map. 9687 if (VF.isScalar()) { 9688 set(Def, ScalarValue, Part); 9689 return ScalarValue; 9690 } 9691 9692 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9693 bool IsUniform = RepR && RepR->isUniform(); 9694 9695 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9696 // Check if there is a scalar value for the selected lane. 9697 if (!hasScalarValue(Def, {Part, LastLane})) { 9698 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9699 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9700 "unexpected recipe found to be invariant"); 9701 IsUniform = true; 9702 LastLane = 0; 9703 } 9704 9705 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9706 9707 // Set the insert point after the last scalarized instruction. This 9708 // ensures the insertelement sequence will directly follow the scalar 9709 // definitions. 9710 auto OldIP = Builder.saveIP(); 9711 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9712 Builder.SetInsertPoint(&*NewIP); 9713 9714 // However, if we are vectorizing, we need to construct the vector values. 9715 // If the value is known to be uniform after vectorization, we can just 9716 // broadcast the scalar value corresponding to lane zero for each unroll 9717 // iteration. Otherwise, we construct the vector values using 9718 // insertelement instructions. Since the resulting vectors are stored in 9719 // State, we will only generate the insertelements once. 9720 Value *VectorValue = nullptr; 9721 if (IsUniform) { 9722 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9723 set(Def, VectorValue, Part); 9724 } else { 9725 // Initialize packing with insertelements to start from undef. 9726 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9727 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9728 set(Def, Undef, Part); 9729 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9730 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9731 VectorValue = get(Def, Part); 9732 } 9733 Builder.restoreIP(OldIP); 9734 return VectorValue; 9735 } 9736 9737 // Process the loop in the VPlan-native vectorization path. This path builds 9738 // VPlan upfront in the vectorization pipeline, which allows to apply 9739 // VPlan-to-VPlan transformations from the very beginning without modifying the 9740 // input LLVM IR. 9741 static bool processLoopInVPlanNativePath( 9742 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9743 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9744 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9745 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9746 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9747 LoopVectorizationRequirements &Requirements) { 9748 9749 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9750 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9751 return false; 9752 } 9753 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9754 Function *F = L->getHeader()->getParent(); 9755 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9756 9757 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9758 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9759 9760 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9761 &Hints, IAI); 9762 // Use the planner for outer loop vectorization. 9763 // TODO: CM is not used at this point inside the planner. Turn CM into an 9764 // optional argument if we don't need it in the future. 9765 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9766 Requirements, ORE); 9767 9768 // Get user vectorization factor. 9769 ElementCount UserVF = Hints.getWidth(); 9770 9771 // Plan how to best vectorize, return the best VF and its cost. 9772 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9773 9774 // If we are stress testing VPlan builds, do not attempt to generate vector 9775 // code. Masked vector code generation support will follow soon. 9776 // Also, do not attempt to vectorize if no vector code will be produced. 9777 if (VPlanBuildStressTest || EnableVPlanPredication || 9778 VectorizationFactor::Disabled() == VF) 9779 return false; 9780 9781 LVP.setBestPlan(VF.Width, 1); 9782 9783 { 9784 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9785 F->getParent()->getDataLayout()); 9786 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9787 &CM, BFI, PSI, Checks); 9788 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9789 << L->getHeader()->getParent()->getName() << "\"\n"); 9790 LVP.executePlan(LB, DT); 9791 } 9792 9793 // Mark the loop as already vectorized to avoid vectorizing again. 9794 Hints.setAlreadyVectorized(); 9795 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9796 return true; 9797 } 9798 9799 // Emit a remark if there are stores to floats that required a floating point 9800 // extension. If the vectorized loop was generated with floating point there 9801 // will be a performance penalty from the conversion overhead and the change in 9802 // the vector width. 9803 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9804 SmallVector<Instruction *, 4> Worklist; 9805 for (BasicBlock *BB : L->getBlocks()) { 9806 for (Instruction &Inst : *BB) { 9807 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9808 if (S->getValueOperand()->getType()->isFloatTy()) 9809 Worklist.push_back(S); 9810 } 9811 } 9812 } 9813 9814 // Traverse the floating point stores upwards searching, for floating point 9815 // conversions. 9816 SmallPtrSet<const Instruction *, 4> Visited; 9817 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9818 while (!Worklist.empty()) { 9819 auto *I = Worklist.pop_back_val(); 9820 if (!L->contains(I)) 9821 continue; 9822 if (!Visited.insert(I).second) 9823 continue; 9824 9825 // Emit a remark if the floating point store required a floating 9826 // point conversion. 9827 // TODO: More work could be done to identify the root cause such as a 9828 // constant or a function return type and point the user to it. 9829 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9830 ORE->emit([&]() { 9831 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9832 I->getDebugLoc(), L->getHeader()) 9833 << "floating point conversion changes vector width. " 9834 << "Mixed floating point precision requires an up/down " 9835 << "cast that will negatively impact performance."; 9836 }); 9837 9838 for (Use &Op : I->operands()) 9839 if (auto *OpI = dyn_cast<Instruction>(Op)) 9840 Worklist.push_back(OpI); 9841 } 9842 } 9843 9844 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9845 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9846 !EnableLoopInterleaving), 9847 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9848 !EnableLoopVectorization) {} 9849 9850 bool LoopVectorizePass::processLoop(Loop *L) { 9851 assert((EnableVPlanNativePath || L->isInnermost()) && 9852 "VPlan-native path is not enabled. Only process inner loops."); 9853 9854 #ifndef NDEBUG 9855 const std::string DebugLocStr = getDebugLocString(L); 9856 #endif /* NDEBUG */ 9857 9858 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9859 << L->getHeader()->getParent()->getName() << "\" from " 9860 << DebugLocStr << "\n"); 9861 9862 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9863 9864 LLVM_DEBUG( 9865 dbgs() << "LV: Loop hints:" 9866 << " force=" 9867 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9868 ? "disabled" 9869 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9870 ? "enabled" 9871 : "?")) 9872 << " width=" << Hints.getWidth() 9873 << " interleave=" << Hints.getInterleave() << "\n"); 9874 9875 // Function containing loop 9876 Function *F = L->getHeader()->getParent(); 9877 9878 // Looking at the diagnostic output is the only way to determine if a loop 9879 // was vectorized (other than looking at the IR or machine code), so it 9880 // is important to generate an optimization remark for each loop. Most of 9881 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9882 // generated as OptimizationRemark and OptimizationRemarkMissed are 9883 // less verbose reporting vectorized loops and unvectorized loops that may 9884 // benefit from vectorization, respectively. 9885 9886 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9887 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9888 return false; 9889 } 9890 9891 PredicatedScalarEvolution PSE(*SE, *L); 9892 9893 // Check if it is legal to vectorize the loop. 9894 LoopVectorizationRequirements Requirements; 9895 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9896 &Requirements, &Hints, DB, AC, BFI, PSI); 9897 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9898 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9899 Hints.emitRemarkWithHints(); 9900 return false; 9901 } 9902 9903 // Check the function attributes and profiles to find out if this function 9904 // should be optimized for size. 9905 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9906 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9907 9908 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9909 // here. They may require CFG and instruction level transformations before 9910 // even evaluating whether vectorization is profitable. Since we cannot modify 9911 // the incoming IR, we need to build VPlan upfront in the vectorization 9912 // pipeline. 9913 if (!L->isInnermost()) 9914 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9915 ORE, BFI, PSI, Hints, Requirements); 9916 9917 assert(L->isInnermost() && "Inner loop expected."); 9918 9919 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9920 // count by optimizing for size, to minimize overheads. 9921 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9922 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9923 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9924 << "This loop is worth vectorizing only if no scalar " 9925 << "iteration overheads are incurred."); 9926 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9927 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9928 else { 9929 LLVM_DEBUG(dbgs() << "\n"); 9930 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9931 } 9932 } 9933 9934 // Check the function attributes to see if implicit floats are allowed. 9935 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9936 // an integer loop and the vector instructions selected are purely integer 9937 // vector instructions? 9938 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9939 reportVectorizationFailure( 9940 "Can't vectorize when the NoImplicitFloat attribute is used", 9941 "loop not vectorized due to NoImplicitFloat attribute", 9942 "NoImplicitFloat", ORE, L); 9943 Hints.emitRemarkWithHints(); 9944 return false; 9945 } 9946 9947 // Check if the target supports potentially unsafe FP vectorization. 9948 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9949 // for the target we're vectorizing for, to make sure none of the 9950 // additional fp-math flags can help. 9951 if (Hints.isPotentiallyUnsafe() && 9952 TTI->isFPVectorizationPotentiallyUnsafe()) { 9953 reportVectorizationFailure( 9954 "Potentially unsafe FP op prevents vectorization", 9955 "loop not vectorized due to unsafe FP support.", 9956 "UnsafeFP", ORE, L); 9957 Hints.emitRemarkWithHints(); 9958 return false; 9959 } 9960 9961 if (!LVL.canVectorizeFPMath(EnableStrictReductions)) { 9962 ORE->emit([&]() { 9963 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9964 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9965 ExactFPMathInst->getDebugLoc(), 9966 ExactFPMathInst->getParent()) 9967 << "loop not vectorized: cannot prove it is safe to reorder " 9968 "floating-point operations"; 9969 }); 9970 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9971 "reorder floating-point operations\n"); 9972 Hints.emitRemarkWithHints(); 9973 return false; 9974 } 9975 9976 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9977 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9978 9979 // If an override option has been passed in for interleaved accesses, use it. 9980 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9981 UseInterleaved = EnableInterleavedMemAccesses; 9982 9983 // Analyze interleaved memory accesses. 9984 if (UseInterleaved) { 9985 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9986 } 9987 9988 // Use the cost model. 9989 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9990 F, &Hints, IAI); 9991 CM.collectValuesToIgnore(); 9992 9993 // Use the planner for vectorization. 9994 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9995 Requirements, ORE); 9996 9997 // Get user vectorization factor and interleave count. 9998 ElementCount UserVF = Hints.getWidth(); 9999 unsigned UserIC = Hints.getInterleave(); 10000 10001 // Plan how to best vectorize, return the best VF and its cost. 10002 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10003 10004 VectorizationFactor VF = VectorizationFactor::Disabled(); 10005 unsigned IC = 1; 10006 10007 if (MaybeVF) { 10008 VF = *MaybeVF; 10009 // Select the interleave count. 10010 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10011 } 10012 10013 // Identify the diagnostic messages that should be produced. 10014 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10015 bool VectorizeLoop = true, InterleaveLoop = true; 10016 if (VF.Width.isScalar()) { 10017 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10018 VecDiagMsg = std::make_pair( 10019 "VectorizationNotBeneficial", 10020 "the cost-model indicates that vectorization is not beneficial"); 10021 VectorizeLoop = false; 10022 } 10023 10024 if (!MaybeVF && UserIC > 1) { 10025 // Tell the user interleaving was avoided up-front, despite being explicitly 10026 // requested. 10027 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10028 "interleaving should be avoided up front\n"); 10029 IntDiagMsg = std::make_pair( 10030 "InterleavingAvoided", 10031 "Ignoring UserIC, because interleaving was avoided up front"); 10032 InterleaveLoop = false; 10033 } else if (IC == 1 && UserIC <= 1) { 10034 // Tell the user interleaving is not beneficial. 10035 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10036 IntDiagMsg = std::make_pair( 10037 "InterleavingNotBeneficial", 10038 "the cost-model indicates that interleaving is not beneficial"); 10039 InterleaveLoop = false; 10040 if (UserIC == 1) { 10041 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10042 IntDiagMsg.second += 10043 " and is explicitly disabled or interleave count is set to 1"; 10044 } 10045 } else if (IC > 1 && UserIC == 1) { 10046 // Tell the user interleaving is beneficial, but it explicitly disabled. 10047 LLVM_DEBUG( 10048 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10049 IntDiagMsg = std::make_pair( 10050 "InterleavingBeneficialButDisabled", 10051 "the cost-model indicates that interleaving is beneficial " 10052 "but is explicitly disabled or interleave count is set to 1"); 10053 InterleaveLoop = false; 10054 } 10055 10056 // Override IC if user provided an interleave count. 10057 IC = UserIC > 0 ? UserIC : IC; 10058 10059 // Emit diagnostic messages, if any. 10060 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10061 if (!VectorizeLoop && !InterleaveLoop) { 10062 // Do not vectorize or interleaving the loop. 10063 ORE->emit([&]() { 10064 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10065 L->getStartLoc(), L->getHeader()) 10066 << VecDiagMsg.second; 10067 }); 10068 ORE->emit([&]() { 10069 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10070 L->getStartLoc(), L->getHeader()) 10071 << IntDiagMsg.second; 10072 }); 10073 return false; 10074 } else if (!VectorizeLoop && InterleaveLoop) { 10075 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10076 ORE->emit([&]() { 10077 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10078 L->getStartLoc(), L->getHeader()) 10079 << VecDiagMsg.second; 10080 }); 10081 } else if (VectorizeLoop && !InterleaveLoop) { 10082 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10083 << ") in " << DebugLocStr << '\n'); 10084 ORE->emit([&]() { 10085 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10086 L->getStartLoc(), L->getHeader()) 10087 << IntDiagMsg.second; 10088 }); 10089 } else if (VectorizeLoop && InterleaveLoop) { 10090 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10091 << ") in " << DebugLocStr << '\n'); 10092 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10093 } 10094 10095 bool DisableRuntimeUnroll = false; 10096 MDNode *OrigLoopID = L->getLoopID(); 10097 { 10098 // Optimistically generate runtime checks. Drop them if they turn out to not 10099 // be profitable. Limit the scope of Checks, so the cleanup happens 10100 // immediately after vector codegeneration is done. 10101 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10102 F->getParent()->getDataLayout()); 10103 if (!VF.Width.isScalar() || IC > 1) 10104 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10105 LVP.setBestPlan(VF.Width, IC); 10106 10107 using namespace ore; 10108 if (!VectorizeLoop) { 10109 assert(IC > 1 && "interleave count should not be 1 or 0"); 10110 // If we decided that it is not legal to vectorize the loop, then 10111 // interleave it. 10112 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10113 &CM, BFI, PSI, Checks); 10114 LVP.executePlan(Unroller, DT); 10115 10116 ORE->emit([&]() { 10117 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10118 L->getHeader()) 10119 << "interleaved loop (interleaved count: " 10120 << NV("InterleaveCount", IC) << ")"; 10121 }); 10122 } else { 10123 // If we decided that it is *legal* to vectorize the loop, then do it. 10124 10125 // Consider vectorizing the epilogue too if it's profitable. 10126 VectorizationFactor EpilogueVF = 10127 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10128 if (EpilogueVF.Width.isVector()) { 10129 10130 // The first pass vectorizes the main loop and creates a scalar epilogue 10131 // to be vectorized by executing the plan (potentially with a different 10132 // factor) again shortly afterwards. 10133 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10134 EpilogueVF.Width.getKnownMinValue(), 10135 1); 10136 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10137 EPI, &LVL, &CM, BFI, PSI, Checks); 10138 10139 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10140 LVP.executePlan(MainILV, DT); 10141 ++LoopsVectorized; 10142 10143 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10144 formLCSSARecursively(*L, *DT, LI, SE); 10145 10146 // Second pass vectorizes the epilogue and adjusts the control flow 10147 // edges from the first pass. 10148 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10149 EPI.MainLoopVF = EPI.EpilogueVF; 10150 EPI.MainLoopUF = EPI.EpilogueUF; 10151 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10152 ORE, EPI, &LVL, &CM, BFI, PSI, 10153 Checks); 10154 LVP.executePlan(EpilogILV, DT); 10155 ++LoopsEpilogueVectorized; 10156 10157 if (!MainILV.areSafetyChecksAdded()) 10158 DisableRuntimeUnroll = true; 10159 } else { 10160 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10161 &LVL, &CM, BFI, PSI, Checks); 10162 LVP.executePlan(LB, DT); 10163 ++LoopsVectorized; 10164 10165 // Add metadata to disable runtime unrolling a scalar loop when there 10166 // are no runtime checks about strides and memory. A scalar loop that is 10167 // rarely used is not worth unrolling. 10168 if (!LB.areSafetyChecksAdded()) 10169 DisableRuntimeUnroll = true; 10170 } 10171 // Report the vectorization decision. 10172 ORE->emit([&]() { 10173 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10174 L->getHeader()) 10175 << "vectorized loop (vectorization width: " 10176 << NV("VectorizationFactor", VF.Width) 10177 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10178 }); 10179 } 10180 10181 if (ORE->allowExtraAnalysis(LV_NAME)) 10182 checkMixedPrecision(L, ORE); 10183 } 10184 10185 Optional<MDNode *> RemainderLoopID = 10186 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10187 LLVMLoopVectorizeFollowupEpilogue}); 10188 if (RemainderLoopID.hasValue()) { 10189 L->setLoopID(RemainderLoopID.getValue()); 10190 } else { 10191 if (DisableRuntimeUnroll) 10192 AddRuntimeUnrollDisableMetaData(L); 10193 10194 // Mark the loop as already vectorized to avoid vectorizing again. 10195 Hints.setAlreadyVectorized(); 10196 } 10197 10198 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10199 return true; 10200 } 10201 10202 LoopVectorizeResult LoopVectorizePass::runImpl( 10203 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10204 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10205 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10206 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10207 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10208 SE = &SE_; 10209 LI = &LI_; 10210 TTI = &TTI_; 10211 DT = &DT_; 10212 BFI = &BFI_; 10213 TLI = TLI_; 10214 AA = &AA_; 10215 AC = &AC_; 10216 GetLAA = &GetLAA_; 10217 DB = &DB_; 10218 ORE = &ORE_; 10219 PSI = PSI_; 10220 10221 // Don't attempt if 10222 // 1. the target claims to have no vector registers, and 10223 // 2. interleaving won't help ILP. 10224 // 10225 // The second condition is necessary because, even if the target has no 10226 // vector registers, loop vectorization may still enable scalar 10227 // interleaving. 10228 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10229 TTI->getMaxInterleaveFactor(1) < 2) 10230 return LoopVectorizeResult(false, false); 10231 10232 bool Changed = false, CFGChanged = false; 10233 10234 // The vectorizer requires loops to be in simplified form. 10235 // Since simplification may add new inner loops, it has to run before the 10236 // legality and profitability checks. This means running the loop vectorizer 10237 // will simplify all loops, regardless of whether anything end up being 10238 // vectorized. 10239 for (auto &L : *LI) 10240 Changed |= CFGChanged |= 10241 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10242 10243 // Build up a worklist of inner-loops to vectorize. This is necessary as 10244 // the act of vectorizing or partially unrolling a loop creates new loops 10245 // and can invalidate iterators across the loops. 10246 SmallVector<Loop *, 8> Worklist; 10247 10248 for (Loop *L : *LI) 10249 collectSupportedLoops(*L, LI, ORE, Worklist); 10250 10251 LoopsAnalyzed += Worklist.size(); 10252 10253 // Now walk the identified inner loops. 10254 while (!Worklist.empty()) { 10255 Loop *L = Worklist.pop_back_val(); 10256 10257 // For the inner loops we actually process, form LCSSA to simplify the 10258 // transform. 10259 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10260 10261 Changed |= CFGChanged |= processLoop(L); 10262 } 10263 10264 // Process each loop nest in the function. 10265 return LoopVectorizeResult(Changed, CFGChanged); 10266 } 10267 10268 PreservedAnalyses LoopVectorizePass::run(Function &F, 10269 FunctionAnalysisManager &AM) { 10270 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10271 auto &LI = AM.getResult<LoopAnalysis>(F); 10272 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10273 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10274 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10275 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10276 auto &AA = AM.getResult<AAManager>(F); 10277 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10278 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10279 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10280 MemorySSA *MSSA = EnableMSSALoopDependency 10281 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10282 : nullptr; 10283 10284 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10285 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10286 [&](Loop &L) -> const LoopAccessInfo & { 10287 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10288 TLI, TTI, nullptr, MSSA}; 10289 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10290 }; 10291 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10292 ProfileSummaryInfo *PSI = 10293 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10294 LoopVectorizeResult Result = 10295 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10296 if (!Result.MadeAnyChange) 10297 return PreservedAnalyses::all(); 10298 PreservedAnalyses PA; 10299 10300 // We currently do not preserve loopinfo/dominator analyses with outer loop 10301 // vectorization. Until this is addressed, mark these analyses as preserved 10302 // only for non-VPlan-native path. 10303 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10304 if (!EnableVPlanNativePath) { 10305 PA.preserve<LoopAnalysis>(); 10306 PA.preserve<DominatorTreeAnalysis>(); 10307 } 10308 if (!Result.MadeCFGChange) 10309 PA.preserveSet<CFGAnalyses>(); 10310 return PA; 10311 } 10312