1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> EnableStrictReductions( 335 "enable-strict-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns the type of loaded or stored value. 379 static Type *getMemInstValueType(Value *I) { 380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 381 "Expected Load or Store instruction"); 382 if (auto *LI = dyn_cast<LoadInst>(I)) 383 return LI->getType(); 384 return cast<StoreInst>(I)->getValueOperand()->getType(); 385 } 386 387 /// A helper function that returns true if the given type is irregular. The 388 /// type is irregular if its allocated size doesn't equal the store size of an 389 /// element of the corresponding vector type. 390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 391 // Determine if an array of N elements of type Ty is "bitcast compatible" 392 // with a <N x Ty> vector. 393 // This is only true if there is no padding between the array elements. 394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 395 } 396 397 /// A helper function that returns the reciprocal of the block probability of 398 /// predicated blocks. If we return X, we are assuming the predicated block 399 /// will execute once for every X iterations of the loop header. 400 /// 401 /// TODO: We should use actual block probability here, if available. Currently, 402 /// we always assume predicated blocks have a 50% chance of executing. 403 static unsigned getReciprocalPredBlockProb() { return 2; } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPWidenPHIRecipe *PhiR, VPTransformState &State); 519 520 /// A helper function to scalarize a single Instruction in the innermost loop. 521 /// Generates a sequence of scalar instances for each lane between \p MinLane 522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 524 /// Instr's operands. 525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 526 const VPIteration &Instance, bool IfPredicateInstr, 527 VPTransformState &State); 528 529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 530 /// is provided, the integer induction variable will first be truncated to 531 /// the corresponding type. 532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 533 VPValue *Def, VPValue *CastDef, 534 VPTransformState &State); 535 536 /// Construct the vector value of a scalarized value \p V one lane at a time. 537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 538 VPTransformState &State); 539 540 /// Try to vectorize interleaved access group \p Group with the base address 541 /// given in \p Addr, optionally masking the vector operations if \p 542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 543 /// values in the vectorized loop. 544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 545 ArrayRef<VPValue *> VPDefs, 546 VPTransformState &State, VPValue *Addr, 547 ArrayRef<VPValue *> StoredValues, 548 VPValue *BlockInMask = nullptr); 549 550 /// Vectorize Load and Store instructions with the base address given in \p 551 /// Addr, optionally masking the vector operations if \p BlockInMask is 552 /// non-null. Use \p State to translate given VPValues to IR values in the 553 /// vectorized loop. 554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 555 VPValue *Def, VPValue *Addr, 556 VPValue *StoredValue, VPValue *BlockInMask); 557 558 /// Set the debug location in the builder using the debug location in 559 /// the instruction. 560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 561 562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 563 void fixNonInductionPHIs(VPTransformState &State); 564 565 /// Create a broadcast instruction. This method generates a broadcast 566 /// instruction (shuffle) for loop invariant values and for the induction 567 /// value. If this is the induction variable then we extend it to N, N+1, ... 568 /// this is needed because each iteration in the loop corresponds to a SIMD 569 /// element. 570 virtual Value *getBroadcastInstrs(Value *V); 571 572 protected: 573 friend class LoopVectorizationPlanner; 574 575 /// A small list of PHINodes. 576 using PhiVector = SmallVector<PHINode *, 4>; 577 578 /// A type for scalarized values in the new loop. Each value from the 579 /// original loop, when scalarized, is represented by UF x VF scalar values 580 /// in the new unrolled loop, where UF is the unroll factor and VF is the 581 /// vectorization factor. 582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 583 584 /// Set up the values of the IVs correctly when exiting the vector loop. 585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 586 Value *CountRoundDown, Value *EndValue, 587 BasicBlock *MiddleBlock); 588 589 /// Create a new induction variable inside L. 590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 591 Value *Step, Instruction *DL); 592 593 /// Handle all cross-iteration phis in the header. 594 void fixCrossIterationPHIs(VPTransformState &State); 595 596 /// Fix a first-order recurrence. This is the second phase of vectorizing 597 /// this phi node. 598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 599 600 /// Fix a reduction cross-iteration phi. This is the second phase of 601 /// vectorizing this phi node. 602 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 603 604 /// Clear NSW/NUW flags from reduction instructions if necessary. 605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 606 VPTransformState &State); 607 608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 609 /// means we need to add the appropriate incoming value from the middle 610 /// block as exiting edges from the scalar epilogue loop (if present) are 611 /// already in place, and we exit the vector loop exclusively to the middle 612 /// block. 613 void fixLCSSAPHIs(VPTransformState &State); 614 615 /// Iteratively sink the scalarized operands of a predicated instruction into 616 /// the block that was created for it. 617 void sinkScalarOperands(Instruction *PredInst); 618 619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 620 /// represented as. 621 void truncateToMinimalBitwidths(VPTransformState &State); 622 623 /// This function adds 624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The unique ExitBlock of the scalar loop if one exists. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// LoopVectorizationCostModel - estimates the expected speedups due to 1211 /// vectorization. 1212 /// In many cases vectorization is not profitable. This can happen because of 1213 /// a number of reasons. In this class we mainly attempt to predict the 1214 /// expected speedup/slowdowns due to the supported instruction set. We use the 1215 /// TargetTransformInfo to query the different backends for the cost of 1216 /// different operations. 1217 class LoopVectorizationCostModel { 1218 public: 1219 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1220 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1221 LoopVectorizationLegality *Legal, 1222 const TargetTransformInfo &TTI, 1223 const TargetLibraryInfo *TLI, DemandedBits *DB, 1224 AssumptionCache *AC, 1225 OptimizationRemarkEmitter *ORE, const Function *F, 1226 const LoopVectorizeHints *Hints, 1227 InterleavedAccessInfo &IAI) 1228 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1229 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1230 Hints(Hints), InterleaveInfo(IAI) {} 1231 1232 /// \return An upper bound for the vectorization factor, or None if 1233 /// vectorization and interleaving should be avoided up front. 1234 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1235 1236 /// \return True if runtime checks are required for vectorization, and false 1237 /// otherwise. 1238 bool runtimeChecksRequired(); 1239 1240 /// \return The most profitable vectorization factor and the cost of that VF. 1241 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1242 /// then this vectorization factor will be selected if vectorization is 1243 /// possible. 1244 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1245 VectorizationFactor 1246 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1247 const LoopVectorizationPlanner &LVP); 1248 1249 /// Setup cost-based decisions for user vectorization factor. 1250 void selectUserVectorizationFactor(ElementCount UserVF) { 1251 collectUniformsAndScalars(UserVF); 1252 collectInstsToScalarize(UserVF); 1253 } 1254 1255 /// \return The size (in bits) of the smallest and widest types in the code 1256 /// that needs to be vectorized. We ignore values that remain scalar such as 1257 /// 64 bit loop indices. 1258 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1259 1260 /// \return The desired interleave count. 1261 /// If interleave count has been specified by metadata it will be returned. 1262 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1263 /// are the selected vectorization factor and the cost of the selected VF. 1264 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1265 1266 /// Memory access instruction may be vectorized in more than one way. 1267 /// Form of instruction after vectorization depends on cost. 1268 /// This function takes cost-based decisions for Load/Store instructions 1269 /// and collects them in a map. This decisions map is used for building 1270 /// the lists of loop-uniform and loop-scalar instructions. 1271 /// The calculated cost is saved with widening decision in order to 1272 /// avoid redundant calculations. 1273 void setCostBasedWideningDecision(ElementCount VF); 1274 1275 /// A struct that represents some properties of the register usage 1276 /// of a loop. 1277 struct RegisterUsage { 1278 /// Holds the number of loop invariant values that are used in the loop. 1279 /// The key is ClassID of target-provided register class. 1280 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1281 /// Holds the maximum number of concurrent live intervals in the loop. 1282 /// The key is ClassID of target-provided register class. 1283 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1284 }; 1285 1286 /// \return Returns information about the register usages of the loop for the 1287 /// given vectorization factors. 1288 SmallVector<RegisterUsage, 8> 1289 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1290 1291 /// Collect values we want to ignore in the cost model. 1292 void collectValuesToIgnore(); 1293 1294 /// Split reductions into those that happen in the loop, and those that happen 1295 /// outside. In loop reductions are collected into InLoopReductionChains. 1296 void collectInLoopReductions(); 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 InstructionCost Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, 1384 InstructionCost Cost) { 1385 assert(VF.isVector() && "Expected VF >=2"); 1386 /// Broadcast this decicion to all instructions inside the group. 1387 /// But the cost will be assigned to one instruction only. 1388 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1389 if (auto *I = Grp->getMember(i)) { 1390 if (Grp->getInsertPos() == I) 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 else 1393 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1394 } 1395 } 1396 } 1397 1398 /// Return the cost model decision for the given instruction \p I and vector 1399 /// width \p VF. Return CM_Unknown if this instruction did not pass 1400 /// through the cost modeling. 1401 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1402 assert(VF.isVector() && "Expected VF to be a vector VF"); 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return CM_GatherScatter; 1407 1408 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1409 auto Itr = WideningDecisions.find(InstOnVF); 1410 if (Itr == WideningDecisions.end()) 1411 return CM_Unknown; 1412 return Itr->second.first; 1413 } 1414 1415 /// Return the vectorization cost for the given instruction \p I and vector 1416 /// width \p VF. 1417 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1418 assert(VF.isVector() && "Expected VF >=2"); 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1421 "The cost is not calculated"); 1422 return WideningDecisions[InstOnVF].second; 1423 } 1424 1425 /// Return True if instruction \p I is an optimizable truncate whose operand 1426 /// is an induction variable. Such a truncate will be removed by adding a new 1427 /// induction variable with the destination type. 1428 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1429 // If the instruction is not a truncate, return false. 1430 auto *Trunc = dyn_cast<TruncInst>(I); 1431 if (!Trunc) 1432 return false; 1433 1434 // Get the source and destination types of the truncate. 1435 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1436 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1437 1438 // If the truncate is free for the given types, return false. Replacing a 1439 // free truncate with an induction variable would add an induction variable 1440 // update instruction to each iteration of the loop. We exclude from this 1441 // check the primary induction variable since it will need an update 1442 // instruction regardless. 1443 Value *Op = Trunc->getOperand(0); 1444 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1445 return false; 1446 1447 // If the truncated value is not an induction variable, return false. 1448 return Legal->isInductionPhi(Op); 1449 } 1450 1451 /// Collects the instructions to scalarize for each predicated instruction in 1452 /// the loop. 1453 void collectInstsToScalarize(ElementCount VF); 1454 1455 /// Collect Uniform and Scalar values for the given \p VF. 1456 /// The sets depend on CM decision for Load/Store instructions 1457 /// that may be vectorized as interleave, gather-scatter or scalarized. 1458 void collectUniformsAndScalars(ElementCount VF) { 1459 // Do the analysis once. 1460 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1461 return; 1462 setCostBasedWideningDecision(VF); 1463 collectLoopUniforms(VF); 1464 collectLoopScalars(VF); 1465 } 1466 1467 /// Returns true if the target machine supports masked store operation 1468 /// for the given \p DataType and kind of access to \p Ptr. 1469 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1470 return Legal->isConsecutivePtr(Ptr) && 1471 TTI.isLegalMaskedStore(DataType, Alignment); 1472 } 1473 1474 /// Returns true if the target machine supports masked load operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedLoad(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked scatter operation 1482 /// for the given \p DataType. 1483 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1484 return TTI.isLegalMaskedScatter(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked gather operation 1488 /// for the given \p DataType. 1489 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1490 return TTI.isLegalMaskedGather(DataType, Alignment); 1491 } 1492 1493 /// Returns true if the target machine can represent \p V as a masked gather 1494 /// or scatter operation. 1495 bool isLegalGatherOrScatter(Value *V) { 1496 bool LI = isa<LoadInst>(V); 1497 bool SI = isa<StoreInst>(V); 1498 if (!LI && !SI) 1499 return false; 1500 auto *Ty = getMemInstValueType(V); 1501 Align Align = getLoadStoreAlignment(V); 1502 return (LI && isLegalMaskedGather(Ty, Align)) || 1503 (SI && isLegalMaskedScatter(Ty, Align)); 1504 } 1505 1506 /// Returns true if the target machine supports all of the reduction 1507 /// variables found for the given VF. 1508 bool canVectorizeReductions(ElementCount VF) { 1509 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1510 RecurrenceDescriptor RdxDesc = Reduction.second; 1511 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1512 })); 1513 } 1514 1515 /// Returns true if \p I is an instruction that will be scalarized with 1516 /// predication. Such instructions include conditional stores and 1517 /// instructions that may divide by zero. 1518 /// If a non-zero VF has been calculated, we check if I will be scalarized 1519 /// predication for that VF. 1520 bool isScalarWithPredication(Instruction *I) const; 1521 1522 // Returns true if \p I is an instruction that will be predicated either 1523 // through scalar predication or masked load/store or masked gather/scatter. 1524 // Superset of instructions that return true for isScalarWithPredication. 1525 bool isPredicatedInst(Instruction *I) { 1526 if (!blockNeedsPredication(I->getParent())) 1527 return false; 1528 // Loads and stores that need some form of masked operation are predicated 1529 // instructions. 1530 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1531 return Legal->isMaskRequired(I); 1532 return isScalarWithPredication(I); 1533 } 1534 1535 /// Returns true if \p I is a memory instruction with consecutive memory 1536 /// access that can be widened. 1537 bool 1538 memoryInstructionCanBeWidened(Instruction *I, 1539 ElementCount VF = ElementCount::getFixed(1)); 1540 1541 /// Returns true if \p I is a memory instruction in an interleaved-group 1542 /// of memory accesses that can be vectorized with wide vector loads/stores 1543 /// and shuffles. 1544 bool 1545 interleavedAccessCanBeWidened(Instruction *I, 1546 ElementCount VF = ElementCount::getFixed(1)); 1547 1548 /// Check if \p Instr belongs to any interleaved access group. 1549 bool isAccessInterleaved(Instruction *Instr) { 1550 return InterleaveInfo.isInterleaved(Instr); 1551 } 1552 1553 /// Get the interleaved access group that \p Instr belongs to. 1554 const InterleaveGroup<Instruction> * 1555 getInterleavedAccessGroup(Instruction *Instr) { 1556 return InterleaveInfo.getInterleaveGroup(Instr); 1557 } 1558 1559 /// Returns true if we're required to use a scalar epilogue for at least 1560 /// the final iteration of the original loop. 1561 bool requiresScalarEpilogue() const { 1562 if (!isScalarEpilogueAllowed()) 1563 return false; 1564 // If we might exit from anywhere but the latch, must run the exiting 1565 // iteration in scalar form. 1566 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1567 return true; 1568 return InterleaveInfo.requiresScalarEpilogue(); 1569 } 1570 1571 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1572 /// loop hint annotation. 1573 bool isScalarEpilogueAllowed() const { 1574 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1575 } 1576 1577 /// Returns true if all loop blocks should be masked to fold tail loop. 1578 bool foldTailByMasking() const { return FoldTailByMasking; } 1579 1580 bool blockNeedsPredication(BasicBlock *BB) const { 1581 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1582 } 1583 1584 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1585 /// nodes to the chain of instructions representing the reductions. Uses a 1586 /// MapVector to ensure deterministic iteration order. 1587 using ReductionChainMap = 1588 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1589 1590 /// Return the chain of instructions representing an inloop reduction. 1591 const ReductionChainMap &getInLoopReductionChains() const { 1592 return InLoopReductionChains; 1593 } 1594 1595 /// Returns true if the Phi is part of an inloop reduction. 1596 bool isInLoopReduction(PHINode *Phi) const { 1597 return InLoopReductionChains.count(Phi); 1598 } 1599 1600 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1601 /// with factor VF. Return the cost of the instruction, including 1602 /// scalarization overhead if it's needed. 1603 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1604 1605 /// Estimate cost of a call instruction CI if it were vectorized with factor 1606 /// VF. Return the cost of the instruction, including scalarization overhead 1607 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1608 /// scalarized - 1609 /// i.e. either vector version isn't available, or is too expensive. 1610 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1611 bool &NeedToScalarize) const; 1612 1613 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1614 /// that of B. 1615 bool isMoreProfitable(const VectorizationFactor &A, 1616 const VectorizationFactor &B) const; 1617 1618 /// Invalidates decisions already taken by the cost model. 1619 void invalidateCostModelingDecisions() { 1620 WideningDecisions.clear(); 1621 Uniforms.clear(); 1622 Scalars.clear(); 1623 } 1624 1625 private: 1626 unsigned NumPredStores = 0; 1627 1628 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1629 /// than zero. One is returned if vectorization should best be avoided due 1630 /// to cost. 1631 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1632 ElementCount UserVF); 1633 1634 /// \return the maximized element count based on the targets vector 1635 /// registers and the loop trip-count, but limited to a maximum safe VF. 1636 /// This is a helper function of computeFeasibleMaxVF. 1637 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1638 /// issue that occurred on one of the buildbots which cannot be reproduced 1639 /// without having access to the properietary compiler (see comments on 1640 /// D98509). The issue is currently under investigation and this workaround 1641 /// will be removed as soon as possible. 1642 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1643 unsigned SmallestType, 1644 unsigned WidestType, 1645 const ElementCount &MaxSafeVF); 1646 1647 /// \return the maximum legal scalable VF, based on the safe max number 1648 /// of elements. 1649 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1650 1651 /// The vectorization cost is a combination of the cost itself and a boolean 1652 /// indicating whether any of the contributing operations will actually 1653 /// operate on 1654 /// vector values after type legalization in the backend. If this latter value 1655 /// is 1656 /// false, then all operations will be scalarized (i.e. no vectorization has 1657 /// actually taken place). 1658 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1659 1660 /// Returns the expected execution cost. The unit of the cost does 1661 /// not matter because we use the 'cost' units to compare different 1662 /// vector widths. The cost that is returned is *not* normalized by 1663 /// the factor width. 1664 VectorizationCostTy expectedCost(ElementCount VF); 1665 1666 /// Returns the execution time cost of an instruction for a given vector 1667 /// width. Vector width of one means scalar. 1668 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1669 1670 /// The cost-computation logic from getInstructionCost which provides 1671 /// the vector type as an output parameter. 1672 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1673 Type *&VectorTy); 1674 1675 /// Return the cost of instructions in an inloop reduction pattern, if I is 1676 /// part of that pattern. 1677 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1678 Type *VectorTy, 1679 TTI::TargetCostKind CostKind); 1680 1681 /// Calculate vectorization cost of memory instruction \p I. 1682 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1683 1684 /// The cost computation for scalarized memory instruction. 1685 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1686 1687 /// The cost computation for interleaving group of memory instructions. 1688 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1689 1690 /// The cost computation for Gather/Scatter instruction. 1691 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1692 1693 /// The cost computation for widening instruction \p I with consecutive 1694 /// memory access. 1695 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1696 1697 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1698 /// Load: scalar load + broadcast. 1699 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1700 /// element) 1701 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1702 1703 /// Estimate the overhead of scalarizing an instruction. This is a 1704 /// convenience wrapper for the type-based getScalarizationOverhead API. 1705 InstructionCost getScalarizationOverhead(Instruction *I, 1706 ElementCount VF) const; 1707 1708 /// Returns whether the instruction is a load or store and will be a emitted 1709 /// as a vector operation. 1710 bool isConsecutiveLoadOrStore(Instruction *I); 1711 1712 /// Returns true if an artificially high cost for emulated masked memrefs 1713 /// should be used. 1714 bool useEmulatedMaskMemRefHack(Instruction *I); 1715 1716 /// Map of scalar integer values to the smallest bitwidth they can be legally 1717 /// represented as. The vector equivalents of these values should be truncated 1718 /// to this type. 1719 MapVector<Instruction *, uint64_t> MinBWs; 1720 1721 /// A type representing the costs for instructions if they were to be 1722 /// scalarized rather than vectorized. The entries are Instruction-Cost 1723 /// pairs. 1724 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1725 1726 /// A set containing all BasicBlocks that are known to present after 1727 /// vectorization as a predicated block. 1728 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1729 1730 /// Records whether it is allowed to have the original scalar loop execute at 1731 /// least once. This may be needed as a fallback loop in case runtime 1732 /// aliasing/dependence checks fail, or to handle the tail/remainder 1733 /// iterations when the trip count is unknown or doesn't divide by the VF, 1734 /// or as a peel-loop to handle gaps in interleave-groups. 1735 /// Under optsize and when the trip count is very small we don't allow any 1736 /// iterations to execute in the scalar loop. 1737 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1738 1739 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1740 bool FoldTailByMasking = false; 1741 1742 /// A map holding scalar costs for different vectorization factors. The 1743 /// presence of a cost for an instruction in the mapping indicates that the 1744 /// instruction will be scalarized when vectorizing with the associated 1745 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1746 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1747 1748 /// Holds the instructions known to be uniform after vectorization. 1749 /// The data is collected per VF. 1750 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1751 1752 /// Holds the instructions known to be scalar after vectorization. 1753 /// The data is collected per VF. 1754 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1755 1756 /// Holds the instructions (address computations) that are forced to be 1757 /// scalarized. 1758 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1759 1760 /// PHINodes of the reductions that should be expanded in-loop along with 1761 /// their associated chains of reduction operations, in program order from top 1762 /// (PHI) to bottom 1763 ReductionChainMap InLoopReductionChains; 1764 1765 /// A Map of inloop reduction operations and their immediate chain operand. 1766 /// FIXME: This can be removed once reductions can be costed correctly in 1767 /// vplan. This was added to allow quick lookup to the inloop operations, 1768 /// without having to loop through InLoopReductionChains. 1769 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1770 1771 /// Returns the expected difference in cost from scalarizing the expression 1772 /// feeding a predicated instruction \p PredInst. The instructions to 1773 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1774 /// non-negative return value implies the expression will be scalarized. 1775 /// Currently, only single-use chains are considered for scalarization. 1776 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1777 ElementCount VF); 1778 1779 /// Collect the instructions that are uniform after vectorization. An 1780 /// instruction is uniform if we represent it with a single scalar value in 1781 /// the vectorized loop corresponding to each vector iteration. Examples of 1782 /// uniform instructions include pointer operands of consecutive or 1783 /// interleaved memory accesses. Note that although uniformity implies an 1784 /// instruction will be scalar, the reverse is not true. In general, a 1785 /// scalarized instruction will be represented by VF scalar values in the 1786 /// vectorized loop, each corresponding to an iteration of the original 1787 /// scalar loop. 1788 void collectLoopUniforms(ElementCount VF); 1789 1790 /// Collect the instructions that are scalar after vectorization. An 1791 /// instruction is scalar if it is known to be uniform or will be scalarized 1792 /// during vectorization. Non-uniform scalarized instructions will be 1793 /// represented by VF values in the vectorized loop, each corresponding to an 1794 /// iteration of the original scalar loop. 1795 void collectLoopScalars(ElementCount VF); 1796 1797 /// Keeps cost model vectorization decision and cost for instructions. 1798 /// Right now it is used for memory instructions only. 1799 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1800 std::pair<InstWidening, InstructionCost>>; 1801 1802 DecisionList WideningDecisions; 1803 1804 /// Returns true if \p V is expected to be vectorized and it needs to be 1805 /// extracted. 1806 bool needsExtract(Value *V, ElementCount VF) const { 1807 Instruction *I = dyn_cast<Instruction>(V); 1808 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1809 TheLoop->isLoopInvariant(I)) 1810 return false; 1811 1812 // Assume we can vectorize V (and hence we need extraction) if the 1813 // scalars are not computed yet. This can happen, because it is called 1814 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1815 // the scalars are collected. That should be a safe assumption in most 1816 // cases, because we check if the operands have vectorizable types 1817 // beforehand in LoopVectorizationLegality. 1818 return Scalars.find(VF) == Scalars.end() || 1819 !isScalarAfterVectorization(I, VF); 1820 }; 1821 1822 /// Returns a range containing only operands needing to be extracted. 1823 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1824 ElementCount VF) const { 1825 return SmallVector<Value *, 4>(make_filter_range( 1826 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1827 } 1828 1829 /// Determines if we have the infrastructure to vectorize loop \p L and its 1830 /// epilogue, assuming the main loop is vectorized by \p VF. 1831 bool isCandidateForEpilogueVectorization(const Loop &L, 1832 const ElementCount VF) const; 1833 1834 /// Returns true if epilogue vectorization is considered profitable, and 1835 /// false otherwise. 1836 /// \p VF is the vectorization factor chosen for the original loop. 1837 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1838 1839 public: 1840 /// The loop that we evaluate. 1841 Loop *TheLoop; 1842 1843 /// Predicated scalar evolution analysis. 1844 PredicatedScalarEvolution &PSE; 1845 1846 /// Loop Info analysis. 1847 LoopInfo *LI; 1848 1849 /// Vectorization legality. 1850 LoopVectorizationLegality *Legal; 1851 1852 /// Vector target information. 1853 const TargetTransformInfo &TTI; 1854 1855 /// Target Library Info. 1856 const TargetLibraryInfo *TLI; 1857 1858 /// Demanded bits analysis. 1859 DemandedBits *DB; 1860 1861 /// Assumption cache. 1862 AssumptionCache *AC; 1863 1864 /// Interface to emit optimization remarks. 1865 OptimizationRemarkEmitter *ORE; 1866 1867 const Function *TheFunction; 1868 1869 /// Loop Vectorize Hint. 1870 const LoopVectorizeHints *Hints; 1871 1872 /// The interleave access information contains groups of interleaved accesses 1873 /// with the same stride and close to each other. 1874 InterleavedAccessInfo &InterleaveInfo; 1875 1876 /// Values to ignore in the cost model. 1877 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1878 1879 /// Values to ignore in the cost model when VF > 1. 1880 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1881 1882 /// Profitable vector factors. 1883 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1884 }; 1885 } // end namespace llvm 1886 1887 /// Helper struct to manage generating runtime checks for vectorization. 1888 /// 1889 /// The runtime checks are created up-front in temporary blocks to allow better 1890 /// estimating the cost and un-linked from the existing IR. After deciding to 1891 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1892 /// temporary blocks are completely removed. 1893 class GeneratedRTChecks { 1894 /// Basic block which contains the generated SCEV checks, if any. 1895 BasicBlock *SCEVCheckBlock = nullptr; 1896 1897 /// The value representing the result of the generated SCEV checks. If it is 1898 /// nullptr, either no SCEV checks have been generated or they have been used. 1899 Value *SCEVCheckCond = nullptr; 1900 1901 /// Basic block which contains the generated memory runtime checks, if any. 1902 BasicBlock *MemCheckBlock = nullptr; 1903 1904 /// The value representing the result of the generated memory runtime checks. 1905 /// If it is nullptr, either no memory runtime checks have been generated or 1906 /// they have been used. 1907 Instruction *MemRuntimeCheckCond = nullptr; 1908 1909 DominatorTree *DT; 1910 LoopInfo *LI; 1911 1912 SCEVExpander SCEVExp; 1913 SCEVExpander MemCheckExp; 1914 1915 public: 1916 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1917 const DataLayout &DL) 1918 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1919 MemCheckExp(SE, DL, "scev.check") {} 1920 1921 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1922 /// accurately estimate the cost of the runtime checks. The blocks are 1923 /// un-linked from the IR and is added back during vector code generation. If 1924 /// there is no vector code generation, the check blocks are removed 1925 /// completely. 1926 void Create(Loop *L, const LoopAccessInfo &LAI, 1927 const SCEVUnionPredicate &UnionPred) { 1928 1929 BasicBlock *LoopHeader = L->getHeader(); 1930 BasicBlock *Preheader = L->getLoopPreheader(); 1931 1932 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1933 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1934 // may be used by SCEVExpander. The blocks will be un-linked from their 1935 // predecessors and removed from LI & DT at the end of the function. 1936 if (!UnionPred.isAlwaysTrue()) { 1937 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1938 nullptr, "vector.scevcheck"); 1939 1940 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1941 &UnionPred, SCEVCheckBlock->getTerminator()); 1942 } 1943 1944 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1945 if (RtPtrChecking.Need) { 1946 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1947 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1948 "vector.memcheck"); 1949 1950 std::tie(std::ignore, MemRuntimeCheckCond) = 1951 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1952 RtPtrChecking.getChecks(), MemCheckExp); 1953 assert(MemRuntimeCheckCond && 1954 "no RT checks generated although RtPtrChecking " 1955 "claimed checks are required"); 1956 } 1957 1958 if (!MemCheckBlock && !SCEVCheckBlock) 1959 return; 1960 1961 // Unhook the temporary block with the checks, update various places 1962 // accordingly. 1963 if (SCEVCheckBlock) 1964 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1965 if (MemCheckBlock) 1966 MemCheckBlock->replaceAllUsesWith(Preheader); 1967 1968 if (SCEVCheckBlock) { 1969 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1970 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1971 Preheader->getTerminator()->eraseFromParent(); 1972 } 1973 if (MemCheckBlock) { 1974 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1975 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1976 Preheader->getTerminator()->eraseFromParent(); 1977 } 1978 1979 DT->changeImmediateDominator(LoopHeader, Preheader); 1980 if (MemCheckBlock) { 1981 DT->eraseNode(MemCheckBlock); 1982 LI->removeBlock(MemCheckBlock); 1983 } 1984 if (SCEVCheckBlock) { 1985 DT->eraseNode(SCEVCheckBlock); 1986 LI->removeBlock(SCEVCheckBlock); 1987 } 1988 } 1989 1990 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1991 /// unused. 1992 ~GeneratedRTChecks() { 1993 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1994 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1995 if (!SCEVCheckCond) 1996 SCEVCleaner.markResultUsed(); 1997 1998 if (!MemRuntimeCheckCond) 1999 MemCheckCleaner.markResultUsed(); 2000 2001 if (MemRuntimeCheckCond) { 2002 auto &SE = *MemCheckExp.getSE(); 2003 // Memory runtime check generation creates compares that use expanded 2004 // values. Remove them before running the SCEVExpanderCleaners. 2005 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2006 if (MemCheckExp.isInsertedInstruction(&I)) 2007 continue; 2008 SE.forgetValue(&I); 2009 SE.eraseValueFromMap(&I); 2010 I.eraseFromParent(); 2011 } 2012 } 2013 MemCheckCleaner.cleanup(); 2014 SCEVCleaner.cleanup(); 2015 2016 if (SCEVCheckCond) 2017 SCEVCheckBlock->eraseFromParent(); 2018 if (MemRuntimeCheckCond) 2019 MemCheckBlock->eraseFromParent(); 2020 } 2021 2022 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2023 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2024 /// depending on the generated condition. 2025 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2026 BasicBlock *LoopVectorPreHeader, 2027 BasicBlock *LoopExitBlock) { 2028 if (!SCEVCheckCond) 2029 return nullptr; 2030 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2031 if (C->isZero()) 2032 return nullptr; 2033 2034 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2035 2036 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2037 // Create new preheader for vector loop. 2038 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2039 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2040 2041 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2042 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2043 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2044 SCEVCheckBlock); 2045 2046 DT->addNewBlock(SCEVCheckBlock, Pred); 2047 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2048 2049 ReplaceInstWithInst( 2050 SCEVCheckBlock->getTerminator(), 2051 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2052 // Mark the check as used, to prevent it from being removed during cleanup. 2053 SCEVCheckCond = nullptr; 2054 return SCEVCheckBlock; 2055 } 2056 2057 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2058 /// the branches to branch to the vector preheader or \p Bypass, depending on 2059 /// the generated condition. 2060 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2061 BasicBlock *LoopVectorPreHeader) { 2062 // Check if we generated code that checks in runtime if arrays overlap. 2063 if (!MemRuntimeCheckCond) 2064 return nullptr; 2065 2066 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2067 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2068 MemCheckBlock); 2069 2070 DT->addNewBlock(MemCheckBlock, Pred); 2071 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2072 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2073 2074 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2075 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2076 2077 ReplaceInstWithInst( 2078 MemCheckBlock->getTerminator(), 2079 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2080 MemCheckBlock->getTerminator()->setDebugLoc( 2081 Pred->getTerminator()->getDebugLoc()); 2082 2083 // Mark the check as used, to prevent it from being removed during cleanup. 2084 MemRuntimeCheckCond = nullptr; 2085 return MemCheckBlock; 2086 } 2087 }; 2088 2089 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2090 // vectorization. The loop needs to be annotated with #pragma omp simd 2091 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2092 // vector length information is not provided, vectorization is not considered 2093 // explicit. Interleave hints are not allowed either. These limitations will be 2094 // relaxed in the future. 2095 // Please, note that we are currently forced to abuse the pragma 'clang 2096 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2097 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2098 // provides *explicit vectorization hints* (LV can bypass legal checks and 2099 // assume that vectorization is legal). However, both hints are implemented 2100 // using the same metadata (llvm.loop.vectorize, processed by 2101 // LoopVectorizeHints). This will be fixed in the future when the native IR 2102 // representation for pragma 'omp simd' is introduced. 2103 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2104 OptimizationRemarkEmitter *ORE) { 2105 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2106 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2107 2108 // Only outer loops with an explicit vectorization hint are supported. 2109 // Unannotated outer loops are ignored. 2110 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2111 return false; 2112 2113 Function *Fn = OuterLp->getHeader()->getParent(); 2114 if (!Hints.allowVectorization(Fn, OuterLp, 2115 true /*VectorizeOnlyWhenForced*/)) { 2116 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2117 return false; 2118 } 2119 2120 if (Hints.getInterleave() > 1) { 2121 // TODO: Interleave support is future work. 2122 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2123 "outer loops.\n"); 2124 Hints.emitRemarkWithHints(); 2125 return false; 2126 } 2127 2128 return true; 2129 } 2130 2131 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2132 OptimizationRemarkEmitter *ORE, 2133 SmallVectorImpl<Loop *> &V) { 2134 // Collect inner loops and outer loops without irreducible control flow. For 2135 // now, only collect outer loops that have explicit vectorization hints. If we 2136 // are stress testing the VPlan H-CFG construction, we collect the outermost 2137 // loop of every loop nest. 2138 if (L.isInnermost() || VPlanBuildStressTest || 2139 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2140 LoopBlocksRPO RPOT(&L); 2141 RPOT.perform(LI); 2142 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2143 V.push_back(&L); 2144 // TODO: Collect inner loops inside marked outer loops in case 2145 // vectorization fails for the outer loop. Do not invoke 2146 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2147 // already known to be reducible. We can use an inherited attribute for 2148 // that. 2149 return; 2150 } 2151 } 2152 for (Loop *InnerL : L) 2153 collectSupportedLoops(*InnerL, LI, ORE, V); 2154 } 2155 2156 namespace { 2157 2158 /// The LoopVectorize Pass. 2159 struct LoopVectorize : public FunctionPass { 2160 /// Pass identification, replacement for typeid 2161 static char ID; 2162 2163 LoopVectorizePass Impl; 2164 2165 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2166 bool VectorizeOnlyWhenForced = false) 2167 : FunctionPass(ID), 2168 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2169 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2170 } 2171 2172 bool runOnFunction(Function &F) override { 2173 if (skipFunction(F)) 2174 return false; 2175 2176 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2177 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2178 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2179 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2180 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2181 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2182 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2183 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2184 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2185 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2186 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2187 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2188 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2189 2190 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2191 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2192 2193 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2194 GetLAA, *ORE, PSI).MadeAnyChange; 2195 } 2196 2197 void getAnalysisUsage(AnalysisUsage &AU) const override { 2198 AU.addRequired<AssumptionCacheTracker>(); 2199 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2200 AU.addRequired<DominatorTreeWrapperPass>(); 2201 AU.addRequired<LoopInfoWrapperPass>(); 2202 AU.addRequired<ScalarEvolutionWrapperPass>(); 2203 AU.addRequired<TargetTransformInfoWrapperPass>(); 2204 AU.addRequired<AAResultsWrapperPass>(); 2205 AU.addRequired<LoopAccessLegacyAnalysis>(); 2206 AU.addRequired<DemandedBitsWrapperPass>(); 2207 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2208 AU.addRequired<InjectTLIMappingsLegacy>(); 2209 2210 // We currently do not preserve loopinfo/dominator analyses with outer loop 2211 // vectorization. Until this is addressed, mark these analyses as preserved 2212 // only for non-VPlan-native path. 2213 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2214 if (!EnableVPlanNativePath) { 2215 AU.addPreserved<LoopInfoWrapperPass>(); 2216 AU.addPreserved<DominatorTreeWrapperPass>(); 2217 } 2218 2219 AU.addPreserved<BasicAAWrapperPass>(); 2220 AU.addPreserved<GlobalsAAWrapperPass>(); 2221 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2222 } 2223 }; 2224 2225 } // end anonymous namespace 2226 2227 //===----------------------------------------------------------------------===// 2228 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2229 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2230 //===----------------------------------------------------------------------===// 2231 2232 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2233 // We need to place the broadcast of invariant variables outside the loop, 2234 // but only if it's proven safe to do so. Else, broadcast will be inside 2235 // vector loop body. 2236 Instruction *Instr = dyn_cast<Instruction>(V); 2237 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2238 (!Instr || 2239 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2240 // Place the code for broadcasting invariant variables in the new preheader. 2241 IRBuilder<>::InsertPointGuard Guard(Builder); 2242 if (SafeToHoist) 2243 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2244 2245 // Broadcast the scalar into all locations in the vector. 2246 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2247 2248 return Shuf; 2249 } 2250 2251 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2252 const InductionDescriptor &II, Value *Step, Value *Start, 2253 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2254 VPTransformState &State) { 2255 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2256 "Expected either an induction phi-node or a truncate of it!"); 2257 2258 // Construct the initial value of the vector IV in the vector loop preheader 2259 auto CurrIP = Builder.saveIP(); 2260 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2261 if (isa<TruncInst>(EntryVal)) { 2262 assert(Start->getType()->isIntegerTy() && 2263 "Truncation requires an integer type"); 2264 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2265 Step = Builder.CreateTrunc(Step, TruncType); 2266 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2267 } 2268 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2269 Value *SteppedStart = 2270 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2271 2272 // We create vector phi nodes for both integer and floating-point induction 2273 // variables. Here, we determine the kind of arithmetic we will perform. 2274 Instruction::BinaryOps AddOp; 2275 Instruction::BinaryOps MulOp; 2276 if (Step->getType()->isIntegerTy()) { 2277 AddOp = Instruction::Add; 2278 MulOp = Instruction::Mul; 2279 } else { 2280 AddOp = II.getInductionOpcode(); 2281 MulOp = Instruction::FMul; 2282 } 2283 2284 // Multiply the vectorization factor by the step using integer or 2285 // floating-point arithmetic as appropriate. 2286 Type *StepType = Step->getType(); 2287 if (Step->getType()->isFloatingPointTy()) 2288 StepType = IntegerType::get(StepType->getContext(), 2289 StepType->getScalarSizeInBits()); 2290 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2291 if (Step->getType()->isFloatingPointTy()) 2292 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2293 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2294 2295 // Create a vector splat to use in the induction update. 2296 // 2297 // FIXME: If the step is non-constant, we create the vector splat with 2298 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2299 // handle a constant vector splat. 2300 Value *SplatVF = isa<Constant>(Mul) 2301 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2302 : Builder.CreateVectorSplat(VF, Mul); 2303 Builder.restoreIP(CurrIP); 2304 2305 // We may need to add the step a number of times, depending on the unroll 2306 // factor. The last of those goes into the PHI. 2307 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2308 &*LoopVectorBody->getFirstInsertionPt()); 2309 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2310 Instruction *LastInduction = VecInd; 2311 for (unsigned Part = 0; Part < UF; ++Part) { 2312 State.set(Def, LastInduction, Part); 2313 2314 if (isa<TruncInst>(EntryVal)) 2315 addMetadata(LastInduction, EntryVal); 2316 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2317 State, Part); 2318 2319 LastInduction = cast<Instruction>( 2320 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2321 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2322 } 2323 2324 // Move the last step to the end of the latch block. This ensures consistent 2325 // placement of all induction updates. 2326 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2327 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2328 auto *ICmp = cast<Instruction>(Br->getCondition()); 2329 LastInduction->moveBefore(ICmp); 2330 LastInduction->setName("vec.ind.next"); 2331 2332 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2333 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2334 } 2335 2336 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2337 return Cost->isScalarAfterVectorization(I, VF) || 2338 Cost->isProfitableToScalarize(I, VF); 2339 } 2340 2341 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2342 if (shouldScalarizeInstruction(IV)) 2343 return true; 2344 auto isScalarInst = [&](User *U) -> bool { 2345 auto *I = cast<Instruction>(U); 2346 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2347 }; 2348 return llvm::any_of(IV->users(), isScalarInst); 2349 } 2350 2351 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2352 const InductionDescriptor &ID, const Instruction *EntryVal, 2353 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2354 unsigned Part, unsigned Lane) { 2355 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2356 "Expected either an induction phi-node or a truncate of it!"); 2357 2358 // This induction variable is not the phi from the original loop but the 2359 // newly-created IV based on the proof that casted Phi is equal to the 2360 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2361 // re-uses the same InductionDescriptor that original IV uses but we don't 2362 // have to do any recording in this case - that is done when original IV is 2363 // processed. 2364 if (isa<TruncInst>(EntryVal)) 2365 return; 2366 2367 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2368 if (Casts.empty()) 2369 return; 2370 // Only the first Cast instruction in the Casts vector is of interest. 2371 // The rest of the Casts (if exist) have no uses outside the 2372 // induction update chain itself. 2373 if (Lane < UINT_MAX) 2374 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2375 else 2376 State.set(CastDef, VectorLoopVal, Part); 2377 } 2378 2379 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2380 TruncInst *Trunc, VPValue *Def, 2381 VPValue *CastDef, 2382 VPTransformState &State) { 2383 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2384 "Primary induction variable must have an integer type"); 2385 2386 auto II = Legal->getInductionVars().find(IV); 2387 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2388 2389 auto ID = II->second; 2390 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2391 2392 // The value from the original loop to which we are mapping the new induction 2393 // variable. 2394 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2395 2396 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2397 2398 // Generate code for the induction step. Note that induction steps are 2399 // required to be loop-invariant 2400 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2401 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2402 "Induction step should be loop invariant"); 2403 if (PSE.getSE()->isSCEVable(IV->getType())) { 2404 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2405 return Exp.expandCodeFor(Step, Step->getType(), 2406 LoopVectorPreHeader->getTerminator()); 2407 } 2408 return cast<SCEVUnknown>(Step)->getValue(); 2409 }; 2410 2411 // The scalar value to broadcast. This is derived from the canonical 2412 // induction variable. If a truncation type is given, truncate the canonical 2413 // induction variable and step. Otherwise, derive these values from the 2414 // induction descriptor. 2415 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2416 Value *ScalarIV = Induction; 2417 if (IV != OldInduction) { 2418 ScalarIV = IV->getType()->isIntegerTy() 2419 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2420 : Builder.CreateCast(Instruction::SIToFP, Induction, 2421 IV->getType()); 2422 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2423 ScalarIV->setName("offset.idx"); 2424 } 2425 if (Trunc) { 2426 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2427 assert(Step->getType()->isIntegerTy() && 2428 "Truncation requires an integer step"); 2429 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2430 Step = Builder.CreateTrunc(Step, TruncType); 2431 } 2432 return ScalarIV; 2433 }; 2434 2435 // Create the vector values from the scalar IV, in the absence of creating a 2436 // vector IV. 2437 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2438 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2439 for (unsigned Part = 0; Part < UF; ++Part) { 2440 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2441 Value *EntryPart = 2442 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2443 ID.getInductionOpcode()); 2444 State.set(Def, EntryPart, Part); 2445 if (Trunc) 2446 addMetadata(EntryPart, Trunc); 2447 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2448 State, Part); 2449 } 2450 }; 2451 2452 // Fast-math-flags propagate from the original induction instruction. 2453 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2454 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2455 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2456 2457 // Now do the actual transformations, and start with creating the step value. 2458 Value *Step = CreateStepValue(ID.getStep()); 2459 if (VF.isZero() || VF.isScalar()) { 2460 Value *ScalarIV = CreateScalarIV(Step); 2461 CreateSplatIV(ScalarIV, Step); 2462 return; 2463 } 2464 2465 // Determine if we want a scalar version of the induction variable. This is 2466 // true if the induction variable itself is not widened, or if it has at 2467 // least one user in the loop that is not widened. 2468 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2469 if (!NeedsScalarIV) { 2470 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2471 State); 2472 return; 2473 } 2474 2475 // Try to create a new independent vector induction variable. If we can't 2476 // create the phi node, we will splat the scalar induction variable in each 2477 // loop iteration. 2478 if (!shouldScalarizeInstruction(EntryVal)) { 2479 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2480 State); 2481 Value *ScalarIV = CreateScalarIV(Step); 2482 // Create scalar steps that can be used by instructions we will later 2483 // scalarize. Note that the addition of the scalar steps will not increase 2484 // the number of instructions in the loop in the common case prior to 2485 // InstCombine. We will be trading one vector extract for each scalar step. 2486 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2487 return; 2488 } 2489 2490 // All IV users are scalar instructions, so only emit a scalar IV, not a 2491 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2492 // predicate used by the masked loads/stores. 2493 Value *ScalarIV = CreateScalarIV(Step); 2494 if (!Cost->isScalarEpilogueAllowed()) 2495 CreateSplatIV(ScalarIV, Step); 2496 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2497 } 2498 2499 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2500 Instruction::BinaryOps BinOp) { 2501 // Create and check the types. 2502 auto *ValVTy = cast<VectorType>(Val->getType()); 2503 ElementCount VLen = ValVTy->getElementCount(); 2504 2505 Type *STy = Val->getType()->getScalarType(); 2506 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2507 "Induction Step must be an integer or FP"); 2508 assert(Step->getType() == STy && "Step has wrong type"); 2509 2510 SmallVector<Constant *, 8> Indices; 2511 2512 // Create a vector of consecutive numbers from zero to VF. 2513 VectorType *InitVecValVTy = ValVTy; 2514 Type *InitVecValSTy = STy; 2515 if (STy->isFloatingPointTy()) { 2516 InitVecValSTy = 2517 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2518 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2519 } 2520 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2521 2522 // Add on StartIdx 2523 Value *StartIdxSplat = Builder.CreateVectorSplat( 2524 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2525 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2526 2527 if (STy->isIntegerTy()) { 2528 Step = Builder.CreateVectorSplat(VLen, Step); 2529 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2530 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2531 // which can be found from the original scalar operations. 2532 Step = Builder.CreateMul(InitVec, Step); 2533 return Builder.CreateAdd(Val, Step, "induction"); 2534 } 2535 2536 // Floating point induction. 2537 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2538 "Binary Opcode should be specified for FP induction"); 2539 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2540 Step = Builder.CreateVectorSplat(VLen, Step); 2541 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2542 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2543 } 2544 2545 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2546 Instruction *EntryVal, 2547 const InductionDescriptor &ID, 2548 VPValue *Def, VPValue *CastDef, 2549 VPTransformState &State) { 2550 // We shouldn't have to build scalar steps if we aren't vectorizing. 2551 assert(VF.isVector() && "VF should be greater than one"); 2552 // Get the value type and ensure it and the step have the same integer type. 2553 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2554 assert(ScalarIVTy == Step->getType() && 2555 "Val and Step should have the same type"); 2556 2557 // We build scalar steps for both integer and floating-point induction 2558 // variables. Here, we determine the kind of arithmetic we will perform. 2559 Instruction::BinaryOps AddOp; 2560 Instruction::BinaryOps MulOp; 2561 if (ScalarIVTy->isIntegerTy()) { 2562 AddOp = Instruction::Add; 2563 MulOp = Instruction::Mul; 2564 } else { 2565 AddOp = ID.getInductionOpcode(); 2566 MulOp = Instruction::FMul; 2567 } 2568 2569 // Determine the number of scalars we need to generate for each unroll 2570 // iteration. If EntryVal is uniform, we only need to generate the first 2571 // lane. Otherwise, we generate all VF values. 2572 bool IsUniform = 2573 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2574 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2575 // Compute the scalar steps and save the results in State. 2576 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2577 ScalarIVTy->getScalarSizeInBits()); 2578 Type *VecIVTy = nullptr; 2579 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2580 if (!IsUniform && VF.isScalable()) { 2581 VecIVTy = VectorType::get(ScalarIVTy, VF); 2582 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2583 SplatStep = Builder.CreateVectorSplat(VF, Step); 2584 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2585 } 2586 2587 for (unsigned Part = 0; Part < UF; ++Part) { 2588 Value *StartIdx0 = 2589 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2590 2591 if (!IsUniform && VF.isScalable()) { 2592 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2593 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2594 if (ScalarIVTy->isFloatingPointTy()) 2595 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2596 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2597 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2598 State.set(Def, Add, Part); 2599 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2600 Part); 2601 // It's useful to record the lane values too for the known minimum number 2602 // of elements so we do those below. This improves the code quality when 2603 // trying to extract the first element, for example. 2604 } 2605 2606 if (ScalarIVTy->isFloatingPointTy()) 2607 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2608 2609 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2610 Value *StartIdx = Builder.CreateBinOp( 2611 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2612 // The step returned by `createStepForVF` is a runtime-evaluated value 2613 // when VF is scalable. Otherwise, it should be folded into a Constant. 2614 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2615 "Expected StartIdx to be folded to a constant when VF is not " 2616 "scalable"); 2617 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2618 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2619 State.set(Def, Add, VPIteration(Part, Lane)); 2620 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2621 Part, Lane); 2622 } 2623 } 2624 } 2625 2626 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2627 const VPIteration &Instance, 2628 VPTransformState &State) { 2629 Value *ScalarInst = State.get(Def, Instance); 2630 Value *VectorValue = State.get(Def, Instance.Part); 2631 VectorValue = Builder.CreateInsertElement( 2632 VectorValue, ScalarInst, 2633 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2634 State.set(Def, VectorValue, Instance.Part); 2635 } 2636 2637 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2638 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2639 return Builder.CreateVectorReverse(Vec, "reverse"); 2640 } 2641 2642 // Return whether we allow using masked interleave-groups (for dealing with 2643 // strided loads/stores that reside in predicated blocks, or for dealing 2644 // with gaps). 2645 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2646 // If an override option has been passed in for interleaved accesses, use it. 2647 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2648 return EnableMaskedInterleavedMemAccesses; 2649 2650 return TTI.enableMaskedInterleavedAccessVectorization(); 2651 } 2652 2653 // Try to vectorize the interleave group that \p Instr belongs to. 2654 // 2655 // E.g. Translate following interleaved load group (factor = 3): 2656 // for (i = 0; i < N; i+=3) { 2657 // R = Pic[i]; // Member of index 0 2658 // G = Pic[i+1]; // Member of index 1 2659 // B = Pic[i+2]; // Member of index 2 2660 // ... // do something to R, G, B 2661 // } 2662 // To: 2663 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2664 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2665 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2666 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2667 // 2668 // Or translate following interleaved store group (factor = 3): 2669 // for (i = 0; i < N; i+=3) { 2670 // ... do something to R, G, B 2671 // Pic[i] = R; // Member of index 0 2672 // Pic[i+1] = G; // Member of index 1 2673 // Pic[i+2] = B; // Member of index 2 2674 // } 2675 // To: 2676 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2677 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2678 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2679 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2680 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2681 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2682 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2683 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2684 VPValue *BlockInMask) { 2685 Instruction *Instr = Group->getInsertPos(); 2686 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2687 2688 // Prepare for the vector type of the interleaved load/store. 2689 Type *ScalarTy = getMemInstValueType(Instr); 2690 unsigned InterleaveFactor = Group->getFactor(); 2691 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2692 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2693 2694 // Prepare for the new pointers. 2695 SmallVector<Value *, 2> AddrParts; 2696 unsigned Index = Group->getIndex(Instr); 2697 2698 // TODO: extend the masked interleaved-group support to reversed access. 2699 assert((!BlockInMask || !Group->isReverse()) && 2700 "Reversed masked interleave-group not supported."); 2701 2702 // If the group is reverse, adjust the index to refer to the last vector lane 2703 // instead of the first. We adjust the index from the first vector lane, 2704 // rather than directly getting the pointer for lane VF - 1, because the 2705 // pointer operand of the interleaved access is supposed to be uniform. For 2706 // uniform instructions, we're only required to generate a value for the 2707 // first vector lane in each unroll iteration. 2708 if (Group->isReverse()) 2709 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2710 2711 for (unsigned Part = 0; Part < UF; Part++) { 2712 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2713 setDebugLocFromInst(Builder, AddrPart); 2714 2715 // Notice current instruction could be any index. Need to adjust the address 2716 // to the member of index 0. 2717 // 2718 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2719 // b = A[i]; // Member of index 0 2720 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2721 // 2722 // E.g. A[i+1] = a; // Member of index 1 2723 // A[i] = b; // Member of index 0 2724 // A[i+2] = c; // Member of index 2 (Current instruction) 2725 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2726 2727 bool InBounds = false; 2728 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2729 InBounds = gep->isInBounds(); 2730 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2731 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2732 2733 // Cast to the vector pointer type. 2734 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2735 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2736 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2737 } 2738 2739 setDebugLocFromInst(Builder, Instr); 2740 Value *PoisonVec = PoisonValue::get(VecTy); 2741 2742 Value *MaskForGaps = nullptr; 2743 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2744 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2745 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2746 } 2747 2748 // Vectorize the interleaved load group. 2749 if (isa<LoadInst>(Instr)) { 2750 // For each unroll part, create a wide load for the group. 2751 SmallVector<Value *, 2> NewLoads; 2752 for (unsigned Part = 0; Part < UF; Part++) { 2753 Instruction *NewLoad; 2754 if (BlockInMask || MaskForGaps) { 2755 assert(useMaskedInterleavedAccesses(*TTI) && 2756 "masked interleaved groups are not allowed."); 2757 Value *GroupMask = MaskForGaps; 2758 if (BlockInMask) { 2759 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2760 Value *ShuffledMask = Builder.CreateShuffleVector( 2761 BlockInMaskPart, 2762 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2763 "interleaved.mask"); 2764 GroupMask = MaskForGaps 2765 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2766 MaskForGaps) 2767 : ShuffledMask; 2768 } 2769 NewLoad = 2770 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2771 GroupMask, PoisonVec, "wide.masked.vec"); 2772 } 2773 else 2774 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2775 Group->getAlign(), "wide.vec"); 2776 Group->addMetadata(NewLoad); 2777 NewLoads.push_back(NewLoad); 2778 } 2779 2780 // For each member in the group, shuffle out the appropriate data from the 2781 // wide loads. 2782 unsigned J = 0; 2783 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2784 Instruction *Member = Group->getMember(I); 2785 2786 // Skip the gaps in the group. 2787 if (!Member) 2788 continue; 2789 2790 auto StrideMask = 2791 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2792 for (unsigned Part = 0; Part < UF; Part++) { 2793 Value *StridedVec = Builder.CreateShuffleVector( 2794 NewLoads[Part], StrideMask, "strided.vec"); 2795 2796 // If this member has different type, cast the result type. 2797 if (Member->getType() != ScalarTy) { 2798 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2799 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2800 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2801 } 2802 2803 if (Group->isReverse()) 2804 StridedVec = reverseVector(StridedVec); 2805 2806 State.set(VPDefs[J], StridedVec, Part); 2807 } 2808 ++J; 2809 } 2810 return; 2811 } 2812 2813 // The sub vector type for current instruction. 2814 auto *SubVT = VectorType::get(ScalarTy, VF); 2815 2816 // Vectorize the interleaved store group. 2817 for (unsigned Part = 0; Part < UF; Part++) { 2818 // Collect the stored vector from each member. 2819 SmallVector<Value *, 4> StoredVecs; 2820 for (unsigned i = 0; i < InterleaveFactor; i++) { 2821 // Interleaved store group doesn't allow a gap, so each index has a member 2822 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2823 2824 Value *StoredVec = State.get(StoredValues[i], Part); 2825 2826 if (Group->isReverse()) 2827 StoredVec = reverseVector(StoredVec); 2828 2829 // If this member has different type, cast it to a unified type. 2830 2831 if (StoredVec->getType() != SubVT) 2832 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2833 2834 StoredVecs.push_back(StoredVec); 2835 } 2836 2837 // Concatenate all vectors into a wide vector. 2838 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2839 2840 // Interleave the elements in the wide vector. 2841 Value *IVec = Builder.CreateShuffleVector( 2842 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2843 "interleaved.vec"); 2844 2845 Instruction *NewStoreInstr; 2846 if (BlockInMask) { 2847 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2848 Value *ShuffledMask = Builder.CreateShuffleVector( 2849 BlockInMaskPart, 2850 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2851 "interleaved.mask"); 2852 NewStoreInstr = Builder.CreateMaskedStore( 2853 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2854 } 2855 else 2856 NewStoreInstr = 2857 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2858 2859 Group->addMetadata(NewStoreInstr); 2860 } 2861 } 2862 2863 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2864 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2865 VPValue *StoredValue, VPValue *BlockInMask) { 2866 // Attempt to issue a wide load. 2867 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2868 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2869 2870 assert((LI || SI) && "Invalid Load/Store instruction"); 2871 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2872 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2873 2874 LoopVectorizationCostModel::InstWidening Decision = 2875 Cost->getWideningDecision(Instr, VF); 2876 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2877 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2878 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2879 "CM decision is not to widen the memory instruction"); 2880 2881 Type *ScalarDataTy = getMemInstValueType(Instr); 2882 2883 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2884 const Align Alignment = getLoadStoreAlignment(Instr); 2885 2886 // Determine if the pointer operand of the access is either consecutive or 2887 // reverse consecutive. 2888 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2889 bool ConsecutiveStride = 2890 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2891 bool CreateGatherScatter = 2892 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2893 2894 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2895 // gather/scatter. Otherwise Decision should have been to Scalarize. 2896 assert((ConsecutiveStride || CreateGatherScatter) && 2897 "The instruction should be scalarized"); 2898 (void)ConsecutiveStride; 2899 2900 VectorParts BlockInMaskParts(UF); 2901 bool isMaskRequired = BlockInMask; 2902 if (isMaskRequired) 2903 for (unsigned Part = 0; Part < UF; ++Part) 2904 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2905 2906 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2907 // Calculate the pointer for the specific unroll-part. 2908 GetElementPtrInst *PartPtr = nullptr; 2909 2910 bool InBounds = false; 2911 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2912 InBounds = gep->isInBounds(); 2913 if (Reverse) { 2914 // If the address is consecutive but reversed, then the 2915 // wide store needs to start at the last vector element. 2916 // RunTimeVF = VScale * VF.getKnownMinValue() 2917 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2918 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2919 // NumElt = -Part * RunTimeVF 2920 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2921 // LastLane = 1 - RunTimeVF 2922 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2923 PartPtr = 2924 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2925 PartPtr->setIsInBounds(InBounds); 2926 PartPtr = cast<GetElementPtrInst>( 2927 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2928 PartPtr->setIsInBounds(InBounds); 2929 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2930 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2931 } else { 2932 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2933 PartPtr = cast<GetElementPtrInst>( 2934 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2935 PartPtr->setIsInBounds(InBounds); 2936 } 2937 2938 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2939 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2940 }; 2941 2942 // Handle Stores: 2943 if (SI) { 2944 setDebugLocFromInst(Builder, SI); 2945 2946 for (unsigned Part = 0; Part < UF; ++Part) { 2947 Instruction *NewSI = nullptr; 2948 Value *StoredVal = State.get(StoredValue, Part); 2949 if (CreateGatherScatter) { 2950 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2951 Value *VectorGep = State.get(Addr, Part); 2952 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2953 MaskPart); 2954 } else { 2955 if (Reverse) { 2956 // If we store to reverse consecutive memory locations, then we need 2957 // to reverse the order of elements in the stored value. 2958 StoredVal = reverseVector(StoredVal); 2959 // We don't want to update the value in the map as it might be used in 2960 // another expression. So don't call resetVectorValue(StoredVal). 2961 } 2962 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2963 if (isMaskRequired) 2964 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2965 BlockInMaskParts[Part]); 2966 else 2967 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2968 } 2969 addMetadata(NewSI, SI); 2970 } 2971 return; 2972 } 2973 2974 // Handle loads. 2975 assert(LI && "Must have a load instruction"); 2976 setDebugLocFromInst(Builder, LI); 2977 for (unsigned Part = 0; Part < UF; ++Part) { 2978 Value *NewLI; 2979 if (CreateGatherScatter) { 2980 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2981 Value *VectorGep = State.get(Addr, Part); 2982 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2983 nullptr, "wide.masked.gather"); 2984 addMetadata(NewLI, LI); 2985 } else { 2986 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2987 if (isMaskRequired) 2988 NewLI = Builder.CreateMaskedLoad( 2989 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2990 "wide.masked.load"); 2991 else 2992 NewLI = 2993 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2994 2995 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2996 addMetadata(NewLI, LI); 2997 if (Reverse) 2998 NewLI = reverseVector(NewLI); 2999 } 3000 3001 State.set(Def, NewLI, Part); 3002 } 3003 } 3004 3005 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3006 VPUser &User, 3007 const VPIteration &Instance, 3008 bool IfPredicateInstr, 3009 VPTransformState &State) { 3010 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3011 3012 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3013 // the first lane and part. 3014 if (isa<NoAliasScopeDeclInst>(Instr)) 3015 if (!Instance.isFirstIteration()) 3016 return; 3017 3018 setDebugLocFromInst(Builder, Instr); 3019 3020 // Does this instruction return a value ? 3021 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3022 3023 Instruction *Cloned = Instr->clone(); 3024 if (!IsVoidRetTy) 3025 Cloned->setName(Instr->getName() + ".cloned"); 3026 3027 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3028 Builder.GetInsertPoint()); 3029 // Replace the operands of the cloned instructions with their scalar 3030 // equivalents in the new loop. 3031 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3032 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3033 auto InputInstance = Instance; 3034 if (!Operand || !OrigLoop->contains(Operand) || 3035 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3036 InputInstance.Lane = VPLane::getFirstLane(); 3037 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3038 Cloned->setOperand(op, NewOp); 3039 } 3040 addNewMetadata(Cloned, Instr); 3041 3042 // Place the cloned scalar in the new loop. 3043 Builder.Insert(Cloned); 3044 3045 State.set(Def, Cloned, Instance); 3046 3047 // If we just cloned a new assumption, add it the assumption cache. 3048 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3049 AC->registerAssumption(II); 3050 3051 // End if-block. 3052 if (IfPredicateInstr) 3053 PredicatedInstructions.push_back(Cloned); 3054 } 3055 3056 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3057 Value *End, Value *Step, 3058 Instruction *DL) { 3059 BasicBlock *Header = L->getHeader(); 3060 BasicBlock *Latch = L->getLoopLatch(); 3061 // As we're just creating this loop, it's possible no latch exists 3062 // yet. If so, use the header as this will be a single block loop. 3063 if (!Latch) 3064 Latch = Header; 3065 3066 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3067 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3068 setDebugLocFromInst(Builder, OldInst); 3069 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3070 3071 Builder.SetInsertPoint(Latch->getTerminator()); 3072 setDebugLocFromInst(Builder, OldInst); 3073 3074 // Create i+1 and fill the PHINode. 3075 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3076 Induction->addIncoming(Start, L->getLoopPreheader()); 3077 Induction->addIncoming(Next, Latch); 3078 // Create the compare. 3079 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3080 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3081 3082 // Now we have two terminators. Remove the old one from the block. 3083 Latch->getTerminator()->eraseFromParent(); 3084 3085 return Induction; 3086 } 3087 3088 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3089 if (TripCount) 3090 return TripCount; 3091 3092 assert(L && "Create Trip Count for null loop."); 3093 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3094 // Find the loop boundaries. 3095 ScalarEvolution *SE = PSE.getSE(); 3096 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3097 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3098 "Invalid loop count"); 3099 3100 Type *IdxTy = Legal->getWidestInductionType(); 3101 assert(IdxTy && "No type for induction"); 3102 3103 // The exit count might have the type of i64 while the phi is i32. This can 3104 // happen if we have an induction variable that is sign extended before the 3105 // compare. The only way that we get a backedge taken count is that the 3106 // induction variable was signed and as such will not overflow. In such a case 3107 // truncation is legal. 3108 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3109 IdxTy->getPrimitiveSizeInBits()) 3110 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3111 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3112 3113 // Get the total trip count from the count by adding 1. 3114 const SCEV *ExitCount = SE->getAddExpr( 3115 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3116 3117 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3118 3119 // Expand the trip count and place the new instructions in the preheader. 3120 // Notice that the pre-header does not change, only the loop body. 3121 SCEVExpander Exp(*SE, DL, "induction"); 3122 3123 // Count holds the overall loop count (N). 3124 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3125 L->getLoopPreheader()->getTerminator()); 3126 3127 if (TripCount->getType()->isPointerTy()) 3128 TripCount = 3129 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3130 L->getLoopPreheader()->getTerminator()); 3131 3132 return TripCount; 3133 } 3134 3135 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3136 if (VectorTripCount) 3137 return VectorTripCount; 3138 3139 Value *TC = getOrCreateTripCount(L); 3140 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3141 3142 Type *Ty = TC->getType(); 3143 // This is where we can make the step a runtime constant. 3144 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3145 3146 // If the tail is to be folded by masking, round the number of iterations N 3147 // up to a multiple of Step instead of rounding down. This is done by first 3148 // adding Step-1 and then rounding down. Note that it's ok if this addition 3149 // overflows: the vector induction variable will eventually wrap to zero given 3150 // that it starts at zero and its Step is a power of two; the loop will then 3151 // exit, with the last early-exit vector comparison also producing all-true. 3152 if (Cost->foldTailByMasking()) { 3153 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3154 "VF*UF must be a power of 2 when folding tail by masking"); 3155 assert(!VF.isScalable() && 3156 "Tail folding not yet supported for scalable vectors"); 3157 TC = Builder.CreateAdd( 3158 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3159 } 3160 3161 // Now we need to generate the expression for the part of the loop that the 3162 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3163 // iterations are not required for correctness, or N - Step, otherwise. Step 3164 // is equal to the vectorization factor (number of SIMD elements) times the 3165 // unroll factor (number of SIMD instructions). 3166 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3167 3168 // There are two cases where we need to ensure (at least) the last iteration 3169 // runs in the scalar remainder loop. Thus, if the step evenly divides 3170 // the trip count, we set the remainder to be equal to the step. If the step 3171 // does not evenly divide the trip count, no adjustment is necessary since 3172 // there will already be scalar iterations. Note that the minimum iterations 3173 // check ensures that N >= Step. The cases are: 3174 // 1) If there is a non-reversed interleaved group that may speculatively 3175 // access memory out-of-bounds. 3176 // 2) If any instruction may follow a conditionally taken exit. That is, if 3177 // the loop contains multiple exiting blocks, or a single exiting block 3178 // which is not the latch. 3179 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3180 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3181 R = Builder.CreateSelect(IsZero, Step, R); 3182 } 3183 3184 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3185 3186 return VectorTripCount; 3187 } 3188 3189 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3190 const DataLayout &DL) { 3191 // Verify that V is a vector type with same number of elements as DstVTy. 3192 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3193 unsigned VF = DstFVTy->getNumElements(); 3194 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3195 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3196 Type *SrcElemTy = SrcVecTy->getElementType(); 3197 Type *DstElemTy = DstFVTy->getElementType(); 3198 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3199 "Vector elements must have same size"); 3200 3201 // Do a direct cast if element types are castable. 3202 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3203 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3204 } 3205 // V cannot be directly casted to desired vector type. 3206 // May happen when V is a floating point vector but DstVTy is a vector of 3207 // pointers or vice-versa. Handle this using a two-step bitcast using an 3208 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3209 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3210 "Only one type should be a pointer type"); 3211 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3212 "Only one type should be a floating point type"); 3213 Type *IntTy = 3214 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3215 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3216 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3217 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3218 } 3219 3220 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3221 BasicBlock *Bypass) { 3222 Value *Count = getOrCreateTripCount(L); 3223 // Reuse existing vector loop preheader for TC checks. 3224 // Note that new preheader block is generated for vector loop. 3225 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3226 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3227 3228 // Generate code to check if the loop's trip count is less than VF * UF, or 3229 // equal to it in case a scalar epilogue is required; this implies that the 3230 // vector trip count is zero. This check also covers the case where adding one 3231 // to the backedge-taken count overflowed leading to an incorrect trip count 3232 // of zero. In this case we will also jump to the scalar loop. 3233 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3234 : ICmpInst::ICMP_ULT; 3235 3236 // If tail is to be folded, vector loop takes care of all iterations. 3237 Value *CheckMinIters = Builder.getFalse(); 3238 if (!Cost->foldTailByMasking()) { 3239 Value *Step = 3240 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3241 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3242 } 3243 // Create new preheader for vector loop. 3244 LoopVectorPreHeader = 3245 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3246 "vector.ph"); 3247 3248 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3249 DT->getNode(Bypass)->getIDom()) && 3250 "TC check is expected to dominate Bypass"); 3251 3252 // Update dominator for Bypass & LoopExit (if needed). 3253 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3254 if (!Cost->requiresScalarEpilogue()) 3255 // If there is an epilogue which must run, there's no edge from the 3256 // middle block to exit blocks and thus no need to update the immediate 3257 // dominator of the exit blocks. 3258 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3259 3260 ReplaceInstWithInst( 3261 TCCheckBlock->getTerminator(), 3262 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3263 LoopBypassBlocks.push_back(TCCheckBlock); 3264 } 3265 3266 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3267 3268 BasicBlock *const SCEVCheckBlock = 3269 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3270 if (!SCEVCheckBlock) 3271 return nullptr; 3272 3273 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3274 (OptForSizeBasedOnProfile && 3275 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3276 "Cannot SCEV check stride or overflow when optimizing for size"); 3277 3278 3279 // Update dominator only if this is first RT check. 3280 if (LoopBypassBlocks.empty()) { 3281 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3282 if (!Cost->requiresScalarEpilogue()) 3283 // If there is an epilogue which must run, there's no edge from the 3284 // middle block to exit blocks and thus no need to update the immediate 3285 // dominator of the exit blocks. 3286 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3287 } 3288 3289 LoopBypassBlocks.push_back(SCEVCheckBlock); 3290 AddedSafetyChecks = true; 3291 return SCEVCheckBlock; 3292 } 3293 3294 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3295 BasicBlock *Bypass) { 3296 // VPlan-native path does not do any analysis for runtime checks currently. 3297 if (EnableVPlanNativePath) 3298 return nullptr; 3299 3300 BasicBlock *const MemCheckBlock = 3301 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3302 3303 // Check if we generated code that checks in runtime if arrays overlap. We put 3304 // the checks into a separate block to make the more common case of few 3305 // elements faster. 3306 if (!MemCheckBlock) 3307 return nullptr; 3308 3309 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3310 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3311 "Cannot emit memory checks when optimizing for size, unless forced " 3312 "to vectorize."); 3313 ORE->emit([&]() { 3314 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3315 L->getStartLoc(), L->getHeader()) 3316 << "Code-size may be reduced by not forcing " 3317 "vectorization, or by source-code modifications " 3318 "eliminating the need for runtime checks " 3319 "(e.g., adding 'restrict')."; 3320 }); 3321 } 3322 3323 LoopBypassBlocks.push_back(MemCheckBlock); 3324 3325 AddedSafetyChecks = true; 3326 3327 // We currently don't use LoopVersioning for the actual loop cloning but we 3328 // still use it to add the noalias metadata. 3329 LVer = std::make_unique<LoopVersioning>( 3330 *Legal->getLAI(), 3331 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3332 DT, PSE.getSE()); 3333 LVer->prepareNoAliasMetadata(); 3334 return MemCheckBlock; 3335 } 3336 3337 Value *InnerLoopVectorizer::emitTransformedIndex( 3338 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3339 const InductionDescriptor &ID) const { 3340 3341 SCEVExpander Exp(*SE, DL, "induction"); 3342 auto Step = ID.getStep(); 3343 auto StartValue = ID.getStartValue(); 3344 assert(Index->getType()->getScalarType() == Step->getType() && 3345 "Index scalar type does not match StepValue type"); 3346 3347 // Note: the IR at this point is broken. We cannot use SE to create any new 3348 // SCEV and then expand it, hoping that SCEV's simplification will give us 3349 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3350 // lead to various SCEV crashes. So all we can do is to use builder and rely 3351 // on InstCombine for future simplifications. Here we handle some trivial 3352 // cases only. 3353 auto CreateAdd = [&B](Value *X, Value *Y) { 3354 assert(X->getType() == Y->getType() && "Types don't match!"); 3355 if (auto *CX = dyn_cast<ConstantInt>(X)) 3356 if (CX->isZero()) 3357 return Y; 3358 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3359 if (CY->isZero()) 3360 return X; 3361 return B.CreateAdd(X, Y); 3362 }; 3363 3364 // We allow X to be a vector type, in which case Y will potentially be 3365 // splatted into a vector with the same element count. 3366 auto CreateMul = [&B](Value *X, Value *Y) { 3367 assert(X->getType()->getScalarType() == Y->getType() && 3368 "Types don't match!"); 3369 if (auto *CX = dyn_cast<ConstantInt>(X)) 3370 if (CX->isOne()) 3371 return Y; 3372 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3373 if (CY->isOne()) 3374 return X; 3375 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3376 if (XVTy && !isa<VectorType>(Y->getType())) 3377 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3378 return B.CreateMul(X, Y); 3379 }; 3380 3381 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3382 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3383 // the DomTree is not kept up-to-date for additional blocks generated in the 3384 // vector loop. By using the header as insertion point, we guarantee that the 3385 // expanded instructions dominate all their uses. 3386 auto GetInsertPoint = [this, &B]() { 3387 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3388 if (InsertBB != LoopVectorBody && 3389 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3390 return LoopVectorBody->getTerminator(); 3391 return &*B.GetInsertPoint(); 3392 }; 3393 3394 switch (ID.getKind()) { 3395 case InductionDescriptor::IK_IntInduction: { 3396 assert(!isa<VectorType>(Index->getType()) && 3397 "Vector indices not supported for integer inductions yet"); 3398 assert(Index->getType() == StartValue->getType() && 3399 "Index type does not match StartValue type"); 3400 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3401 return B.CreateSub(StartValue, Index); 3402 auto *Offset = CreateMul( 3403 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3404 return CreateAdd(StartValue, Offset); 3405 } 3406 case InductionDescriptor::IK_PtrInduction: { 3407 assert(isa<SCEVConstant>(Step) && 3408 "Expected constant step for pointer induction"); 3409 return B.CreateGEP( 3410 StartValue->getType()->getPointerElementType(), StartValue, 3411 CreateMul(Index, 3412 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3413 GetInsertPoint()))); 3414 } 3415 case InductionDescriptor::IK_FpInduction: { 3416 assert(!isa<VectorType>(Index->getType()) && 3417 "Vector indices not supported for FP inductions yet"); 3418 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3419 auto InductionBinOp = ID.getInductionBinOp(); 3420 assert(InductionBinOp && 3421 (InductionBinOp->getOpcode() == Instruction::FAdd || 3422 InductionBinOp->getOpcode() == Instruction::FSub) && 3423 "Original bin op should be defined for FP induction"); 3424 3425 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3426 Value *MulExp = B.CreateFMul(StepValue, Index); 3427 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3428 "induction"); 3429 } 3430 case InductionDescriptor::IK_NoInduction: 3431 return nullptr; 3432 } 3433 llvm_unreachable("invalid enum"); 3434 } 3435 3436 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3437 LoopScalarBody = OrigLoop->getHeader(); 3438 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3439 assert(LoopVectorPreHeader && "Invalid loop structure"); 3440 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3441 assert((LoopExitBlock || Cost->requiresScalarEpilogue()) && 3442 "multiple exit loop without required epilogue?"); 3443 3444 LoopMiddleBlock = 3445 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3446 LI, nullptr, Twine(Prefix) + "middle.block"); 3447 LoopScalarPreHeader = 3448 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3449 nullptr, Twine(Prefix) + "scalar.ph"); 3450 3451 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3452 3453 // Set up the middle block terminator. Two cases: 3454 // 1) If we know that we must execute the scalar epilogue, emit an 3455 // unconditional branch. 3456 // 2) Otherwise, we must have a single unique exit block (due to how we 3457 // implement the multiple exit case). In this case, set up a conditonal 3458 // branch from the middle block to the loop scalar preheader, and the 3459 // exit block. completeLoopSkeleton will update the condition to use an 3460 // iteration check, if required to decide whether to execute the remainder. 3461 BranchInst *BrInst = Cost->requiresScalarEpilogue() ? 3462 BranchInst::Create(LoopScalarPreHeader) : 3463 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3464 Builder.getTrue()); 3465 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3466 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3467 3468 // We intentionally don't let SplitBlock to update LoopInfo since 3469 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3470 // LoopVectorBody is explicitly added to the correct place few lines later. 3471 LoopVectorBody = 3472 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3473 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3474 3475 // Update dominator for loop exit. 3476 if (!Cost->requiresScalarEpilogue()) 3477 // If there is an epilogue which must run, there's no edge from the 3478 // middle block to exit blocks and thus no need to update the immediate 3479 // dominator of the exit blocks. 3480 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3481 3482 // Create and register the new vector loop. 3483 Loop *Lp = LI->AllocateLoop(); 3484 Loop *ParentLoop = OrigLoop->getParentLoop(); 3485 3486 // Insert the new loop into the loop nest and register the new basic blocks 3487 // before calling any utilities such as SCEV that require valid LoopInfo. 3488 if (ParentLoop) { 3489 ParentLoop->addChildLoop(Lp); 3490 } else { 3491 LI->addTopLevelLoop(Lp); 3492 } 3493 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3494 return Lp; 3495 } 3496 3497 void InnerLoopVectorizer::createInductionResumeValues( 3498 Loop *L, Value *VectorTripCount, 3499 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3500 assert(VectorTripCount && L && "Expected valid arguments"); 3501 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3502 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3503 "Inconsistent information about additional bypass."); 3504 // We are going to resume the execution of the scalar loop. 3505 // Go over all of the induction variables that we found and fix the 3506 // PHIs that are left in the scalar version of the loop. 3507 // The starting values of PHI nodes depend on the counter of the last 3508 // iteration in the vectorized loop. 3509 // If we come from a bypass edge then we need to start from the original 3510 // start value. 3511 for (auto &InductionEntry : Legal->getInductionVars()) { 3512 PHINode *OrigPhi = InductionEntry.first; 3513 InductionDescriptor II = InductionEntry.second; 3514 3515 // Create phi nodes to merge from the backedge-taken check block. 3516 PHINode *BCResumeVal = 3517 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3518 LoopScalarPreHeader->getTerminator()); 3519 // Copy original phi DL over to the new one. 3520 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3521 Value *&EndValue = IVEndValues[OrigPhi]; 3522 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3523 if (OrigPhi == OldInduction) { 3524 // We know what the end value is. 3525 EndValue = VectorTripCount; 3526 } else { 3527 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3528 3529 // Fast-math-flags propagate from the original induction instruction. 3530 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3531 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3532 3533 Type *StepType = II.getStep()->getType(); 3534 Instruction::CastOps CastOp = 3535 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3536 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3537 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3538 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3539 EndValue->setName("ind.end"); 3540 3541 // Compute the end value for the additional bypass (if applicable). 3542 if (AdditionalBypass.first) { 3543 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3544 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3545 StepType, true); 3546 CRD = 3547 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3548 EndValueFromAdditionalBypass = 3549 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3550 EndValueFromAdditionalBypass->setName("ind.end"); 3551 } 3552 } 3553 // The new PHI merges the original incoming value, in case of a bypass, 3554 // or the value at the end of the vectorized loop. 3555 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3556 3557 // Fix the scalar body counter (PHI node). 3558 // The old induction's phi node in the scalar body needs the truncated 3559 // value. 3560 for (BasicBlock *BB : LoopBypassBlocks) 3561 BCResumeVal->addIncoming(II.getStartValue(), BB); 3562 3563 if (AdditionalBypass.first) 3564 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3565 EndValueFromAdditionalBypass); 3566 3567 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3568 } 3569 } 3570 3571 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3572 MDNode *OrigLoopID) { 3573 assert(L && "Expected valid loop."); 3574 3575 // The trip counts should be cached by now. 3576 Value *Count = getOrCreateTripCount(L); 3577 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3578 3579 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3580 3581 // Add a check in the middle block to see if we have completed 3582 // all of the iterations in the first vector loop. Three cases: 3583 // 1) If we require a scalar epilogue, there is no conditional branch as 3584 // we unconditionally branch to the scalar preheader. Do nothing. 3585 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3586 // Thus if tail is to be folded, we know we don't need to run the 3587 // remainder and we can use the previous value for the condition (true). 3588 // 3) Otherwise, construct a runtime check. 3589 if (!Cost->requiresScalarEpilogue() && !Cost->foldTailByMasking()) { 3590 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3591 Count, VectorTripCount, "cmp.n", 3592 LoopMiddleBlock->getTerminator()); 3593 3594 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3595 // of the corresponding compare because they may have ended up with 3596 // different line numbers and we want to avoid awkward line stepping while 3597 // debugging. Eg. if the compare has got a line number inside the loop. 3598 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3599 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3600 } 3601 3602 // Get ready to start creating new instructions into the vectorized body. 3603 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3604 "Inconsistent vector loop preheader"); 3605 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3606 3607 Optional<MDNode *> VectorizedLoopID = 3608 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3609 LLVMLoopVectorizeFollowupVectorized}); 3610 if (VectorizedLoopID.hasValue()) { 3611 L->setLoopID(VectorizedLoopID.getValue()); 3612 3613 // Do not setAlreadyVectorized if loop attributes have been defined 3614 // explicitly. 3615 return LoopVectorPreHeader; 3616 } 3617 3618 // Keep all loop hints from the original loop on the vector loop (we'll 3619 // replace the vectorizer-specific hints below). 3620 if (MDNode *LID = OrigLoop->getLoopID()) 3621 L->setLoopID(LID); 3622 3623 LoopVectorizeHints Hints(L, true, *ORE); 3624 Hints.setAlreadyVectorized(); 3625 3626 #ifdef EXPENSIVE_CHECKS 3627 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3628 LI->verify(*DT); 3629 #endif 3630 3631 return LoopVectorPreHeader; 3632 } 3633 3634 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3635 /* 3636 In this function we generate a new loop. The new loop will contain 3637 the vectorized instructions while the old loop will continue to run the 3638 scalar remainder. 3639 3640 [ ] <-- loop iteration number check. 3641 / | 3642 / v 3643 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3644 | / | 3645 | / v 3646 || [ ] <-- vector pre header. 3647 |/ | 3648 | v 3649 | [ ] \ 3650 | [ ]_| <-- vector loop. 3651 | | 3652 | v 3653 \ -[ ] <--- middle-block. 3654 \/ | 3655 /\ v 3656 | ->[ ] <--- new preheader. 3657 | | 3658 (opt) v <-- edge from middle to exit iff epilogue is not required. 3659 | [ ] \ 3660 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3661 \ | 3662 \ v 3663 >[ ] <-- exit block(s). 3664 ... 3665 */ 3666 3667 // Get the metadata of the original loop before it gets modified. 3668 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3669 3670 // Workaround! Compute the trip count of the original loop and cache it 3671 // before we start modifying the CFG. This code has a systemic problem 3672 // wherein it tries to run analysis over partially constructed IR; this is 3673 // wrong, and not simply for SCEV. The trip count of the original loop 3674 // simply happens to be prone to hitting this in practice. In theory, we 3675 // can hit the same issue for any SCEV, or ValueTracking query done during 3676 // mutation. See PR49900. 3677 getOrCreateTripCount(OrigLoop); 3678 3679 // Create an empty vector loop, and prepare basic blocks for the runtime 3680 // checks. 3681 Loop *Lp = createVectorLoopSkeleton(""); 3682 3683 // Now, compare the new count to zero. If it is zero skip the vector loop and 3684 // jump to the scalar loop. This check also covers the case where the 3685 // backedge-taken count is uint##_max: adding one to it will overflow leading 3686 // to an incorrect trip count of zero. In this (rare) case we will also jump 3687 // to the scalar loop. 3688 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3689 3690 // Generate the code to check any assumptions that we've made for SCEV 3691 // expressions. 3692 emitSCEVChecks(Lp, LoopScalarPreHeader); 3693 3694 // Generate the code that checks in runtime if arrays overlap. We put the 3695 // checks into a separate block to make the more common case of few elements 3696 // faster. 3697 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3698 3699 // Some loops have a single integer induction variable, while other loops 3700 // don't. One example is c++ iterators that often have multiple pointer 3701 // induction variables. In the code below we also support a case where we 3702 // don't have a single induction variable. 3703 // 3704 // We try to obtain an induction variable from the original loop as hard 3705 // as possible. However if we don't find one that: 3706 // - is an integer 3707 // - counts from zero, stepping by one 3708 // - is the size of the widest induction variable type 3709 // then we create a new one. 3710 OldInduction = Legal->getPrimaryInduction(); 3711 Type *IdxTy = Legal->getWidestInductionType(); 3712 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3713 // The loop step is equal to the vectorization factor (num of SIMD elements) 3714 // times the unroll factor (num of SIMD instructions). 3715 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3716 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3717 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3718 Induction = 3719 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3720 getDebugLocFromInstOrOperands(OldInduction)); 3721 3722 // Emit phis for the new starting index of the scalar loop. 3723 createInductionResumeValues(Lp, CountRoundDown); 3724 3725 return completeLoopSkeleton(Lp, OrigLoopID); 3726 } 3727 3728 // Fix up external users of the induction variable. At this point, we are 3729 // in LCSSA form, with all external PHIs that use the IV having one input value, 3730 // coming from the remainder loop. We need those PHIs to also have a correct 3731 // value for the IV when arriving directly from the middle block. 3732 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3733 const InductionDescriptor &II, 3734 Value *CountRoundDown, Value *EndValue, 3735 BasicBlock *MiddleBlock) { 3736 // There are two kinds of external IV usages - those that use the value 3737 // computed in the last iteration (the PHI) and those that use the penultimate 3738 // value (the value that feeds into the phi from the loop latch). 3739 // We allow both, but they, obviously, have different values. 3740 3741 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3742 3743 DenseMap<Value *, Value *> MissingVals; 3744 3745 // An external user of the last iteration's value should see the value that 3746 // the remainder loop uses to initialize its own IV. 3747 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3748 for (User *U : PostInc->users()) { 3749 Instruction *UI = cast<Instruction>(U); 3750 if (!OrigLoop->contains(UI)) { 3751 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3752 MissingVals[UI] = EndValue; 3753 } 3754 } 3755 3756 // An external user of the penultimate value need to see EndValue - Step. 3757 // The simplest way to get this is to recompute it from the constituent SCEVs, 3758 // that is Start + (Step * (CRD - 1)). 3759 for (User *U : OrigPhi->users()) { 3760 auto *UI = cast<Instruction>(U); 3761 if (!OrigLoop->contains(UI)) { 3762 const DataLayout &DL = 3763 OrigLoop->getHeader()->getModule()->getDataLayout(); 3764 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3765 3766 IRBuilder<> B(MiddleBlock->getTerminator()); 3767 3768 // Fast-math-flags propagate from the original induction instruction. 3769 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3770 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3771 3772 Value *CountMinusOne = B.CreateSub( 3773 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3774 Value *CMO = 3775 !II.getStep()->getType()->isIntegerTy() 3776 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3777 II.getStep()->getType()) 3778 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3779 CMO->setName("cast.cmo"); 3780 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3781 Escape->setName("ind.escape"); 3782 MissingVals[UI] = Escape; 3783 } 3784 } 3785 3786 for (auto &I : MissingVals) { 3787 PHINode *PHI = cast<PHINode>(I.first); 3788 // One corner case we have to handle is two IVs "chasing" each-other, 3789 // that is %IV2 = phi [...], [ %IV1, %latch ] 3790 // In this case, if IV1 has an external use, we need to avoid adding both 3791 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3792 // don't already have an incoming value for the middle block. 3793 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3794 PHI->addIncoming(I.second, MiddleBlock); 3795 } 3796 } 3797 3798 namespace { 3799 3800 struct CSEDenseMapInfo { 3801 static bool canHandle(const Instruction *I) { 3802 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3803 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3804 } 3805 3806 static inline Instruction *getEmptyKey() { 3807 return DenseMapInfo<Instruction *>::getEmptyKey(); 3808 } 3809 3810 static inline Instruction *getTombstoneKey() { 3811 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3812 } 3813 3814 static unsigned getHashValue(const Instruction *I) { 3815 assert(canHandle(I) && "Unknown instruction!"); 3816 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3817 I->value_op_end())); 3818 } 3819 3820 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3821 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3822 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3823 return LHS == RHS; 3824 return LHS->isIdenticalTo(RHS); 3825 } 3826 }; 3827 3828 } // end anonymous namespace 3829 3830 ///Perform cse of induction variable instructions. 3831 static void cse(BasicBlock *BB) { 3832 // Perform simple cse. 3833 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3834 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3835 Instruction *In = &*I++; 3836 3837 if (!CSEDenseMapInfo::canHandle(In)) 3838 continue; 3839 3840 // Check if we can replace this instruction with any of the 3841 // visited instructions. 3842 if (Instruction *V = CSEMap.lookup(In)) { 3843 In->replaceAllUsesWith(V); 3844 In->eraseFromParent(); 3845 continue; 3846 } 3847 3848 CSEMap[In] = In; 3849 } 3850 } 3851 3852 InstructionCost 3853 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3854 bool &NeedToScalarize) const { 3855 Function *F = CI->getCalledFunction(); 3856 Type *ScalarRetTy = CI->getType(); 3857 SmallVector<Type *, 4> Tys, ScalarTys; 3858 for (auto &ArgOp : CI->arg_operands()) 3859 ScalarTys.push_back(ArgOp->getType()); 3860 3861 // Estimate cost of scalarized vector call. The source operands are assumed 3862 // to be vectors, so we need to extract individual elements from there, 3863 // execute VF scalar calls, and then gather the result into the vector return 3864 // value. 3865 InstructionCost ScalarCallCost = 3866 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3867 if (VF.isScalar()) 3868 return ScalarCallCost; 3869 3870 // Compute corresponding vector type for return value and arguments. 3871 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3872 for (Type *ScalarTy : ScalarTys) 3873 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3874 3875 // Compute costs of unpacking argument values for the scalar calls and 3876 // packing the return values to a vector. 3877 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3878 3879 InstructionCost Cost = 3880 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3881 3882 // If we can't emit a vector call for this function, then the currently found 3883 // cost is the cost we need to return. 3884 NeedToScalarize = true; 3885 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3886 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3887 3888 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3889 return Cost; 3890 3891 // If the corresponding vector cost is cheaper, return its cost. 3892 InstructionCost VectorCallCost = 3893 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3894 if (VectorCallCost < Cost) { 3895 NeedToScalarize = false; 3896 Cost = VectorCallCost; 3897 } 3898 return Cost; 3899 } 3900 3901 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3902 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3903 return Elt; 3904 return VectorType::get(Elt, VF); 3905 } 3906 3907 InstructionCost 3908 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3909 ElementCount VF) const { 3910 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3911 assert(ID && "Expected intrinsic call!"); 3912 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3913 FastMathFlags FMF; 3914 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3915 FMF = FPMO->getFastMathFlags(); 3916 3917 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3918 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3919 SmallVector<Type *> ParamTys; 3920 std::transform(FTy->param_begin(), FTy->param_end(), 3921 std::back_inserter(ParamTys), 3922 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3923 3924 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3925 dyn_cast<IntrinsicInst>(CI)); 3926 return TTI.getIntrinsicInstrCost(CostAttrs, 3927 TargetTransformInfo::TCK_RecipThroughput); 3928 } 3929 3930 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3931 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3932 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3933 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3934 } 3935 3936 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3937 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3938 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3939 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3940 } 3941 3942 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3943 // For every instruction `I` in MinBWs, truncate the operands, create a 3944 // truncated version of `I` and reextend its result. InstCombine runs 3945 // later and will remove any ext/trunc pairs. 3946 SmallPtrSet<Value *, 4> Erased; 3947 for (const auto &KV : Cost->getMinimalBitwidths()) { 3948 // If the value wasn't vectorized, we must maintain the original scalar 3949 // type. The absence of the value from State indicates that it 3950 // wasn't vectorized. 3951 VPValue *Def = State.Plan->getVPValue(KV.first); 3952 if (!State.hasAnyVectorValue(Def)) 3953 continue; 3954 for (unsigned Part = 0; Part < UF; ++Part) { 3955 Value *I = State.get(Def, Part); 3956 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3957 continue; 3958 Type *OriginalTy = I->getType(); 3959 Type *ScalarTruncatedTy = 3960 IntegerType::get(OriginalTy->getContext(), KV.second); 3961 auto *TruncatedTy = FixedVectorType::get( 3962 ScalarTruncatedTy, 3963 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3964 if (TruncatedTy == OriginalTy) 3965 continue; 3966 3967 IRBuilder<> B(cast<Instruction>(I)); 3968 auto ShrinkOperand = [&](Value *V) -> Value * { 3969 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3970 if (ZI->getSrcTy() == TruncatedTy) 3971 return ZI->getOperand(0); 3972 return B.CreateZExtOrTrunc(V, TruncatedTy); 3973 }; 3974 3975 // The actual instruction modification depends on the instruction type, 3976 // unfortunately. 3977 Value *NewI = nullptr; 3978 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3979 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3980 ShrinkOperand(BO->getOperand(1))); 3981 3982 // Any wrapping introduced by shrinking this operation shouldn't be 3983 // considered undefined behavior. So, we can't unconditionally copy 3984 // arithmetic wrapping flags to NewI. 3985 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3986 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3987 NewI = 3988 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3989 ShrinkOperand(CI->getOperand(1))); 3990 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3991 NewI = B.CreateSelect(SI->getCondition(), 3992 ShrinkOperand(SI->getTrueValue()), 3993 ShrinkOperand(SI->getFalseValue())); 3994 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3995 switch (CI->getOpcode()) { 3996 default: 3997 llvm_unreachable("Unhandled cast!"); 3998 case Instruction::Trunc: 3999 NewI = ShrinkOperand(CI->getOperand(0)); 4000 break; 4001 case Instruction::SExt: 4002 NewI = B.CreateSExtOrTrunc( 4003 CI->getOperand(0), 4004 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4005 break; 4006 case Instruction::ZExt: 4007 NewI = B.CreateZExtOrTrunc( 4008 CI->getOperand(0), 4009 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 4010 break; 4011 } 4012 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 4013 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 4014 ->getNumElements(); 4015 auto *O0 = B.CreateZExtOrTrunc( 4016 SI->getOperand(0), 4017 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 4018 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 4019 ->getNumElements(); 4020 auto *O1 = B.CreateZExtOrTrunc( 4021 SI->getOperand(1), 4022 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 4023 4024 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4025 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4026 // Don't do anything with the operands, just extend the result. 4027 continue; 4028 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4029 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 4030 ->getNumElements(); 4031 auto *O0 = B.CreateZExtOrTrunc( 4032 IE->getOperand(0), 4033 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4034 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4035 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4036 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4037 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4038 ->getNumElements(); 4039 auto *O0 = B.CreateZExtOrTrunc( 4040 EE->getOperand(0), 4041 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4042 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4043 } else { 4044 // If we don't know what to do, be conservative and don't do anything. 4045 continue; 4046 } 4047 4048 // Lastly, extend the result. 4049 NewI->takeName(cast<Instruction>(I)); 4050 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4051 I->replaceAllUsesWith(Res); 4052 cast<Instruction>(I)->eraseFromParent(); 4053 Erased.insert(I); 4054 State.reset(Def, Res, Part); 4055 } 4056 } 4057 4058 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4059 for (const auto &KV : Cost->getMinimalBitwidths()) { 4060 // If the value wasn't vectorized, we must maintain the original scalar 4061 // type. The absence of the value from State indicates that it 4062 // wasn't vectorized. 4063 VPValue *Def = State.Plan->getVPValue(KV.first); 4064 if (!State.hasAnyVectorValue(Def)) 4065 continue; 4066 for (unsigned Part = 0; Part < UF; ++Part) { 4067 Value *I = State.get(Def, Part); 4068 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4069 if (Inst && Inst->use_empty()) { 4070 Value *NewI = Inst->getOperand(0); 4071 Inst->eraseFromParent(); 4072 State.reset(Def, NewI, Part); 4073 } 4074 } 4075 } 4076 } 4077 4078 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4079 // Insert truncates and extends for any truncated instructions as hints to 4080 // InstCombine. 4081 if (VF.isVector()) 4082 truncateToMinimalBitwidths(State); 4083 4084 // Fix widened non-induction PHIs by setting up the PHI operands. 4085 if (OrigPHIsToFix.size()) { 4086 assert(EnableVPlanNativePath && 4087 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4088 fixNonInductionPHIs(State); 4089 } 4090 4091 // At this point every instruction in the original loop is widened to a 4092 // vector form. Now we need to fix the recurrences in the loop. These PHI 4093 // nodes are currently empty because we did not want to introduce cycles. 4094 // This is the second stage of vectorizing recurrences. 4095 fixCrossIterationPHIs(State); 4096 4097 // Forget the original basic block. 4098 PSE.getSE()->forgetLoop(OrigLoop); 4099 4100 // If we inserted an edge from the middle block to the unique exit block, 4101 // update uses outside the loop (phis) to account for the newly inserted 4102 // edge. 4103 if (!Cost->requiresScalarEpilogue()) { 4104 // Fix-up external users of the induction variables. 4105 for (auto &Entry : Legal->getInductionVars()) 4106 fixupIVUsers(Entry.first, Entry.second, 4107 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4108 IVEndValues[Entry.first], LoopMiddleBlock); 4109 4110 fixLCSSAPHIs(State); 4111 } 4112 4113 for (Instruction *PI : PredicatedInstructions) 4114 sinkScalarOperands(&*PI); 4115 4116 // Remove redundant induction instructions. 4117 cse(LoopVectorBody); 4118 4119 // Set/update profile weights for the vector and remainder loops as original 4120 // loop iterations are now distributed among them. Note that original loop 4121 // represented by LoopScalarBody becomes remainder loop after vectorization. 4122 // 4123 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4124 // end up getting slightly roughened result but that should be OK since 4125 // profile is not inherently precise anyway. Note also possible bypass of 4126 // vector code caused by legality checks is ignored, assigning all the weight 4127 // to the vector loop, optimistically. 4128 // 4129 // For scalable vectorization we can't know at compile time how many iterations 4130 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4131 // vscale of '1'. 4132 setProfileInfoAfterUnrolling( 4133 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4134 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4135 } 4136 4137 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4138 // In order to support recurrences we need to be able to vectorize Phi nodes. 4139 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4140 // stage #2: We now need to fix the recurrences by adding incoming edges to 4141 // the currently empty PHI nodes. At this point every instruction in the 4142 // original loop is widened to a vector form so we can use them to construct 4143 // the incoming edges. 4144 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4145 for (VPRecipeBase &R : Header->phis()) { 4146 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4147 if (!PhiR) 4148 continue; 4149 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4150 if (PhiR->getRecurrenceDescriptor()) { 4151 fixReduction(PhiR, State); 4152 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4153 fixFirstOrderRecurrence(OrigPhi, State); 4154 } 4155 } 4156 4157 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4158 VPTransformState &State) { 4159 // This is the second phase of vectorizing first-order recurrences. An 4160 // overview of the transformation is described below. Suppose we have the 4161 // following loop. 4162 // 4163 // for (int i = 0; i < n; ++i) 4164 // b[i] = a[i] - a[i - 1]; 4165 // 4166 // There is a first-order recurrence on "a". For this loop, the shorthand 4167 // scalar IR looks like: 4168 // 4169 // scalar.ph: 4170 // s_init = a[-1] 4171 // br scalar.body 4172 // 4173 // scalar.body: 4174 // i = phi [0, scalar.ph], [i+1, scalar.body] 4175 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4176 // s2 = a[i] 4177 // b[i] = s2 - s1 4178 // br cond, scalar.body, ... 4179 // 4180 // In this example, s1 is a recurrence because it's value depends on the 4181 // previous iteration. In the first phase of vectorization, we created a 4182 // temporary value for s1. We now complete the vectorization and produce the 4183 // shorthand vector IR shown below (for VF = 4, UF = 1). 4184 // 4185 // vector.ph: 4186 // v_init = vector(..., ..., ..., a[-1]) 4187 // br vector.body 4188 // 4189 // vector.body 4190 // i = phi [0, vector.ph], [i+4, vector.body] 4191 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4192 // v2 = a[i, i+1, i+2, i+3]; 4193 // v3 = vector(v1(3), v2(0, 1, 2)) 4194 // b[i, i+1, i+2, i+3] = v2 - v3 4195 // br cond, vector.body, middle.block 4196 // 4197 // middle.block: 4198 // x = v2(3) 4199 // br scalar.ph 4200 // 4201 // scalar.ph: 4202 // s_init = phi [x, middle.block], [a[-1], otherwise] 4203 // br scalar.body 4204 // 4205 // After execution completes the vector loop, we extract the next value of 4206 // the recurrence (x) to use as the initial value in the scalar loop. 4207 4208 // Get the original loop preheader and single loop latch. 4209 auto *Preheader = OrigLoop->getLoopPreheader(); 4210 auto *Latch = OrigLoop->getLoopLatch(); 4211 4212 // Get the initial and previous values of the scalar recurrence. 4213 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4214 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4215 4216 auto *IdxTy = Builder.getInt32Ty(); 4217 auto *One = ConstantInt::get(IdxTy, 1); 4218 4219 // Create a vector from the initial value. 4220 auto *VectorInit = ScalarInit; 4221 if (VF.isVector()) { 4222 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4223 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4224 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4225 VectorInit = Builder.CreateInsertElement( 4226 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4227 VectorInit, LastIdx, "vector.recur.init"); 4228 } 4229 4230 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4231 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4232 // We constructed a temporary phi node in the first phase of vectorization. 4233 // This phi node will eventually be deleted. 4234 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4235 4236 // Create a phi node for the new recurrence. The current value will either be 4237 // the initial value inserted into a vector or loop-varying vector value. 4238 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4239 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4240 4241 // Get the vectorized previous value of the last part UF - 1. It appears last 4242 // among all unrolled iterations, due to the order of their construction. 4243 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4244 4245 // Find and set the insertion point after the previous value if it is an 4246 // instruction. 4247 BasicBlock::iterator InsertPt; 4248 // Note that the previous value may have been constant-folded so it is not 4249 // guaranteed to be an instruction in the vector loop. 4250 // FIXME: Loop invariant values do not form recurrences. We should deal with 4251 // them earlier. 4252 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4253 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4254 else { 4255 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4256 if (isa<PHINode>(PreviousLastPart)) 4257 // If the previous value is a phi node, we should insert after all the phi 4258 // nodes in the block containing the PHI to avoid breaking basic block 4259 // verification. Note that the basic block may be different to 4260 // LoopVectorBody, in case we predicate the loop. 4261 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4262 else 4263 InsertPt = ++PreviousInst->getIterator(); 4264 } 4265 Builder.SetInsertPoint(&*InsertPt); 4266 4267 // The vector from which to take the initial value for the current iteration 4268 // (actual or unrolled). Initially, this is the vector phi node. 4269 Value *Incoming = VecPhi; 4270 4271 // Shuffle the current and previous vector and update the vector parts. 4272 for (unsigned Part = 0; Part < UF; ++Part) { 4273 Value *PreviousPart = State.get(PreviousDef, Part); 4274 Value *PhiPart = State.get(PhiDef, Part); 4275 auto *Shuffle = VF.isVector() 4276 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4277 : Incoming; 4278 PhiPart->replaceAllUsesWith(Shuffle); 4279 cast<Instruction>(PhiPart)->eraseFromParent(); 4280 State.reset(PhiDef, Shuffle, Part); 4281 Incoming = PreviousPart; 4282 } 4283 4284 // Fix the latch value of the new recurrence in the vector loop. 4285 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4286 4287 // Extract the last vector element in the middle block. This will be the 4288 // initial value for the recurrence when jumping to the scalar loop. 4289 auto *ExtractForScalar = Incoming; 4290 if (VF.isVector()) { 4291 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4292 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4293 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4294 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4295 "vector.recur.extract"); 4296 } 4297 // Extract the second last element in the middle block if the 4298 // Phi is used outside the loop. We need to extract the phi itself 4299 // and not the last element (the phi update in the current iteration). This 4300 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4301 // when the scalar loop is not run at all. 4302 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4303 if (VF.isVector()) { 4304 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4305 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4306 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4307 Incoming, Idx, "vector.recur.extract.for.phi"); 4308 } else if (UF > 1) 4309 // When loop is unrolled without vectorizing, initialize 4310 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4311 // of `Incoming`. This is analogous to the vectorized case above: extracting 4312 // the second last element when VF > 1. 4313 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4314 4315 // Fix the initial value of the original recurrence in the scalar loop. 4316 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4317 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4318 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4319 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4320 Start->addIncoming(Incoming, BB); 4321 } 4322 4323 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4324 Phi->setName("scalar.recur"); 4325 4326 // Finally, fix users of the recurrence outside the loop. The users will need 4327 // either the last value of the scalar recurrence or the last value of the 4328 // vector recurrence we extracted in the middle block. Since the loop is in 4329 // LCSSA form, we just need to find all the phi nodes for the original scalar 4330 // recurrence in the exit block, and then add an edge for the middle block. 4331 // Note that LCSSA does not imply single entry when the original scalar loop 4332 // had multiple exiting edges (as we always run the last iteration in the 4333 // scalar epilogue); in that case, there is no edge from middle to exit and 4334 // and thus no phis which needed updated. 4335 if (!Cost->requiresScalarEpilogue()) 4336 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4337 if (any_of(LCSSAPhi.incoming_values(), 4338 [Phi](Value *V) { return V == Phi; })) 4339 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4340 } 4341 4342 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4343 return EnableStrictReductions && RdxDesc.isOrdered(); 4344 } 4345 4346 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4347 VPTransformState &State) { 4348 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4349 // Get it's reduction variable descriptor. 4350 assert(Legal->isReductionVariable(OrigPhi) && 4351 "Unable to find the reduction variable"); 4352 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4353 4354 RecurKind RK = RdxDesc.getRecurrenceKind(); 4355 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4356 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4357 setDebugLocFromInst(Builder, ReductionStartValue); 4358 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4359 4360 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4361 // This is the vector-clone of the value that leaves the loop. 4362 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4363 4364 // Wrap flags are in general invalid after vectorization, clear them. 4365 clearReductionWrapFlags(RdxDesc, State); 4366 4367 // Fix the vector-loop phi. 4368 4369 // Reductions do not have to start at zero. They can start with 4370 // any loop invariant values. 4371 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4372 4373 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4374 useOrderedReductions(RdxDesc); 4375 4376 for (unsigned Part = 0; Part < UF; ++Part) { 4377 if (IsOrdered && Part > 0) 4378 break; 4379 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4380 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4381 if (IsOrdered) 4382 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4383 4384 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4385 } 4386 4387 // Before each round, move the insertion point right between 4388 // the PHIs and the values we are going to write. 4389 // This allows us to write both PHINodes and the extractelement 4390 // instructions. 4391 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4392 4393 setDebugLocFromInst(Builder, LoopExitInst); 4394 4395 Type *PhiTy = OrigPhi->getType(); 4396 // If tail is folded by masking, the vector value to leave the loop should be 4397 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4398 // instead of the former. For an inloop reduction the reduction will already 4399 // be predicated, and does not need to be handled here. 4400 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4401 for (unsigned Part = 0; Part < UF; ++Part) { 4402 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4403 Value *Sel = nullptr; 4404 for (User *U : VecLoopExitInst->users()) { 4405 if (isa<SelectInst>(U)) { 4406 assert(!Sel && "Reduction exit feeding two selects"); 4407 Sel = U; 4408 } else 4409 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4410 } 4411 assert(Sel && "Reduction exit feeds no select"); 4412 State.reset(LoopExitInstDef, Sel, Part); 4413 4414 // If the target can create a predicated operator for the reduction at no 4415 // extra cost in the loop (for example a predicated vadd), it can be 4416 // cheaper for the select to remain in the loop than be sunk out of it, 4417 // and so use the select value for the phi instead of the old 4418 // LoopExitValue. 4419 if (PreferPredicatedReductionSelect || 4420 TTI->preferPredicatedReductionSelect( 4421 RdxDesc.getOpcode(), PhiTy, 4422 TargetTransformInfo::ReductionFlags())) { 4423 auto *VecRdxPhi = 4424 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4425 VecRdxPhi->setIncomingValueForBlock( 4426 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4427 } 4428 } 4429 } 4430 4431 // If the vector reduction can be performed in a smaller type, we truncate 4432 // then extend the loop exit value to enable InstCombine to evaluate the 4433 // entire expression in the smaller type. 4434 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4435 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4436 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4437 Builder.SetInsertPoint( 4438 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4439 VectorParts RdxParts(UF); 4440 for (unsigned Part = 0; Part < UF; ++Part) { 4441 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4442 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4443 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4444 : Builder.CreateZExt(Trunc, VecTy); 4445 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4446 UI != RdxParts[Part]->user_end();) 4447 if (*UI != Trunc) { 4448 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4449 RdxParts[Part] = Extnd; 4450 } else { 4451 ++UI; 4452 } 4453 } 4454 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4455 for (unsigned Part = 0; Part < UF; ++Part) { 4456 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4457 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4458 } 4459 } 4460 4461 // Reduce all of the unrolled parts into a single vector. 4462 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4463 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4464 4465 // The middle block terminator has already been assigned a DebugLoc here (the 4466 // OrigLoop's single latch terminator). We want the whole middle block to 4467 // appear to execute on this line because: (a) it is all compiler generated, 4468 // (b) these instructions are always executed after evaluating the latch 4469 // conditional branch, and (c) other passes may add new predecessors which 4470 // terminate on this line. This is the easiest way to ensure we don't 4471 // accidentally cause an extra step back into the loop while debugging. 4472 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4473 if (IsOrdered) 4474 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4475 else { 4476 // Floating-point operations should have some FMF to enable the reduction. 4477 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4478 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4479 for (unsigned Part = 1; Part < UF; ++Part) { 4480 Value *RdxPart = State.get(LoopExitInstDef, Part); 4481 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4482 ReducedPartRdx = Builder.CreateBinOp( 4483 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4484 } else { 4485 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4486 } 4487 } 4488 } 4489 4490 // Create the reduction after the loop. Note that inloop reductions create the 4491 // target reduction in the loop using a Reduction recipe. 4492 if (VF.isVector() && !IsInLoopReductionPhi) { 4493 ReducedPartRdx = 4494 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4495 // If the reduction can be performed in a smaller type, we need to extend 4496 // the reduction to the wider type before we branch to the original loop. 4497 if (PhiTy != RdxDesc.getRecurrenceType()) 4498 ReducedPartRdx = RdxDesc.isSigned() 4499 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4500 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4501 } 4502 4503 // Create a phi node that merges control-flow from the backedge-taken check 4504 // block and the middle block. 4505 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4506 LoopScalarPreHeader->getTerminator()); 4507 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4508 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4509 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4510 4511 // Now, we need to fix the users of the reduction variable 4512 // inside and outside of the scalar remainder loop. 4513 4514 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4515 // in the exit blocks. See comment on analogous loop in 4516 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4517 if (!Cost->requiresScalarEpilogue()) 4518 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4519 if (any_of(LCSSAPhi.incoming_values(), 4520 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4521 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4522 4523 // Fix the scalar loop reduction variable with the incoming reduction sum 4524 // from the vector body and from the backedge value. 4525 int IncomingEdgeBlockIdx = 4526 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4527 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4528 // Pick the other block. 4529 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4530 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4531 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4532 } 4533 4534 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4535 VPTransformState &State) { 4536 RecurKind RK = RdxDesc.getRecurrenceKind(); 4537 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4538 return; 4539 4540 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4541 assert(LoopExitInstr && "null loop exit instruction"); 4542 SmallVector<Instruction *, 8> Worklist; 4543 SmallPtrSet<Instruction *, 8> Visited; 4544 Worklist.push_back(LoopExitInstr); 4545 Visited.insert(LoopExitInstr); 4546 4547 while (!Worklist.empty()) { 4548 Instruction *Cur = Worklist.pop_back_val(); 4549 if (isa<OverflowingBinaryOperator>(Cur)) 4550 for (unsigned Part = 0; Part < UF; ++Part) { 4551 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4552 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4553 } 4554 4555 for (User *U : Cur->users()) { 4556 Instruction *UI = cast<Instruction>(U); 4557 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4558 Visited.insert(UI).second) 4559 Worklist.push_back(UI); 4560 } 4561 } 4562 } 4563 4564 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4565 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4566 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4567 // Some phis were already hand updated by the reduction and recurrence 4568 // code above, leave them alone. 4569 continue; 4570 4571 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4572 // Non-instruction incoming values will have only one value. 4573 4574 VPLane Lane = VPLane::getFirstLane(); 4575 if (isa<Instruction>(IncomingValue) && 4576 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4577 VF)) 4578 Lane = VPLane::getLastLaneForVF(VF); 4579 4580 // Can be a loop invariant incoming value or the last scalar value to be 4581 // extracted from the vectorized loop. 4582 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4583 Value *lastIncomingValue = 4584 OrigLoop->isLoopInvariant(IncomingValue) 4585 ? IncomingValue 4586 : State.get(State.Plan->getVPValue(IncomingValue), 4587 VPIteration(UF - 1, Lane)); 4588 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4589 } 4590 } 4591 4592 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4593 // The basic block and loop containing the predicated instruction. 4594 auto *PredBB = PredInst->getParent(); 4595 auto *VectorLoop = LI->getLoopFor(PredBB); 4596 4597 // Initialize a worklist with the operands of the predicated instruction. 4598 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4599 4600 // Holds instructions that we need to analyze again. An instruction may be 4601 // reanalyzed if we don't yet know if we can sink it or not. 4602 SmallVector<Instruction *, 8> InstsToReanalyze; 4603 4604 // Returns true if a given use occurs in the predicated block. Phi nodes use 4605 // their operands in their corresponding predecessor blocks. 4606 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4607 auto *I = cast<Instruction>(U.getUser()); 4608 BasicBlock *BB = I->getParent(); 4609 if (auto *Phi = dyn_cast<PHINode>(I)) 4610 BB = Phi->getIncomingBlock( 4611 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4612 return BB == PredBB; 4613 }; 4614 4615 // Iteratively sink the scalarized operands of the predicated instruction 4616 // into the block we created for it. When an instruction is sunk, it's 4617 // operands are then added to the worklist. The algorithm ends after one pass 4618 // through the worklist doesn't sink a single instruction. 4619 bool Changed; 4620 do { 4621 // Add the instructions that need to be reanalyzed to the worklist, and 4622 // reset the changed indicator. 4623 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4624 InstsToReanalyze.clear(); 4625 Changed = false; 4626 4627 while (!Worklist.empty()) { 4628 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4629 4630 // We can't sink an instruction if it is a phi node, is already in the 4631 // predicated block, is not in the loop, or may have side effects. 4632 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4633 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4634 continue; 4635 4636 // It's legal to sink the instruction if all its uses occur in the 4637 // predicated block. Otherwise, there's nothing to do yet, and we may 4638 // need to reanalyze the instruction. 4639 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4640 InstsToReanalyze.push_back(I); 4641 continue; 4642 } 4643 4644 // Move the instruction to the beginning of the predicated block, and add 4645 // it's operands to the worklist. 4646 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4647 Worklist.insert(I->op_begin(), I->op_end()); 4648 4649 // The sinking may have enabled other instructions to be sunk, so we will 4650 // need to iterate. 4651 Changed = true; 4652 } 4653 } while (Changed); 4654 } 4655 4656 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4657 for (PHINode *OrigPhi : OrigPHIsToFix) { 4658 VPWidenPHIRecipe *VPPhi = 4659 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4660 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4661 // Make sure the builder has a valid insert point. 4662 Builder.SetInsertPoint(NewPhi); 4663 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4664 VPValue *Inc = VPPhi->getIncomingValue(i); 4665 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4666 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4667 } 4668 } 4669 } 4670 4671 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4672 VPUser &Operands, unsigned UF, 4673 ElementCount VF, bool IsPtrLoopInvariant, 4674 SmallBitVector &IsIndexLoopInvariant, 4675 VPTransformState &State) { 4676 // Construct a vector GEP by widening the operands of the scalar GEP as 4677 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4678 // results in a vector of pointers when at least one operand of the GEP 4679 // is vector-typed. Thus, to keep the representation compact, we only use 4680 // vector-typed operands for loop-varying values. 4681 4682 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4683 // If we are vectorizing, but the GEP has only loop-invariant operands, 4684 // the GEP we build (by only using vector-typed operands for 4685 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4686 // produce a vector of pointers, we need to either arbitrarily pick an 4687 // operand to broadcast, or broadcast a clone of the original GEP. 4688 // Here, we broadcast a clone of the original. 4689 // 4690 // TODO: If at some point we decide to scalarize instructions having 4691 // loop-invariant operands, this special case will no longer be 4692 // required. We would add the scalarization decision to 4693 // collectLoopScalars() and teach getVectorValue() to broadcast 4694 // the lane-zero scalar value. 4695 auto *Clone = Builder.Insert(GEP->clone()); 4696 for (unsigned Part = 0; Part < UF; ++Part) { 4697 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4698 State.set(VPDef, EntryPart, Part); 4699 addMetadata(EntryPart, GEP); 4700 } 4701 } else { 4702 // If the GEP has at least one loop-varying operand, we are sure to 4703 // produce a vector of pointers. But if we are only unrolling, we want 4704 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4705 // produce with the code below will be scalar (if VF == 1) or vector 4706 // (otherwise). Note that for the unroll-only case, we still maintain 4707 // values in the vector mapping with initVector, as we do for other 4708 // instructions. 4709 for (unsigned Part = 0; Part < UF; ++Part) { 4710 // The pointer operand of the new GEP. If it's loop-invariant, we 4711 // won't broadcast it. 4712 auto *Ptr = IsPtrLoopInvariant 4713 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4714 : State.get(Operands.getOperand(0), Part); 4715 4716 // Collect all the indices for the new GEP. If any index is 4717 // loop-invariant, we won't broadcast it. 4718 SmallVector<Value *, 4> Indices; 4719 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4720 VPValue *Operand = Operands.getOperand(I); 4721 if (IsIndexLoopInvariant[I - 1]) 4722 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4723 else 4724 Indices.push_back(State.get(Operand, Part)); 4725 } 4726 4727 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4728 // but it should be a vector, otherwise. 4729 auto *NewGEP = 4730 GEP->isInBounds() 4731 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4732 Indices) 4733 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4734 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4735 "NewGEP is not a pointer vector"); 4736 State.set(VPDef, NewGEP, Part); 4737 addMetadata(NewGEP, GEP); 4738 } 4739 } 4740 } 4741 4742 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4743 RecurrenceDescriptor *RdxDesc, 4744 VPWidenPHIRecipe *PhiR, 4745 VPTransformState &State) { 4746 PHINode *P = cast<PHINode>(PN); 4747 if (EnableVPlanNativePath) { 4748 // Currently we enter here in the VPlan-native path for non-induction 4749 // PHIs where all control flow is uniform. We simply widen these PHIs. 4750 // Create a vector phi with no operands - the vector phi operands will be 4751 // set at the end of vector code generation. 4752 Type *VecTy = (State.VF.isScalar()) 4753 ? PN->getType() 4754 : VectorType::get(PN->getType(), State.VF); 4755 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4756 State.set(PhiR, VecPhi, 0); 4757 OrigPHIsToFix.push_back(P); 4758 4759 return; 4760 } 4761 4762 assert(PN->getParent() == OrigLoop->getHeader() && 4763 "Non-header phis should have been handled elsewhere"); 4764 4765 VPValue *StartVPV = PhiR->getStartValue(); 4766 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4767 // In order to support recurrences we need to be able to vectorize Phi nodes. 4768 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4769 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4770 // this value when we vectorize all of the instructions that use the PHI. 4771 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4772 Value *Iden = nullptr; 4773 bool ScalarPHI = 4774 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4775 Type *VecTy = 4776 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4777 4778 if (RdxDesc) { 4779 assert(Legal->isReductionVariable(P) && StartV && 4780 "RdxDesc should only be set for reduction variables; in that case " 4781 "a StartV is also required"); 4782 RecurKind RK = RdxDesc->getRecurrenceKind(); 4783 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4784 // MinMax reduction have the start value as their identify. 4785 if (ScalarPHI) { 4786 Iden = StartV; 4787 } else { 4788 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4789 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4790 StartV = Iden = 4791 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4792 } 4793 } else { 4794 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4795 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4796 Iden = IdenC; 4797 4798 if (!ScalarPHI) { 4799 Iden = ConstantVector::getSplat(State.VF, IdenC); 4800 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4801 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4802 Constant *Zero = Builder.getInt32(0); 4803 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4804 } 4805 } 4806 } 4807 4808 bool IsOrdered = State.VF.isVector() && 4809 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4810 useOrderedReductions(*RdxDesc); 4811 4812 for (unsigned Part = 0; Part < State.UF; ++Part) { 4813 // This is phase one of vectorizing PHIs. 4814 if (Part > 0 && IsOrdered) 4815 return; 4816 Value *EntryPart = PHINode::Create( 4817 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4818 State.set(PhiR, EntryPart, Part); 4819 if (StartV) { 4820 // Make sure to add the reduction start value only to the 4821 // first unroll part. 4822 Value *StartVal = (Part == 0) ? StartV : Iden; 4823 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4824 } 4825 } 4826 return; 4827 } 4828 4829 assert(!Legal->isReductionVariable(P) && 4830 "reductions should be handled above"); 4831 4832 setDebugLocFromInst(Builder, P); 4833 4834 // This PHINode must be an induction variable. 4835 // Make sure that we know about it. 4836 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4837 4838 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4839 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4840 4841 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4842 // which can be found from the original scalar operations. 4843 switch (II.getKind()) { 4844 case InductionDescriptor::IK_NoInduction: 4845 llvm_unreachable("Unknown induction"); 4846 case InductionDescriptor::IK_IntInduction: 4847 case InductionDescriptor::IK_FpInduction: 4848 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4849 case InductionDescriptor::IK_PtrInduction: { 4850 // Handle the pointer induction variable case. 4851 assert(P->getType()->isPointerTy() && "Unexpected type."); 4852 4853 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4854 // This is the normalized GEP that starts counting at zero. 4855 Value *PtrInd = 4856 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4857 // Determine the number of scalars we need to generate for each unroll 4858 // iteration. If the instruction is uniform, we only need to generate the 4859 // first lane. Otherwise, we generate all VF values. 4860 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4861 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4862 4863 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4864 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4865 if (NeedsVectorIndex) { 4866 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4867 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4868 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4869 } 4870 4871 for (unsigned Part = 0; Part < UF; ++Part) { 4872 Value *PartStart = createStepForVF( 4873 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4874 4875 if (NeedsVectorIndex) { 4876 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4877 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4878 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4879 Value *SclrGep = 4880 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4881 SclrGep->setName("next.gep"); 4882 State.set(PhiR, SclrGep, Part); 4883 // We've cached the whole vector, which means we can support the 4884 // extraction of any lane. 4885 continue; 4886 } 4887 4888 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4889 Value *Idx = Builder.CreateAdd( 4890 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4891 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4892 Value *SclrGep = 4893 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4894 SclrGep->setName("next.gep"); 4895 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4896 } 4897 } 4898 return; 4899 } 4900 assert(isa<SCEVConstant>(II.getStep()) && 4901 "Induction step not a SCEV constant!"); 4902 Type *PhiType = II.getStep()->getType(); 4903 4904 // Build a pointer phi 4905 Value *ScalarStartValue = II.getStartValue(); 4906 Type *ScStValueType = ScalarStartValue->getType(); 4907 PHINode *NewPointerPhi = 4908 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4909 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4910 4911 // A pointer induction, performed by using a gep 4912 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4913 Instruction *InductionLoc = LoopLatch->getTerminator(); 4914 const SCEV *ScalarStep = II.getStep(); 4915 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4916 Value *ScalarStepValue = 4917 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4918 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4919 Value *NumUnrolledElems = 4920 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4921 Value *InductionGEP = GetElementPtrInst::Create( 4922 ScStValueType->getPointerElementType(), NewPointerPhi, 4923 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4924 InductionLoc); 4925 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4926 4927 // Create UF many actual address geps that use the pointer 4928 // phi as base and a vectorized version of the step value 4929 // (<step*0, ..., step*N>) as offset. 4930 for (unsigned Part = 0; Part < State.UF; ++Part) { 4931 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4932 Value *StartOffsetScalar = 4933 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4934 Value *StartOffset = 4935 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4936 // Create a vector of consecutive numbers from zero to VF. 4937 StartOffset = 4938 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4939 4940 Value *GEP = Builder.CreateGEP( 4941 ScStValueType->getPointerElementType(), NewPointerPhi, 4942 Builder.CreateMul( 4943 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4944 "vector.gep")); 4945 State.set(PhiR, GEP, Part); 4946 } 4947 } 4948 } 4949 } 4950 4951 /// A helper function for checking whether an integer division-related 4952 /// instruction may divide by zero (in which case it must be predicated if 4953 /// executed conditionally in the scalar code). 4954 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4955 /// Non-zero divisors that are non compile-time constants will not be 4956 /// converted into multiplication, so we will still end up scalarizing 4957 /// the division, but can do so w/o predication. 4958 static bool mayDivideByZero(Instruction &I) { 4959 assert((I.getOpcode() == Instruction::UDiv || 4960 I.getOpcode() == Instruction::SDiv || 4961 I.getOpcode() == Instruction::URem || 4962 I.getOpcode() == Instruction::SRem) && 4963 "Unexpected instruction"); 4964 Value *Divisor = I.getOperand(1); 4965 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4966 return !CInt || CInt->isZero(); 4967 } 4968 4969 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4970 VPUser &User, 4971 VPTransformState &State) { 4972 switch (I.getOpcode()) { 4973 case Instruction::Call: 4974 case Instruction::Br: 4975 case Instruction::PHI: 4976 case Instruction::GetElementPtr: 4977 case Instruction::Select: 4978 llvm_unreachable("This instruction is handled by a different recipe."); 4979 case Instruction::UDiv: 4980 case Instruction::SDiv: 4981 case Instruction::SRem: 4982 case Instruction::URem: 4983 case Instruction::Add: 4984 case Instruction::FAdd: 4985 case Instruction::Sub: 4986 case Instruction::FSub: 4987 case Instruction::FNeg: 4988 case Instruction::Mul: 4989 case Instruction::FMul: 4990 case Instruction::FDiv: 4991 case Instruction::FRem: 4992 case Instruction::Shl: 4993 case Instruction::LShr: 4994 case Instruction::AShr: 4995 case Instruction::And: 4996 case Instruction::Or: 4997 case Instruction::Xor: { 4998 // Just widen unops and binops. 4999 setDebugLocFromInst(Builder, &I); 5000 5001 for (unsigned Part = 0; Part < UF; ++Part) { 5002 SmallVector<Value *, 2> Ops; 5003 for (VPValue *VPOp : User.operands()) 5004 Ops.push_back(State.get(VPOp, Part)); 5005 5006 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 5007 5008 if (auto *VecOp = dyn_cast<Instruction>(V)) 5009 VecOp->copyIRFlags(&I); 5010 5011 // Use this vector value for all users of the original instruction. 5012 State.set(Def, V, Part); 5013 addMetadata(V, &I); 5014 } 5015 5016 break; 5017 } 5018 case Instruction::ICmp: 5019 case Instruction::FCmp: { 5020 // Widen compares. Generate vector compares. 5021 bool FCmp = (I.getOpcode() == Instruction::FCmp); 5022 auto *Cmp = cast<CmpInst>(&I); 5023 setDebugLocFromInst(Builder, Cmp); 5024 for (unsigned Part = 0; Part < UF; ++Part) { 5025 Value *A = State.get(User.getOperand(0), Part); 5026 Value *B = State.get(User.getOperand(1), Part); 5027 Value *C = nullptr; 5028 if (FCmp) { 5029 // Propagate fast math flags. 5030 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 5031 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 5032 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 5033 } else { 5034 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 5035 } 5036 State.set(Def, C, Part); 5037 addMetadata(C, &I); 5038 } 5039 5040 break; 5041 } 5042 5043 case Instruction::ZExt: 5044 case Instruction::SExt: 5045 case Instruction::FPToUI: 5046 case Instruction::FPToSI: 5047 case Instruction::FPExt: 5048 case Instruction::PtrToInt: 5049 case Instruction::IntToPtr: 5050 case Instruction::SIToFP: 5051 case Instruction::UIToFP: 5052 case Instruction::Trunc: 5053 case Instruction::FPTrunc: 5054 case Instruction::BitCast: { 5055 auto *CI = cast<CastInst>(&I); 5056 setDebugLocFromInst(Builder, CI); 5057 5058 /// Vectorize casts. 5059 Type *DestTy = 5060 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 5061 5062 for (unsigned Part = 0; Part < UF; ++Part) { 5063 Value *A = State.get(User.getOperand(0), Part); 5064 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5065 State.set(Def, Cast, Part); 5066 addMetadata(Cast, &I); 5067 } 5068 break; 5069 } 5070 default: 5071 // This instruction is not vectorized by simple widening. 5072 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5073 llvm_unreachable("Unhandled instruction!"); 5074 } // end of switch. 5075 } 5076 5077 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5078 VPUser &ArgOperands, 5079 VPTransformState &State) { 5080 assert(!isa<DbgInfoIntrinsic>(I) && 5081 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5082 setDebugLocFromInst(Builder, &I); 5083 5084 Module *M = I.getParent()->getParent()->getParent(); 5085 auto *CI = cast<CallInst>(&I); 5086 5087 SmallVector<Type *, 4> Tys; 5088 for (Value *ArgOperand : CI->arg_operands()) 5089 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5090 5091 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5092 5093 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5094 // version of the instruction. 5095 // Is it beneficial to perform intrinsic call compared to lib call? 5096 bool NeedToScalarize = false; 5097 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5098 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5099 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5100 assert((UseVectorIntrinsic || !NeedToScalarize) && 5101 "Instruction should be scalarized elsewhere."); 5102 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5103 "Either the intrinsic cost or vector call cost must be valid"); 5104 5105 for (unsigned Part = 0; Part < UF; ++Part) { 5106 SmallVector<Value *, 4> Args; 5107 for (auto &I : enumerate(ArgOperands.operands())) { 5108 // Some intrinsics have a scalar argument - don't replace it with a 5109 // vector. 5110 Value *Arg; 5111 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5112 Arg = State.get(I.value(), Part); 5113 else 5114 Arg = State.get(I.value(), VPIteration(0, 0)); 5115 Args.push_back(Arg); 5116 } 5117 5118 Function *VectorF; 5119 if (UseVectorIntrinsic) { 5120 // Use vector version of the intrinsic. 5121 Type *TysForDecl[] = {CI->getType()}; 5122 if (VF.isVector()) 5123 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5124 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5125 assert(VectorF && "Can't retrieve vector intrinsic."); 5126 } else { 5127 // Use vector version of the function call. 5128 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5129 #ifndef NDEBUG 5130 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5131 "Can't create vector function."); 5132 #endif 5133 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5134 } 5135 SmallVector<OperandBundleDef, 1> OpBundles; 5136 CI->getOperandBundlesAsDefs(OpBundles); 5137 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5138 5139 if (isa<FPMathOperator>(V)) 5140 V->copyFastMathFlags(CI); 5141 5142 State.set(Def, V, Part); 5143 addMetadata(V, &I); 5144 } 5145 } 5146 5147 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5148 VPUser &Operands, 5149 bool InvariantCond, 5150 VPTransformState &State) { 5151 setDebugLocFromInst(Builder, &I); 5152 5153 // The condition can be loop invariant but still defined inside the 5154 // loop. This means that we can't just use the original 'cond' value. 5155 // We have to take the 'vectorized' value and pick the first lane. 5156 // Instcombine will make this a no-op. 5157 auto *InvarCond = InvariantCond 5158 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5159 : nullptr; 5160 5161 for (unsigned Part = 0; Part < UF; ++Part) { 5162 Value *Cond = 5163 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5164 Value *Op0 = State.get(Operands.getOperand(1), Part); 5165 Value *Op1 = State.get(Operands.getOperand(2), Part); 5166 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5167 State.set(VPDef, Sel, Part); 5168 addMetadata(Sel, &I); 5169 } 5170 } 5171 5172 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5173 // We should not collect Scalars more than once per VF. Right now, this 5174 // function is called from collectUniformsAndScalars(), which already does 5175 // this check. Collecting Scalars for VF=1 does not make any sense. 5176 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5177 "This function should not be visited twice for the same VF"); 5178 5179 SmallSetVector<Instruction *, 8> Worklist; 5180 5181 // These sets are used to seed the analysis with pointers used by memory 5182 // accesses that will remain scalar. 5183 SmallSetVector<Instruction *, 8> ScalarPtrs; 5184 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5185 auto *Latch = TheLoop->getLoopLatch(); 5186 5187 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5188 // The pointer operands of loads and stores will be scalar as long as the 5189 // memory access is not a gather or scatter operation. The value operand of a 5190 // store will remain scalar if the store is scalarized. 5191 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5192 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5193 assert(WideningDecision != CM_Unknown && 5194 "Widening decision should be ready at this moment"); 5195 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5196 if (Ptr == Store->getValueOperand()) 5197 return WideningDecision == CM_Scalarize; 5198 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5199 "Ptr is neither a value or pointer operand"); 5200 return WideningDecision != CM_GatherScatter; 5201 }; 5202 5203 // A helper that returns true if the given value is a bitcast or 5204 // getelementptr instruction contained in the loop. 5205 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5206 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5207 isa<GetElementPtrInst>(V)) && 5208 !TheLoop->isLoopInvariant(V); 5209 }; 5210 5211 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5212 if (!isa<PHINode>(Ptr) || 5213 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5214 return false; 5215 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5216 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5217 return false; 5218 return isScalarUse(MemAccess, Ptr); 5219 }; 5220 5221 // A helper that evaluates a memory access's use of a pointer. If the 5222 // pointer is actually the pointer induction of a loop, it is being 5223 // inserted into Worklist. If the use will be a scalar use, and the 5224 // pointer is only used by memory accesses, we place the pointer in 5225 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5226 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5227 if (isScalarPtrInduction(MemAccess, Ptr)) { 5228 Worklist.insert(cast<Instruction>(Ptr)); 5229 Instruction *Update = cast<Instruction>( 5230 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5231 Worklist.insert(Update); 5232 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5233 << "\n"); 5234 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5235 << "\n"); 5236 return; 5237 } 5238 // We only care about bitcast and getelementptr instructions contained in 5239 // the loop. 5240 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5241 return; 5242 5243 // If the pointer has already been identified as scalar (e.g., if it was 5244 // also identified as uniform), there's nothing to do. 5245 auto *I = cast<Instruction>(Ptr); 5246 if (Worklist.count(I)) 5247 return; 5248 5249 // If the use of the pointer will be a scalar use, and all users of the 5250 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5251 // place the pointer in PossibleNonScalarPtrs. 5252 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5253 return isa<LoadInst>(U) || isa<StoreInst>(U); 5254 })) 5255 ScalarPtrs.insert(I); 5256 else 5257 PossibleNonScalarPtrs.insert(I); 5258 }; 5259 5260 // We seed the scalars analysis with three classes of instructions: (1) 5261 // instructions marked uniform-after-vectorization and (2) bitcast, 5262 // getelementptr and (pointer) phi instructions used by memory accesses 5263 // requiring a scalar use. 5264 // 5265 // (1) Add to the worklist all instructions that have been identified as 5266 // uniform-after-vectorization. 5267 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5268 5269 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5270 // memory accesses requiring a scalar use. The pointer operands of loads and 5271 // stores will be scalar as long as the memory accesses is not a gather or 5272 // scatter operation. The value operand of a store will remain scalar if the 5273 // store is scalarized. 5274 for (auto *BB : TheLoop->blocks()) 5275 for (auto &I : *BB) { 5276 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5277 evaluatePtrUse(Load, Load->getPointerOperand()); 5278 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5279 evaluatePtrUse(Store, Store->getPointerOperand()); 5280 evaluatePtrUse(Store, Store->getValueOperand()); 5281 } 5282 } 5283 for (auto *I : ScalarPtrs) 5284 if (!PossibleNonScalarPtrs.count(I)) { 5285 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5286 Worklist.insert(I); 5287 } 5288 5289 // Insert the forced scalars. 5290 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5291 // induction variable when the PHI user is scalarized. 5292 auto ForcedScalar = ForcedScalars.find(VF); 5293 if (ForcedScalar != ForcedScalars.end()) 5294 for (auto *I : ForcedScalar->second) 5295 Worklist.insert(I); 5296 5297 // Expand the worklist by looking through any bitcasts and getelementptr 5298 // instructions we've already identified as scalar. This is similar to the 5299 // expansion step in collectLoopUniforms(); however, here we're only 5300 // expanding to include additional bitcasts and getelementptr instructions. 5301 unsigned Idx = 0; 5302 while (Idx != Worklist.size()) { 5303 Instruction *Dst = Worklist[Idx++]; 5304 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5305 continue; 5306 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5307 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5308 auto *J = cast<Instruction>(U); 5309 return !TheLoop->contains(J) || Worklist.count(J) || 5310 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5311 isScalarUse(J, Src)); 5312 })) { 5313 Worklist.insert(Src); 5314 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5315 } 5316 } 5317 5318 // An induction variable will remain scalar if all users of the induction 5319 // variable and induction variable update remain scalar. 5320 for (auto &Induction : Legal->getInductionVars()) { 5321 auto *Ind = Induction.first; 5322 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5323 5324 // If tail-folding is applied, the primary induction variable will be used 5325 // to feed a vector compare. 5326 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5327 continue; 5328 5329 // Determine if all users of the induction variable are scalar after 5330 // vectorization. 5331 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5332 auto *I = cast<Instruction>(U); 5333 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5334 }); 5335 if (!ScalarInd) 5336 continue; 5337 5338 // Determine if all users of the induction variable update instruction are 5339 // scalar after vectorization. 5340 auto ScalarIndUpdate = 5341 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5342 auto *I = cast<Instruction>(U); 5343 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5344 }); 5345 if (!ScalarIndUpdate) 5346 continue; 5347 5348 // The induction variable and its update instruction will remain scalar. 5349 Worklist.insert(Ind); 5350 Worklist.insert(IndUpdate); 5351 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5352 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5353 << "\n"); 5354 } 5355 5356 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5357 } 5358 5359 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5360 if (!blockNeedsPredication(I->getParent())) 5361 return false; 5362 switch(I->getOpcode()) { 5363 default: 5364 break; 5365 case Instruction::Load: 5366 case Instruction::Store: { 5367 if (!Legal->isMaskRequired(I)) 5368 return false; 5369 auto *Ptr = getLoadStorePointerOperand(I); 5370 auto *Ty = getMemInstValueType(I); 5371 const Align Alignment = getLoadStoreAlignment(I); 5372 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5373 isLegalMaskedGather(Ty, Alignment)) 5374 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5375 isLegalMaskedScatter(Ty, Alignment)); 5376 } 5377 case Instruction::UDiv: 5378 case Instruction::SDiv: 5379 case Instruction::SRem: 5380 case Instruction::URem: 5381 return mayDivideByZero(*I); 5382 } 5383 return false; 5384 } 5385 5386 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5387 Instruction *I, ElementCount VF) { 5388 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5389 assert(getWideningDecision(I, VF) == CM_Unknown && 5390 "Decision should not be set yet."); 5391 auto *Group = getInterleavedAccessGroup(I); 5392 assert(Group && "Must have a group."); 5393 5394 // If the instruction's allocated size doesn't equal it's type size, it 5395 // requires padding and will be scalarized. 5396 auto &DL = I->getModule()->getDataLayout(); 5397 auto *ScalarTy = getMemInstValueType(I); 5398 if (hasIrregularType(ScalarTy, DL)) 5399 return false; 5400 5401 // Check if masking is required. 5402 // A Group may need masking for one of two reasons: it resides in a block that 5403 // needs predication, or it was decided to use masking to deal with gaps. 5404 bool PredicatedAccessRequiresMasking = 5405 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5406 bool AccessWithGapsRequiresMasking = 5407 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5408 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5409 return true; 5410 5411 // If masked interleaving is required, we expect that the user/target had 5412 // enabled it, because otherwise it either wouldn't have been created or 5413 // it should have been invalidated by the CostModel. 5414 assert(useMaskedInterleavedAccesses(TTI) && 5415 "Masked interleave-groups for predicated accesses are not enabled."); 5416 5417 auto *Ty = getMemInstValueType(I); 5418 const Align Alignment = getLoadStoreAlignment(I); 5419 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5420 : TTI.isLegalMaskedStore(Ty, Alignment); 5421 } 5422 5423 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5424 Instruction *I, ElementCount VF) { 5425 // Get and ensure we have a valid memory instruction. 5426 LoadInst *LI = dyn_cast<LoadInst>(I); 5427 StoreInst *SI = dyn_cast<StoreInst>(I); 5428 assert((LI || SI) && "Invalid memory instruction"); 5429 5430 auto *Ptr = getLoadStorePointerOperand(I); 5431 5432 // In order to be widened, the pointer should be consecutive, first of all. 5433 if (!Legal->isConsecutivePtr(Ptr)) 5434 return false; 5435 5436 // If the instruction is a store located in a predicated block, it will be 5437 // scalarized. 5438 if (isScalarWithPredication(I)) 5439 return false; 5440 5441 // If the instruction's allocated size doesn't equal it's type size, it 5442 // requires padding and will be scalarized. 5443 auto &DL = I->getModule()->getDataLayout(); 5444 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5445 if (hasIrregularType(ScalarTy, DL)) 5446 return false; 5447 5448 return true; 5449 } 5450 5451 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5452 // We should not collect Uniforms more than once per VF. Right now, 5453 // this function is called from collectUniformsAndScalars(), which 5454 // already does this check. Collecting Uniforms for VF=1 does not make any 5455 // sense. 5456 5457 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5458 "This function should not be visited twice for the same VF"); 5459 5460 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5461 // not analyze again. Uniforms.count(VF) will return 1. 5462 Uniforms[VF].clear(); 5463 5464 // We now know that the loop is vectorizable! 5465 // Collect instructions inside the loop that will remain uniform after 5466 // vectorization. 5467 5468 // Global values, params and instructions outside of current loop are out of 5469 // scope. 5470 auto isOutOfScope = [&](Value *V) -> bool { 5471 Instruction *I = dyn_cast<Instruction>(V); 5472 return (!I || !TheLoop->contains(I)); 5473 }; 5474 5475 SetVector<Instruction *> Worklist; 5476 BasicBlock *Latch = TheLoop->getLoopLatch(); 5477 5478 // Instructions that are scalar with predication must not be considered 5479 // uniform after vectorization, because that would create an erroneous 5480 // replicating region where only a single instance out of VF should be formed. 5481 // TODO: optimize such seldom cases if found important, see PR40816. 5482 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5483 if (isOutOfScope(I)) { 5484 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5485 << *I << "\n"); 5486 return; 5487 } 5488 if (isScalarWithPredication(I)) { 5489 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5490 << *I << "\n"); 5491 return; 5492 } 5493 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5494 Worklist.insert(I); 5495 }; 5496 5497 // Start with the conditional branch. If the branch condition is an 5498 // instruction contained in the loop that is only used by the branch, it is 5499 // uniform. 5500 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5501 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5502 addToWorklistIfAllowed(Cmp); 5503 5504 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5505 InstWidening WideningDecision = getWideningDecision(I, VF); 5506 assert(WideningDecision != CM_Unknown && 5507 "Widening decision should be ready at this moment"); 5508 5509 // A uniform memory op is itself uniform. We exclude uniform stores 5510 // here as they demand the last lane, not the first one. 5511 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5512 assert(WideningDecision == CM_Scalarize); 5513 return true; 5514 } 5515 5516 return (WideningDecision == CM_Widen || 5517 WideningDecision == CM_Widen_Reverse || 5518 WideningDecision == CM_Interleave); 5519 }; 5520 5521 5522 // Returns true if Ptr is the pointer operand of a memory access instruction 5523 // I, and I is known to not require scalarization. 5524 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5525 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5526 }; 5527 5528 // Holds a list of values which are known to have at least one uniform use. 5529 // Note that there may be other uses which aren't uniform. A "uniform use" 5530 // here is something which only demands lane 0 of the unrolled iterations; 5531 // it does not imply that all lanes produce the same value (e.g. this is not 5532 // the usual meaning of uniform) 5533 SetVector<Value *> HasUniformUse; 5534 5535 // Scan the loop for instructions which are either a) known to have only 5536 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5537 for (auto *BB : TheLoop->blocks()) 5538 for (auto &I : *BB) { 5539 // If there's no pointer operand, there's nothing to do. 5540 auto *Ptr = getLoadStorePointerOperand(&I); 5541 if (!Ptr) 5542 continue; 5543 5544 // A uniform memory op is itself uniform. We exclude uniform stores 5545 // here as they demand the last lane, not the first one. 5546 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5547 addToWorklistIfAllowed(&I); 5548 5549 if (isUniformDecision(&I, VF)) { 5550 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5551 HasUniformUse.insert(Ptr); 5552 } 5553 } 5554 5555 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5556 // demanding) users. Since loops are assumed to be in LCSSA form, this 5557 // disallows uses outside the loop as well. 5558 for (auto *V : HasUniformUse) { 5559 if (isOutOfScope(V)) 5560 continue; 5561 auto *I = cast<Instruction>(V); 5562 auto UsersAreMemAccesses = 5563 llvm::all_of(I->users(), [&](User *U) -> bool { 5564 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5565 }); 5566 if (UsersAreMemAccesses) 5567 addToWorklistIfAllowed(I); 5568 } 5569 5570 // Expand Worklist in topological order: whenever a new instruction 5571 // is added , its users should be already inside Worklist. It ensures 5572 // a uniform instruction will only be used by uniform instructions. 5573 unsigned idx = 0; 5574 while (idx != Worklist.size()) { 5575 Instruction *I = Worklist[idx++]; 5576 5577 for (auto OV : I->operand_values()) { 5578 // isOutOfScope operands cannot be uniform instructions. 5579 if (isOutOfScope(OV)) 5580 continue; 5581 // First order recurrence Phi's should typically be considered 5582 // non-uniform. 5583 auto *OP = dyn_cast<PHINode>(OV); 5584 if (OP && Legal->isFirstOrderRecurrence(OP)) 5585 continue; 5586 // If all the users of the operand are uniform, then add the 5587 // operand into the uniform worklist. 5588 auto *OI = cast<Instruction>(OV); 5589 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5590 auto *J = cast<Instruction>(U); 5591 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5592 })) 5593 addToWorklistIfAllowed(OI); 5594 } 5595 } 5596 5597 // For an instruction to be added into Worklist above, all its users inside 5598 // the loop should also be in Worklist. However, this condition cannot be 5599 // true for phi nodes that form a cyclic dependence. We must process phi 5600 // nodes separately. An induction variable will remain uniform if all users 5601 // of the induction variable and induction variable update remain uniform. 5602 // The code below handles both pointer and non-pointer induction variables. 5603 for (auto &Induction : Legal->getInductionVars()) { 5604 auto *Ind = Induction.first; 5605 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5606 5607 // Determine if all users of the induction variable are uniform after 5608 // vectorization. 5609 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5610 auto *I = cast<Instruction>(U); 5611 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5612 isVectorizedMemAccessUse(I, Ind); 5613 }); 5614 if (!UniformInd) 5615 continue; 5616 5617 // Determine if all users of the induction variable update instruction are 5618 // uniform after vectorization. 5619 auto UniformIndUpdate = 5620 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5621 auto *I = cast<Instruction>(U); 5622 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5623 isVectorizedMemAccessUse(I, IndUpdate); 5624 }); 5625 if (!UniformIndUpdate) 5626 continue; 5627 5628 // The induction variable and its update instruction will remain uniform. 5629 addToWorklistIfAllowed(Ind); 5630 addToWorklistIfAllowed(IndUpdate); 5631 } 5632 5633 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5634 } 5635 5636 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5637 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5638 5639 if (Legal->getRuntimePointerChecking()->Need) { 5640 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5641 "runtime pointer checks needed. Enable vectorization of this " 5642 "loop with '#pragma clang loop vectorize(enable)' when " 5643 "compiling with -Os/-Oz", 5644 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5645 return true; 5646 } 5647 5648 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5649 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5650 "runtime SCEV checks needed. Enable vectorization of this " 5651 "loop with '#pragma clang loop vectorize(enable)' when " 5652 "compiling with -Os/-Oz", 5653 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5654 return true; 5655 } 5656 5657 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5658 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5659 reportVectorizationFailure("Runtime stride check for small trip count", 5660 "runtime stride == 1 checks needed. Enable vectorization of " 5661 "this loop without such check by compiling with -Os/-Oz", 5662 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5663 return true; 5664 } 5665 5666 return false; 5667 } 5668 5669 ElementCount 5670 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5671 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5672 reportVectorizationInfo( 5673 "Disabling scalable vectorization, because target does not " 5674 "support scalable vectors.", 5675 "ScalableVectorsUnsupported", ORE, TheLoop); 5676 return ElementCount::getScalable(0); 5677 } 5678 5679 auto MaxScalableVF = ElementCount::getScalable( 5680 std::numeric_limits<ElementCount::ScalarTy>::max()); 5681 5682 // Disable scalable vectorization if the loop contains unsupported reductions. 5683 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5684 // FIXME: While for scalable vectors this is currently sufficient, this should 5685 // be replaced by a more detailed mechanism that filters out specific VFs, 5686 // instead of invalidating vectorization for a whole set of VFs based on the 5687 // MaxVF. 5688 if (!canVectorizeReductions(MaxScalableVF)) { 5689 reportVectorizationInfo( 5690 "Scalable vectorization not supported for the reduction " 5691 "operations found in this loop.", 5692 "ScalableVFUnfeasible", ORE, TheLoop); 5693 return ElementCount::getScalable(0); 5694 } 5695 5696 if (Legal->isSafeForAnyVectorWidth()) 5697 return MaxScalableVF; 5698 5699 // Limit MaxScalableVF by the maximum safe dependence distance. 5700 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5701 MaxScalableVF = ElementCount::getScalable( 5702 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5703 if (!MaxScalableVF) 5704 reportVectorizationInfo( 5705 "Max legal vector width too small, scalable vectorization " 5706 "unfeasible.", 5707 "ScalableVFUnfeasible", ORE, TheLoop); 5708 5709 return MaxScalableVF; 5710 } 5711 5712 ElementCount 5713 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5714 ElementCount UserVF) { 5715 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5716 unsigned SmallestType, WidestType; 5717 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5718 5719 // Get the maximum safe dependence distance in bits computed by LAA. 5720 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5721 // the memory accesses that is most restrictive (involved in the smallest 5722 // dependence distance). 5723 unsigned MaxSafeElements = 5724 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5725 5726 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5727 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5728 5729 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5730 << ".\n"); 5731 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5732 << ".\n"); 5733 5734 // First analyze the UserVF, fall back if the UserVF should be ignored. 5735 if (UserVF) { 5736 auto MaxSafeUserVF = 5737 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5738 5739 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5740 return UserVF; 5741 5742 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5743 5744 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5745 // is better to ignore the hint and let the compiler choose a suitable VF. 5746 if (!UserVF.isScalable()) { 5747 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5748 << " is unsafe, clamping to max safe VF=" 5749 << MaxSafeFixedVF << ".\n"); 5750 ORE->emit([&]() { 5751 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5752 TheLoop->getStartLoc(), 5753 TheLoop->getHeader()) 5754 << "User-specified vectorization factor " 5755 << ore::NV("UserVectorizationFactor", UserVF) 5756 << " is unsafe, clamping to maximum safe vectorization factor " 5757 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5758 }); 5759 return MaxSafeFixedVF; 5760 } 5761 5762 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5763 << " is unsafe. Ignoring scalable UserVF.\n"); 5764 ORE->emit([&]() { 5765 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5766 TheLoop->getStartLoc(), 5767 TheLoop->getHeader()) 5768 << "User-specified vectorization factor " 5769 << ore::NV("UserVectorizationFactor", UserVF) 5770 << " is unsafe. Ignoring the hint to let the compiler pick a " 5771 "suitable VF."; 5772 }); 5773 } 5774 5775 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5776 << " / " << WidestType << " bits.\n"); 5777 5778 ElementCount MaxFixedVF = ElementCount::getFixed(1); 5779 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5780 WidestType, MaxSafeFixedVF)) 5781 MaxFixedVF = MaxVF; 5782 5783 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5784 WidestType, MaxSafeScalableVF)) 5785 // FIXME: Return scalable VF as well (to be added in future patch). 5786 if (MaxVF.isScalable()) 5787 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5788 << "\n"); 5789 5790 return MaxFixedVF; 5791 } 5792 5793 Optional<ElementCount> 5794 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5795 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5796 // TODO: It may by useful to do since it's still likely to be dynamically 5797 // uniform if the target can skip. 5798 reportVectorizationFailure( 5799 "Not inserting runtime ptr check for divergent target", 5800 "runtime pointer checks needed. Not enabled for divergent target", 5801 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5802 return None; 5803 } 5804 5805 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5806 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5807 if (TC == 1) { 5808 reportVectorizationFailure("Single iteration (non) loop", 5809 "loop trip count is one, irrelevant for vectorization", 5810 "SingleIterationLoop", ORE, TheLoop); 5811 return None; 5812 } 5813 5814 switch (ScalarEpilogueStatus) { 5815 case CM_ScalarEpilogueAllowed: 5816 return computeFeasibleMaxVF(TC, UserVF); 5817 case CM_ScalarEpilogueNotAllowedUsePredicate: 5818 LLVM_FALLTHROUGH; 5819 case CM_ScalarEpilogueNotNeededUsePredicate: 5820 LLVM_DEBUG( 5821 dbgs() << "LV: vector predicate hint/switch found.\n" 5822 << "LV: Not allowing scalar epilogue, creating predicated " 5823 << "vector loop.\n"); 5824 break; 5825 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5826 // fallthrough as a special case of OptForSize 5827 case CM_ScalarEpilogueNotAllowedOptSize: 5828 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5829 LLVM_DEBUG( 5830 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5831 else 5832 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5833 << "count.\n"); 5834 5835 // Bail if runtime checks are required, which are not good when optimising 5836 // for size. 5837 if (runtimeChecksRequired()) 5838 return None; 5839 5840 break; 5841 } 5842 5843 // The only loops we can vectorize without a scalar epilogue, are loops with 5844 // a bottom-test and a single exiting block. We'd have to handle the fact 5845 // that not every instruction executes on the last iteration. This will 5846 // require a lane mask which varies through the vector loop body. (TODO) 5847 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5848 // If there was a tail-folding hint/switch, but we can't fold the tail by 5849 // masking, fallback to a vectorization with a scalar epilogue. 5850 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5851 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5852 "scalar epilogue instead.\n"); 5853 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5854 return computeFeasibleMaxVF(TC, UserVF); 5855 } 5856 return None; 5857 } 5858 5859 // Now try the tail folding 5860 5861 // Invalidate interleave groups that require an epilogue if we can't mask 5862 // the interleave-group. 5863 if (!useMaskedInterleavedAccesses(TTI)) { 5864 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5865 "No decisions should have been taken at this point"); 5866 // Note: There is no need to invalidate any cost modeling decisions here, as 5867 // non where taken so far. 5868 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5869 } 5870 5871 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5872 assert(!MaxVF.isScalable() && 5873 "Scalable vectors do not yet support tail folding"); 5874 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5875 "MaxVF must be a power of 2"); 5876 unsigned MaxVFtimesIC = 5877 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5878 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5879 // chose. 5880 ScalarEvolution *SE = PSE.getSE(); 5881 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5882 const SCEV *ExitCount = SE->getAddExpr( 5883 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5884 const SCEV *Rem = SE->getURemExpr( 5885 SE->applyLoopGuards(ExitCount, TheLoop), 5886 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5887 if (Rem->isZero()) { 5888 // Accept MaxVF if we do not have a tail. 5889 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5890 return MaxVF; 5891 } 5892 5893 // If we don't know the precise trip count, or if the trip count that we 5894 // found modulo the vectorization factor is not zero, try to fold the tail 5895 // by masking. 5896 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5897 if (Legal->prepareToFoldTailByMasking()) { 5898 FoldTailByMasking = true; 5899 return MaxVF; 5900 } 5901 5902 // If there was a tail-folding hint/switch, but we can't fold the tail by 5903 // masking, fallback to a vectorization with a scalar epilogue. 5904 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5905 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5906 "scalar epilogue instead.\n"); 5907 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5908 return MaxVF; 5909 } 5910 5911 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5912 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5913 return None; 5914 } 5915 5916 if (TC == 0) { 5917 reportVectorizationFailure( 5918 "Unable to calculate the loop count due to complex control flow", 5919 "unable to calculate the loop count due to complex control flow", 5920 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5921 return None; 5922 } 5923 5924 reportVectorizationFailure( 5925 "Cannot optimize for size and vectorize at the same time.", 5926 "cannot optimize for size and vectorize at the same time. " 5927 "Enable vectorization of this loop with '#pragma clang loop " 5928 "vectorize(enable)' when compiling with -Os/-Oz", 5929 "NoTailLoopWithOptForSize", ORE, TheLoop); 5930 return None; 5931 } 5932 5933 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5934 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5935 const ElementCount &MaxSafeVF) { 5936 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5937 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5938 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5939 : TargetTransformInfo::RGK_FixedWidthVector); 5940 5941 // Convenience function to return the minimum of two ElementCounts. 5942 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5943 assert((LHS.isScalable() == RHS.isScalable()) && 5944 "Scalable flags must match"); 5945 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5946 }; 5947 5948 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5949 // Note that both WidestRegister and WidestType may not be a powers of 2. 5950 auto MaxVectorElementCount = ElementCount::get( 5951 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5952 ComputeScalableMaxVF); 5953 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5954 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5955 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5956 5957 if (!MaxVectorElementCount) { 5958 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5959 return ElementCount::getFixed(1); 5960 } 5961 5962 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5963 if (ConstTripCount && 5964 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5965 isPowerOf2_32(ConstTripCount)) { 5966 // We need to clamp the VF to be the ConstTripCount. There is no point in 5967 // choosing a higher viable VF as done in the loop below. If 5968 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5969 // the TC is less than or equal to the known number of lanes. 5970 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5971 << ConstTripCount << "\n"); 5972 return TripCountEC; 5973 } 5974 5975 ElementCount MaxVF = MaxVectorElementCount; 5976 if (TTI.shouldMaximizeVectorBandwidth() || 5977 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5978 auto MaxVectorElementCountMaxBW = ElementCount::get( 5979 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5980 ComputeScalableMaxVF); 5981 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5982 5983 // Collect all viable vectorization factors larger than the default MaxVF 5984 // (i.e. MaxVectorElementCount). 5985 SmallVector<ElementCount, 8> VFs; 5986 for (ElementCount VS = MaxVectorElementCount * 2; 5987 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5988 VFs.push_back(VS); 5989 5990 // For each VF calculate its register usage. 5991 auto RUs = calculateRegisterUsage(VFs); 5992 5993 // Select the largest VF which doesn't require more registers than existing 5994 // ones. 5995 for (int i = RUs.size() - 1; i >= 0; --i) { 5996 bool Selected = true; 5997 for (auto &pair : RUs[i].MaxLocalUsers) { 5998 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5999 if (pair.second > TargetNumRegisters) 6000 Selected = false; 6001 } 6002 if (Selected) { 6003 MaxVF = VFs[i]; 6004 break; 6005 } 6006 } 6007 if (ElementCount MinVF = 6008 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 6009 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 6010 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 6011 << ") with target's minimum: " << MinVF << '\n'); 6012 MaxVF = MinVF; 6013 } 6014 } 6015 } 6016 return MaxVF; 6017 } 6018 6019 bool LoopVectorizationCostModel::isMoreProfitable( 6020 const VectorizationFactor &A, const VectorizationFactor &B) const { 6021 InstructionCost::CostType CostA = *A.Cost.getValue(); 6022 InstructionCost::CostType CostB = *B.Cost.getValue(); 6023 6024 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6025 6026 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6027 MaxTripCount) { 6028 // If we are folding the tail and the trip count is a known (possibly small) 6029 // constant, the trip count will be rounded up to an integer number of 6030 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6031 // which we compare directly. When not folding the tail, the total cost will 6032 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6033 // approximated with the per-lane cost below instead of using the tripcount 6034 // as here. 6035 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6036 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6037 return RTCostA < RTCostB; 6038 } 6039 6040 // To avoid the need for FP division: 6041 // (CostA / A.Width) < (CostB / B.Width) 6042 // <=> (CostA * B.Width) < (CostB * A.Width) 6043 return (CostA * B.Width.getKnownMinValue()) < 6044 (CostB * A.Width.getKnownMinValue()); 6045 } 6046 6047 VectorizationFactor 6048 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 6049 // FIXME: This can be fixed for scalable vectors later, because at this stage 6050 // the LoopVectorizer will only consider vectorizing a loop with scalable 6051 // vectors when the loop has a hint to enable vectorization for a given VF. 6052 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 6053 6054 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6055 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6056 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6057 6058 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6059 VectorizationFactor ChosenFactor = ScalarCost; 6060 6061 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6062 if (ForceVectorization && MaxVF.isVector()) { 6063 // Ignore scalar width, because the user explicitly wants vectorization. 6064 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6065 // evaluation. 6066 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6067 } 6068 6069 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 6070 i *= 2) { 6071 // Notice that the vector loop needs to be executed less times, so 6072 // we need to divide the cost of the vector loops by the width of 6073 // the vector elements. 6074 VectorizationCostTy C = expectedCost(i); 6075 6076 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6077 VectorizationFactor Candidate(i, C.first); 6078 LLVM_DEBUG( 6079 dbgs() << "LV: Vector loop of width " << i << " costs: " 6080 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 6081 << ".\n"); 6082 6083 if (!C.second && !ForceVectorization) { 6084 LLVM_DEBUG( 6085 dbgs() << "LV: Not considering vector loop of width " << i 6086 << " because it will not generate any vector instructions.\n"); 6087 continue; 6088 } 6089 6090 // If profitable add it to ProfitableVF list. 6091 if (isMoreProfitable(Candidate, ScalarCost)) 6092 ProfitableVFs.push_back(Candidate); 6093 6094 if (isMoreProfitable(Candidate, ChosenFactor)) 6095 ChosenFactor = Candidate; 6096 } 6097 6098 if (!EnableCondStoresVectorization && NumPredStores) { 6099 reportVectorizationFailure("There are conditional stores.", 6100 "store that is conditionally executed prevents vectorization", 6101 "ConditionalStore", ORE, TheLoop); 6102 ChosenFactor = ScalarCost; 6103 } 6104 6105 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6106 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6107 dbgs() 6108 << "LV: Vectorization seems to be not beneficial, " 6109 << "but was forced by a user.\n"); 6110 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6111 return ChosenFactor; 6112 } 6113 6114 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6115 const Loop &L, ElementCount VF) const { 6116 // Cross iteration phis such as reductions need special handling and are 6117 // currently unsupported. 6118 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6119 return Legal->isFirstOrderRecurrence(&Phi) || 6120 Legal->isReductionVariable(&Phi); 6121 })) 6122 return false; 6123 6124 // Phis with uses outside of the loop require special handling and are 6125 // currently unsupported. 6126 for (auto &Entry : Legal->getInductionVars()) { 6127 // Look for uses of the value of the induction at the last iteration. 6128 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6129 for (User *U : PostInc->users()) 6130 if (!L.contains(cast<Instruction>(U))) 6131 return false; 6132 // Look for uses of penultimate value of the induction. 6133 for (User *U : Entry.first->users()) 6134 if (!L.contains(cast<Instruction>(U))) 6135 return false; 6136 } 6137 6138 // Induction variables that are widened require special handling that is 6139 // currently not supported. 6140 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6141 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6142 this->isProfitableToScalarize(Entry.first, VF)); 6143 })) 6144 return false; 6145 6146 return true; 6147 } 6148 6149 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6150 const ElementCount VF) const { 6151 // FIXME: We need a much better cost-model to take different parameters such 6152 // as register pressure, code size increase and cost of extra branches into 6153 // account. For now we apply a very crude heuristic and only consider loops 6154 // with vectorization factors larger than a certain value. 6155 // We also consider epilogue vectorization unprofitable for targets that don't 6156 // consider interleaving beneficial (eg. MVE). 6157 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6158 return false; 6159 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6160 return true; 6161 return false; 6162 } 6163 6164 VectorizationFactor 6165 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6166 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6167 VectorizationFactor Result = VectorizationFactor::Disabled(); 6168 if (!EnableEpilogueVectorization) { 6169 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6170 return Result; 6171 } 6172 6173 if (!isScalarEpilogueAllowed()) { 6174 LLVM_DEBUG( 6175 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6176 "allowed.\n";); 6177 return Result; 6178 } 6179 6180 // FIXME: This can be fixed for scalable vectors later, because at this stage 6181 // the LoopVectorizer will only consider vectorizing a loop with scalable 6182 // vectors when the loop has a hint to enable vectorization for a given VF. 6183 if (MainLoopVF.isScalable()) { 6184 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6185 "yet supported.\n"); 6186 return Result; 6187 } 6188 6189 // Not really a cost consideration, but check for unsupported cases here to 6190 // simplify the logic. 6191 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6192 LLVM_DEBUG( 6193 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6194 "not a supported candidate.\n";); 6195 return Result; 6196 } 6197 6198 if (EpilogueVectorizationForceVF > 1) { 6199 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6200 if (LVP.hasPlanWithVFs( 6201 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6202 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6203 else { 6204 LLVM_DEBUG( 6205 dbgs() 6206 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6207 return Result; 6208 } 6209 } 6210 6211 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6212 TheLoop->getHeader()->getParent()->hasMinSize()) { 6213 LLVM_DEBUG( 6214 dbgs() 6215 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6216 return Result; 6217 } 6218 6219 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6220 return Result; 6221 6222 for (auto &NextVF : ProfitableVFs) 6223 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6224 (Result.Width.getFixedValue() == 1 || 6225 isMoreProfitable(NextVF, Result)) && 6226 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6227 Result = NextVF; 6228 6229 if (Result != VectorizationFactor::Disabled()) 6230 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6231 << Result.Width.getFixedValue() << "\n";); 6232 return Result; 6233 } 6234 6235 std::pair<unsigned, unsigned> 6236 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6237 unsigned MinWidth = -1U; 6238 unsigned MaxWidth = 8; 6239 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6240 6241 // For each block. 6242 for (BasicBlock *BB : TheLoop->blocks()) { 6243 // For each instruction in the loop. 6244 for (Instruction &I : BB->instructionsWithoutDebug()) { 6245 Type *T = I.getType(); 6246 6247 // Skip ignored values. 6248 if (ValuesToIgnore.count(&I)) 6249 continue; 6250 6251 // Only examine Loads, Stores and PHINodes. 6252 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6253 continue; 6254 6255 // Examine PHI nodes that are reduction variables. Update the type to 6256 // account for the recurrence type. 6257 if (auto *PN = dyn_cast<PHINode>(&I)) { 6258 if (!Legal->isReductionVariable(PN)) 6259 continue; 6260 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6261 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6262 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6263 RdxDesc.getRecurrenceType(), 6264 TargetTransformInfo::ReductionFlags())) 6265 continue; 6266 T = RdxDesc.getRecurrenceType(); 6267 } 6268 6269 // Examine the stored values. 6270 if (auto *ST = dyn_cast<StoreInst>(&I)) 6271 T = ST->getValueOperand()->getType(); 6272 6273 // Ignore loaded pointer types and stored pointer types that are not 6274 // vectorizable. 6275 // 6276 // FIXME: The check here attempts to predict whether a load or store will 6277 // be vectorized. We only know this for certain after a VF has 6278 // been selected. Here, we assume that if an access can be 6279 // vectorized, it will be. We should also look at extending this 6280 // optimization to non-pointer types. 6281 // 6282 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6283 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6284 continue; 6285 6286 MinWidth = std::min(MinWidth, 6287 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6288 MaxWidth = std::max(MaxWidth, 6289 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6290 } 6291 } 6292 6293 return {MinWidth, MaxWidth}; 6294 } 6295 6296 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6297 unsigned LoopCost) { 6298 // -- The interleave heuristics -- 6299 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6300 // There are many micro-architectural considerations that we can't predict 6301 // at this level. For example, frontend pressure (on decode or fetch) due to 6302 // code size, or the number and capabilities of the execution ports. 6303 // 6304 // We use the following heuristics to select the interleave count: 6305 // 1. If the code has reductions, then we interleave to break the cross 6306 // iteration dependency. 6307 // 2. If the loop is really small, then we interleave to reduce the loop 6308 // overhead. 6309 // 3. We don't interleave if we think that we will spill registers to memory 6310 // due to the increased register pressure. 6311 6312 if (!isScalarEpilogueAllowed()) 6313 return 1; 6314 6315 // We used the distance for the interleave count. 6316 if (Legal->getMaxSafeDepDistBytes() != -1U) 6317 return 1; 6318 6319 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6320 const bool HasReductions = !Legal->getReductionVars().empty(); 6321 // Do not interleave loops with a relatively small known or estimated trip 6322 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6323 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6324 // because with the above conditions interleaving can expose ILP and break 6325 // cross iteration dependences for reductions. 6326 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6327 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6328 return 1; 6329 6330 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6331 // We divide by these constants so assume that we have at least one 6332 // instruction that uses at least one register. 6333 for (auto& pair : R.MaxLocalUsers) { 6334 pair.second = std::max(pair.second, 1U); 6335 } 6336 6337 // We calculate the interleave count using the following formula. 6338 // Subtract the number of loop invariants from the number of available 6339 // registers. These registers are used by all of the interleaved instances. 6340 // Next, divide the remaining registers by the number of registers that is 6341 // required by the loop, in order to estimate how many parallel instances 6342 // fit without causing spills. All of this is rounded down if necessary to be 6343 // a power of two. We want power of two interleave count to simplify any 6344 // addressing operations or alignment considerations. 6345 // We also want power of two interleave counts to ensure that the induction 6346 // variable of the vector loop wraps to zero, when tail is folded by masking; 6347 // this currently happens when OptForSize, in which case IC is set to 1 above. 6348 unsigned IC = UINT_MAX; 6349 6350 for (auto& pair : R.MaxLocalUsers) { 6351 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6352 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6353 << " registers of " 6354 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6355 if (VF.isScalar()) { 6356 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6357 TargetNumRegisters = ForceTargetNumScalarRegs; 6358 } else { 6359 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6360 TargetNumRegisters = ForceTargetNumVectorRegs; 6361 } 6362 unsigned MaxLocalUsers = pair.second; 6363 unsigned LoopInvariantRegs = 0; 6364 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6365 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6366 6367 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6368 // Don't count the induction variable as interleaved. 6369 if (EnableIndVarRegisterHeur) { 6370 TmpIC = 6371 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6372 std::max(1U, (MaxLocalUsers - 1))); 6373 } 6374 6375 IC = std::min(IC, TmpIC); 6376 } 6377 6378 // Clamp the interleave ranges to reasonable counts. 6379 unsigned MaxInterleaveCount = 6380 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6381 6382 // Check if the user has overridden the max. 6383 if (VF.isScalar()) { 6384 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6385 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6386 } else { 6387 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6388 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6389 } 6390 6391 // If trip count is known or estimated compile time constant, limit the 6392 // interleave count to be less than the trip count divided by VF, provided it 6393 // is at least 1. 6394 // 6395 // For scalable vectors we can't know if interleaving is beneficial. It may 6396 // not be beneficial for small loops if none of the lanes in the second vector 6397 // iterations is enabled. However, for larger loops, there is likely to be a 6398 // similar benefit as for fixed-width vectors. For now, we choose to leave 6399 // the InterleaveCount as if vscale is '1', although if some information about 6400 // the vector is known (e.g. min vector size), we can make a better decision. 6401 if (BestKnownTC) { 6402 MaxInterleaveCount = 6403 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6404 // Make sure MaxInterleaveCount is greater than 0. 6405 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6406 } 6407 6408 assert(MaxInterleaveCount > 0 && 6409 "Maximum interleave count must be greater than 0"); 6410 6411 // Clamp the calculated IC to be between the 1 and the max interleave count 6412 // that the target and trip count allows. 6413 if (IC > MaxInterleaveCount) 6414 IC = MaxInterleaveCount; 6415 else 6416 // Make sure IC is greater than 0. 6417 IC = std::max(1u, IC); 6418 6419 assert(IC > 0 && "Interleave count must be greater than 0."); 6420 6421 // If we did not calculate the cost for VF (because the user selected the VF) 6422 // then we calculate the cost of VF here. 6423 if (LoopCost == 0) { 6424 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6425 LoopCost = *expectedCost(VF).first.getValue(); 6426 } 6427 6428 assert(LoopCost && "Non-zero loop cost expected"); 6429 6430 // Interleave if we vectorized this loop and there is a reduction that could 6431 // benefit from interleaving. 6432 if (VF.isVector() && HasReductions) { 6433 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6434 return IC; 6435 } 6436 6437 // Note that if we've already vectorized the loop we will have done the 6438 // runtime check and so interleaving won't require further checks. 6439 bool InterleavingRequiresRuntimePointerCheck = 6440 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6441 6442 // We want to interleave small loops in order to reduce the loop overhead and 6443 // potentially expose ILP opportunities. 6444 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6445 << "LV: IC is " << IC << '\n' 6446 << "LV: VF is " << VF << '\n'); 6447 const bool AggressivelyInterleaveReductions = 6448 TTI.enableAggressiveInterleaving(HasReductions); 6449 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6450 // We assume that the cost overhead is 1 and we use the cost model 6451 // to estimate the cost of the loop and interleave until the cost of the 6452 // loop overhead is about 5% of the cost of the loop. 6453 unsigned SmallIC = 6454 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6455 6456 // Interleave until store/load ports (estimated by max interleave count) are 6457 // saturated. 6458 unsigned NumStores = Legal->getNumStores(); 6459 unsigned NumLoads = Legal->getNumLoads(); 6460 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6461 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6462 6463 // If we have a scalar reduction (vector reductions are already dealt with 6464 // by this point), we can increase the critical path length if the loop 6465 // we're interleaving is inside another loop. Limit, by default to 2, so the 6466 // critical path only gets increased by one reduction operation. 6467 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6468 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6469 SmallIC = std::min(SmallIC, F); 6470 StoresIC = std::min(StoresIC, F); 6471 LoadsIC = std::min(LoadsIC, F); 6472 } 6473 6474 if (EnableLoadStoreRuntimeInterleave && 6475 std::max(StoresIC, LoadsIC) > SmallIC) { 6476 LLVM_DEBUG( 6477 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6478 return std::max(StoresIC, LoadsIC); 6479 } 6480 6481 // If there are scalar reductions and TTI has enabled aggressive 6482 // interleaving for reductions, we will interleave to expose ILP. 6483 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6484 AggressivelyInterleaveReductions) { 6485 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6486 // Interleave no less than SmallIC but not as aggressive as the normal IC 6487 // to satisfy the rare situation when resources are too limited. 6488 return std::max(IC / 2, SmallIC); 6489 } else { 6490 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6491 return SmallIC; 6492 } 6493 } 6494 6495 // Interleave if this is a large loop (small loops are already dealt with by 6496 // this point) that could benefit from interleaving. 6497 if (AggressivelyInterleaveReductions) { 6498 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6499 return IC; 6500 } 6501 6502 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6503 return 1; 6504 } 6505 6506 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6507 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6508 // This function calculates the register usage by measuring the highest number 6509 // of values that are alive at a single location. Obviously, this is a very 6510 // rough estimation. We scan the loop in a topological order in order and 6511 // assign a number to each instruction. We use RPO to ensure that defs are 6512 // met before their users. We assume that each instruction that has in-loop 6513 // users starts an interval. We record every time that an in-loop value is 6514 // used, so we have a list of the first and last occurrences of each 6515 // instruction. Next, we transpose this data structure into a multi map that 6516 // holds the list of intervals that *end* at a specific location. This multi 6517 // map allows us to perform a linear search. We scan the instructions linearly 6518 // and record each time that a new interval starts, by placing it in a set. 6519 // If we find this value in the multi-map then we remove it from the set. 6520 // The max register usage is the maximum size of the set. 6521 // We also search for instructions that are defined outside the loop, but are 6522 // used inside the loop. We need this number separately from the max-interval 6523 // usage number because when we unroll, loop-invariant values do not take 6524 // more register. 6525 LoopBlocksDFS DFS(TheLoop); 6526 DFS.perform(LI); 6527 6528 RegisterUsage RU; 6529 6530 // Each 'key' in the map opens a new interval. The values 6531 // of the map are the index of the 'last seen' usage of the 6532 // instruction that is the key. 6533 using IntervalMap = DenseMap<Instruction *, unsigned>; 6534 6535 // Maps instruction to its index. 6536 SmallVector<Instruction *, 64> IdxToInstr; 6537 // Marks the end of each interval. 6538 IntervalMap EndPoint; 6539 // Saves the list of instruction indices that are used in the loop. 6540 SmallPtrSet<Instruction *, 8> Ends; 6541 // Saves the list of values that are used in the loop but are 6542 // defined outside the loop, such as arguments and constants. 6543 SmallPtrSet<Value *, 8> LoopInvariants; 6544 6545 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6546 for (Instruction &I : BB->instructionsWithoutDebug()) { 6547 IdxToInstr.push_back(&I); 6548 6549 // Save the end location of each USE. 6550 for (Value *U : I.operands()) { 6551 auto *Instr = dyn_cast<Instruction>(U); 6552 6553 // Ignore non-instruction values such as arguments, constants, etc. 6554 if (!Instr) 6555 continue; 6556 6557 // If this instruction is outside the loop then record it and continue. 6558 if (!TheLoop->contains(Instr)) { 6559 LoopInvariants.insert(Instr); 6560 continue; 6561 } 6562 6563 // Overwrite previous end points. 6564 EndPoint[Instr] = IdxToInstr.size(); 6565 Ends.insert(Instr); 6566 } 6567 } 6568 } 6569 6570 // Saves the list of intervals that end with the index in 'key'. 6571 using InstrList = SmallVector<Instruction *, 2>; 6572 DenseMap<unsigned, InstrList> TransposeEnds; 6573 6574 // Transpose the EndPoints to a list of values that end at each index. 6575 for (auto &Interval : EndPoint) 6576 TransposeEnds[Interval.second].push_back(Interval.first); 6577 6578 SmallPtrSet<Instruction *, 8> OpenIntervals; 6579 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6580 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6581 6582 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6583 6584 // A lambda that gets the register usage for the given type and VF. 6585 const auto &TTICapture = TTI; 6586 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6587 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6588 return 0U; 6589 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6590 }; 6591 6592 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6593 Instruction *I = IdxToInstr[i]; 6594 6595 // Remove all of the instructions that end at this location. 6596 InstrList &List = TransposeEnds[i]; 6597 for (Instruction *ToRemove : List) 6598 OpenIntervals.erase(ToRemove); 6599 6600 // Ignore instructions that are never used within the loop. 6601 if (!Ends.count(I)) 6602 continue; 6603 6604 // Skip ignored values. 6605 if (ValuesToIgnore.count(I)) 6606 continue; 6607 6608 // For each VF find the maximum usage of registers. 6609 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6610 // Count the number of live intervals. 6611 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6612 6613 if (VFs[j].isScalar()) { 6614 for (auto Inst : OpenIntervals) { 6615 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6616 if (RegUsage.find(ClassID) == RegUsage.end()) 6617 RegUsage[ClassID] = 1; 6618 else 6619 RegUsage[ClassID] += 1; 6620 } 6621 } else { 6622 collectUniformsAndScalars(VFs[j]); 6623 for (auto Inst : OpenIntervals) { 6624 // Skip ignored values for VF > 1. 6625 if (VecValuesToIgnore.count(Inst)) 6626 continue; 6627 if (isScalarAfterVectorization(Inst, VFs[j])) { 6628 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6629 if (RegUsage.find(ClassID) == RegUsage.end()) 6630 RegUsage[ClassID] = 1; 6631 else 6632 RegUsage[ClassID] += 1; 6633 } else { 6634 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6635 if (RegUsage.find(ClassID) == RegUsage.end()) 6636 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6637 else 6638 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6639 } 6640 } 6641 } 6642 6643 for (auto& pair : RegUsage) { 6644 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6645 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6646 else 6647 MaxUsages[j][pair.first] = pair.second; 6648 } 6649 } 6650 6651 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6652 << OpenIntervals.size() << '\n'); 6653 6654 // Add the current instruction to the list of open intervals. 6655 OpenIntervals.insert(I); 6656 } 6657 6658 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6659 SmallMapVector<unsigned, unsigned, 4> Invariant; 6660 6661 for (auto Inst : LoopInvariants) { 6662 unsigned Usage = 6663 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6664 unsigned ClassID = 6665 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6666 if (Invariant.find(ClassID) == Invariant.end()) 6667 Invariant[ClassID] = Usage; 6668 else 6669 Invariant[ClassID] += Usage; 6670 } 6671 6672 LLVM_DEBUG({ 6673 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6674 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6675 << " item\n"; 6676 for (const auto &pair : MaxUsages[i]) { 6677 dbgs() << "LV(REG): RegisterClass: " 6678 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6679 << " registers\n"; 6680 } 6681 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6682 << " item\n"; 6683 for (const auto &pair : Invariant) { 6684 dbgs() << "LV(REG): RegisterClass: " 6685 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6686 << " registers\n"; 6687 } 6688 }); 6689 6690 RU.LoopInvariantRegs = Invariant; 6691 RU.MaxLocalUsers = MaxUsages[i]; 6692 RUs[i] = RU; 6693 } 6694 6695 return RUs; 6696 } 6697 6698 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6699 // TODO: Cost model for emulated masked load/store is completely 6700 // broken. This hack guides the cost model to use an artificially 6701 // high enough value to practically disable vectorization with such 6702 // operations, except where previously deployed legality hack allowed 6703 // using very low cost values. This is to avoid regressions coming simply 6704 // from moving "masked load/store" check from legality to cost model. 6705 // Masked Load/Gather emulation was previously never allowed. 6706 // Limited number of Masked Store/Scatter emulation was allowed. 6707 assert(isPredicatedInst(I) && 6708 "Expecting a scalar emulated instruction"); 6709 return isa<LoadInst>(I) || 6710 (isa<StoreInst>(I) && 6711 NumPredStores > NumberOfStoresToPredicate); 6712 } 6713 6714 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6715 // If we aren't vectorizing the loop, or if we've already collected the 6716 // instructions to scalarize, there's nothing to do. Collection may already 6717 // have occurred if we have a user-selected VF and are now computing the 6718 // expected cost for interleaving. 6719 if (VF.isScalar() || VF.isZero() || 6720 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6721 return; 6722 6723 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6724 // not profitable to scalarize any instructions, the presence of VF in the 6725 // map will indicate that we've analyzed it already. 6726 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6727 6728 // Find all the instructions that are scalar with predication in the loop and 6729 // determine if it would be better to not if-convert the blocks they are in. 6730 // If so, we also record the instructions to scalarize. 6731 for (BasicBlock *BB : TheLoop->blocks()) { 6732 if (!blockNeedsPredication(BB)) 6733 continue; 6734 for (Instruction &I : *BB) 6735 if (isScalarWithPredication(&I)) { 6736 ScalarCostsTy ScalarCosts; 6737 // Do not apply discount logic if hacked cost is needed 6738 // for emulated masked memrefs. 6739 if (!useEmulatedMaskMemRefHack(&I) && 6740 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6741 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6742 // Remember that BB will remain after vectorization. 6743 PredicatedBBsAfterVectorization.insert(BB); 6744 } 6745 } 6746 } 6747 6748 int LoopVectorizationCostModel::computePredInstDiscount( 6749 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6750 assert(!isUniformAfterVectorization(PredInst, VF) && 6751 "Instruction marked uniform-after-vectorization will be predicated"); 6752 6753 // Initialize the discount to zero, meaning that the scalar version and the 6754 // vector version cost the same. 6755 InstructionCost Discount = 0; 6756 6757 // Holds instructions to analyze. The instructions we visit are mapped in 6758 // ScalarCosts. Those instructions are the ones that would be scalarized if 6759 // we find that the scalar version costs less. 6760 SmallVector<Instruction *, 8> Worklist; 6761 6762 // Returns true if the given instruction can be scalarized. 6763 auto canBeScalarized = [&](Instruction *I) -> bool { 6764 // We only attempt to scalarize instructions forming a single-use chain 6765 // from the original predicated block that would otherwise be vectorized. 6766 // Although not strictly necessary, we give up on instructions we know will 6767 // already be scalar to avoid traversing chains that are unlikely to be 6768 // beneficial. 6769 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6770 isScalarAfterVectorization(I, VF)) 6771 return false; 6772 6773 // If the instruction is scalar with predication, it will be analyzed 6774 // separately. We ignore it within the context of PredInst. 6775 if (isScalarWithPredication(I)) 6776 return false; 6777 6778 // If any of the instruction's operands are uniform after vectorization, 6779 // the instruction cannot be scalarized. This prevents, for example, a 6780 // masked load from being scalarized. 6781 // 6782 // We assume we will only emit a value for lane zero of an instruction 6783 // marked uniform after vectorization, rather than VF identical values. 6784 // Thus, if we scalarize an instruction that uses a uniform, we would 6785 // create uses of values corresponding to the lanes we aren't emitting code 6786 // for. This behavior can be changed by allowing getScalarValue to clone 6787 // the lane zero values for uniforms rather than asserting. 6788 for (Use &U : I->operands()) 6789 if (auto *J = dyn_cast<Instruction>(U.get())) 6790 if (isUniformAfterVectorization(J, VF)) 6791 return false; 6792 6793 // Otherwise, we can scalarize the instruction. 6794 return true; 6795 }; 6796 6797 // Compute the expected cost discount from scalarizing the entire expression 6798 // feeding the predicated instruction. We currently only consider expressions 6799 // that are single-use instruction chains. 6800 Worklist.push_back(PredInst); 6801 while (!Worklist.empty()) { 6802 Instruction *I = Worklist.pop_back_val(); 6803 6804 // If we've already analyzed the instruction, there's nothing to do. 6805 if (ScalarCosts.find(I) != ScalarCosts.end()) 6806 continue; 6807 6808 // Compute the cost of the vector instruction. Note that this cost already 6809 // includes the scalarization overhead of the predicated instruction. 6810 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6811 6812 // Compute the cost of the scalarized instruction. This cost is the cost of 6813 // the instruction as if it wasn't if-converted and instead remained in the 6814 // predicated block. We will scale this cost by block probability after 6815 // computing the scalarization overhead. 6816 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6817 InstructionCost ScalarCost = 6818 VF.getKnownMinValue() * 6819 getInstructionCost(I, ElementCount::getFixed(1)).first; 6820 6821 // Compute the scalarization overhead of needed insertelement instructions 6822 // and phi nodes. 6823 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6824 ScalarCost += TTI.getScalarizationOverhead( 6825 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6826 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6827 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6828 ScalarCost += 6829 VF.getKnownMinValue() * 6830 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6831 } 6832 6833 // Compute the scalarization overhead of needed extractelement 6834 // instructions. For each of the instruction's operands, if the operand can 6835 // be scalarized, add it to the worklist; otherwise, account for the 6836 // overhead. 6837 for (Use &U : I->operands()) 6838 if (auto *J = dyn_cast<Instruction>(U.get())) { 6839 assert(VectorType::isValidElementType(J->getType()) && 6840 "Instruction has non-scalar type"); 6841 if (canBeScalarized(J)) 6842 Worklist.push_back(J); 6843 else if (needsExtract(J, VF)) { 6844 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6845 ScalarCost += TTI.getScalarizationOverhead( 6846 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6847 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6848 } 6849 } 6850 6851 // Scale the total scalar cost by block probability. 6852 ScalarCost /= getReciprocalPredBlockProb(); 6853 6854 // Compute the discount. A non-negative discount means the vector version 6855 // of the instruction costs more, and scalarizing would be beneficial. 6856 Discount += VectorCost - ScalarCost; 6857 ScalarCosts[I] = ScalarCost; 6858 } 6859 6860 return *Discount.getValue(); 6861 } 6862 6863 LoopVectorizationCostModel::VectorizationCostTy 6864 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6865 VectorizationCostTy Cost; 6866 6867 // For each block. 6868 for (BasicBlock *BB : TheLoop->blocks()) { 6869 VectorizationCostTy BlockCost; 6870 6871 // For each instruction in the old loop. 6872 for (Instruction &I : BB->instructionsWithoutDebug()) { 6873 // Skip ignored values. 6874 if (ValuesToIgnore.count(&I) || 6875 (VF.isVector() && VecValuesToIgnore.count(&I))) 6876 continue; 6877 6878 VectorizationCostTy C = getInstructionCost(&I, VF); 6879 6880 // Check if we should override the cost. 6881 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6882 C.first = InstructionCost(ForceTargetInstructionCost); 6883 6884 BlockCost.first += C.first; 6885 BlockCost.second |= C.second; 6886 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6887 << " for VF " << VF << " For instruction: " << I 6888 << '\n'); 6889 } 6890 6891 // If we are vectorizing a predicated block, it will have been 6892 // if-converted. This means that the block's instructions (aside from 6893 // stores and instructions that may divide by zero) will now be 6894 // unconditionally executed. For the scalar case, we may not always execute 6895 // the predicated block, if it is an if-else block. Thus, scale the block's 6896 // cost by the probability of executing it. blockNeedsPredication from 6897 // Legal is used so as to not include all blocks in tail folded loops. 6898 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6899 BlockCost.first /= getReciprocalPredBlockProb(); 6900 6901 Cost.first += BlockCost.first; 6902 Cost.second |= BlockCost.second; 6903 } 6904 6905 return Cost; 6906 } 6907 6908 /// Gets Address Access SCEV after verifying that the access pattern 6909 /// is loop invariant except the induction variable dependence. 6910 /// 6911 /// This SCEV can be sent to the Target in order to estimate the address 6912 /// calculation cost. 6913 static const SCEV *getAddressAccessSCEV( 6914 Value *Ptr, 6915 LoopVectorizationLegality *Legal, 6916 PredicatedScalarEvolution &PSE, 6917 const Loop *TheLoop) { 6918 6919 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6920 if (!Gep) 6921 return nullptr; 6922 6923 // We are looking for a gep with all loop invariant indices except for one 6924 // which should be an induction variable. 6925 auto SE = PSE.getSE(); 6926 unsigned NumOperands = Gep->getNumOperands(); 6927 for (unsigned i = 1; i < NumOperands; ++i) { 6928 Value *Opd = Gep->getOperand(i); 6929 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6930 !Legal->isInductionVariable(Opd)) 6931 return nullptr; 6932 } 6933 6934 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6935 return PSE.getSCEV(Ptr); 6936 } 6937 6938 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6939 return Legal->hasStride(I->getOperand(0)) || 6940 Legal->hasStride(I->getOperand(1)); 6941 } 6942 6943 InstructionCost 6944 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6945 ElementCount VF) { 6946 assert(VF.isVector() && 6947 "Scalarization cost of instruction implies vectorization."); 6948 if (VF.isScalable()) 6949 return InstructionCost::getInvalid(); 6950 6951 Type *ValTy = getMemInstValueType(I); 6952 auto SE = PSE.getSE(); 6953 6954 unsigned AS = getLoadStoreAddressSpace(I); 6955 Value *Ptr = getLoadStorePointerOperand(I); 6956 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6957 6958 // Figure out whether the access is strided and get the stride value 6959 // if it's known in compile time 6960 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6961 6962 // Get the cost of the scalar memory instruction and address computation. 6963 InstructionCost Cost = 6964 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6965 6966 // Don't pass *I here, since it is scalar but will actually be part of a 6967 // vectorized loop where the user of it is a vectorized instruction. 6968 const Align Alignment = getLoadStoreAlignment(I); 6969 Cost += VF.getKnownMinValue() * 6970 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6971 AS, TTI::TCK_RecipThroughput); 6972 6973 // Get the overhead of the extractelement and insertelement instructions 6974 // we might create due to scalarization. 6975 Cost += getScalarizationOverhead(I, VF); 6976 6977 // If we have a predicated load/store, it will need extra i1 extracts and 6978 // conditional branches, but may not be executed for each vector lane. Scale 6979 // the cost by the probability of executing the predicated block. 6980 if (isPredicatedInst(I)) { 6981 Cost /= getReciprocalPredBlockProb(); 6982 6983 // Add the cost of an i1 extract and a branch 6984 auto *Vec_i1Ty = 6985 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6986 Cost += TTI.getScalarizationOverhead( 6987 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6988 /*Insert=*/false, /*Extract=*/true); 6989 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6990 6991 if (useEmulatedMaskMemRefHack(I)) 6992 // Artificially setting to a high enough value to practically disable 6993 // vectorization with such operations. 6994 Cost = 3000000; 6995 } 6996 6997 return Cost; 6998 } 6999 7000 InstructionCost 7001 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 7002 ElementCount VF) { 7003 Type *ValTy = getMemInstValueType(I); 7004 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7005 Value *Ptr = getLoadStorePointerOperand(I); 7006 unsigned AS = getLoadStoreAddressSpace(I); 7007 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 7008 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7009 7010 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7011 "Stride should be 1 or -1 for consecutive memory access"); 7012 const Align Alignment = getLoadStoreAlignment(I); 7013 InstructionCost Cost = 0; 7014 if (Legal->isMaskRequired(I)) 7015 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7016 CostKind); 7017 else 7018 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 7019 CostKind, I); 7020 7021 bool Reverse = ConsecutiveStride < 0; 7022 if (Reverse) 7023 Cost += 7024 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7025 return Cost; 7026 } 7027 7028 InstructionCost 7029 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7030 ElementCount VF) { 7031 assert(Legal->isUniformMemOp(*I)); 7032 7033 Type *ValTy = getMemInstValueType(I); 7034 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7035 const Align Alignment = getLoadStoreAlignment(I); 7036 unsigned AS = getLoadStoreAddressSpace(I); 7037 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7038 if (isa<LoadInst>(I)) { 7039 return TTI.getAddressComputationCost(ValTy) + 7040 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7041 CostKind) + 7042 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7043 } 7044 StoreInst *SI = cast<StoreInst>(I); 7045 7046 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7047 return TTI.getAddressComputationCost(ValTy) + 7048 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7049 CostKind) + 7050 (isLoopInvariantStoreValue 7051 ? 0 7052 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7053 VF.getKnownMinValue() - 1)); 7054 } 7055 7056 InstructionCost 7057 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7058 ElementCount VF) { 7059 Type *ValTy = getMemInstValueType(I); 7060 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7061 const Align Alignment = getLoadStoreAlignment(I); 7062 const Value *Ptr = getLoadStorePointerOperand(I); 7063 7064 return TTI.getAddressComputationCost(VectorTy) + 7065 TTI.getGatherScatterOpCost( 7066 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7067 TargetTransformInfo::TCK_RecipThroughput, I); 7068 } 7069 7070 InstructionCost 7071 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7072 ElementCount VF) { 7073 // TODO: Once we have support for interleaving with scalable vectors 7074 // we can calculate the cost properly here. 7075 if (VF.isScalable()) 7076 return InstructionCost::getInvalid(); 7077 7078 Type *ValTy = getMemInstValueType(I); 7079 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7080 unsigned AS = getLoadStoreAddressSpace(I); 7081 7082 auto Group = getInterleavedAccessGroup(I); 7083 assert(Group && "Fail to get an interleaved access group."); 7084 7085 unsigned InterleaveFactor = Group->getFactor(); 7086 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7087 7088 // Holds the indices of existing members in an interleaved load group. 7089 // An interleaved store group doesn't need this as it doesn't allow gaps. 7090 SmallVector<unsigned, 4> Indices; 7091 if (isa<LoadInst>(I)) { 7092 for (unsigned i = 0; i < InterleaveFactor; i++) 7093 if (Group->getMember(i)) 7094 Indices.push_back(i); 7095 } 7096 7097 // Calculate the cost of the whole interleaved group. 7098 bool UseMaskForGaps = 7099 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7100 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7101 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7102 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7103 7104 if (Group->isReverse()) { 7105 // TODO: Add support for reversed masked interleaved access. 7106 assert(!Legal->isMaskRequired(I) && 7107 "Reverse masked interleaved access not supported."); 7108 Cost += 7109 Group->getNumMembers() * 7110 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7111 } 7112 return Cost; 7113 } 7114 7115 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7116 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7117 // Early exit for no inloop reductions 7118 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7119 return InstructionCost::getInvalid(); 7120 auto *VectorTy = cast<VectorType>(Ty); 7121 7122 // We are looking for a pattern of, and finding the minimal acceptable cost: 7123 // reduce(mul(ext(A), ext(B))) or 7124 // reduce(mul(A, B)) or 7125 // reduce(ext(A)) or 7126 // reduce(A). 7127 // The basic idea is that we walk down the tree to do that, finding the root 7128 // reduction instruction in InLoopReductionImmediateChains. From there we find 7129 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7130 // of the components. If the reduction cost is lower then we return it for the 7131 // reduction instruction and 0 for the other instructions in the pattern. If 7132 // it is not we return an invalid cost specifying the orignal cost method 7133 // should be used. 7134 Instruction *RetI = I; 7135 if ((RetI->getOpcode() == Instruction::SExt || 7136 RetI->getOpcode() == Instruction::ZExt)) { 7137 if (!RetI->hasOneUser()) 7138 return InstructionCost::getInvalid(); 7139 RetI = RetI->user_back(); 7140 } 7141 if (RetI->getOpcode() == Instruction::Mul && 7142 RetI->user_back()->getOpcode() == Instruction::Add) { 7143 if (!RetI->hasOneUser()) 7144 return InstructionCost::getInvalid(); 7145 RetI = RetI->user_back(); 7146 } 7147 7148 // Test if the found instruction is a reduction, and if not return an invalid 7149 // cost specifying the parent to use the original cost modelling. 7150 if (!InLoopReductionImmediateChains.count(RetI)) 7151 return InstructionCost::getInvalid(); 7152 7153 // Find the reduction this chain is a part of and calculate the basic cost of 7154 // the reduction on its own. 7155 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7156 Instruction *ReductionPhi = LastChain; 7157 while (!isa<PHINode>(ReductionPhi)) 7158 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7159 7160 RecurrenceDescriptor RdxDesc = 7161 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7162 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7163 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7164 7165 // Get the operand that was not the reduction chain and match it to one of the 7166 // patterns, returning the better cost if it is found. 7167 Instruction *RedOp = RetI->getOperand(1) == LastChain 7168 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7169 : dyn_cast<Instruction>(RetI->getOperand(1)); 7170 7171 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7172 7173 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7174 !TheLoop->isLoopInvariant(RedOp)) { 7175 bool IsUnsigned = isa<ZExtInst>(RedOp); 7176 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7177 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7178 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7179 CostKind); 7180 7181 InstructionCost ExtCost = 7182 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7183 TTI::CastContextHint::None, CostKind, RedOp); 7184 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7185 return I == RetI ? *RedCost.getValue() : 0; 7186 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7187 Instruction *Mul = RedOp; 7188 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7189 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7190 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7191 Op0->getOpcode() == Op1->getOpcode() && 7192 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7193 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7194 bool IsUnsigned = isa<ZExtInst>(Op0); 7195 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7196 // reduce(mul(ext, ext)) 7197 InstructionCost ExtCost = 7198 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7199 TTI::CastContextHint::None, CostKind, Op0); 7200 InstructionCost MulCost = 7201 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7202 7203 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7204 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7205 CostKind); 7206 7207 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7208 return I == RetI ? *RedCost.getValue() : 0; 7209 } else { 7210 InstructionCost MulCost = 7211 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7212 7213 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7214 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7215 CostKind); 7216 7217 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7218 return I == RetI ? *RedCost.getValue() : 0; 7219 } 7220 } 7221 7222 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7223 } 7224 7225 InstructionCost 7226 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7227 ElementCount VF) { 7228 // Calculate scalar cost only. Vectorization cost should be ready at this 7229 // moment. 7230 if (VF.isScalar()) { 7231 Type *ValTy = getMemInstValueType(I); 7232 const Align Alignment = getLoadStoreAlignment(I); 7233 unsigned AS = getLoadStoreAddressSpace(I); 7234 7235 return TTI.getAddressComputationCost(ValTy) + 7236 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7237 TTI::TCK_RecipThroughput, I); 7238 } 7239 return getWideningCost(I, VF); 7240 } 7241 7242 LoopVectorizationCostModel::VectorizationCostTy 7243 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7244 ElementCount VF) { 7245 // If we know that this instruction will remain uniform, check the cost of 7246 // the scalar version. 7247 if (isUniformAfterVectorization(I, VF)) 7248 VF = ElementCount::getFixed(1); 7249 7250 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7251 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7252 7253 // Forced scalars do not have any scalarization overhead. 7254 auto ForcedScalar = ForcedScalars.find(VF); 7255 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7256 auto InstSet = ForcedScalar->second; 7257 if (InstSet.count(I)) 7258 return VectorizationCostTy( 7259 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7260 VF.getKnownMinValue()), 7261 false); 7262 } 7263 7264 Type *VectorTy; 7265 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7266 7267 bool TypeNotScalarized = 7268 VF.isVector() && VectorTy->isVectorTy() && 7269 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7270 return VectorizationCostTy(C, TypeNotScalarized); 7271 } 7272 7273 InstructionCost 7274 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7275 ElementCount VF) const { 7276 7277 if (VF.isScalable()) 7278 return InstructionCost::getInvalid(); 7279 7280 if (VF.isScalar()) 7281 return 0; 7282 7283 InstructionCost Cost = 0; 7284 Type *RetTy = ToVectorTy(I->getType(), VF); 7285 if (!RetTy->isVoidTy() && 7286 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7287 Cost += TTI.getScalarizationOverhead( 7288 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7289 true, false); 7290 7291 // Some targets keep addresses scalar. 7292 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7293 return Cost; 7294 7295 // Some targets support efficient element stores. 7296 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7297 return Cost; 7298 7299 // Collect operands to consider. 7300 CallInst *CI = dyn_cast<CallInst>(I); 7301 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7302 7303 // Skip operands that do not require extraction/scalarization and do not incur 7304 // any overhead. 7305 SmallVector<Type *> Tys; 7306 for (auto *V : filterExtractingOperands(Ops, VF)) 7307 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7308 return Cost + TTI.getOperandsScalarizationOverhead( 7309 filterExtractingOperands(Ops, VF), Tys); 7310 } 7311 7312 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7313 if (VF.isScalar()) 7314 return; 7315 NumPredStores = 0; 7316 for (BasicBlock *BB : TheLoop->blocks()) { 7317 // For each instruction in the old loop. 7318 for (Instruction &I : *BB) { 7319 Value *Ptr = getLoadStorePointerOperand(&I); 7320 if (!Ptr) 7321 continue; 7322 7323 // TODO: We should generate better code and update the cost model for 7324 // predicated uniform stores. Today they are treated as any other 7325 // predicated store (see added test cases in 7326 // invariant-store-vectorization.ll). 7327 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7328 NumPredStores++; 7329 7330 if (Legal->isUniformMemOp(I)) { 7331 // TODO: Avoid replicating loads and stores instead of 7332 // relying on instcombine to remove them. 7333 // Load: Scalar load + broadcast 7334 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7335 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7336 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7337 continue; 7338 } 7339 7340 // We assume that widening is the best solution when possible. 7341 if (memoryInstructionCanBeWidened(&I, VF)) { 7342 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7343 int ConsecutiveStride = 7344 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7345 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7346 "Expected consecutive stride."); 7347 InstWidening Decision = 7348 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7349 setWideningDecision(&I, VF, Decision, Cost); 7350 continue; 7351 } 7352 7353 // Choose between Interleaving, Gather/Scatter or Scalarization. 7354 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7355 unsigned NumAccesses = 1; 7356 if (isAccessInterleaved(&I)) { 7357 auto Group = getInterleavedAccessGroup(&I); 7358 assert(Group && "Fail to get an interleaved access group."); 7359 7360 // Make one decision for the whole group. 7361 if (getWideningDecision(&I, VF) != CM_Unknown) 7362 continue; 7363 7364 NumAccesses = Group->getNumMembers(); 7365 if (interleavedAccessCanBeWidened(&I, VF)) 7366 InterleaveCost = getInterleaveGroupCost(&I, VF); 7367 } 7368 7369 InstructionCost GatherScatterCost = 7370 isLegalGatherOrScatter(&I) 7371 ? getGatherScatterCost(&I, VF) * NumAccesses 7372 : InstructionCost::getInvalid(); 7373 7374 InstructionCost ScalarizationCost = 7375 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7376 7377 // Choose better solution for the current VF, 7378 // write down this decision and use it during vectorization. 7379 InstructionCost Cost; 7380 InstWidening Decision; 7381 if (InterleaveCost <= GatherScatterCost && 7382 InterleaveCost < ScalarizationCost) { 7383 Decision = CM_Interleave; 7384 Cost = InterleaveCost; 7385 } else if (GatherScatterCost < ScalarizationCost) { 7386 Decision = CM_GatherScatter; 7387 Cost = GatherScatterCost; 7388 } else { 7389 assert(!VF.isScalable() && 7390 "We cannot yet scalarise for scalable vectors"); 7391 Decision = CM_Scalarize; 7392 Cost = ScalarizationCost; 7393 } 7394 // If the instructions belongs to an interleave group, the whole group 7395 // receives the same decision. The whole group receives the cost, but 7396 // the cost will actually be assigned to one instruction. 7397 if (auto Group = getInterleavedAccessGroup(&I)) 7398 setWideningDecision(Group, VF, Decision, Cost); 7399 else 7400 setWideningDecision(&I, VF, Decision, Cost); 7401 } 7402 } 7403 7404 // Make sure that any load of address and any other address computation 7405 // remains scalar unless there is gather/scatter support. This avoids 7406 // inevitable extracts into address registers, and also has the benefit of 7407 // activating LSR more, since that pass can't optimize vectorized 7408 // addresses. 7409 if (TTI.prefersVectorizedAddressing()) 7410 return; 7411 7412 // Start with all scalar pointer uses. 7413 SmallPtrSet<Instruction *, 8> AddrDefs; 7414 for (BasicBlock *BB : TheLoop->blocks()) 7415 for (Instruction &I : *BB) { 7416 Instruction *PtrDef = 7417 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7418 if (PtrDef && TheLoop->contains(PtrDef) && 7419 getWideningDecision(&I, VF) != CM_GatherScatter) 7420 AddrDefs.insert(PtrDef); 7421 } 7422 7423 // Add all instructions used to generate the addresses. 7424 SmallVector<Instruction *, 4> Worklist; 7425 append_range(Worklist, AddrDefs); 7426 while (!Worklist.empty()) { 7427 Instruction *I = Worklist.pop_back_val(); 7428 for (auto &Op : I->operands()) 7429 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7430 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7431 AddrDefs.insert(InstOp).second) 7432 Worklist.push_back(InstOp); 7433 } 7434 7435 for (auto *I : AddrDefs) { 7436 if (isa<LoadInst>(I)) { 7437 // Setting the desired widening decision should ideally be handled in 7438 // by cost functions, but since this involves the task of finding out 7439 // if the loaded register is involved in an address computation, it is 7440 // instead changed here when we know this is the case. 7441 InstWidening Decision = getWideningDecision(I, VF); 7442 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7443 // Scalarize a widened load of address. 7444 setWideningDecision( 7445 I, VF, CM_Scalarize, 7446 (VF.getKnownMinValue() * 7447 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7448 else if (auto Group = getInterleavedAccessGroup(I)) { 7449 // Scalarize an interleave group of address loads. 7450 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7451 if (Instruction *Member = Group->getMember(I)) 7452 setWideningDecision( 7453 Member, VF, CM_Scalarize, 7454 (VF.getKnownMinValue() * 7455 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7456 } 7457 } 7458 } else 7459 // Make sure I gets scalarized and a cost estimate without 7460 // scalarization overhead. 7461 ForcedScalars[VF].insert(I); 7462 } 7463 } 7464 7465 InstructionCost 7466 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7467 Type *&VectorTy) { 7468 Type *RetTy = I->getType(); 7469 if (canTruncateToMinimalBitwidth(I, VF)) 7470 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7471 auto SE = PSE.getSE(); 7472 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7473 7474 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7475 ElementCount VF) -> bool { 7476 if (VF.isScalar()) 7477 return true; 7478 7479 auto Scalarized = InstsToScalarize.find(VF); 7480 assert(Scalarized != InstsToScalarize.end() && 7481 "VF not yet analyzed for scalarization profitability"); 7482 return !Scalarized->second.count(I) && 7483 llvm::all_of(I->users(), [&](User *U) { 7484 auto *UI = cast<Instruction>(U); 7485 return !Scalarized->second.count(UI); 7486 }); 7487 }; 7488 (void) hasSingleCopyAfterVectorization; 7489 7490 if (isScalarAfterVectorization(I, VF)) { 7491 // With the exception of GEPs and PHIs, after scalarization there should 7492 // only be one copy of the instruction generated in the loop. This is 7493 // because the VF is either 1, or any instructions that need scalarizing 7494 // have already been dealt with by the the time we get here. As a result, 7495 // it means we don't have to multiply the instruction cost by VF. 7496 assert(I->getOpcode() == Instruction::GetElementPtr || 7497 I->getOpcode() == Instruction::PHI || 7498 (I->getOpcode() == Instruction::BitCast && 7499 I->getType()->isPointerTy()) || 7500 hasSingleCopyAfterVectorization(I, VF)); 7501 VectorTy = RetTy; 7502 } else 7503 VectorTy = ToVectorTy(RetTy, VF); 7504 7505 // TODO: We need to estimate the cost of intrinsic calls. 7506 switch (I->getOpcode()) { 7507 case Instruction::GetElementPtr: 7508 // We mark this instruction as zero-cost because the cost of GEPs in 7509 // vectorized code depends on whether the corresponding memory instruction 7510 // is scalarized or not. Therefore, we handle GEPs with the memory 7511 // instruction cost. 7512 return 0; 7513 case Instruction::Br: { 7514 // In cases of scalarized and predicated instructions, there will be VF 7515 // predicated blocks in the vectorized loop. Each branch around these 7516 // blocks requires also an extract of its vector compare i1 element. 7517 bool ScalarPredicatedBB = false; 7518 BranchInst *BI = cast<BranchInst>(I); 7519 if (VF.isVector() && BI->isConditional() && 7520 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7521 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7522 ScalarPredicatedBB = true; 7523 7524 if (ScalarPredicatedBB) { 7525 // Return cost for branches around scalarized and predicated blocks. 7526 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7527 auto *Vec_i1Ty = 7528 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7529 return (TTI.getScalarizationOverhead( 7530 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7531 false, true) + 7532 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7533 VF.getKnownMinValue())); 7534 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7535 // The back-edge branch will remain, as will all scalar branches. 7536 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7537 else 7538 // This branch will be eliminated by if-conversion. 7539 return 0; 7540 // Note: We currently assume zero cost for an unconditional branch inside 7541 // a predicated block since it will become a fall-through, although we 7542 // may decide in the future to call TTI for all branches. 7543 } 7544 case Instruction::PHI: { 7545 auto *Phi = cast<PHINode>(I); 7546 7547 // First-order recurrences are replaced by vector shuffles inside the loop. 7548 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7549 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7550 return TTI.getShuffleCost( 7551 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7552 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7553 7554 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7555 // converted into select instructions. We require N - 1 selects per phi 7556 // node, where N is the number of incoming values. 7557 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7558 return (Phi->getNumIncomingValues() - 1) * 7559 TTI.getCmpSelInstrCost( 7560 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7561 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7562 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7563 7564 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7565 } 7566 case Instruction::UDiv: 7567 case Instruction::SDiv: 7568 case Instruction::URem: 7569 case Instruction::SRem: 7570 // If we have a predicated instruction, it may not be executed for each 7571 // vector lane. Get the scalarization cost and scale this amount by the 7572 // probability of executing the predicated block. If the instruction is not 7573 // predicated, we fall through to the next case. 7574 if (VF.isVector() && isScalarWithPredication(I)) { 7575 InstructionCost Cost = 0; 7576 7577 // These instructions have a non-void type, so account for the phi nodes 7578 // that we will create. This cost is likely to be zero. The phi node 7579 // cost, if any, should be scaled by the block probability because it 7580 // models a copy at the end of each predicated block. 7581 Cost += VF.getKnownMinValue() * 7582 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7583 7584 // The cost of the non-predicated instruction. 7585 Cost += VF.getKnownMinValue() * 7586 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7587 7588 // The cost of insertelement and extractelement instructions needed for 7589 // scalarization. 7590 Cost += getScalarizationOverhead(I, VF); 7591 7592 // Scale the cost by the probability of executing the predicated blocks. 7593 // This assumes the predicated block for each vector lane is equally 7594 // likely. 7595 return Cost / getReciprocalPredBlockProb(); 7596 } 7597 LLVM_FALLTHROUGH; 7598 case Instruction::Add: 7599 case Instruction::FAdd: 7600 case Instruction::Sub: 7601 case Instruction::FSub: 7602 case Instruction::Mul: 7603 case Instruction::FMul: 7604 case Instruction::FDiv: 7605 case Instruction::FRem: 7606 case Instruction::Shl: 7607 case Instruction::LShr: 7608 case Instruction::AShr: 7609 case Instruction::And: 7610 case Instruction::Or: 7611 case Instruction::Xor: { 7612 // Since we will replace the stride by 1 the multiplication should go away. 7613 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7614 return 0; 7615 7616 // Detect reduction patterns 7617 InstructionCost RedCost; 7618 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7619 .isValid()) 7620 return RedCost; 7621 7622 // Certain instructions can be cheaper to vectorize if they have a constant 7623 // second vector operand. One example of this are shifts on x86. 7624 Value *Op2 = I->getOperand(1); 7625 TargetTransformInfo::OperandValueProperties Op2VP; 7626 TargetTransformInfo::OperandValueKind Op2VK = 7627 TTI.getOperandInfo(Op2, Op2VP); 7628 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7629 Op2VK = TargetTransformInfo::OK_UniformValue; 7630 7631 SmallVector<const Value *, 4> Operands(I->operand_values()); 7632 return TTI.getArithmeticInstrCost( 7633 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7634 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7635 } 7636 case Instruction::FNeg: { 7637 return TTI.getArithmeticInstrCost( 7638 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7639 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7640 TargetTransformInfo::OP_None, I->getOperand(0), I); 7641 } 7642 case Instruction::Select: { 7643 SelectInst *SI = cast<SelectInst>(I); 7644 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7645 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7646 7647 const Value *Op0, *Op1; 7648 using namespace llvm::PatternMatch; 7649 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7650 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7651 // select x, y, false --> x & y 7652 // select x, true, y --> x | y 7653 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7654 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7655 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7656 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7657 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7658 Op1->getType()->getScalarSizeInBits() == 1); 7659 7660 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7661 return TTI.getArithmeticInstrCost( 7662 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7663 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7664 } 7665 7666 Type *CondTy = SI->getCondition()->getType(); 7667 if (!ScalarCond) 7668 CondTy = VectorType::get(CondTy, VF); 7669 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7670 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7671 } 7672 case Instruction::ICmp: 7673 case Instruction::FCmp: { 7674 Type *ValTy = I->getOperand(0)->getType(); 7675 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7676 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7677 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7678 VectorTy = ToVectorTy(ValTy, VF); 7679 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7680 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7681 } 7682 case Instruction::Store: 7683 case Instruction::Load: { 7684 ElementCount Width = VF; 7685 if (Width.isVector()) { 7686 InstWidening Decision = getWideningDecision(I, Width); 7687 assert(Decision != CM_Unknown && 7688 "CM decision should be taken at this point"); 7689 if (Decision == CM_Scalarize) 7690 Width = ElementCount::getFixed(1); 7691 } 7692 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7693 return getMemoryInstructionCost(I, VF); 7694 } 7695 case Instruction::BitCast: 7696 if (I->getType()->isPointerTy()) 7697 return 0; 7698 LLVM_FALLTHROUGH; 7699 case Instruction::ZExt: 7700 case Instruction::SExt: 7701 case Instruction::FPToUI: 7702 case Instruction::FPToSI: 7703 case Instruction::FPExt: 7704 case Instruction::PtrToInt: 7705 case Instruction::IntToPtr: 7706 case Instruction::SIToFP: 7707 case Instruction::UIToFP: 7708 case Instruction::Trunc: 7709 case Instruction::FPTrunc: { 7710 // Computes the CastContextHint from a Load/Store instruction. 7711 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7712 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7713 "Expected a load or a store!"); 7714 7715 if (VF.isScalar() || !TheLoop->contains(I)) 7716 return TTI::CastContextHint::Normal; 7717 7718 switch (getWideningDecision(I, VF)) { 7719 case LoopVectorizationCostModel::CM_GatherScatter: 7720 return TTI::CastContextHint::GatherScatter; 7721 case LoopVectorizationCostModel::CM_Interleave: 7722 return TTI::CastContextHint::Interleave; 7723 case LoopVectorizationCostModel::CM_Scalarize: 7724 case LoopVectorizationCostModel::CM_Widen: 7725 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7726 : TTI::CastContextHint::Normal; 7727 case LoopVectorizationCostModel::CM_Widen_Reverse: 7728 return TTI::CastContextHint::Reversed; 7729 case LoopVectorizationCostModel::CM_Unknown: 7730 llvm_unreachable("Instr did not go through cost modelling?"); 7731 } 7732 7733 llvm_unreachable("Unhandled case!"); 7734 }; 7735 7736 unsigned Opcode = I->getOpcode(); 7737 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7738 // For Trunc, the context is the only user, which must be a StoreInst. 7739 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7740 if (I->hasOneUse()) 7741 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7742 CCH = ComputeCCH(Store); 7743 } 7744 // For Z/Sext, the context is the operand, which must be a LoadInst. 7745 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7746 Opcode == Instruction::FPExt) { 7747 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7748 CCH = ComputeCCH(Load); 7749 } 7750 7751 // We optimize the truncation of induction variables having constant 7752 // integer steps. The cost of these truncations is the same as the scalar 7753 // operation. 7754 if (isOptimizableIVTruncate(I, VF)) { 7755 auto *Trunc = cast<TruncInst>(I); 7756 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7757 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7758 } 7759 7760 // Detect reduction patterns 7761 InstructionCost RedCost; 7762 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7763 .isValid()) 7764 return RedCost; 7765 7766 Type *SrcScalarTy = I->getOperand(0)->getType(); 7767 Type *SrcVecTy = 7768 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7769 if (canTruncateToMinimalBitwidth(I, VF)) { 7770 // This cast is going to be shrunk. This may remove the cast or it might 7771 // turn it into slightly different cast. For example, if MinBW == 16, 7772 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7773 // 7774 // Calculate the modified src and dest types. 7775 Type *MinVecTy = VectorTy; 7776 if (Opcode == Instruction::Trunc) { 7777 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7778 VectorTy = 7779 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7780 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7781 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7782 VectorTy = 7783 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7784 } 7785 } 7786 7787 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7788 } 7789 case Instruction::Call: { 7790 bool NeedToScalarize; 7791 CallInst *CI = cast<CallInst>(I); 7792 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7793 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7794 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7795 return std::min(CallCost, IntrinsicCost); 7796 } 7797 return CallCost; 7798 } 7799 case Instruction::ExtractValue: 7800 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7801 default: 7802 // This opcode is unknown. Assume that it is the same as 'mul'. 7803 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7804 } // end of switch. 7805 } 7806 7807 char LoopVectorize::ID = 0; 7808 7809 static const char lv_name[] = "Loop Vectorization"; 7810 7811 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7812 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7813 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7814 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7815 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7816 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7817 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7818 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7819 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7820 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7821 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7822 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7823 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7824 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7825 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7826 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7827 7828 namespace llvm { 7829 7830 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7831 7832 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7833 bool VectorizeOnlyWhenForced) { 7834 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7835 } 7836 7837 } // end namespace llvm 7838 7839 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7840 // Check if the pointer operand of a load or store instruction is 7841 // consecutive. 7842 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7843 return Legal->isConsecutivePtr(Ptr); 7844 return false; 7845 } 7846 7847 void LoopVectorizationCostModel::collectValuesToIgnore() { 7848 // Ignore ephemeral values. 7849 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7850 7851 // Ignore type-promoting instructions we identified during reduction 7852 // detection. 7853 for (auto &Reduction : Legal->getReductionVars()) { 7854 RecurrenceDescriptor &RedDes = Reduction.second; 7855 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7856 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7857 } 7858 // Ignore type-casting instructions we identified during induction 7859 // detection. 7860 for (auto &Induction : Legal->getInductionVars()) { 7861 InductionDescriptor &IndDes = Induction.second; 7862 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7863 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7864 } 7865 } 7866 7867 void LoopVectorizationCostModel::collectInLoopReductions() { 7868 for (auto &Reduction : Legal->getReductionVars()) { 7869 PHINode *Phi = Reduction.first; 7870 RecurrenceDescriptor &RdxDesc = Reduction.second; 7871 7872 // We don't collect reductions that are type promoted (yet). 7873 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7874 continue; 7875 7876 // If the target would prefer this reduction to happen "in-loop", then we 7877 // want to record it as such. 7878 unsigned Opcode = RdxDesc.getOpcode(); 7879 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7880 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7881 TargetTransformInfo::ReductionFlags())) 7882 continue; 7883 7884 // Check that we can correctly put the reductions into the loop, by 7885 // finding the chain of operations that leads from the phi to the loop 7886 // exit value. 7887 SmallVector<Instruction *, 4> ReductionOperations = 7888 RdxDesc.getReductionOpChain(Phi, TheLoop); 7889 bool InLoop = !ReductionOperations.empty(); 7890 if (InLoop) { 7891 InLoopReductionChains[Phi] = ReductionOperations; 7892 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7893 Instruction *LastChain = Phi; 7894 for (auto *I : ReductionOperations) { 7895 InLoopReductionImmediateChains[I] = LastChain; 7896 LastChain = I; 7897 } 7898 } 7899 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7900 << " reduction for phi: " << *Phi << "\n"); 7901 } 7902 } 7903 7904 // TODO: we could return a pair of values that specify the max VF and 7905 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7906 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7907 // doesn't have a cost model that can choose which plan to execute if 7908 // more than one is generated. 7909 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7910 LoopVectorizationCostModel &CM) { 7911 unsigned WidestType; 7912 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7913 return WidestVectorRegBits / WidestType; 7914 } 7915 7916 VectorizationFactor 7917 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7918 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7919 ElementCount VF = UserVF; 7920 // Outer loop handling: They may require CFG and instruction level 7921 // transformations before even evaluating whether vectorization is profitable. 7922 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7923 // the vectorization pipeline. 7924 if (!OrigLoop->isInnermost()) { 7925 // If the user doesn't provide a vectorization factor, determine a 7926 // reasonable one. 7927 if (UserVF.isZero()) { 7928 VF = ElementCount::getFixed(determineVPlanVF( 7929 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7930 .getFixedSize(), 7931 CM)); 7932 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7933 7934 // Make sure we have a VF > 1 for stress testing. 7935 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7936 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7937 << "overriding computed VF.\n"); 7938 VF = ElementCount::getFixed(4); 7939 } 7940 } 7941 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7942 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7943 "VF needs to be a power of two"); 7944 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7945 << "VF " << VF << " to build VPlans.\n"); 7946 buildVPlans(VF, VF); 7947 7948 // For VPlan build stress testing, we bail out after VPlan construction. 7949 if (VPlanBuildStressTest) 7950 return VectorizationFactor::Disabled(); 7951 7952 return {VF, 0 /*Cost*/}; 7953 } 7954 7955 LLVM_DEBUG( 7956 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7957 "VPlan-native path.\n"); 7958 return VectorizationFactor::Disabled(); 7959 } 7960 7961 Optional<VectorizationFactor> 7962 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7963 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7964 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7965 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7966 return None; 7967 7968 // Invalidate interleave groups if all blocks of loop will be predicated. 7969 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7970 !useMaskedInterleavedAccesses(*TTI)) { 7971 LLVM_DEBUG( 7972 dbgs() 7973 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7974 "which requires masked-interleaved support.\n"); 7975 if (CM.InterleaveInfo.invalidateGroups()) 7976 // Invalidating interleave groups also requires invalidating all decisions 7977 // based on them, which includes widening decisions and uniform and scalar 7978 // values. 7979 CM.invalidateCostModelingDecisions(); 7980 } 7981 7982 ElementCount MaxVF = MaybeMaxVF.getValue(); 7983 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7984 7985 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7986 if (!UserVF.isZero() && 7987 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7988 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7989 // VFs here, this should be reverted to only use legal UserVFs once the 7990 // loop below supports scalable VFs. 7991 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7992 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7993 << " VF " << VF << ".\n"); 7994 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7995 "VF needs to be a power of two"); 7996 // Collect the instructions (and their associated costs) that will be more 7997 // profitable to scalarize. 7998 CM.selectUserVectorizationFactor(VF); 7999 CM.collectInLoopReductions(); 8000 buildVPlansWithVPRecipes(VF, VF); 8001 LLVM_DEBUG(printPlans(dbgs())); 8002 return {{VF, 0}}; 8003 } 8004 8005 assert(!MaxVF.isScalable() && 8006 "Scalable vectors not yet supported beyond this point"); 8007 8008 for (ElementCount VF = ElementCount::getFixed(1); 8009 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 8010 // Collect Uniform and Scalar instructions after vectorization with VF. 8011 CM.collectUniformsAndScalars(VF); 8012 8013 // Collect the instructions (and their associated costs) that will be more 8014 // profitable to scalarize. 8015 if (VF.isVector()) 8016 CM.collectInstsToScalarize(VF); 8017 } 8018 8019 CM.collectInLoopReductions(); 8020 8021 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 8022 LLVM_DEBUG(printPlans(dbgs())); 8023 if (MaxVF.isScalar()) 8024 return VectorizationFactor::Disabled(); 8025 8026 // Select the optimal vectorization factor. 8027 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 8028 8029 // Check if it is profitable to vectorize with runtime checks. 8030 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8031 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8032 bool PragmaThresholdReached = 8033 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8034 bool ThresholdReached = 8035 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8036 if ((ThresholdReached && !Hints.allowReordering()) || 8037 PragmaThresholdReached) { 8038 ORE->emit([&]() { 8039 return OptimizationRemarkAnalysisAliasing( 8040 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8041 OrigLoop->getHeader()) 8042 << "loop not vectorized: cannot prove it is safe to reorder " 8043 "memory operations"; 8044 }); 8045 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8046 Hints.emitRemarkWithHints(); 8047 return VectorizationFactor::Disabled(); 8048 } 8049 } 8050 return SelectedVF; 8051 } 8052 8053 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8054 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8055 << '\n'); 8056 BestVF = VF; 8057 BestUF = UF; 8058 8059 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8060 return !Plan->hasVF(VF); 8061 }); 8062 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8063 } 8064 8065 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8066 DominatorTree *DT) { 8067 // Perform the actual loop transformation. 8068 8069 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8070 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8071 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8072 8073 VPTransformState State{ 8074 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8075 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8076 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8077 State.CanonicalIV = ILV.Induction; 8078 8079 ILV.printDebugTracesAtStart(); 8080 8081 //===------------------------------------------------===// 8082 // 8083 // Notice: any optimization or new instruction that go 8084 // into the code below should also be implemented in 8085 // the cost-model. 8086 // 8087 //===------------------------------------------------===// 8088 8089 // 2. Copy and widen instructions from the old loop into the new loop. 8090 VPlans.front()->execute(&State); 8091 8092 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8093 // predication, updating analyses. 8094 ILV.fixVectorizedLoop(State); 8095 8096 ILV.printDebugTracesAtEnd(); 8097 } 8098 8099 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8100 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8101 for (const auto &Plan : VPlans) 8102 if (PrintVPlansInDotFormat) 8103 Plan->printDOT(O); 8104 else 8105 Plan->print(O); 8106 } 8107 #endif 8108 8109 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8110 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8111 8112 // We create new control-flow for the vectorized loop, so the original exit 8113 // conditions will be dead after vectorization if it's only used by the 8114 // terminator 8115 SmallVector<BasicBlock*> ExitingBlocks; 8116 OrigLoop->getExitingBlocks(ExitingBlocks); 8117 for (auto *BB : ExitingBlocks) { 8118 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8119 if (!Cmp || !Cmp->hasOneUse()) 8120 continue; 8121 8122 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8123 if (!DeadInstructions.insert(Cmp).second) 8124 continue; 8125 8126 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8127 // TODO: can recurse through operands in general 8128 for (Value *Op : Cmp->operands()) { 8129 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8130 DeadInstructions.insert(cast<Instruction>(Op)); 8131 } 8132 } 8133 8134 // We create new "steps" for induction variable updates to which the original 8135 // induction variables map. An original update instruction will be dead if 8136 // all its users except the induction variable are dead. 8137 auto *Latch = OrigLoop->getLoopLatch(); 8138 for (auto &Induction : Legal->getInductionVars()) { 8139 PHINode *Ind = Induction.first; 8140 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8141 8142 // If the tail is to be folded by masking, the primary induction variable, 8143 // if exists, isn't dead: it will be used for masking. Don't kill it. 8144 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8145 continue; 8146 8147 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8148 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8149 })) 8150 DeadInstructions.insert(IndUpdate); 8151 8152 // We record as "Dead" also the type-casting instructions we had identified 8153 // during induction analysis. We don't need any handling for them in the 8154 // vectorized loop because we have proven that, under a proper runtime 8155 // test guarding the vectorized loop, the value of the phi, and the casted 8156 // value of the phi, are the same. The last instruction in this casting chain 8157 // will get its scalar/vector/widened def from the scalar/vector/widened def 8158 // of the respective phi node. Any other casts in the induction def-use chain 8159 // have no other uses outside the phi update chain, and will be ignored. 8160 InductionDescriptor &IndDes = Induction.second; 8161 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8162 DeadInstructions.insert(Casts.begin(), Casts.end()); 8163 } 8164 } 8165 8166 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8167 8168 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8169 8170 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8171 Instruction::BinaryOps BinOp) { 8172 // When unrolling and the VF is 1, we only need to add a simple scalar. 8173 Type *Ty = Val->getType(); 8174 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8175 8176 if (Ty->isFloatingPointTy()) { 8177 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8178 8179 // Floating-point operations inherit FMF via the builder's flags. 8180 Value *MulOp = Builder.CreateFMul(C, Step); 8181 return Builder.CreateBinOp(BinOp, Val, MulOp); 8182 } 8183 Constant *C = ConstantInt::get(Ty, StartIdx); 8184 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8185 } 8186 8187 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8188 SmallVector<Metadata *, 4> MDs; 8189 // Reserve first location for self reference to the LoopID metadata node. 8190 MDs.push_back(nullptr); 8191 bool IsUnrollMetadata = false; 8192 MDNode *LoopID = L->getLoopID(); 8193 if (LoopID) { 8194 // First find existing loop unrolling disable metadata. 8195 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8196 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8197 if (MD) { 8198 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8199 IsUnrollMetadata = 8200 S && S->getString().startswith("llvm.loop.unroll.disable"); 8201 } 8202 MDs.push_back(LoopID->getOperand(i)); 8203 } 8204 } 8205 8206 if (!IsUnrollMetadata) { 8207 // Add runtime unroll disable metadata. 8208 LLVMContext &Context = L->getHeader()->getContext(); 8209 SmallVector<Metadata *, 1> DisableOperands; 8210 DisableOperands.push_back( 8211 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8212 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8213 MDs.push_back(DisableNode); 8214 MDNode *NewLoopID = MDNode::get(Context, MDs); 8215 // Set operand 0 to refer to the loop id itself. 8216 NewLoopID->replaceOperandWith(0, NewLoopID); 8217 L->setLoopID(NewLoopID); 8218 } 8219 } 8220 8221 //===--------------------------------------------------------------------===// 8222 // EpilogueVectorizerMainLoop 8223 //===--------------------------------------------------------------------===// 8224 8225 /// This function is partially responsible for generating the control flow 8226 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8227 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8228 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8229 Loop *Lp = createVectorLoopSkeleton(""); 8230 8231 // Generate the code to check the minimum iteration count of the vector 8232 // epilogue (see below). 8233 EPI.EpilogueIterationCountCheck = 8234 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8235 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8236 8237 // Generate the code to check any assumptions that we've made for SCEV 8238 // expressions. 8239 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8240 8241 // Generate the code that checks at runtime if arrays overlap. We put the 8242 // checks into a separate block to make the more common case of few elements 8243 // faster. 8244 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8245 8246 // Generate the iteration count check for the main loop, *after* the check 8247 // for the epilogue loop, so that the path-length is shorter for the case 8248 // that goes directly through the vector epilogue. The longer-path length for 8249 // the main loop is compensated for, by the gain from vectorizing the larger 8250 // trip count. Note: the branch will get updated later on when we vectorize 8251 // the epilogue. 8252 EPI.MainLoopIterationCountCheck = 8253 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8254 8255 // Generate the induction variable. 8256 OldInduction = Legal->getPrimaryInduction(); 8257 Type *IdxTy = Legal->getWidestInductionType(); 8258 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8259 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8260 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8261 EPI.VectorTripCount = CountRoundDown; 8262 Induction = 8263 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8264 getDebugLocFromInstOrOperands(OldInduction)); 8265 8266 // Skip induction resume value creation here because they will be created in 8267 // the second pass. If we created them here, they wouldn't be used anyway, 8268 // because the vplan in the second pass still contains the inductions from the 8269 // original loop. 8270 8271 return completeLoopSkeleton(Lp, OrigLoopID); 8272 } 8273 8274 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8275 LLVM_DEBUG({ 8276 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8277 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8278 << ", Main Loop UF:" << EPI.MainLoopUF 8279 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8280 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8281 }); 8282 } 8283 8284 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8285 DEBUG_WITH_TYPE(VerboseDebug, { 8286 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8287 }); 8288 } 8289 8290 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8291 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8292 assert(L && "Expected valid Loop."); 8293 assert(Bypass && "Expected valid bypass basic block."); 8294 unsigned VFactor = 8295 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8296 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8297 Value *Count = getOrCreateTripCount(L); 8298 // Reuse existing vector loop preheader for TC checks. 8299 // Note that new preheader block is generated for vector loop. 8300 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8301 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8302 8303 // Generate code to check if the loop's trip count is less than VF * UF of the 8304 // main vector loop. 8305 auto P = 8306 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8307 8308 Value *CheckMinIters = Builder.CreateICmp( 8309 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8310 "min.iters.check"); 8311 8312 if (!ForEpilogue) 8313 TCCheckBlock->setName("vector.main.loop.iter.check"); 8314 8315 // Create new preheader for vector loop. 8316 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8317 DT, LI, nullptr, "vector.ph"); 8318 8319 if (ForEpilogue) { 8320 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8321 DT->getNode(Bypass)->getIDom()) && 8322 "TC check is expected to dominate Bypass"); 8323 8324 // Update dominator for Bypass & LoopExit. 8325 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8326 if (!Cost->requiresScalarEpilogue()) 8327 // For loops with multiple exits, there's no edge from the middle block 8328 // to exit blocks (as the epilogue must run) and thus no need to update 8329 // the immediate dominator of the exit blocks. 8330 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8331 8332 LoopBypassBlocks.push_back(TCCheckBlock); 8333 8334 // Save the trip count so we don't have to regenerate it in the 8335 // vec.epilog.iter.check. This is safe to do because the trip count 8336 // generated here dominates the vector epilog iter check. 8337 EPI.TripCount = Count; 8338 } 8339 8340 ReplaceInstWithInst( 8341 TCCheckBlock->getTerminator(), 8342 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8343 8344 return TCCheckBlock; 8345 } 8346 8347 //===--------------------------------------------------------------------===// 8348 // EpilogueVectorizerEpilogueLoop 8349 //===--------------------------------------------------------------------===// 8350 8351 /// This function is partially responsible for generating the control flow 8352 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8353 BasicBlock * 8354 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8355 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8356 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8357 8358 // Now, compare the remaining count and if there aren't enough iterations to 8359 // execute the vectorized epilogue skip to the scalar part. 8360 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8361 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8362 LoopVectorPreHeader = 8363 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8364 LI, nullptr, "vec.epilog.ph"); 8365 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8366 VecEpilogueIterationCountCheck); 8367 8368 // Adjust the control flow taking the state info from the main loop 8369 // vectorization into account. 8370 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8371 "expected this to be saved from the previous pass."); 8372 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8373 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8374 8375 DT->changeImmediateDominator(LoopVectorPreHeader, 8376 EPI.MainLoopIterationCountCheck); 8377 8378 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8379 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8380 8381 if (EPI.SCEVSafetyCheck) 8382 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8383 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8384 if (EPI.MemSafetyCheck) 8385 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8386 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8387 8388 DT->changeImmediateDominator( 8389 VecEpilogueIterationCountCheck, 8390 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8391 8392 DT->changeImmediateDominator(LoopScalarPreHeader, 8393 EPI.EpilogueIterationCountCheck); 8394 if (!Cost->requiresScalarEpilogue()) 8395 // If there is an epilogue which must run, there's no edge from the 8396 // middle block to exit blocks and thus no need to update the immediate 8397 // dominator of the exit blocks. 8398 DT->changeImmediateDominator(LoopExitBlock, 8399 EPI.EpilogueIterationCountCheck); 8400 8401 // Keep track of bypass blocks, as they feed start values to the induction 8402 // phis in the scalar loop preheader. 8403 if (EPI.SCEVSafetyCheck) 8404 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8405 if (EPI.MemSafetyCheck) 8406 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8407 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8408 8409 // Generate a resume induction for the vector epilogue and put it in the 8410 // vector epilogue preheader 8411 Type *IdxTy = Legal->getWidestInductionType(); 8412 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8413 LoopVectorPreHeader->getFirstNonPHI()); 8414 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8415 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8416 EPI.MainLoopIterationCountCheck); 8417 8418 // Generate the induction variable. 8419 OldInduction = Legal->getPrimaryInduction(); 8420 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8421 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8422 Value *StartIdx = EPResumeVal; 8423 Induction = 8424 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8425 getDebugLocFromInstOrOperands(OldInduction)); 8426 8427 // Generate induction resume values. These variables save the new starting 8428 // indexes for the scalar loop. They are used to test if there are any tail 8429 // iterations left once the vector loop has completed. 8430 // Note that when the vectorized epilogue is skipped due to iteration count 8431 // check, then the resume value for the induction variable comes from 8432 // the trip count of the main vector loop, hence passing the AdditionalBypass 8433 // argument. 8434 createInductionResumeValues(Lp, CountRoundDown, 8435 {VecEpilogueIterationCountCheck, 8436 EPI.VectorTripCount} /* AdditionalBypass */); 8437 8438 AddRuntimeUnrollDisableMetaData(Lp); 8439 return completeLoopSkeleton(Lp, OrigLoopID); 8440 } 8441 8442 BasicBlock * 8443 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8444 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8445 8446 assert(EPI.TripCount && 8447 "Expected trip count to have been safed in the first pass."); 8448 assert( 8449 (!isa<Instruction>(EPI.TripCount) || 8450 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8451 "saved trip count does not dominate insertion point."); 8452 Value *TC = EPI.TripCount; 8453 IRBuilder<> Builder(Insert->getTerminator()); 8454 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8455 8456 // Generate code to check if the loop's trip count is less than VF * UF of the 8457 // vector epilogue loop. 8458 auto P = 8459 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8460 8461 Value *CheckMinIters = Builder.CreateICmp( 8462 P, Count, 8463 ConstantInt::get(Count->getType(), 8464 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8465 "min.epilog.iters.check"); 8466 8467 ReplaceInstWithInst( 8468 Insert->getTerminator(), 8469 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8470 8471 LoopBypassBlocks.push_back(Insert); 8472 return Insert; 8473 } 8474 8475 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8476 LLVM_DEBUG({ 8477 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8478 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8479 << ", Main Loop UF:" << EPI.MainLoopUF 8480 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8481 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8482 }); 8483 } 8484 8485 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8486 DEBUG_WITH_TYPE(VerboseDebug, { 8487 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8488 }); 8489 } 8490 8491 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8492 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8493 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8494 bool PredicateAtRangeStart = Predicate(Range.Start); 8495 8496 for (ElementCount TmpVF = Range.Start * 2; 8497 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8498 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8499 Range.End = TmpVF; 8500 break; 8501 } 8502 8503 return PredicateAtRangeStart; 8504 } 8505 8506 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8507 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8508 /// of VF's starting at a given VF and extending it as much as possible. Each 8509 /// vectorization decision can potentially shorten this sub-range during 8510 /// buildVPlan(). 8511 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8512 ElementCount MaxVF) { 8513 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8514 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8515 VFRange SubRange = {VF, MaxVFPlusOne}; 8516 VPlans.push_back(buildVPlan(SubRange)); 8517 VF = SubRange.End; 8518 } 8519 } 8520 8521 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8522 VPlanPtr &Plan) { 8523 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8524 8525 // Look for cached value. 8526 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8527 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8528 if (ECEntryIt != EdgeMaskCache.end()) 8529 return ECEntryIt->second; 8530 8531 VPValue *SrcMask = createBlockInMask(Src, Plan); 8532 8533 // The terminator has to be a branch inst! 8534 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8535 assert(BI && "Unexpected terminator found"); 8536 8537 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8538 return EdgeMaskCache[Edge] = SrcMask; 8539 8540 // If source is an exiting block, we know the exit edge is dynamically dead 8541 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8542 // adding uses of an otherwise potentially dead instruction. 8543 if (OrigLoop->isLoopExiting(Src)) 8544 return EdgeMaskCache[Edge] = SrcMask; 8545 8546 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8547 assert(EdgeMask && "No Edge Mask found for condition"); 8548 8549 if (BI->getSuccessor(0) != Dst) 8550 EdgeMask = Builder.createNot(EdgeMask); 8551 8552 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8553 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8554 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8555 // The select version does not introduce new UB if SrcMask is false and 8556 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8557 VPValue *False = Plan->getOrAddVPValue( 8558 ConstantInt::getFalse(BI->getCondition()->getType())); 8559 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8560 } 8561 8562 return EdgeMaskCache[Edge] = EdgeMask; 8563 } 8564 8565 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8566 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8567 8568 // Look for cached value. 8569 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8570 if (BCEntryIt != BlockMaskCache.end()) 8571 return BCEntryIt->second; 8572 8573 // All-one mask is modelled as no-mask following the convention for masked 8574 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8575 VPValue *BlockMask = nullptr; 8576 8577 if (OrigLoop->getHeader() == BB) { 8578 if (!CM.blockNeedsPredication(BB)) 8579 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8580 8581 // Create the block in mask as the first non-phi instruction in the block. 8582 VPBuilder::InsertPointGuard Guard(Builder); 8583 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8584 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8585 8586 // Introduce the early-exit compare IV <= BTC to form header block mask. 8587 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8588 // Start by constructing the desired canonical IV. 8589 VPValue *IV = nullptr; 8590 if (Legal->getPrimaryInduction()) 8591 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8592 else { 8593 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8594 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8595 IV = IVRecipe->getVPSingleValue(); 8596 } 8597 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8598 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8599 8600 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8601 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8602 // as a second argument, we only pass the IV here and extract the 8603 // tripcount from the transform state where codegen of the VP instructions 8604 // happen. 8605 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8606 } else { 8607 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8608 } 8609 return BlockMaskCache[BB] = BlockMask; 8610 } 8611 8612 // This is the block mask. We OR all incoming edges. 8613 for (auto *Predecessor : predecessors(BB)) { 8614 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8615 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8616 return BlockMaskCache[BB] = EdgeMask; 8617 8618 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8619 BlockMask = EdgeMask; 8620 continue; 8621 } 8622 8623 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8624 } 8625 8626 return BlockMaskCache[BB] = BlockMask; 8627 } 8628 8629 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8630 ArrayRef<VPValue *> Operands, 8631 VFRange &Range, 8632 VPlanPtr &Plan) { 8633 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8634 "Must be called with either a load or store"); 8635 8636 auto willWiden = [&](ElementCount VF) -> bool { 8637 if (VF.isScalar()) 8638 return false; 8639 LoopVectorizationCostModel::InstWidening Decision = 8640 CM.getWideningDecision(I, VF); 8641 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8642 "CM decision should be taken at this point."); 8643 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8644 return true; 8645 if (CM.isScalarAfterVectorization(I, VF) || 8646 CM.isProfitableToScalarize(I, VF)) 8647 return false; 8648 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8649 }; 8650 8651 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8652 return nullptr; 8653 8654 VPValue *Mask = nullptr; 8655 if (Legal->isMaskRequired(I)) 8656 Mask = createBlockInMask(I->getParent(), Plan); 8657 8658 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8659 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8660 8661 StoreInst *Store = cast<StoreInst>(I); 8662 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8663 Mask); 8664 } 8665 8666 VPWidenIntOrFpInductionRecipe * 8667 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8668 ArrayRef<VPValue *> Operands) const { 8669 // Check if this is an integer or fp induction. If so, build the recipe that 8670 // produces its scalar and vector values. 8671 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8672 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8673 II.getKind() == InductionDescriptor::IK_FpInduction) { 8674 assert(II.getStartValue() == 8675 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8676 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8677 return new VPWidenIntOrFpInductionRecipe( 8678 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8679 } 8680 8681 return nullptr; 8682 } 8683 8684 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8685 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8686 VPlan &Plan) const { 8687 // Optimize the special case where the source is a constant integer 8688 // induction variable. Notice that we can only optimize the 'trunc' case 8689 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8690 // (c) other casts depend on pointer size. 8691 8692 // Determine whether \p K is a truncation based on an induction variable that 8693 // can be optimized. 8694 auto isOptimizableIVTruncate = 8695 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8696 return [=](ElementCount VF) -> bool { 8697 return CM.isOptimizableIVTruncate(K, VF); 8698 }; 8699 }; 8700 8701 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8702 isOptimizableIVTruncate(I), Range)) { 8703 8704 InductionDescriptor II = 8705 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8706 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8707 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8708 Start, nullptr, I); 8709 } 8710 return nullptr; 8711 } 8712 8713 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8714 ArrayRef<VPValue *> Operands, 8715 VPlanPtr &Plan) { 8716 // If all incoming values are equal, the incoming VPValue can be used directly 8717 // instead of creating a new VPBlendRecipe. 8718 VPValue *FirstIncoming = Operands[0]; 8719 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8720 return FirstIncoming == Inc; 8721 })) { 8722 return Operands[0]; 8723 } 8724 8725 // We know that all PHIs in non-header blocks are converted into selects, so 8726 // we don't have to worry about the insertion order and we can just use the 8727 // builder. At this point we generate the predication tree. There may be 8728 // duplications since this is a simple recursive scan, but future 8729 // optimizations will clean it up. 8730 SmallVector<VPValue *, 2> OperandsWithMask; 8731 unsigned NumIncoming = Phi->getNumIncomingValues(); 8732 8733 for (unsigned In = 0; In < NumIncoming; In++) { 8734 VPValue *EdgeMask = 8735 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8736 assert((EdgeMask || NumIncoming == 1) && 8737 "Multiple predecessors with one having a full mask"); 8738 OperandsWithMask.push_back(Operands[In]); 8739 if (EdgeMask) 8740 OperandsWithMask.push_back(EdgeMask); 8741 } 8742 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8743 } 8744 8745 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8746 ArrayRef<VPValue *> Operands, 8747 VFRange &Range) const { 8748 8749 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8750 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8751 Range); 8752 8753 if (IsPredicated) 8754 return nullptr; 8755 8756 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8757 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8758 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8759 ID == Intrinsic::pseudoprobe || 8760 ID == Intrinsic::experimental_noalias_scope_decl)) 8761 return nullptr; 8762 8763 auto willWiden = [&](ElementCount VF) -> bool { 8764 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8765 // The following case may be scalarized depending on the VF. 8766 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8767 // version of the instruction. 8768 // Is it beneficial to perform intrinsic call compared to lib call? 8769 bool NeedToScalarize = false; 8770 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8771 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8772 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8773 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8774 "Either the intrinsic cost or vector call cost must be valid"); 8775 return UseVectorIntrinsic || !NeedToScalarize; 8776 }; 8777 8778 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8779 return nullptr; 8780 8781 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8782 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8783 } 8784 8785 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8786 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8787 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8788 // Instruction should be widened, unless it is scalar after vectorization, 8789 // scalarization is profitable or it is predicated. 8790 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8791 return CM.isScalarAfterVectorization(I, VF) || 8792 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8793 }; 8794 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8795 Range); 8796 } 8797 8798 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8799 ArrayRef<VPValue *> Operands) const { 8800 auto IsVectorizableOpcode = [](unsigned Opcode) { 8801 switch (Opcode) { 8802 case Instruction::Add: 8803 case Instruction::And: 8804 case Instruction::AShr: 8805 case Instruction::BitCast: 8806 case Instruction::FAdd: 8807 case Instruction::FCmp: 8808 case Instruction::FDiv: 8809 case Instruction::FMul: 8810 case Instruction::FNeg: 8811 case Instruction::FPExt: 8812 case Instruction::FPToSI: 8813 case Instruction::FPToUI: 8814 case Instruction::FPTrunc: 8815 case Instruction::FRem: 8816 case Instruction::FSub: 8817 case Instruction::ICmp: 8818 case Instruction::IntToPtr: 8819 case Instruction::LShr: 8820 case Instruction::Mul: 8821 case Instruction::Or: 8822 case Instruction::PtrToInt: 8823 case Instruction::SDiv: 8824 case Instruction::Select: 8825 case Instruction::SExt: 8826 case Instruction::Shl: 8827 case Instruction::SIToFP: 8828 case Instruction::SRem: 8829 case Instruction::Sub: 8830 case Instruction::Trunc: 8831 case Instruction::UDiv: 8832 case Instruction::UIToFP: 8833 case Instruction::URem: 8834 case Instruction::Xor: 8835 case Instruction::ZExt: 8836 return true; 8837 } 8838 return false; 8839 }; 8840 8841 if (!IsVectorizableOpcode(I->getOpcode())) 8842 return nullptr; 8843 8844 // Success: widen this instruction. 8845 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8846 } 8847 8848 void VPRecipeBuilder::fixHeaderPhis() { 8849 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8850 for (VPWidenPHIRecipe *R : PhisToFix) { 8851 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8852 VPRecipeBase *IncR = 8853 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8854 R->addOperand(IncR->getVPSingleValue()); 8855 } 8856 } 8857 8858 VPBasicBlock *VPRecipeBuilder::handleReplication( 8859 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8860 VPlanPtr &Plan) { 8861 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8862 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8863 Range); 8864 8865 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8866 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8867 8868 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8869 IsUniform, IsPredicated); 8870 setRecipe(I, Recipe); 8871 Plan->addVPValue(I, Recipe); 8872 8873 // Find if I uses a predicated instruction. If so, it will use its scalar 8874 // value. Avoid hoisting the insert-element which packs the scalar value into 8875 // a vector value, as that happens iff all users use the vector value. 8876 for (VPValue *Op : Recipe->operands()) { 8877 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8878 if (!PredR) 8879 continue; 8880 auto *RepR = 8881 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8882 assert(RepR->isPredicated() && 8883 "expected Replicate recipe to be predicated"); 8884 RepR->setAlsoPack(false); 8885 } 8886 8887 // Finalize the recipe for Instr, first if it is not predicated. 8888 if (!IsPredicated) { 8889 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8890 VPBB->appendRecipe(Recipe); 8891 return VPBB; 8892 } 8893 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8894 assert(VPBB->getSuccessors().empty() && 8895 "VPBB has successors when handling predicated replication."); 8896 // Record predicated instructions for above packing optimizations. 8897 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8898 VPBlockUtils::insertBlockAfter(Region, VPBB); 8899 auto *RegSucc = new VPBasicBlock(); 8900 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8901 return RegSucc; 8902 } 8903 8904 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8905 VPRecipeBase *PredRecipe, 8906 VPlanPtr &Plan) { 8907 // Instructions marked for predication are replicated and placed under an 8908 // if-then construct to prevent side-effects. 8909 8910 // Generate recipes to compute the block mask for this region. 8911 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8912 8913 // Build the triangular if-then region. 8914 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8915 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8916 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8917 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8918 auto *PHIRecipe = Instr->getType()->isVoidTy() 8919 ? nullptr 8920 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8921 if (PHIRecipe) { 8922 Plan->removeVPValueFor(Instr); 8923 Plan->addVPValue(Instr, PHIRecipe); 8924 } 8925 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8926 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8927 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8928 8929 // Note: first set Entry as region entry and then connect successors starting 8930 // from it in order, to propagate the "parent" of each VPBasicBlock. 8931 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8932 VPBlockUtils::connectBlocks(Pred, Exit); 8933 8934 return Region; 8935 } 8936 8937 VPRecipeOrVPValueTy 8938 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8939 ArrayRef<VPValue *> Operands, 8940 VFRange &Range, VPlanPtr &Plan) { 8941 // First, check for specific widening recipes that deal with calls, memory 8942 // operations, inductions and Phi nodes. 8943 if (auto *CI = dyn_cast<CallInst>(Instr)) 8944 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8945 8946 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8947 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8948 8949 VPRecipeBase *Recipe; 8950 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8951 if (Phi->getParent() != OrigLoop->getHeader()) 8952 return tryToBlend(Phi, Operands, Plan); 8953 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8954 return toVPRecipeResult(Recipe); 8955 8956 if (Legal->isReductionVariable(Phi)) { 8957 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8958 assert(RdxDesc.getRecurrenceStartValue() == 8959 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8960 VPValue *StartV = Operands[0]; 8961 8962 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8963 PhisToFix.push_back(PhiRecipe); 8964 // Record the incoming value from the backedge, so we can add the incoming 8965 // value from the backedge after all recipes have been created. 8966 recordRecipeOf(cast<Instruction>( 8967 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8968 return toVPRecipeResult(PhiRecipe); 8969 } 8970 8971 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8972 } 8973 8974 if (isa<TruncInst>(Instr) && 8975 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8976 Range, *Plan))) 8977 return toVPRecipeResult(Recipe); 8978 8979 if (!shouldWiden(Instr, Range)) 8980 return nullptr; 8981 8982 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8983 return toVPRecipeResult(new VPWidenGEPRecipe( 8984 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8985 8986 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8987 bool InvariantCond = 8988 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8989 return toVPRecipeResult(new VPWidenSelectRecipe( 8990 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8991 } 8992 8993 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8994 } 8995 8996 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8997 ElementCount MaxVF) { 8998 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8999 9000 // Collect instructions from the original loop that will become trivially dead 9001 // in the vectorized loop. We don't need to vectorize these instructions. For 9002 // example, original induction update instructions can become dead because we 9003 // separately emit induction "steps" when generating code for the new loop. 9004 // Similarly, we create a new latch condition when setting up the structure 9005 // of the new loop, so the old one can become dead. 9006 SmallPtrSet<Instruction *, 4> DeadInstructions; 9007 collectTriviallyDeadInstructions(DeadInstructions); 9008 9009 // Add assume instructions we need to drop to DeadInstructions, to prevent 9010 // them from being added to the VPlan. 9011 // TODO: We only need to drop assumes in blocks that get flattend. If the 9012 // control flow is preserved, we should keep them. 9013 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 9014 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 9015 9016 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 9017 // Dead instructions do not need sinking. Remove them from SinkAfter. 9018 for (Instruction *I : DeadInstructions) 9019 SinkAfter.erase(I); 9020 9021 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 9022 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 9023 VFRange SubRange = {VF, MaxVFPlusOne}; 9024 VPlans.push_back( 9025 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 9026 VF = SubRange.End; 9027 } 9028 } 9029 9030 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9031 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9032 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 9033 9034 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9035 9036 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9037 9038 // --------------------------------------------------------------------------- 9039 // Pre-construction: record ingredients whose recipes we'll need to further 9040 // process after constructing the initial VPlan. 9041 // --------------------------------------------------------------------------- 9042 9043 // Mark instructions we'll need to sink later and their targets as 9044 // ingredients whose recipe we'll need to record. 9045 for (auto &Entry : SinkAfter) { 9046 RecipeBuilder.recordRecipeOf(Entry.first); 9047 RecipeBuilder.recordRecipeOf(Entry.second); 9048 } 9049 for (auto &Reduction : CM.getInLoopReductionChains()) { 9050 PHINode *Phi = Reduction.first; 9051 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9052 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9053 9054 RecipeBuilder.recordRecipeOf(Phi); 9055 for (auto &R : ReductionOperations) { 9056 RecipeBuilder.recordRecipeOf(R); 9057 // For min/max reducitons, where we have a pair of icmp/select, we also 9058 // need to record the ICmp recipe, so it can be removed later. 9059 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9060 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9061 } 9062 } 9063 9064 // For each interleave group which is relevant for this (possibly trimmed) 9065 // Range, add it to the set of groups to be later applied to the VPlan and add 9066 // placeholders for its members' Recipes which we'll be replacing with a 9067 // single VPInterleaveRecipe. 9068 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9069 auto applyIG = [IG, this](ElementCount VF) -> bool { 9070 return (VF.isVector() && // Query is illegal for VF == 1 9071 CM.getWideningDecision(IG->getInsertPos(), VF) == 9072 LoopVectorizationCostModel::CM_Interleave); 9073 }; 9074 if (!getDecisionAndClampRange(applyIG, Range)) 9075 continue; 9076 InterleaveGroups.insert(IG); 9077 for (unsigned i = 0; i < IG->getFactor(); i++) 9078 if (Instruction *Member = IG->getMember(i)) 9079 RecipeBuilder.recordRecipeOf(Member); 9080 }; 9081 9082 // --------------------------------------------------------------------------- 9083 // Build initial VPlan: Scan the body of the loop in a topological order to 9084 // visit each basic block after having visited its predecessor basic blocks. 9085 // --------------------------------------------------------------------------- 9086 9087 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9088 auto Plan = std::make_unique<VPlan>(); 9089 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9090 Plan->setEntry(VPBB); 9091 9092 // Scan the body of the loop in a topological order to visit each basic block 9093 // after having visited its predecessor basic blocks. 9094 LoopBlocksDFS DFS(OrigLoop); 9095 DFS.perform(LI); 9096 9097 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9098 // Relevant instructions from basic block BB will be grouped into VPRecipe 9099 // ingredients and fill a new VPBasicBlock. 9100 unsigned VPBBsForBB = 0; 9101 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9102 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9103 VPBB = FirstVPBBForBB; 9104 Builder.setInsertPoint(VPBB); 9105 9106 // Introduce each ingredient into VPlan. 9107 // TODO: Model and preserve debug instrinsics in VPlan. 9108 for (Instruction &I : BB->instructionsWithoutDebug()) { 9109 Instruction *Instr = &I; 9110 9111 // First filter out irrelevant instructions, to ensure no recipes are 9112 // built for them. 9113 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9114 continue; 9115 9116 SmallVector<VPValue *, 4> Operands; 9117 auto *Phi = dyn_cast<PHINode>(Instr); 9118 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9119 Operands.push_back(Plan->getOrAddVPValue( 9120 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9121 } else { 9122 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9123 Operands = {OpRange.begin(), OpRange.end()}; 9124 } 9125 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9126 Instr, Operands, Range, Plan)) { 9127 // If Instr can be simplified to an existing VPValue, use it. 9128 if (RecipeOrValue.is<VPValue *>()) { 9129 auto *VPV = RecipeOrValue.get<VPValue *>(); 9130 Plan->addVPValue(Instr, VPV); 9131 // If the re-used value is a recipe, register the recipe for the 9132 // instruction, in case the recipe for Instr needs to be recorded. 9133 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9134 RecipeBuilder.setRecipe(Instr, R); 9135 continue; 9136 } 9137 // Otherwise, add the new recipe. 9138 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9139 for (auto *Def : Recipe->definedValues()) { 9140 auto *UV = Def->getUnderlyingValue(); 9141 Plan->addVPValue(UV, Def); 9142 } 9143 9144 RecipeBuilder.setRecipe(Instr, Recipe); 9145 VPBB->appendRecipe(Recipe); 9146 continue; 9147 } 9148 9149 // Otherwise, if all widening options failed, Instruction is to be 9150 // replicated. This may create a successor for VPBB. 9151 VPBasicBlock *NextVPBB = 9152 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9153 if (NextVPBB != VPBB) { 9154 VPBB = NextVPBB; 9155 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9156 : ""); 9157 } 9158 } 9159 } 9160 9161 RecipeBuilder.fixHeaderPhis(); 9162 9163 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9164 // may also be empty, such as the last one VPBB, reflecting original 9165 // basic-blocks with no recipes. 9166 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9167 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9168 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9169 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9170 delete PreEntry; 9171 9172 // --------------------------------------------------------------------------- 9173 // Transform initial VPlan: Apply previously taken decisions, in order, to 9174 // bring the VPlan to its final state. 9175 // --------------------------------------------------------------------------- 9176 9177 // Apply Sink-After legal constraints. 9178 for (auto &Entry : SinkAfter) { 9179 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9180 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9181 9182 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9183 auto *Region = 9184 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9185 if (Region && Region->isReplicator()) 9186 return Region; 9187 return nullptr; 9188 }; 9189 9190 // If the target is in a replication region, make sure to move Sink to the 9191 // block after it, not into the replication region itself. 9192 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9193 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9194 assert(!GetReplicateRegion(Sink) && 9195 "cannot sink a region into another region yet"); 9196 VPBasicBlock *NextBlock = 9197 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9198 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9199 continue; 9200 } 9201 9202 auto *SinkRegion = GetReplicateRegion(Sink); 9203 // Unless the sink source is in a replicate region, sink the recipe 9204 // directly. 9205 if (!SinkRegion) { 9206 Sink->moveAfter(Target); 9207 continue; 9208 } 9209 9210 // If the sink source is in a replicate region, we need to move the whole 9211 // replicate region, which should only contain a single recipe in the main 9212 // block. 9213 assert(Sink->getParent()->size() == 1 && 9214 "parent must be a replicator with a single recipe"); 9215 auto *SplitBlock = 9216 Target->getParent()->splitAt(std::next(Target->getIterator())); 9217 9218 auto *Pred = SinkRegion->getSinglePredecessor(); 9219 auto *Succ = SinkRegion->getSingleSuccessor(); 9220 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9221 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9222 VPBlockUtils::connectBlocks(Pred, Succ); 9223 9224 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9225 9226 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9227 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9228 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9229 if (VPBB == SplitPred) 9230 VPBB = SplitBlock; 9231 } 9232 9233 // Interleave memory: for each Interleave Group we marked earlier as relevant 9234 // for this VPlan, replace the Recipes widening its memory instructions with a 9235 // single VPInterleaveRecipe at its insertion point. 9236 for (auto IG : InterleaveGroups) { 9237 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9238 RecipeBuilder.getRecipe(IG->getInsertPos())); 9239 SmallVector<VPValue *, 4> StoredValues; 9240 for (unsigned i = 0; i < IG->getFactor(); ++i) 9241 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9242 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9243 9244 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9245 Recipe->getMask()); 9246 VPIG->insertBefore(Recipe); 9247 unsigned J = 0; 9248 for (unsigned i = 0; i < IG->getFactor(); ++i) 9249 if (Instruction *Member = IG->getMember(i)) { 9250 if (!Member->getType()->isVoidTy()) { 9251 VPValue *OriginalV = Plan->getVPValue(Member); 9252 Plan->removeVPValueFor(Member); 9253 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9254 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9255 J++; 9256 } 9257 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9258 } 9259 } 9260 9261 // Adjust the recipes for any inloop reductions. 9262 if (Range.Start.isVector()) 9263 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9264 9265 // Finally, if tail is folded by masking, introduce selects between the phi 9266 // and the live-out instruction of each reduction, at the end of the latch. 9267 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9268 Builder.setInsertPoint(VPBB); 9269 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9270 for (auto &Reduction : Legal->getReductionVars()) { 9271 if (CM.isInLoopReduction(Reduction.first)) 9272 continue; 9273 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9274 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9275 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9276 } 9277 } 9278 9279 std::string PlanName; 9280 raw_string_ostream RSO(PlanName); 9281 ElementCount VF = Range.Start; 9282 Plan->addVF(VF); 9283 RSO << "Initial VPlan for VF={" << VF; 9284 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9285 Plan->addVF(VF); 9286 RSO << "," << VF; 9287 } 9288 RSO << "},UF>=1"; 9289 RSO.flush(); 9290 Plan->setName(PlanName); 9291 9292 return Plan; 9293 } 9294 9295 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9296 // Outer loop handling: They may require CFG and instruction level 9297 // transformations before even evaluating whether vectorization is profitable. 9298 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9299 // the vectorization pipeline. 9300 assert(!OrigLoop->isInnermost()); 9301 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9302 9303 // Create new empty VPlan 9304 auto Plan = std::make_unique<VPlan>(); 9305 9306 // Build hierarchical CFG 9307 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9308 HCFGBuilder.buildHierarchicalCFG(); 9309 9310 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9311 VF *= 2) 9312 Plan->addVF(VF); 9313 9314 if (EnableVPlanPredication) { 9315 VPlanPredicator VPP(*Plan); 9316 VPP.predicate(); 9317 9318 // Avoid running transformation to recipes until masked code generation in 9319 // VPlan-native path is in place. 9320 return Plan; 9321 } 9322 9323 SmallPtrSet<Instruction *, 1> DeadInstructions; 9324 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9325 Legal->getInductionVars(), 9326 DeadInstructions, *PSE.getSE()); 9327 return Plan; 9328 } 9329 9330 // Adjust the recipes for any inloop reductions. The chain of instructions 9331 // leading from the loop exit instr to the phi need to be converted to 9332 // reductions, with one operand being vector and the other being the scalar 9333 // reduction chain. 9334 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9335 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9336 for (auto &Reduction : CM.getInLoopReductionChains()) { 9337 PHINode *Phi = Reduction.first; 9338 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9339 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9340 9341 // ReductionOperations are orders top-down from the phi's use to the 9342 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9343 // which of the two operands will remain scalar and which will be reduced. 9344 // For minmax the chain will be the select instructions. 9345 Instruction *Chain = Phi; 9346 for (Instruction *R : ReductionOperations) { 9347 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9348 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9349 9350 VPValue *ChainOp = Plan->getVPValue(Chain); 9351 unsigned FirstOpId; 9352 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9353 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9354 "Expected to replace a VPWidenSelectSC"); 9355 FirstOpId = 1; 9356 } else { 9357 assert(isa<VPWidenRecipe>(WidenRecipe) && 9358 "Expected to replace a VPWidenSC"); 9359 FirstOpId = 0; 9360 } 9361 unsigned VecOpId = 9362 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9363 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9364 9365 auto *CondOp = CM.foldTailByMasking() 9366 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9367 : nullptr; 9368 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9369 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9370 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9371 Plan->removeVPValueFor(R); 9372 Plan->addVPValue(R, RedRecipe); 9373 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9374 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9375 WidenRecipe->eraseFromParent(); 9376 9377 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9378 VPRecipeBase *CompareRecipe = 9379 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9380 assert(isa<VPWidenRecipe>(CompareRecipe) && 9381 "Expected to replace a VPWidenSC"); 9382 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9383 "Expected no remaining users"); 9384 CompareRecipe->eraseFromParent(); 9385 } 9386 Chain = R; 9387 } 9388 } 9389 } 9390 9391 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9392 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9393 VPSlotTracker &SlotTracker) const { 9394 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9395 IG->getInsertPos()->printAsOperand(O, false); 9396 O << ", "; 9397 getAddr()->printAsOperand(O, SlotTracker); 9398 VPValue *Mask = getMask(); 9399 if (Mask) { 9400 O << ", "; 9401 Mask->printAsOperand(O, SlotTracker); 9402 } 9403 for (unsigned i = 0; i < IG->getFactor(); ++i) 9404 if (Instruction *I = IG->getMember(i)) 9405 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9406 } 9407 #endif 9408 9409 void VPWidenCallRecipe::execute(VPTransformState &State) { 9410 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9411 *this, State); 9412 } 9413 9414 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9415 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9416 this, *this, InvariantCond, State); 9417 } 9418 9419 void VPWidenRecipe::execute(VPTransformState &State) { 9420 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9421 } 9422 9423 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9424 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9425 *this, State.UF, State.VF, IsPtrLoopInvariant, 9426 IsIndexLoopInvariant, State); 9427 } 9428 9429 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9430 assert(!State.Instance && "Int or FP induction being replicated."); 9431 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9432 getTruncInst(), getVPValue(0), 9433 getCastValue(), State); 9434 } 9435 9436 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9437 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9438 this, State); 9439 } 9440 9441 void VPBlendRecipe::execute(VPTransformState &State) { 9442 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9443 // We know that all PHIs in non-header blocks are converted into 9444 // selects, so we don't have to worry about the insertion order and we 9445 // can just use the builder. 9446 // At this point we generate the predication tree. There may be 9447 // duplications since this is a simple recursive scan, but future 9448 // optimizations will clean it up. 9449 9450 unsigned NumIncoming = getNumIncomingValues(); 9451 9452 // Generate a sequence of selects of the form: 9453 // SELECT(Mask3, In3, 9454 // SELECT(Mask2, In2, 9455 // SELECT(Mask1, In1, 9456 // In0))) 9457 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9458 // are essentially undef are taken from In0. 9459 InnerLoopVectorizer::VectorParts Entry(State.UF); 9460 for (unsigned In = 0; In < NumIncoming; ++In) { 9461 for (unsigned Part = 0; Part < State.UF; ++Part) { 9462 // We might have single edge PHIs (blocks) - use an identity 9463 // 'select' for the first PHI operand. 9464 Value *In0 = State.get(getIncomingValue(In), Part); 9465 if (In == 0) 9466 Entry[Part] = In0; // Initialize with the first incoming value. 9467 else { 9468 // Select between the current value and the previous incoming edge 9469 // based on the incoming mask. 9470 Value *Cond = State.get(getMask(In), Part); 9471 Entry[Part] = 9472 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9473 } 9474 } 9475 } 9476 for (unsigned Part = 0; Part < State.UF; ++Part) 9477 State.set(this, Entry[Part], Part); 9478 } 9479 9480 void VPInterleaveRecipe::execute(VPTransformState &State) { 9481 assert(!State.Instance && "Interleave group being replicated."); 9482 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9483 getStoredValues(), getMask()); 9484 } 9485 9486 void VPReductionRecipe::execute(VPTransformState &State) { 9487 assert(!State.Instance && "Reduction being replicated."); 9488 Value *PrevInChain = State.get(getChainOp(), 0); 9489 for (unsigned Part = 0; Part < State.UF; ++Part) { 9490 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9491 bool IsOrdered = useOrderedReductions(*RdxDesc); 9492 Value *NewVecOp = State.get(getVecOp(), Part); 9493 if (VPValue *Cond = getCondOp()) { 9494 Value *NewCond = State.get(Cond, Part); 9495 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9496 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9497 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9498 Constant *IdenVec = 9499 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9500 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9501 NewVecOp = Select; 9502 } 9503 Value *NewRed; 9504 Value *NextInChain; 9505 if (IsOrdered) { 9506 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9507 PrevInChain); 9508 PrevInChain = NewRed; 9509 } else { 9510 PrevInChain = State.get(getChainOp(), Part); 9511 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9512 } 9513 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9514 NextInChain = 9515 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9516 NewRed, PrevInChain); 9517 } else if (IsOrdered) 9518 NextInChain = NewRed; 9519 else { 9520 NextInChain = State.Builder.CreateBinOp( 9521 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9522 PrevInChain); 9523 } 9524 State.set(this, NextInChain, Part); 9525 } 9526 } 9527 9528 void VPReplicateRecipe::execute(VPTransformState &State) { 9529 if (State.Instance) { // Generate a single instance. 9530 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9531 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9532 *State.Instance, IsPredicated, State); 9533 // Insert scalar instance packing it into a vector. 9534 if (AlsoPack && State.VF.isVector()) { 9535 // If we're constructing lane 0, initialize to start from poison. 9536 if (State.Instance->Lane.isFirstLane()) { 9537 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9538 Value *Poison = PoisonValue::get( 9539 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9540 State.set(this, Poison, State.Instance->Part); 9541 } 9542 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9543 } 9544 return; 9545 } 9546 9547 // Generate scalar instances for all VF lanes of all UF parts, unless the 9548 // instruction is uniform inwhich case generate only the first lane for each 9549 // of the UF parts. 9550 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9551 assert((!State.VF.isScalable() || IsUniform) && 9552 "Can't scalarize a scalable vector"); 9553 for (unsigned Part = 0; Part < State.UF; ++Part) 9554 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9555 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9556 VPIteration(Part, Lane), IsPredicated, 9557 State); 9558 } 9559 9560 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9561 assert(State.Instance && "Branch on Mask works only on single instance."); 9562 9563 unsigned Part = State.Instance->Part; 9564 unsigned Lane = State.Instance->Lane.getKnownLane(); 9565 9566 Value *ConditionBit = nullptr; 9567 VPValue *BlockInMask = getMask(); 9568 if (BlockInMask) { 9569 ConditionBit = State.get(BlockInMask, Part); 9570 if (ConditionBit->getType()->isVectorTy()) 9571 ConditionBit = State.Builder.CreateExtractElement( 9572 ConditionBit, State.Builder.getInt32(Lane)); 9573 } else // Block in mask is all-one. 9574 ConditionBit = State.Builder.getTrue(); 9575 9576 // Replace the temporary unreachable terminator with a new conditional branch, 9577 // whose two destinations will be set later when they are created. 9578 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9579 assert(isa<UnreachableInst>(CurrentTerminator) && 9580 "Expected to replace unreachable terminator with conditional branch."); 9581 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9582 CondBr->setSuccessor(0, nullptr); 9583 ReplaceInstWithInst(CurrentTerminator, CondBr); 9584 } 9585 9586 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9587 assert(State.Instance && "Predicated instruction PHI works per instance."); 9588 Instruction *ScalarPredInst = 9589 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9590 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9591 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9592 assert(PredicatingBB && "Predicated block has no single predecessor."); 9593 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9594 "operand must be VPReplicateRecipe"); 9595 9596 // By current pack/unpack logic we need to generate only a single phi node: if 9597 // a vector value for the predicated instruction exists at this point it means 9598 // the instruction has vector users only, and a phi for the vector value is 9599 // needed. In this case the recipe of the predicated instruction is marked to 9600 // also do that packing, thereby "hoisting" the insert-element sequence. 9601 // Otherwise, a phi node for the scalar value is needed. 9602 unsigned Part = State.Instance->Part; 9603 if (State.hasVectorValue(getOperand(0), Part)) { 9604 Value *VectorValue = State.get(getOperand(0), Part); 9605 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9606 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9607 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9608 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9609 if (State.hasVectorValue(this, Part)) 9610 State.reset(this, VPhi, Part); 9611 else 9612 State.set(this, VPhi, Part); 9613 // NOTE: Currently we need to update the value of the operand, so the next 9614 // predicated iteration inserts its generated value in the correct vector. 9615 State.reset(getOperand(0), VPhi, Part); 9616 } else { 9617 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9618 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9619 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9620 PredicatingBB); 9621 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9622 if (State.hasScalarValue(this, *State.Instance)) 9623 State.reset(this, Phi, *State.Instance); 9624 else 9625 State.set(this, Phi, *State.Instance); 9626 // NOTE: Currently we need to update the value of the operand, so the next 9627 // predicated iteration inserts its generated value in the correct vector. 9628 State.reset(getOperand(0), Phi, *State.Instance); 9629 } 9630 } 9631 9632 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9633 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9634 State.ILV->vectorizeMemoryInstruction( 9635 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9636 StoredValue, getMask()); 9637 } 9638 9639 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9640 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9641 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9642 // for predication. 9643 static ScalarEpilogueLowering getScalarEpilogueLowering( 9644 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9645 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9646 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9647 LoopVectorizationLegality &LVL) { 9648 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9649 // don't look at hints or options, and don't request a scalar epilogue. 9650 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9651 // LoopAccessInfo (due to code dependency and not being able to reliably get 9652 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9653 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9654 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9655 // back to the old way and vectorize with versioning when forced. See D81345.) 9656 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9657 PGSOQueryType::IRPass) && 9658 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9659 return CM_ScalarEpilogueNotAllowedOptSize; 9660 9661 // 2) If set, obey the directives 9662 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9663 switch (PreferPredicateOverEpilogue) { 9664 case PreferPredicateTy::ScalarEpilogue: 9665 return CM_ScalarEpilogueAllowed; 9666 case PreferPredicateTy::PredicateElseScalarEpilogue: 9667 return CM_ScalarEpilogueNotNeededUsePredicate; 9668 case PreferPredicateTy::PredicateOrDontVectorize: 9669 return CM_ScalarEpilogueNotAllowedUsePredicate; 9670 }; 9671 } 9672 9673 // 3) If set, obey the hints 9674 switch (Hints.getPredicate()) { 9675 case LoopVectorizeHints::FK_Enabled: 9676 return CM_ScalarEpilogueNotNeededUsePredicate; 9677 case LoopVectorizeHints::FK_Disabled: 9678 return CM_ScalarEpilogueAllowed; 9679 }; 9680 9681 // 4) if the TTI hook indicates this is profitable, request predication. 9682 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9683 LVL.getLAI())) 9684 return CM_ScalarEpilogueNotNeededUsePredicate; 9685 9686 return CM_ScalarEpilogueAllowed; 9687 } 9688 9689 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9690 // If Values have been set for this Def return the one relevant for \p Part. 9691 if (hasVectorValue(Def, Part)) 9692 return Data.PerPartOutput[Def][Part]; 9693 9694 if (!hasScalarValue(Def, {Part, 0})) { 9695 Value *IRV = Def->getLiveInIRValue(); 9696 Value *B = ILV->getBroadcastInstrs(IRV); 9697 set(Def, B, Part); 9698 return B; 9699 } 9700 9701 Value *ScalarValue = get(Def, {Part, 0}); 9702 // If we aren't vectorizing, we can just copy the scalar map values over 9703 // to the vector map. 9704 if (VF.isScalar()) { 9705 set(Def, ScalarValue, Part); 9706 return ScalarValue; 9707 } 9708 9709 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9710 bool IsUniform = RepR && RepR->isUniform(); 9711 9712 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9713 // Check if there is a scalar value for the selected lane. 9714 if (!hasScalarValue(Def, {Part, LastLane})) { 9715 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9716 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9717 "unexpected recipe found to be invariant"); 9718 IsUniform = true; 9719 LastLane = 0; 9720 } 9721 9722 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9723 9724 // Set the insert point after the last scalarized instruction. This 9725 // ensures the insertelement sequence will directly follow the scalar 9726 // definitions. 9727 auto OldIP = Builder.saveIP(); 9728 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9729 Builder.SetInsertPoint(&*NewIP); 9730 9731 // However, if we are vectorizing, we need to construct the vector values. 9732 // If the value is known to be uniform after vectorization, we can just 9733 // broadcast the scalar value corresponding to lane zero for each unroll 9734 // iteration. Otherwise, we construct the vector values using 9735 // insertelement instructions. Since the resulting vectors are stored in 9736 // State, we will only generate the insertelements once. 9737 Value *VectorValue = nullptr; 9738 if (IsUniform) { 9739 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9740 set(Def, VectorValue, Part); 9741 } else { 9742 // Initialize packing with insertelements to start from undef. 9743 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9744 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9745 set(Def, Undef, Part); 9746 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9747 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9748 VectorValue = get(Def, Part); 9749 } 9750 Builder.restoreIP(OldIP); 9751 return VectorValue; 9752 } 9753 9754 // Process the loop in the VPlan-native vectorization path. This path builds 9755 // VPlan upfront in the vectorization pipeline, which allows to apply 9756 // VPlan-to-VPlan transformations from the very beginning without modifying the 9757 // input LLVM IR. 9758 static bool processLoopInVPlanNativePath( 9759 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9760 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9761 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9762 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9763 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9764 LoopVectorizationRequirements &Requirements) { 9765 9766 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9767 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9768 return false; 9769 } 9770 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9771 Function *F = L->getHeader()->getParent(); 9772 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9773 9774 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9775 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9776 9777 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9778 &Hints, IAI); 9779 // Use the planner for outer loop vectorization. 9780 // TODO: CM is not used at this point inside the planner. Turn CM into an 9781 // optional argument if we don't need it in the future. 9782 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9783 Requirements, ORE); 9784 9785 // Get user vectorization factor. 9786 ElementCount UserVF = Hints.getWidth(); 9787 9788 // Plan how to best vectorize, return the best VF and its cost. 9789 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9790 9791 // If we are stress testing VPlan builds, do not attempt to generate vector 9792 // code. Masked vector code generation support will follow soon. 9793 // Also, do not attempt to vectorize if no vector code will be produced. 9794 if (VPlanBuildStressTest || EnableVPlanPredication || 9795 VectorizationFactor::Disabled() == VF) 9796 return false; 9797 9798 LVP.setBestPlan(VF.Width, 1); 9799 9800 { 9801 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9802 F->getParent()->getDataLayout()); 9803 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9804 &CM, BFI, PSI, Checks); 9805 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9806 << L->getHeader()->getParent()->getName() << "\"\n"); 9807 LVP.executePlan(LB, DT); 9808 } 9809 9810 // Mark the loop as already vectorized to avoid vectorizing again. 9811 Hints.setAlreadyVectorized(); 9812 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9813 return true; 9814 } 9815 9816 // Emit a remark if there are stores to floats that required a floating point 9817 // extension. If the vectorized loop was generated with floating point there 9818 // will be a performance penalty from the conversion overhead and the change in 9819 // the vector width. 9820 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9821 SmallVector<Instruction *, 4> Worklist; 9822 for (BasicBlock *BB : L->getBlocks()) { 9823 for (Instruction &Inst : *BB) { 9824 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9825 if (S->getValueOperand()->getType()->isFloatTy()) 9826 Worklist.push_back(S); 9827 } 9828 } 9829 } 9830 9831 // Traverse the floating point stores upwards searching, for floating point 9832 // conversions. 9833 SmallPtrSet<const Instruction *, 4> Visited; 9834 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9835 while (!Worklist.empty()) { 9836 auto *I = Worklist.pop_back_val(); 9837 if (!L->contains(I)) 9838 continue; 9839 if (!Visited.insert(I).second) 9840 continue; 9841 9842 // Emit a remark if the floating point store required a floating 9843 // point conversion. 9844 // TODO: More work could be done to identify the root cause such as a 9845 // constant or a function return type and point the user to it. 9846 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9847 ORE->emit([&]() { 9848 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9849 I->getDebugLoc(), L->getHeader()) 9850 << "floating point conversion changes vector width. " 9851 << "Mixed floating point precision requires an up/down " 9852 << "cast that will negatively impact performance."; 9853 }); 9854 9855 for (Use &Op : I->operands()) 9856 if (auto *OpI = dyn_cast<Instruction>(Op)) 9857 Worklist.push_back(OpI); 9858 } 9859 } 9860 9861 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9862 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9863 !EnableLoopInterleaving), 9864 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9865 !EnableLoopVectorization) {} 9866 9867 bool LoopVectorizePass::processLoop(Loop *L) { 9868 assert((EnableVPlanNativePath || L->isInnermost()) && 9869 "VPlan-native path is not enabled. Only process inner loops."); 9870 9871 #ifndef NDEBUG 9872 const std::string DebugLocStr = getDebugLocString(L); 9873 #endif /* NDEBUG */ 9874 9875 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9876 << L->getHeader()->getParent()->getName() << "\" from " 9877 << DebugLocStr << "\n"); 9878 9879 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9880 9881 LLVM_DEBUG( 9882 dbgs() << "LV: Loop hints:" 9883 << " force=" 9884 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9885 ? "disabled" 9886 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9887 ? "enabled" 9888 : "?")) 9889 << " width=" << Hints.getWidth() 9890 << " interleave=" << Hints.getInterleave() << "\n"); 9891 9892 // Function containing loop 9893 Function *F = L->getHeader()->getParent(); 9894 9895 // Looking at the diagnostic output is the only way to determine if a loop 9896 // was vectorized (other than looking at the IR or machine code), so it 9897 // is important to generate an optimization remark for each loop. Most of 9898 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9899 // generated as OptimizationRemark and OptimizationRemarkMissed are 9900 // less verbose reporting vectorized loops and unvectorized loops that may 9901 // benefit from vectorization, respectively. 9902 9903 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9904 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9905 return false; 9906 } 9907 9908 PredicatedScalarEvolution PSE(*SE, *L); 9909 9910 // Check if it is legal to vectorize the loop. 9911 LoopVectorizationRequirements Requirements; 9912 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9913 &Requirements, &Hints, DB, AC, BFI, PSI); 9914 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9915 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9916 Hints.emitRemarkWithHints(); 9917 return false; 9918 } 9919 9920 // Check the function attributes and profiles to find out if this function 9921 // should be optimized for size. 9922 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9923 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9924 9925 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9926 // here. They may require CFG and instruction level transformations before 9927 // even evaluating whether vectorization is profitable. Since we cannot modify 9928 // the incoming IR, we need to build VPlan upfront in the vectorization 9929 // pipeline. 9930 if (!L->isInnermost()) 9931 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9932 ORE, BFI, PSI, Hints, Requirements); 9933 9934 assert(L->isInnermost() && "Inner loop expected."); 9935 9936 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9937 // count by optimizing for size, to minimize overheads. 9938 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9939 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9940 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9941 << "This loop is worth vectorizing only if no scalar " 9942 << "iteration overheads are incurred."); 9943 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9944 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9945 else { 9946 LLVM_DEBUG(dbgs() << "\n"); 9947 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9948 } 9949 } 9950 9951 // Check the function attributes to see if implicit floats are allowed. 9952 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9953 // an integer loop and the vector instructions selected are purely integer 9954 // vector instructions? 9955 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9956 reportVectorizationFailure( 9957 "Can't vectorize when the NoImplicitFloat attribute is used", 9958 "loop not vectorized due to NoImplicitFloat attribute", 9959 "NoImplicitFloat", ORE, L); 9960 Hints.emitRemarkWithHints(); 9961 return false; 9962 } 9963 9964 // Check if the target supports potentially unsafe FP vectorization. 9965 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9966 // for the target we're vectorizing for, to make sure none of the 9967 // additional fp-math flags can help. 9968 if (Hints.isPotentiallyUnsafe() && 9969 TTI->isFPVectorizationPotentiallyUnsafe()) { 9970 reportVectorizationFailure( 9971 "Potentially unsafe FP op prevents vectorization", 9972 "loop not vectorized due to unsafe FP support.", 9973 "UnsafeFP", ORE, L); 9974 Hints.emitRemarkWithHints(); 9975 return false; 9976 } 9977 9978 if (!Requirements.canVectorizeFPMath(Hints)) { 9979 ORE->emit([&]() { 9980 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9981 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9982 ExactFPMathInst->getDebugLoc(), 9983 ExactFPMathInst->getParent()) 9984 << "loop not vectorized: cannot prove it is safe to reorder " 9985 "floating-point operations"; 9986 }); 9987 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9988 "reorder floating-point operations\n"); 9989 Hints.emitRemarkWithHints(); 9990 return false; 9991 } 9992 9993 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9994 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9995 9996 // If an override option has been passed in for interleaved accesses, use it. 9997 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9998 UseInterleaved = EnableInterleavedMemAccesses; 9999 10000 // Analyze interleaved memory accesses. 10001 if (UseInterleaved) { 10002 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 10003 } 10004 10005 // Use the cost model. 10006 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 10007 F, &Hints, IAI); 10008 CM.collectValuesToIgnore(); 10009 10010 // Use the planner for vectorization. 10011 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 10012 Requirements, ORE); 10013 10014 // Get user vectorization factor and interleave count. 10015 ElementCount UserVF = Hints.getWidth(); 10016 unsigned UserIC = Hints.getInterleave(); 10017 10018 // Plan how to best vectorize, return the best VF and its cost. 10019 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10020 10021 VectorizationFactor VF = VectorizationFactor::Disabled(); 10022 unsigned IC = 1; 10023 10024 if (MaybeVF) { 10025 VF = *MaybeVF; 10026 // Select the interleave count. 10027 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10028 } 10029 10030 // Identify the diagnostic messages that should be produced. 10031 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10032 bool VectorizeLoop = true, InterleaveLoop = true; 10033 if (VF.Width.isScalar()) { 10034 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10035 VecDiagMsg = std::make_pair( 10036 "VectorizationNotBeneficial", 10037 "the cost-model indicates that vectorization is not beneficial"); 10038 VectorizeLoop = false; 10039 } 10040 10041 if (!MaybeVF && UserIC > 1) { 10042 // Tell the user interleaving was avoided up-front, despite being explicitly 10043 // requested. 10044 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10045 "interleaving should be avoided up front\n"); 10046 IntDiagMsg = std::make_pair( 10047 "InterleavingAvoided", 10048 "Ignoring UserIC, because interleaving was avoided up front"); 10049 InterleaveLoop = false; 10050 } else if (IC == 1 && UserIC <= 1) { 10051 // Tell the user interleaving is not beneficial. 10052 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10053 IntDiagMsg = std::make_pair( 10054 "InterleavingNotBeneficial", 10055 "the cost-model indicates that interleaving is not beneficial"); 10056 InterleaveLoop = false; 10057 if (UserIC == 1) { 10058 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10059 IntDiagMsg.second += 10060 " and is explicitly disabled or interleave count is set to 1"; 10061 } 10062 } else if (IC > 1 && UserIC == 1) { 10063 // Tell the user interleaving is beneficial, but it explicitly disabled. 10064 LLVM_DEBUG( 10065 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10066 IntDiagMsg = std::make_pair( 10067 "InterleavingBeneficialButDisabled", 10068 "the cost-model indicates that interleaving is beneficial " 10069 "but is explicitly disabled or interleave count is set to 1"); 10070 InterleaveLoop = false; 10071 } 10072 10073 // Override IC if user provided an interleave count. 10074 IC = UserIC > 0 ? UserIC : IC; 10075 10076 // Emit diagnostic messages, if any. 10077 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10078 if (!VectorizeLoop && !InterleaveLoop) { 10079 // Do not vectorize or interleaving the loop. 10080 ORE->emit([&]() { 10081 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10082 L->getStartLoc(), L->getHeader()) 10083 << VecDiagMsg.second; 10084 }); 10085 ORE->emit([&]() { 10086 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10087 L->getStartLoc(), L->getHeader()) 10088 << IntDiagMsg.second; 10089 }); 10090 return false; 10091 } else if (!VectorizeLoop && InterleaveLoop) { 10092 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10093 ORE->emit([&]() { 10094 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10095 L->getStartLoc(), L->getHeader()) 10096 << VecDiagMsg.second; 10097 }); 10098 } else if (VectorizeLoop && !InterleaveLoop) { 10099 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10100 << ") in " << DebugLocStr << '\n'); 10101 ORE->emit([&]() { 10102 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10103 L->getStartLoc(), L->getHeader()) 10104 << IntDiagMsg.second; 10105 }); 10106 } else if (VectorizeLoop && InterleaveLoop) { 10107 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10108 << ") in " << DebugLocStr << '\n'); 10109 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10110 } 10111 10112 bool DisableRuntimeUnroll = false; 10113 MDNode *OrigLoopID = L->getLoopID(); 10114 { 10115 // Optimistically generate runtime checks. Drop them if they turn out to not 10116 // be profitable. Limit the scope of Checks, so the cleanup happens 10117 // immediately after vector codegeneration is done. 10118 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10119 F->getParent()->getDataLayout()); 10120 if (!VF.Width.isScalar() || IC > 1) 10121 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10122 LVP.setBestPlan(VF.Width, IC); 10123 10124 using namespace ore; 10125 if (!VectorizeLoop) { 10126 assert(IC > 1 && "interleave count should not be 1 or 0"); 10127 // If we decided that it is not legal to vectorize the loop, then 10128 // interleave it. 10129 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10130 &CM, BFI, PSI, Checks); 10131 LVP.executePlan(Unroller, DT); 10132 10133 ORE->emit([&]() { 10134 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10135 L->getHeader()) 10136 << "interleaved loop (interleaved count: " 10137 << NV("InterleaveCount", IC) << ")"; 10138 }); 10139 } else { 10140 // If we decided that it is *legal* to vectorize the loop, then do it. 10141 10142 // Consider vectorizing the epilogue too if it's profitable. 10143 VectorizationFactor EpilogueVF = 10144 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10145 if (EpilogueVF.Width.isVector()) { 10146 10147 // The first pass vectorizes the main loop and creates a scalar epilogue 10148 // to be vectorized by executing the plan (potentially with a different 10149 // factor) again shortly afterwards. 10150 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10151 EpilogueVF.Width.getKnownMinValue(), 10152 1); 10153 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10154 EPI, &LVL, &CM, BFI, PSI, Checks); 10155 10156 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10157 LVP.executePlan(MainILV, DT); 10158 ++LoopsVectorized; 10159 10160 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10161 formLCSSARecursively(*L, *DT, LI, SE); 10162 10163 // Second pass vectorizes the epilogue and adjusts the control flow 10164 // edges from the first pass. 10165 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10166 EPI.MainLoopVF = EPI.EpilogueVF; 10167 EPI.MainLoopUF = EPI.EpilogueUF; 10168 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10169 ORE, EPI, &LVL, &CM, BFI, PSI, 10170 Checks); 10171 LVP.executePlan(EpilogILV, DT); 10172 ++LoopsEpilogueVectorized; 10173 10174 if (!MainILV.areSafetyChecksAdded()) 10175 DisableRuntimeUnroll = true; 10176 } else { 10177 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10178 &LVL, &CM, BFI, PSI, Checks); 10179 LVP.executePlan(LB, DT); 10180 ++LoopsVectorized; 10181 10182 // Add metadata to disable runtime unrolling a scalar loop when there 10183 // are no runtime checks about strides and memory. A scalar loop that is 10184 // rarely used is not worth unrolling. 10185 if (!LB.areSafetyChecksAdded()) 10186 DisableRuntimeUnroll = true; 10187 } 10188 // Report the vectorization decision. 10189 ORE->emit([&]() { 10190 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10191 L->getHeader()) 10192 << "vectorized loop (vectorization width: " 10193 << NV("VectorizationFactor", VF.Width) 10194 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10195 }); 10196 } 10197 10198 if (ORE->allowExtraAnalysis(LV_NAME)) 10199 checkMixedPrecision(L, ORE); 10200 } 10201 10202 Optional<MDNode *> RemainderLoopID = 10203 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10204 LLVMLoopVectorizeFollowupEpilogue}); 10205 if (RemainderLoopID.hasValue()) { 10206 L->setLoopID(RemainderLoopID.getValue()); 10207 } else { 10208 if (DisableRuntimeUnroll) 10209 AddRuntimeUnrollDisableMetaData(L); 10210 10211 // Mark the loop as already vectorized to avoid vectorizing again. 10212 Hints.setAlreadyVectorized(); 10213 } 10214 10215 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10216 return true; 10217 } 10218 10219 LoopVectorizeResult LoopVectorizePass::runImpl( 10220 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10221 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10222 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10223 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10224 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10225 SE = &SE_; 10226 LI = &LI_; 10227 TTI = &TTI_; 10228 DT = &DT_; 10229 BFI = &BFI_; 10230 TLI = TLI_; 10231 AA = &AA_; 10232 AC = &AC_; 10233 GetLAA = &GetLAA_; 10234 DB = &DB_; 10235 ORE = &ORE_; 10236 PSI = PSI_; 10237 10238 // Don't attempt if 10239 // 1. the target claims to have no vector registers, and 10240 // 2. interleaving won't help ILP. 10241 // 10242 // The second condition is necessary because, even if the target has no 10243 // vector registers, loop vectorization may still enable scalar 10244 // interleaving. 10245 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10246 TTI->getMaxInterleaveFactor(1) < 2) 10247 return LoopVectorizeResult(false, false); 10248 10249 bool Changed = false, CFGChanged = false; 10250 10251 // The vectorizer requires loops to be in simplified form. 10252 // Since simplification may add new inner loops, it has to run before the 10253 // legality and profitability checks. This means running the loop vectorizer 10254 // will simplify all loops, regardless of whether anything end up being 10255 // vectorized. 10256 for (auto &L : *LI) 10257 Changed |= CFGChanged |= 10258 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10259 10260 // Build up a worklist of inner-loops to vectorize. This is necessary as 10261 // the act of vectorizing or partially unrolling a loop creates new loops 10262 // and can invalidate iterators across the loops. 10263 SmallVector<Loop *, 8> Worklist; 10264 10265 for (Loop *L : *LI) 10266 collectSupportedLoops(*L, LI, ORE, Worklist); 10267 10268 LoopsAnalyzed += Worklist.size(); 10269 10270 // Now walk the identified inner loops. 10271 while (!Worklist.empty()) { 10272 Loop *L = Worklist.pop_back_val(); 10273 10274 // For the inner loops we actually process, form LCSSA to simplify the 10275 // transform. 10276 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10277 10278 Changed |= CFGChanged |= processLoop(L); 10279 } 10280 10281 // Process each loop nest in the function. 10282 return LoopVectorizeResult(Changed, CFGChanged); 10283 } 10284 10285 PreservedAnalyses LoopVectorizePass::run(Function &F, 10286 FunctionAnalysisManager &AM) { 10287 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10288 auto &LI = AM.getResult<LoopAnalysis>(F); 10289 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10290 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10291 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10292 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10293 auto &AA = AM.getResult<AAManager>(F); 10294 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10295 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10296 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10297 MemorySSA *MSSA = EnableMSSALoopDependency 10298 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10299 : nullptr; 10300 10301 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10302 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10303 [&](Loop &L) -> const LoopAccessInfo & { 10304 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10305 TLI, TTI, nullptr, MSSA}; 10306 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10307 }; 10308 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10309 ProfileSummaryInfo *PSI = 10310 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10311 LoopVectorizeResult Result = 10312 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10313 if (!Result.MadeAnyChange) 10314 return PreservedAnalyses::all(); 10315 PreservedAnalyses PA; 10316 10317 // We currently do not preserve loopinfo/dominator analyses with outer loop 10318 // vectorization. Until this is addressed, mark these analyses as preserved 10319 // only for non-VPlan-native path. 10320 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10321 if (!EnableVPlanNativePath) { 10322 PA.preserve<LoopAnalysis>(); 10323 PA.preserve<DominatorTreeAnalysis>(); 10324 } 10325 PA.preserve<BasicAA>(); 10326 PA.preserve<GlobalsAA>(); 10327 if (!Result.MadeCFGChange) 10328 PA.preserveSet<CFGAnalyses>(); 10329 return PA; 10330 } 10331