1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> EnableStrictReductions( 335 "enable-strict-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns the type of loaded or stored value. 379 static Type *getMemInstValueType(Value *I) { 380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 381 "Expected Load or Store instruction"); 382 if (auto *LI = dyn_cast<LoadInst>(I)) 383 return LI->getType(); 384 return cast<StoreInst>(I)->getValueOperand()->getType(); 385 } 386 387 /// A helper function that returns true if the given type is irregular. The 388 /// type is irregular if its allocated size doesn't equal the store size of an 389 /// element of the corresponding vector type. 390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 391 // Determine if an array of N elements of type Ty is "bitcast compatible" 392 // with a <N x Ty> vector. 393 // This is only true if there is no padding between the array elements. 394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 395 } 396 397 /// A helper function that returns the reciprocal of the block probability of 398 /// predicated blocks. If we return X, we are assuming the predicated block 399 /// will execute once for every X iterations of the loop header. 400 /// 401 /// TODO: We should use actual block probability here, if available. Currently, 402 /// we always assume predicated blocks have a 50% chance of executing. 403 static unsigned getReciprocalPredBlockProb() { return 2; } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPWidenPHIRecipe *PhiR, VPTransformState &State); 519 520 /// A helper function to scalarize a single Instruction in the innermost loop. 521 /// Generates a sequence of scalar instances for each lane between \p MinLane 522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 524 /// Instr's operands. 525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 526 const VPIteration &Instance, bool IfPredicateInstr, 527 VPTransformState &State); 528 529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 530 /// is provided, the integer induction variable will first be truncated to 531 /// the corresponding type. 532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 533 VPValue *Def, VPValue *CastDef, 534 VPTransformState &State); 535 536 /// Construct the vector value of a scalarized value \p V one lane at a time. 537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 538 VPTransformState &State); 539 540 /// Try to vectorize interleaved access group \p Group with the base address 541 /// given in \p Addr, optionally masking the vector operations if \p 542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 543 /// values in the vectorized loop. 544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 545 ArrayRef<VPValue *> VPDefs, 546 VPTransformState &State, VPValue *Addr, 547 ArrayRef<VPValue *> StoredValues, 548 VPValue *BlockInMask = nullptr); 549 550 /// Vectorize Load and Store instructions with the base address given in \p 551 /// Addr, optionally masking the vector operations if \p BlockInMask is 552 /// non-null. Use \p State to translate given VPValues to IR values in the 553 /// vectorized loop. 554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 555 VPValue *Def, VPValue *Addr, 556 VPValue *StoredValue, VPValue *BlockInMask); 557 558 /// Set the debug location in the builder using the debug location in 559 /// the instruction. 560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 561 562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 563 void fixNonInductionPHIs(VPTransformState &State); 564 565 /// Create a broadcast instruction. This method generates a broadcast 566 /// instruction (shuffle) for loop invariant values and for the induction 567 /// value. If this is the induction variable then we extend it to N, N+1, ... 568 /// this is needed because each iteration in the loop corresponds to a SIMD 569 /// element. 570 virtual Value *getBroadcastInstrs(Value *V); 571 572 protected: 573 friend class LoopVectorizationPlanner; 574 575 /// A small list of PHINodes. 576 using PhiVector = SmallVector<PHINode *, 4>; 577 578 /// A type for scalarized values in the new loop. Each value from the 579 /// original loop, when scalarized, is represented by UF x VF scalar values 580 /// in the new unrolled loop, where UF is the unroll factor and VF is the 581 /// vectorization factor. 582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 583 584 /// Set up the values of the IVs correctly when exiting the vector loop. 585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 586 Value *CountRoundDown, Value *EndValue, 587 BasicBlock *MiddleBlock); 588 589 /// Create a new induction variable inside L. 590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 591 Value *Step, Instruction *DL); 592 593 /// Handle all cross-iteration phis in the header. 594 void fixCrossIterationPHIs(VPTransformState &State); 595 596 /// Fix a first-order recurrence. This is the second phase of vectorizing 597 /// this phi node. 598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 599 600 /// Fix a reduction cross-iteration phi. This is the second phase of 601 /// vectorizing this phi node. 602 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 603 604 /// Clear NSW/NUW flags from reduction instructions if necessary. 605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 606 VPTransformState &State); 607 608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 609 /// means we need to add the appropriate incoming value from the middle 610 /// block as exiting edges from the scalar epilogue loop (if present) are 611 /// already in place, and we exit the vector loop exclusively to the middle 612 /// block. 613 void fixLCSSAPHIs(VPTransformState &State); 614 615 /// Iteratively sink the scalarized operands of a predicated instruction into 616 /// the block that was created for it. 617 void sinkScalarOperands(Instruction *PredInst); 618 619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 620 /// represented as. 621 void truncateToMinimalBitwidths(VPTransformState &State); 622 623 /// This function adds 624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The (unique) ExitBlock of the scalar loop. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// LoopVectorizationCostModel - estimates the expected speedups due to 1211 /// vectorization. 1212 /// In many cases vectorization is not profitable. This can happen because of 1213 /// a number of reasons. In this class we mainly attempt to predict the 1214 /// expected speedup/slowdowns due to the supported instruction set. We use the 1215 /// TargetTransformInfo to query the different backends for the cost of 1216 /// different operations. 1217 class LoopVectorizationCostModel { 1218 public: 1219 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1220 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1221 LoopVectorizationLegality *Legal, 1222 const TargetTransformInfo &TTI, 1223 const TargetLibraryInfo *TLI, DemandedBits *DB, 1224 AssumptionCache *AC, 1225 OptimizationRemarkEmitter *ORE, const Function *F, 1226 const LoopVectorizeHints *Hints, 1227 InterleavedAccessInfo &IAI) 1228 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1229 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1230 Hints(Hints), InterleaveInfo(IAI) {} 1231 1232 /// \return An upper bound for the vectorization factor, or None if 1233 /// vectorization and interleaving should be avoided up front. 1234 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1235 1236 /// \return True if runtime checks are required for vectorization, and false 1237 /// otherwise. 1238 bool runtimeChecksRequired(); 1239 1240 /// \return The most profitable vectorization factor and the cost of that VF. 1241 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1242 /// then this vectorization factor will be selected if vectorization is 1243 /// possible. 1244 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1245 VectorizationFactor 1246 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1247 const LoopVectorizationPlanner &LVP); 1248 1249 /// Setup cost-based decisions for user vectorization factor. 1250 void selectUserVectorizationFactor(ElementCount UserVF) { 1251 collectUniformsAndScalars(UserVF); 1252 collectInstsToScalarize(UserVF); 1253 } 1254 1255 /// \return The size (in bits) of the smallest and widest types in the code 1256 /// that needs to be vectorized. We ignore values that remain scalar such as 1257 /// 64 bit loop indices. 1258 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1259 1260 /// \return The desired interleave count. 1261 /// If interleave count has been specified by metadata it will be returned. 1262 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1263 /// are the selected vectorization factor and the cost of the selected VF. 1264 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1265 1266 /// Memory access instruction may be vectorized in more than one way. 1267 /// Form of instruction after vectorization depends on cost. 1268 /// This function takes cost-based decisions for Load/Store instructions 1269 /// and collects them in a map. This decisions map is used for building 1270 /// the lists of loop-uniform and loop-scalar instructions. 1271 /// The calculated cost is saved with widening decision in order to 1272 /// avoid redundant calculations. 1273 void setCostBasedWideningDecision(ElementCount VF); 1274 1275 /// A struct that represents some properties of the register usage 1276 /// of a loop. 1277 struct RegisterUsage { 1278 /// Holds the number of loop invariant values that are used in the loop. 1279 /// The key is ClassID of target-provided register class. 1280 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1281 /// Holds the maximum number of concurrent live intervals in the loop. 1282 /// The key is ClassID of target-provided register class. 1283 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1284 }; 1285 1286 /// \return Returns information about the register usages of the loop for the 1287 /// given vectorization factors. 1288 SmallVector<RegisterUsage, 8> 1289 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1290 1291 /// Collect values we want to ignore in the cost model. 1292 void collectValuesToIgnore(); 1293 1294 /// Split reductions into those that happen in the loop, and those that happen 1295 /// outside. In loop reductions are collected into InLoopReductionChains. 1296 void collectInLoopReductions(); 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 InstructionCost Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, 1384 InstructionCost Cost) { 1385 assert(VF.isVector() && "Expected VF >=2"); 1386 /// Broadcast this decicion to all instructions inside the group. 1387 /// But the cost will be assigned to one instruction only. 1388 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1389 if (auto *I = Grp->getMember(i)) { 1390 if (Grp->getInsertPos() == I) 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 else 1393 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1394 } 1395 } 1396 } 1397 1398 /// Return the cost model decision for the given instruction \p I and vector 1399 /// width \p VF. Return CM_Unknown if this instruction did not pass 1400 /// through the cost modeling. 1401 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1402 assert(VF.isVector() && "Expected VF to be a vector VF"); 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return CM_GatherScatter; 1407 1408 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1409 auto Itr = WideningDecisions.find(InstOnVF); 1410 if (Itr == WideningDecisions.end()) 1411 return CM_Unknown; 1412 return Itr->second.first; 1413 } 1414 1415 /// Return the vectorization cost for the given instruction \p I and vector 1416 /// width \p VF. 1417 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1418 assert(VF.isVector() && "Expected VF >=2"); 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1421 "The cost is not calculated"); 1422 return WideningDecisions[InstOnVF].second; 1423 } 1424 1425 /// Return True if instruction \p I is an optimizable truncate whose operand 1426 /// is an induction variable. Such a truncate will be removed by adding a new 1427 /// induction variable with the destination type. 1428 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1429 // If the instruction is not a truncate, return false. 1430 auto *Trunc = dyn_cast<TruncInst>(I); 1431 if (!Trunc) 1432 return false; 1433 1434 // Get the source and destination types of the truncate. 1435 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1436 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1437 1438 // If the truncate is free for the given types, return false. Replacing a 1439 // free truncate with an induction variable would add an induction variable 1440 // update instruction to each iteration of the loop. We exclude from this 1441 // check the primary induction variable since it will need an update 1442 // instruction regardless. 1443 Value *Op = Trunc->getOperand(0); 1444 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1445 return false; 1446 1447 // If the truncated value is not an induction variable, return false. 1448 return Legal->isInductionPhi(Op); 1449 } 1450 1451 /// Collects the instructions to scalarize for each predicated instruction in 1452 /// the loop. 1453 void collectInstsToScalarize(ElementCount VF); 1454 1455 /// Collect Uniform and Scalar values for the given \p VF. 1456 /// The sets depend on CM decision for Load/Store instructions 1457 /// that may be vectorized as interleave, gather-scatter or scalarized. 1458 void collectUniformsAndScalars(ElementCount VF) { 1459 // Do the analysis once. 1460 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1461 return; 1462 setCostBasedWideningDecision(VF); 1463 collectLoopUniforms(VF); 1464 collectLoopScalars(VF); 1465 } 1466 1467 /// Returns true if the target machine supports masked store operation 1468 /// for the given \p DataType and kind of access to \p Ptr. 1469 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1470 return Legal->isConsecutivePtr(Ptr) && 1471 TTI.isLegalMaskedStore(DataType, Alignment); 1472 } 1473 1474 /// Returns true if the target machine supports masked load operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedLoad(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked scatter operation 1482 /// for the given \p DataType. 1483 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1484 return TTI.isLegalMaskedScatter(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked gather operation 1488 /// for the given \p DataType. 1489 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1490 return TTI.isLegalMaskedGather(DataType, Alignment); 1491 } 1492 1493 /// Returns true if the target machine can represent \p V as a masked gather 1494 /// or scatter operation. 1495 bool isLegalGatherOrScatter(Value *V) { 1496 bool LI = isa<LoadInst>(V); 1497 bool SI = isa<StoreInst>(V); 1498 if (!LI && !SI) 1499 return false; 1500 auto *Ty = getMemInstValueType(V); 1501 Align Align = getLoadStoreAlignment(V); 1502 return (LI && isLegalMaskedGather(Ty, Align)) || 1503 (SI && isLegalMaskedScatter(Ty, Align)); 1504 } 1505 1506 /// Returns true if the target machine supports all of the reduction 1507 /// variables found for the given VF. 1508 bool canVectorizeReductions(ElementCount VF) { 1509 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1510 RecurrenceDescriptor RdxDesc = Reduction.second; 1511 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1512 })); 1513 } 1514 1515 /// Returns true if \p I is an instruction that will be scalarized with 1516 /// predication. Such instructions include conditional stores and 1517 /// instructions that may divide by zero. 1518 /// If a non-zero VF has been calculated, we check if I will be scalarized 1519 /// predication for that VF. 1520 bool 1521 isScalarWithPredication(Instruction *I, 1522 ElementCount VF = ElementCount::getFixed(1)) const; 1523 1524 // Returns true if \p I is an instruction that will be predicated either 1525 // through scalar predication or masked load/store or masked gather/scatter. 1526 // Superset of instructions that return true for isScalarWithPredication. 1527 bool isPredicatedInst(Instruction *I, ElementCount VF) { 1528 if (!blockNeedsPredication(I->getParent())) 1529 return false; 1530 // Loads and stores that need some form of masked operation are predicated 1531 // instructions. 1532 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1533 return Legal->isMaskRequired(I); 1534 return isScalarWithPredication(I, VF); 1535 } 1536 1537 /// Returns true if \p I is a memory instruction with consecutive memory 1538 /// access that can be widened. 1539 bool 1540 memoryInstructionCanBeWidened(Instruction *I, 1541 ElementCount VF = ElementCount::getFixed(1)); 1542 1543 /// Returns true if \p I is a memory instruction in an interleaved-group 1544 /// of memory accesses that can be vectorized with wide vector loads/stores 1545 /// and shuffles. 1546 bool 1547 interleavedAccessCanBeWidened(Instruction *I, 1548 ElementCount VF = ElementCount::getFixed(1)); 1549 1550 /// Check if \p Instr belongs to any interleaved access group. 1551 bool isAccessInterleaved(Instruction *Instr) { 1552 return InterleaveInfo.isInterleaved(Instr); 1553 } 1554 1555 /// Get the interleaved access group that \p Instr belongs to. 1556 const InterleaveGroup<Instruction> * 1557 getInterleavedAccessGroup(Instruction *Instr) { 1558 return InterleaveInfo.getInterleaveGroup(Instr); 1559 } 1560 1561 /// Returns true if we're required to use a scalar epilogue for at least 1562 /// the final iteration of the original loop. 1563 bool requiresScalarEpilogue() const { 1564 if (!isScalarEpilogueAllowed()) 1565 return false; 1566 // If we might exit from anywhere but the latch, must run the exiting 1567 // iteration in scalar form. 1568 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1569 return true; 1570 return InterleaveInfo.requiresScalarEpilogue(); 1571 } 1572 1573 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1574 /// loop hint annotation. 1575 bool isScalarEpilogueAllowed() const { 1576 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1577 } 1578 1579 /// Returns true if all loop blocks should be masked to fold tail loop. 1580 bool foldTailByMasking() const { return FoldTailByMasking; } 1581 1582 bool blockNeedsPredication(BasicBlock *BB) const { 1583 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1584 } 1585 1586 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1587 /// nodes to the chain of instructions representing the reductions. Uses a 1588 /// MapVector to ensure deterministic iteration order. 1589 using ReductionChainMap = 1590 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1591 1592 /// Return the chain of instructions representing an inloop reduction. 1593 const ReductionChainMap &getInLoopReductionChains() const { 1594 return InLoopReductionChains; 1595 } 1596 1597 /// Returns true if the Phi is part of an inloop reduction. 1598 bool isInLoopReduction(PHINode *Phi) const { 1599 return InLoopReductionChains.count(Phi); 1600 } 1601 1602 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1603 /// with factor VF. Return the cost of the instruction, including 1604 /// scalarization overhead if it's needed. 1605 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1606 1607 /// Estimate cost of a call instruction CI if it were vectorized with factor 1608 /// VF. Return the cost of the instruction, including scalarization overhead 1609 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1610 /// scalarized - 1611 /// i.e. either vector version isn't available, or is too expensive. 1612 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1613 bool &NeedToScalarize) const; 1614 1615 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1616 /// that of B. 1617 bool isMoreProfitable(const VectorizationFactor &A, 1618 const VectorizationFactor &B) const; 1619 1620 /// Invalidates decisions already taken by the cost model. 1621 void invalidateCostModelingDecisions() { 1622 WideningDecisions.clear(); 1623 Uniforms.clear(); 1624 Scalars.clear(); 1625 } 1626 1627 private: 1628 unsigned NumPredStores = 0; 1629 1630 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1631 /// than zero. One is returned if vectorization should best be avoided due 1632 /// to cost. 1633 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1634 ElementCount UserVF); 1635 1636 /// \return the maximized element count based on the targets vector 1637 /// registers and the loop trip-count, but limited to a maximum safe VF. 1638 /// This is a helper function of computeFeasibleMaxVF. 1639 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1640 /// issue that occurred on one of the buildbots which cannot be reproduced 1641 /// without having access to the properietary compiler (see comments on 1642 /// D98509). The issue is currently under investigation and this workaround 1643 /// will be removed as soon as possible. 1644 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1645 unsigned SmallestType, 1646 unsigned WidestType, 1647 const ElementCount &MaxSafeVF); 1648 1649 /// \return the maximum legal scalable VF, based on the safe max number 1650 /// of elements. 1651 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1652 1653 /// The vectorization cost is a combination of the cost itself and a boolean 1654 /// indicating whether any of the contributing operations will actually 1655 /// operate on 1656 /// vector values after type legalization in the backend. If this latter value 1657 /// is 1658 /// false, then all operations will be scalarized (i.e. no vectorization has 1659 /// actually taken place). 1660 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1661 1662 /// Returns the expected execution cost. The unit of the cost does 1663 /// not matter because we use the 'cost' units to compare different 1664 /// vector widths. The cost that is returned is *not* normalized by 1665 /// the factor width. 1666 VectorizationCostTy expectedCost(ElementCount VF); 1667 1668 /// Returns the execution time cost of an instruction for a given vector 1669 /// width. Vector width of one means scalar. 1670 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1671 1672 /// The cost-computation logic from getInstructionCost which provides 1673 /// the vector type as an output parameter. 1674 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1675 Type *&VectorTy); 1676 1677 /// Return the cost of instructions in an inloop reduction pattern, if I is 1678 /// part of that pattern. 1679 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1680 Type *VectorTy, 1681 TTI::TargetCostKind CostKind); 1682 1683 /// Calculate vectorization cost of memory instruction \p I. 1684 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1685 1686 /// The cost computation for scalarized memory instruction. 1687 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1688 1689 /// The cost computation for interleaving group of memory instructions. 1690 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1691 1692 /// The cost computation for Gather/Scatter instruction. 1693 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1694 1695 /// The cost computation for widening instruction \p I with consecutive 1696 /// memory access. 1697 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1698 1699 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1700 /// Load: scalar load + broadcast. 1701 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1702 /// element) 1703 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1704 1705 /// Estimate the overhead of scalarizing an instruction. This is a 1706 /// convenience wrapper for the type-based getScalarizationOverhead API. 1707 InstructionCost getScalarizationOverhead(Instruction *I, 1708 ElementCount VF) const; 1709 1710 /// Returns whether the instruction is a load or store and will be a emitted 1711 /// as a vector operation. 1712 bool isConsecutiveLoadOrStore(Instruction *I); 1713 1714 /// Returns true if an artificially high cost for emulated masked memrefs 1715 /// should be used. 1716 bool useEmulatedMaskMemRefHack(Instruction *I); 1717 1718 /// Map of scalar integer values to the smallest bitwidth they can be legally 1719 /// represented as. The vector equivalents of these values should be truncated 1720 /// to this type. 1721 MapVector<Instruction *, uint64_t> MinBWs; 1722 1723 /// A type representing the costs for instructions if they were to be 1724 /// scalarized rather than vectorized. The entries are Instruction-Cost 1725 /// pairs. 1726 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1727 1728 /// A set containing all BasicBlocks that are known to present after 1729 /// vectorization as a predicated block. 1730 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1731 1732 /// Records whether it is allowed to have the original scalar loop execute at 1733 /// least once. This may be needed as a fallback loop in case runtime 1734 /// aliasing/dependence checks fail, or to handle the tail/remainder 1735 /// iterations when the trip count is unknown or doesn't divide by the VF, 1736 /// or as a peel-loop to handle gaps in interleave-groups. 1737 /// Under optsize and when the trip count is very small we don't allow any 1738 /// iterations to execute in the scalar loop. 1739 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1740 1741 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1742 bool FoldTailByMasking = false; 1743 1744 /// A map holding scalar costs for different vectorization factors. The 1745 /// presence of a cost for an instruction in the mapping indicates that the 1746 /// instruction will be scalarized when vectorizing with the associated 1747 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1748 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1749 1750 /// Holds the instructions known to be uniform after vectorization. 1751 /// The data is collected per VF. 1752 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1753 1754 /// Holds the instructions known to be scalar after vectorization. 1755 /// The data is collected per VF. 1756 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1757 1758 /// Holds the instructions (address computations) that are forced to be 1759 /// scalarized. 1760 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1761 1762 /// PHINodes of the reductions that should be expanded in-loop along with 1763 /// their associated chains of reduction operations, in program order from top 1764 /// (PHI) to bottom 1765 ReductionChainMap InLoopReductionChains; 1766 1767 /// A Map of inloop reduction operations and their immediate chain operand. 1768 /// FIXME: This can be removed once reductions can be costed correctly in 1769 /// vplan. This was added to allow quick lookup to the inloop operations, 1770 /// without having to loop through InLoopReductionChains. 1771 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1772 1773 /// Returns the expected difference in cost from scalarizing the expression 1774 /// feeding a predicated instruction \p PredInst. The instructions to 1775 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1776 /// non-negative return value implies the expression will be scalarized. 1777 /// Currently, only single-use chains are considered for scalarization. 1778 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1779 ElementCount VF); 1780 1781 /// Collect the instructions that are uniform after vectorization. An 1782 /// instruction is uniform if we represent it with a single scalar value in 1783 /// the vectorized loop corresponding to each vector iteration. Examples of 1784 /// uniform instructions include pointer operands of consecutive or 1785 /// interleaved memory accesses. Note that although uniformity implies an 1786 /// instruction will be scalar, the reverse is not true. In general, a 1787 /// scalarized instruction will be represented by VF scalar values in the 1788 /// vectorized loop, each corresponding to an iteration of the original 1789 /// scalar loop. 1790 void collectLoopUniforms(ElementCount VF); 1791 1792 /// Collect the instructions that are scalar after vectorization. An 1793 /// instruction is scalar if it is known to be uniform or will be scalarized 1794 /// during vectorization. Non-uniform scalarized instructions will be 1795 /// represented by VF values in the vectorized loop, each corresponding to an 1796 /// iteration of the original scalar loop. 1797 void collectLoopScalars(ElementCount VF); 1798 1799 /// Keeps cost model vectorization decision and cost for instructions. 1800 /// Right now it is used for memory instructions only. 1801 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1802 std::pair<InstWidening, InstructionCost>>; 1803 1804 DecisionList WideningDecisions; 1805 1806 /// Returns true if \p V is expected to be vectorized and it needs to be 1807 /// extracted. 1808 bool needsExtract(Value *V, ElementCount VF) const { 1809 Instruction *I = dyn_cast<Instruction>(V); 1810 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1811 TheLoop->isLoopInvariant(I)) 1812 return false; 1813 1814 // Assume we can vectorize V (and hence we need extraction) if the 1815 // scalars are not computed yet. This can happen, because it is called 1816 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1817 // the scalars are collected. That should be a safe assumption in most 1818 // cases, because we check if the operands have vectorizable types 1819 // beforehand in LoopVectorizationLegality. 1820 return Scalars.find(VF) == Scalars.end() || 1821 !isScalarAfterVectorization(I, VF); 1822 }; 1823 1824 /// Returns a range containing only operands needing to be extracted. 1825 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1826 ElementCount VF) const { 1827 return SmallVector<Value *, 4>(make_filter_range( 1828 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1829 } 1830 1831 /// Determines if we have the infrastructure to vectorize loop \p L and its 1832 /// epilogue, assuming the main loop is vectorized by \p VF. 1833 bool isCandidateForEpilogueVectorization(const Loop &L, 1834 const ElementCount VF) const; 1835 1836 /// Returns true if epilogue vectorization is considered profitable, and 1837 /// false otherwise. 1838 /// \p VF is the vectorization factor chosen for the original loop. 1839 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1840 1841 public: 1842 /// The loop that we evaluate. 1843 Loop *TheLoop; 1844 1845 /// Predicated scalar evolution analysis. 1846 PredicatedScalarEvolution &PSE; 1847 1848 /// Loop Info analysis. 1849 LoopInfo *LI; 1850 1851 /// Vectorization legality. 1852 LoopVectorizationLegality *Legal; 1853 1854 /// Vector target information. 1855 const TargetTransformInfo &TTI; 1856 1857 /// Target Library Info. 1858 const TargetLibraryInfo *TLI; 1859 1860 /// Demanded bits analysis. 1861 DemandedBits *DB; 1862 1863 /// Assumption cache. 1864 AssumptionCache *AC; 1865 1866 /// Interface to emit optimization remarks. 1867 OptimizationRemarkEmitter *ORE; 1868 1869 const Function *TheFunction; 1870 1871 /// Loop Vectorize Hint. 1872 const LoopVectorizeHints *Hints; 1873 1874 /// The interleave access information contains groups of interleaved accesses 1875 /// with the same stride and close to each other. 1876 InterleavedAccessInfo &InterleaveInfo; 1877 1878 /// Values to ignore in the cost model. 1879 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1880 1881 /// Values to ignore in the cost model when VF > 1. 1882 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1883 1884 /// Profitable vector factors. 1885 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1886 }; 1887 } // end namespace llvm 1888 1889 /// Helper struct to manage generating runtime checks for vectorization. 1890 /// 1891 /// The runtime checks are created up-front in temporary blocks to allow better 1892 /// estimating the cost and un-linked from the existing IR. After deciding to 1893 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1894 /// temporary blocks are completely removed. 1895 class GeneratedRTChecks { 1896 /// Basic block which contains the generated SCEV checks, if any. 1897 BasicBlock *SCEVCheckBlock = nullptr; 1898 1899 /// The value representing the result of the generated SCEV checks. If it is 1900 /// nullptr, either no SCEV checks have been generated or they have been used. 1901 Value *SCEVCheckCond = nullptr; 1902 1903 /// Basic block which contains the generated memory runtime checks, if any. 1904 BasicBlock *MemCheckBlock = nullptr; 1905 1906 /// The value representing the result of the generated memory runtime checks. 1907 /// If it is nullptr, either no memory runtime checks have been generated or 1908 /// they have been used. 1909 Instruction *MemRuntimeCheckCond = nullptr; 1910 1911 DominatorTree *DT; 1912 LoopInfo *LI; 1913 1914 SCEVExpander SCEVExp; 1915 SCEVExpander MemCheckExp; 1916 1917 public: 1918 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1919 const DataLayout &DL) 1920 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1921 MemCheckExp(SE, DL, "scev.check") {} 1922 1923 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1924 /// accurately estimate the cost of the runtime checks. The blocks are 1925 /// un-linked from the IR and is added back during vector code generation. If 1926 /// there is no vector code generation, the check blocks are removed 1927 /// completely. 1928 void Create(Loop *L, const LoopAccessInfo &LAI, 1929 const SCEVUnionPredicate &UnionPred) { 1930 1931 BasicBlock *LoopHeader = L->getHeader(); 1932 BasicBlock *Preheader = L->getLoopPreheader(); 1933 1934 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1935 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1936 // may be used by SCEVExpander. The blocks will be un-linked from their 1937 // predecessors and removed from LI & DT at the end of the function. 1938 if (!UnionPred.isAlwaysTrue()) { 1939 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1940 nullptr, "vector.scevcheck"); 1941 1942 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1943 &UnionPred, SCEVCheckBlock->getTerminator()); 1944 } 1945 1946 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1947 if (RtPtrChecking.Need) { 1948 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1949 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1950 "vector.memcheck"); 1951 1952 std::tie(std::ignore, MemRuntimeCheckCond) = 1953 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1954 RtPtrChecking.getChecks(), MemCheckExp); 1955 assert(MemRuntimeCheckCond && 1956 "no RT checks generated although RtPtrChecking " 1957 "claimed checks are required"); 1958 } 1959 1960 if (!MemCheckBlock && !SCEVCheckBlock) 1961 return; 1962 1963 // Unhook the temporary block with the checks, update various places 1964 // accordingly. 1965 if (SCEVCheckBlock) 1966 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1967 if (MemCheckBlock) 1968 MemCheckBlock->replaceAllUsesWith(Preheader); 1969 1970 if (SCEVCheckBlock) { 1971 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1972 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1973 Preheader->getTerminator()->eraseFromParent(); 1974 } 1975 if (MemCheckBlock) { 1976 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1977 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1978 Preheader->getTerminator()->eraseFromParent(); 1979 } 1980 1981 DT->changeImmediateDominator(LoopHeader, Preheader); 1982 if (MemCheckBlock) { 1983 DT->eraseNode(MemCheckBlock); 1984 LI->removeBlock(MemCheckBlock); 1985 } 1986 if (SCEVCheckBlock) { 1987 DT->eraseNode(SCEVCheckBlock); 1988 LI->removeBlock(SCEVCheckBlock); 1989 } 1990 } 1991 1992 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1993 /// unused. 1994 ~GeneratedRTChecks() { 1995 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1996 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1997 if (!SCEVCheckCond) 1998 SCEVCleaner.markResultUsed(); 1999 2000 if (!MemRuntimeCheckCond) 2001 MemCheckCleaner.markResultUsed(); 2002 2003 if (MemRuntimeCheckCond) { 2004 auto &SE = *MemCheckExp.getSE(); 2005 // Memory runtime check generation creates compares that use expanded 2006 // values. Remove them before running the SCEVExpanderCleaners. 2007 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2008 if (MemCheckExp.isInsertedInstruction(&I)) 2009 continue; 2010 SE.forgetValue(&I); 2011 SE.eraseValueFromMap(&I); 2012 I.eraseFromParent(); 2013 } 2014 } 2015 MemCheckCleaner.cleanup(); 2016 SCEVCleaner.cleanup(); 2017 2018 if (SCEVCheckCond) 2019 SCEVCheckBlock->eraseFromParent(); 2020 if (MemRuntimeCheckCond) 2021 MemCheckBlock->eraseFromParent(); 2022 } 2023 2024 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2025 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2026 /// depending on the generated condition. 2027 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2028 BasicBlock *LoopVectorPreHeader, 2029 BasicBlock *LoopExitBlock) { 2030 if (!SCEVCheckCond) 2031 return nullptr; 2032 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2033 if (C->isZero()) 2034 return nullptr; 2035 2036 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2037 2038 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2039 // Create new preheader for vector loop. 2040 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2041 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2042 2043 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2044 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2045 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2046 SCEVCheckBlock); 2047 2048 DT->addNewBlock(SCEVCheckBlock, Pred); 2049 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2050 2051 ReplaceInstWithInst( 2052 SCEVCheckBlock->getTerminator(), 2053 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2054 // Mark the check as used, to prevent it from being removed during cleanup. 2055 SCEVCheckCond = nullptr; 2056 return SCEVCheckBlock; 2057 } 2058 2059 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2060 /// the branches to branch to the vector preheader or \p Bypass, depending on 2061 /// the generated condition. 2062 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2063 BasicBlock *LoopVectorPreHeader) { 2064 // Check if we generated code that checks in runtime if arrays overlap. 2065 if (!MemRuntimeCheckCond) 2066 return nullptr; 2067 2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2069 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2070 MemCheckBlock); 2071 2072 DT->addNewBlock(MemCheckBlock, Pred); 2073 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2074 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2075 2076 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2077 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2078 2079 ReplaceInstWithInst( 2080 MemCheckBlock->getTerminator(), 2081 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2082 MemCheckBlock->getTerminator()->setDebugLoc( 2083 Pred->getTerminator()->getDebugLoc()); 2084 2085 // Mark the check as used, to prevent it from being removed during cleanup. 2086 MemRuntimeCheckCond = nullptr; 2087 return MemCheckBlock; 2088 } 2089 }; 2090 2091 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2092 // vectorization. The loop needs to be annotated with #pragma omp simd 2093 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2094 // vector length information is not provided, vectorization is not considered 2095 // explicit. Interleave hints are not allowed either. These limitations will be 2096 // relaxed in the future. 2097 // Please, note that we are currently forced to abuse the pragma 'clang 2098 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2099 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2100 // provides *explicit vectorization hints* (LV can bypass legal checks and 2101 // assume that vectorization is legal). However, both hints are implemented 2102 // using the same metadata (llvm.loop.vectorize, processed by 2103 // LoopVectorizeHints). This will be fixed in the future when the native IR 2104 // representation for pragma 'omp simd' is introduced. 2105 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2106 OptimizationRemarkEmitter *ORE) { 2107 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2108 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2109 2110 // Only outer loops with an explicit vectorization hint are supported. 2111 // Unannotated outer loops are ignored. 2112 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2113 return false; 2114 2115 Function *Fn = OuterLp->getHeader()->getParent(); 2116 if (!Hints.allowVectorization(Fn, OuterLp, 2117 true /*VectorizeOnlyWhenForced*/)) { 2118 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2119 return false; 2120 } 2121 2122 if (Hints.getInterleave() > 1) { 2123 // TODO: Interleave support is future work. 2124 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2125 "outer loops.\n"); 2126 Hints.emitRemarkWithHints(); 2127 return false; 2128 } 2129 2130 return true; 2131 } 2132 2133 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2134 OptimizationRemarkEmitter *ORE, 2135 SmallVectorImpl<Loop *> &V) { 2136 // Collect inner loops and outer loops without irreducible control flow. For 2137 // now, only collect outer loops that have explicit vectorization hints. If we 2138 // are stress testing the VPlan H-CFG construction, we collect the outermost 2139 // loop of every loop nest. 2140 if (L.isInnermost() || VPlanBuildStressTest || 2141 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2142 LoopBlocksRPO RPOT(&L); 2143 RPOT.perform(LI); 2144 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2145 V.push_back(&L); 2146 // TODO: Collect inner loops inside marked outer loops in case 2147 // vectorization fails for the outer loop. Do not invoke 2148 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2149 // already known to be reducible. We can use an inherited attribute for 2150 // that. 2151 return; 2152 } 2153 } 2154 for (Loop *InnerL : L) 2155 collectSupportedLoops(*InnerL, LI, ORE, V); 2156 } 2157 2158 namespace { 2159 2160 /// The LoopVectorize Pass. 2161 struct LoopVectorize : public FunctionPass { 2162 /// Pass identification, replacement for typeid 2163 static char ID; 2164 2165 LoopVectorizePass Impl; 2166 2167 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2168 bool VectorizeOnlyWhenForced = false) 2169 : FunctionPass(ID), 2170 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2171 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2172 } 2173 2174 bool runOnFunction(Function &F) override { 2175 if (skipFunction(F)) 2176 return false; 2177 2178 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2179 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2180 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2181 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2182 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2183 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2184 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2185 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2186 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2187 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2188 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2189 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2190 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2191 2192 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2193 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2194 2195 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2196 GetLAA, *ORE, PSI).MadeAnyChange; 2197 } 2198 2199 void getAnalysisUsage(AnalysisUsage &AU) const override { 2200 AU.addRequired<AssumptionCacheTracker>(); 2201 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2202 AU.addRequired<DominatorTreeWrapperPass>(); 2203 AU.addRequired<LoopInfoWrapperPass>(); 2204 AU.addRequired<ScalarEvolutionWrapperPass>(); 2205 AU.addRequired<TargetTransformInfoWrapperPass>(); 2206 AU.addRequired<AAResultsWrapperPass>(); 2207 AU.addRequired<LoopAccessLegacyAnalysis>(); 2208 AU.addRequired<DemandedBitsWrapperPass>(); 2209 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2210 AU.addRequired<InjectTLIMappingsLegacy>(); 2211 2212 // We currently do not preserve loopinfo/dominator analyses with outer loop 2213 // vectorization. Until this is addressed, mark these analyses as preserved 2214 // only for non-VPlan-native path. 2215 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2216 if (!EnableVPlanNativePath) { 2217 AU.addPreserved<LoopInfoWrapperPass>(); 2218 AU.addPreserved<DominatorTreeWrapperPass>(); 2219 } 2220 2221 AU.addPreserved<BasicAAWrapperPass>(); 2222 AU.addPreserved<GlobalsAAWrapperPass>(); 2223 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2224 } 2225 }; 2226 2227 } // end anonymous namespace 2228 2229 //===----------------------------------------------------------------------===// 2230 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2231 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2232 //===----------------------------------------------------------------------===// 2233 2234 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2235 // We need to place the broadcast of invariant variables outside the loop, 2236 // but only if it's proven safe to do so. Else, broadcast will be inside 2237 // vector loop body. 2238 Instruction *Instr = dyn_cast<Instruction>(V); 2239 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2240 (!Instr || 2241 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2242 // Place the code for broadcasting invariant variables in the new preheader. 2243 IRBuilder<>::InsertPointGuard Guard(Builder); 2244 if (SafeToHoist) 2245 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2246 2247 // Broadcast the scalar into all locations in the vector. 2248 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2249 2250 return Shuf; 2251 } 2252 2253 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2254 const InductionDescriptor &II, Value *Step, Value *Start, 2255 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2256 VPTransformState &State) { 2257 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2258 "Expected either an induction phi-node or a truncate of it!"); 2259 2260 // Construct the initial value of the vector IV in the vector loop preheader 2261 auto CurrIP = Builder.saveIP(); 2262 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2263 if (isa<TruncInst>(EntryVal)) { 2264 assert(Start->getType()->isIntegerTy() && 2265 "Truncation requires an integer type"); 2266 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2267 Step = Builder.CreateTrunc(Step, TruncType); 2268 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2269 } 2270 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2271 Value *SteppedStart = 2272 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2273 2274 // We create vector phi nodes for both integer and floating-point induction 2275 // variables. Here, we determine the kind of arithmetic we will perform. 2276 Instruction::BinaryOps AddOp; 2277 Instruction::BinaryOps MulOp; 2278 if (Step->getType()->isIntegerTy()) { 2279 AddOp = Instruction::Add; 2280 MulOp = Instruction::Mul; 2281 } else { 2282 AddOp = II.getInductionOpcode(); 2283 MulOp = Instruction::FMul; 2284 } 2285 2286 // Multiply the vectorization factor by the step using integer or 2287 // floating-point arithmetic as appropriate. 2288 Type *StepType = Step->getType(); 2289 if (Step->getType()->isFloatingPointTy()) 2290 StepType = IntegerType::get(StepType->getContext(), 2291 StepType->getScalarSizeInBits()); 2292 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2293 if (Step->getType()->isFloatingPointTy()) 2294 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2295 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2296 2297 // Create a vector splat to use in the induction update. 2298 // 2299 // FIXME: If the step is non-constant, we create the vector splat with 2300 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2301 // handle a constant vector splat. 2302 Value *SplatVF = isa<Constant>(Mul) 2303 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2304 : Builder.CreateVectorSplat(VF, Mul); 2305 Builder.restoreIP(CurrIP); 2306 2307 // We may need to add the step a number of times, depending on the unroll 2308 // factor. The last of those goes into the PHI. 2309 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2310 &*LoopVectorBody->getFirstInsertionPt()); 2311 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2312 Instruction *LastInduction = VecInd; 2313 for (unsigned Part = 0; Part < UF; ++Part) { 2314 State.set(Def, LastInduction, Part); 2315 2316 if (isa<TruncInst>(EntryVal)) 2317 addMetadata(LastInduction, EntryVal); 2318 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2319 State, Part); 2320 2321 LastInduction = cast<Instruction>( 2322 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2323 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2324 } 2325 2326 // Move the last step to the end of the latch block. This ensures consistent 2327 // placement of all induction updates. 2328 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2329 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2330 auto *ICmp = cast<Instruction>(Br->getCondition()); 2331 LastInduction->moveBefore(ICmp); 2332 LastInduction->setName("vec.ind.next"); 2333 2334 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2335 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2336 } 2337 2338 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2339 return Cost->isScalarAfterVectorization(I, VF) || 2340 Cost->isProfitableToScalarize(I, VF); 2341 } 2342 2343 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2344 if (shouldScalarizeInstruction(IV)) 2345 return true; 2346 auto isScalarInst = [&](User *U) -> bool { 2347 auto *I = cast<Instruction>(U); 2348 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2349 }; 2350 return llvm::any_of(IV->users(), isScalarInst); 2351 } 2352 2353 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2354 const InductionDescriptor &ID, const Instruction *EntryVal, 2355 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2356 unsigned Part, unsigned Lane) { 2357 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2358 "Expected either an induction phi-node or a truncate of it!"); 2359 2360 // This induction variable is not the phi from the original loop but the 2361 // newly-created IV based on the proof that casted Phi is equal to the 2362 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2363 // re-uses the same InductionDescriptor that original IV uses but we don't 2364 // have to do any recording in this case - that is done when original IV is 2365 // processed. 2366 if (isa<TruncInst>(EntryVal)) 2367 return; 2368 2369 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2370 if (Casts.empty()) 2371 return; 2372 // Only the first Cast instruction in the Casts vector is of interest. 2373 // The rest of the Casts (if exist) have no uses outside the 2374 // induction update chain itself. 2375 if (Lane < UINT_MAX) 2376 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2377 else 2378 State.set(CastDef, VectorLoopVal, Part); 2379 } 2380 2381 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2382 TruncInst *Trunc, VPValue *Def, 2383 VPValue *CastDef, 2384 VPTransformState &State) { 2385 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2386 "Primary induction variable must have an integer type"); 2387 2388 auto II = Legal->getInductionVars().find(IV); 2389 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2390 2391 auto ID = II->second; 2392 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2393 2394 // The value from the original loop to which we are mapping the new induction 2395 // variable. 2396 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2397 2398 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2399 2400 // Generate code for the induction step. Note that induction steps are 2401 // required to be loop-invariant 2402 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2403 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2404 "Induction step should be loop invariant"); 2405 if (PSE.getSE()->isSCEVable(IV->getType())) { 2406 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2407 return Exp.expandCodeFor(Step, Step->getType(), 2408 LoopVectorPreHeader->getTerminator()); 2409 } 2410 return cast<SCEVUnknown>(Step)->getValue(); 2411 }; 2412 2413 // The scalar value to broadcast. This is derived from the canonical 2414 // induction variable. If a truncation type is given, truncate the canonical 2415 // induction variable and step. Otherwise, derive these values from the 2416 // induction descriptor. 2417 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2418 Value *ScalarIV = Induction; 2419 if (IV != OldInduction) { 2420 ScalarIV = IV->getType()->isIntegerTy() 2421 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2422 : Builder.CreateCast(Instruction::SIToFP, Induction, 2423 IV->getType()); 2424 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2425 ScalarIV->setName("offset.idx"); 2426 } 2427 if (Trunc) { 2428 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2429 assert(Step->getType()->isIntegerTy() && 2430 "Truncation requires an integer step"); 2431 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2432 Step = Builder.CreateTrunc(Step, TruncType); 2433 } 2434 return ScalarIV; 2435 }; 2436 2437 // Create the vector values from the scalar IV, in the absence of creating a 2438 // vector IV. 2439 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2440 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2441 for (unsigned Part = 0; Part < UF; ++Part) { 2442 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2443 Value *EntryPart = 2444 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2445 ID.getInductionOpcode()); 2446 State.set(Def, EntryPart, Part); 2447 if (Trunc) 2448 addMetadata(EntryPart, Trunc); 2449 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2450 State, Part); 2451 } 2452 }; 2453 2454 // Fast-math-flags propagate from the original induction instruction. 2455 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2456 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2457 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2458 2459 // Now do the actual transformations, and start with creating the step value. 2460 Value *Step = CreateStepValue(ID.getStep()); 2461 if (VF.isZero() || VF.isScalar()) { 2462 Value *ScalarIV = CreateScalarIV(Step); 2463 CreateSplatIV(ScalarIV, Step); 2464 return; 2465 } 2466 2467 // Determine if we want a scalar version of the induction variable. This is 2468 // true if the induction variable itself is not widened, or if it has at 2469 // least one user in the loop that is not widened. 2470 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2471 if (!NeedsScalarIV) { 2472 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2473 State); 2474 return; 2475 } 2476 2477 // Try to create a new independent vector induction variable. If we can't 2478 // create the phi node, we will splat the scalar induction variable in each 2479 // loop iteration. 2480 if (!shouldScalarizeInstruction(EntryVal)) { 2481 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2482 State); 2483 Value *ScalarIV = CreateScalarIV(Step); 2484 // Create scalar steps that can be used by instructions we will later 2485 // scalarize. Note that the addition of the scalar steps will not increase 2486 // the number of instructions in the loop in the common case prior to 2487 // InstCombine. We will be trading one vector extract for each scalar step. 2488 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2489 return; 2490 } 2491 2492 // All IV users are scalar instructions, so only emit a scalar IV, not a 2493 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2494 // predicate used by the masked loads/stores. 2495 Value *ScalarIV = CreateScalarIV(Step); 2496 if (!Cost->isScalarEpilogueAllowed()) 2497 CreateSplatIV(ScalarIV, Step); 2498 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2499 } 2500 2501 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2502 Instruction::BinaryOps BinOp) { 2503 // Create and check the types. 2504 auto *ValVTy = cast<VectorType>(Val->getType()); 2505 ElementCount VLen = ValVTy->getElementCount(); 2506 2507 Type *STy = Val->getType()->getScalarType(); 2508 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2509 "Induction Step must be an integer or FP"); 2510 assert(Step->getType() == STy && "Step has wrong type"); 2511 2512 SmallVector<Constant *, 8> Indices; 2513 2514 // Create a vector of consecutive numbers from zero to VF. 2515 VectorType *InitVecValVTy = ValVTy; 2516 Type *InitVecValSTy = STy; 2517 if (STy->isFloatingPointTy()) { 2518 InitVecValSTy = 2519 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2520 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2521 } 2522 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2523 2524 // Add on StartIdx 2525 Value *StartIdxSplat = Builder.CreateVectorSplat( 2526 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2527 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2528 2529 if (STy->isIntegerTy()) { 2530 Step = Builder.CreateVectorSplat(VLen, Step); 2531 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2532 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2533 // which can be found from the original scalar operations. 2534 Step = Builder.CreateMul(InitVec, Step); 2535 return Builder.CreateAdd(Val, Step, "induction"); 2536 } 2537 2538 // Floating point induction. 2539 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2540 "Binary Opcode should be specified for FP induction"); 2541 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2542 Step = Builder.CreateVectorSplat(VLen, Step); 2543 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2544 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2545 } 2546 2547 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2548 Instruction *EntryVal, 2549 const InductionDescriptor &ID, 2550 VPValue *Def, VPValue *CastDef, 2551 VPTransformState &State) { 2552 // We shouldn't have to build scalar steps if we aren't vectorizing. 2553 assert(VF.isVector() && "VF should be greater than one"); 2554 // Get the value type and ensure it and the step have the same integer type. 2555 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2556 assert(ScalarIVTy == Step->getType() && 2557 "Val and Step should have the same type"); 2558 2559 // We build scalar steps for both integer and floating-point induction 2560 // variables. Here, we determine the kind of arithmetic we will perform. 2561 Instruction::BinaryOps AddOp; 2562 Instruction::BinaryOps MulOp; 2563 if (ScalarIVTy->isIntegerTy()) { 2564 AddOp = Instruction::Add; 2565 MulOp = Instruction::Mul; 2566 } else { 2567 AddOp = ID.getInductionOpcode(); 2568 MulOp = Instruction::FMul; 2569 } 2570 2571 // Determine the number of scalars we need to generate for each unroll 2572 // iteration. If EntryVal is uniform, we only need to generate the first 2573 // lane. Otherwise, we generate all VF values. 2574 bool IsUniform = 2575 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2576 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2577 // Compute the scalar steps and save the results in State. 2578 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2579 ScalarIVTy->getScalarSizeInBits()); 2580 Type *VecIVTy = nullptr; 2581 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2582 if (!IsUniform && VF.isScalable()) { 2583 VecIVTy = VectorType::get(ScalarIVTy, VF); 2584 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2585 SplatStep = Builder.CreateVectorSplat(VF, Step); 2586 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2587 } 2588 2589 for (unsigned Part = 0; Part < UF; ++Part) { 2590 Value *StartIdx0 = 2591 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2592 2593 if (!IsUniform && VF.isScalable()) { 2594 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2595 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2596 if (ScalarIVTy->isFloatingPointTy()) 2597 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2598 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2599 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2600 State.set(Def, Add, Part); 2601 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2602 Part); 2603 // It's useful to record the lane values too for the known minimum number 2604 // of elements so we do those below. This improves the code quality when 2605 // trying to extract the first element, for example. 2606 } 2607 2608 if (ScalarIVTy->isFloatingPointTy()) 2609 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2610 2611 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2612 Value *StartIdx = Builder.CreateBinOp( 2613 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2614 // The step returned by `createStepForVF` is a runtime-evaluated value 2615 // when VF is scalable. Otherwise, it should be folded into a Constant. 2616 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2617 "Expected StartIdx to be folded to a constant when VF is not " 2618 "scalable"); 2619 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2620 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2621 State.set(Def, Add, VPIteration(Part, Lane)); 2622 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2623 Part, Lane); 2624 } 2625 } 2626 } 2627 2628 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2629 const VPIteration &Instance, 2630 VPTransformState &State) { 2631 Value *ScalarInst = State.get(Def, Instance); 2632 Value *VectorValue = State.get(Def, Instance.Part); 2633 VectorValue = Builder.CreateInsertElement( 2634 VectorValue, ScalarInst, 2635 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2636 State.set(Def, VectorValue, Instance.Part); 2637 } 2638 2639 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2640 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2641 return Builder.CreateVectorReverse(Vec, "reverse"); 2642 } 2643 2644 // Return whether we allow using masked interleave-groups (for dealing with 2645 // strided loads/stores that reside in predicated blocks, or for dealing 2646 // with gaps). 2647 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2648 // If an override option has been passed in for interleaved accesses, use it. 2649 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2650 return EnableMaskedInterleavedMemAccesses; 2651 2652 return TTI.enableMaskedInterleavedAccessVectorization(); 2653 } 2654 2655 // Try to vectorize the interleave group that \p Instr belongs to. 2656 // 2657 // E.g. Translate following interleaved load group (factor = 3): 2658 // for (i = 0; i < N; i+=3) { 2659 // R = Pic[i]; // Member of index 0 2660 // G = Pic[i+1]; // Member of index 1 2661 // B = Pic[i+2]; // Member of index 2 2662 // ... // do something to R, G, B 2663 // } 2664 // To: 2665 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2666 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2667 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2668 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2669 // 2670 // Or translate following interleaved store group (factor = 3): 2671 // for (i = 0; i < N; i+=3) { 2672 // ... do something to R, G, B 2673 // Pic[i] = R; // Member of index 0 2674 // Pic[i+1] = G; // Member of index 1 2675 // Pic[i+2] = B; // Member of index 2 2676 // } 2677 // To: 2678 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2679 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2680 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2681 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2682 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2683 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2684 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2685 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2686 VPValue *BlockInMask) { 2687 Instruction *Instr = Group->getInsertPos(); 2688 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2689 2690 // Prepare for the vector type of the interleaved load/store. 2691 Type *ScalarTy = getMemInstValueType(Instr); 2692 unsigned InterleaveFactor = Group->getFactor(); 2693 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2694 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2695 2696 // Prepare for the new pointers. 2697 SmallVector<Value *, 2> AddrParts; 2698 unsigned Index = Group->getIndex(Instr); 2699 2700 // TODO: extend the masked interleaved-group support to reversed access. 2701 assert((!BlockInMask || !Group->isReverse()) && 2702 "Reversed masked interleave-group not supported."); 2703 2704 // If the group is reverse, adjust the index to refer to the last vector lane 2705 // instead of the first. We adjust the index from the first vector lane, 2706 // rather than directly getting the pointer for lane VF - 1, because the 2707 // pointer operand of the interleaved access is supposed to be uniform. For 2708 // uniform instructions, we're only required to generate a value for the 2709 // first vector lane in each unroll iteration. 2710 if (Group->isReverse()) 2711 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2712 2713 for (unsigned Part = 0; Part < UF; Part++) { 2714 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2715 setDebugLocFromInst(Builder, AddrPart); 2716 2717 // Notice current instruction could be any index. Need to adjust the address 2718 // to the member of index 0. 2719 // 2720 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2721 // b = A[i]; // Member of index 0 2722 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2723 // 2724 // E.g. A[i+1] = a; // Member of index 1 2725 // A[i] = b; // Member of index 0 2726 // A[i+2] = c; // Member of index 2 (Current instruction) 2727 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2728 2729 bool InBounds = false; 2730 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2731 InBounds = gep->isInBounds(); 2732 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2733 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2734 2735 // Cast to the vector pointer type. 2736 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2737 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2738 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2739 } 2740 2741 setDebugLocFromInst(Builder, Instr); 2742 Value *PoisonVec = PoisonValue::get(VecTy); 2743 2744 Value *MaskForGaps = nullptr; 2745 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2746 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2747 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2748 } 2749 2750 // Vectorize the interleaved load group. 2751 if (isa<LoadInst>(Instr)) { 2752 // For each unroll part, create a wide load for the group. 2753 SmallVector<Value *, 2> NewLoads; 2754 for (unsigned Part = 0; Part < UF; Part++) { 2755 Instruction *NewLoad; 2756 if (BlockInMask || MaskForGaps) { 2757 assert(useMaskedInterleavedAccesses(*TTI) && 2758 "masked interleaved groups are not allowed."); 2759 Value *GroupMask = MaskForGaps; 2760 if (BlockInMask) { 2761 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2762 Value *ShuffledMask = Builder.CreateShuffleVector( 2763 BlockInMaskPart, 2764 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2765 "interleaved.mask"); 2766 GroupMask = MaskForGaps 2767 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2768 MaskForGaps) 2769 : ShuffledMask; 2770 } 2771 NewLoad = 2772 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2773 GroupMask, PoisonVec, "wide.masked.vec"); 2774 } 2775 else 2776 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2777 Group->getAlign(), "wide.vec"); 2778 Group->addMetadata(NewLoad); 2779 NewLoads.push_back(NewLoad); 2780 } 2781 2782 // For each member in the group, shuffle out the appropriate data from the 2783 // wide loads. 2784 unsigned J = 0; 2785 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2786 Instruction *Member = Group->getMember(I); 2787 2788 // Skip the gaps in the group. 2789 if (!Member) 2790 continue; 2791 2792 auto StrideMask = 2793 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2794 for (unsigned Part = 0; Part < UF; Part++) { 2795 Value *StridedVec = Builder.CreateShuffleVector( 2796 NewLoads[Part], StrideMask, "strided.vec"); 2797 2798 // If this member has different type, cast the result type. 2799 if (Member->getType() != ScalarTy) { 2800 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2801 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2802 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2803 } 2804 2805 if (Group->isReverse()) 2806 StridedVec = reverseVector(StridedVec); 2807 2808 State.set(VPDefs[J], StridedVec, Part); 2809 } 2810 ++J; 2811 } 2812 return; 2813 } 2814 2815 // The sub vector type for current instruction. 2816 auto *SubVT = VectorType::get(ScalarTy, VF); 2817 2818 // Vectorize the interleaved store group. 2819 for (unsigned Part = 0; Part < UF; Part++) { 2820 // Collect the stored vector from each member. 2821 SmallVector<Value *, 4> StoredVecs; 2822 for (unsigned i = 0; i < InterleaveFactor; i++) { 2823 // Interleaved store group doesn't allow a gap, so each index has a member 2824 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2825 2826 Value *StoredVec = State.get(StoredValues[i], Part); 2827 2828 if (Group->isReverse()) 2829 StoredVec = reverseVector(StoredVec); 2830 2831 // If this member has different type, cast it to a unified type. 2832 2833 if (StoredVec->getType() != SubVT) 2834 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2835 2836 StoredVecs.push_back(StoredVec); 2837 } 2838 2839 // Concatenate all vectors into a wide vector. 2840 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2841 2842 // Interleave the elements in the wide vector. 2843 Value *IVec = Builder.CreateShuffleVector( 2844 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2845 "interleaved.vec"); 2846 2847 Instruction *NewStoreInstr; 2848 if (BlockInMask) { 2849 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2850 Value *ShuffledMask = Builder.CreateShuffleVector( 2851 BlockInMaskPart, 2852 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2853 "interleaved.mask"); 2854 NewStoreInstr = Builder.CreateMaskedStore( 2855 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2856 } 2857 else 2858 NewStoreInstr = 2859 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2860 2861 Group->addMetadata(NewStoreInstr); 2862 } 2863 } 2864 2865 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2866 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2867 VPValue *StoredValue, VPValue *BlockInMask) { 2868 // Attempt to issue a wide load. 2869 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2870 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2871 2872 assert((LI || SI) && "Invalid Load/Store instruction"); 2873 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2874 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2875 2876 LoopVectorizationCostModel::InstWidening Decision = 2877 Cost->getWideningDecision(Instr, VF); 2878 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2879 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2880 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2881 "CM decision is not to widen the memory instruction"); 2882 2883 Type *ScalarDataTy = getMemInstValueType(Instr); 2884 2885 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2886 const Align Alignment = getLoadStoreAlignment(Instr); 2887 2888 // Determine if the pointer operand of the access is either consecutive or 2889 // reverse consecutive. 2890 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2891 bool ConsecutiveStride = 2892 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2893 bool CreateGatherScatter = 2894 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2895 2896 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2897 // gather/scatter. Otherwise Decision should have been to Scalarize. 2898 assert((ConsecutiveStride || CreateGatherScatter) && 2899 "The instruction should be scalarized"); 2900 (void)ConsecutiveStride; 2901 2902 VectorParts BlockInMaskParts(UF); 2903 bool isMaskRequired = BlockInMask; 2904 if (isMaskRequired) 2905 for (unsigned Part = 0; Part < UF; ++Part) 2906 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2907 2908 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2909 // Calculate the pointer for the specific unroll-part. 2910 GetElementPtrInst *PartPtr = nullptr; 2911 2912 bool InBounds = false; 2913 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2914 InBounds = gep->isInBounds(); 2915 if (Reverse) { 2916 // If the address is consecutive but reversed, then the 2917 // wide store needs to start at the last vector element. 2918 // RunTimeVF = VScale * VF.getKnownMinValue() 2919 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2920 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2921 // NumElt = -Part * RunTimeVF 2922 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2923 // LastLane = 1 - RunTimeVF 2924 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2925 PartPtr = 2926 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2927 PartPtr->setIsInBounds(InBounds); 2928 PartPtr = cast<GetElementPtrInst>( 2929 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2930 PartPtr->setIsInBounds(InBounds); 2931 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2932 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2933 } else { 2934 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2935 PartPtr = cast<GetElementPtrInst>( 2936 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2937 PartPtr->setIsInBounds(InBounds); 2938 } 2939 2940 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2941 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2942 }; 2943 2944 // Handle Stores: 2945 if (SI) { 2946 setDebugLocFromInst(Builder, SI); 2947 2948 for (unsigned Part = 0; Part < UF; ++Part) { 2949 Instruction *NewSI = nullptr; 2950 Value *StoredVal = State.get(StoredValue, Part); 2951 if (CreateGatherScatter) { 2952 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2953 Value *VectorGep = State.get(Addr, Part); 2954 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2955 MaskPart); 2956 } else { 2957 if (Reverse) { 2958 // If we store to reverse consecutive memory locations, then we need 2959 // to reverse the order of elements in the stored value. 2960 StoredVal = reverseVector(StoredVal); 2961 // We don't want to update the value in the map as it might be used in 2962 // another expression. So don't call resetVectorValue(StoredVal). 2963 } 2964 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2965 if (isMaskRequired) 2966 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2967 BlockInMaskParts[Part]); 2968 else 2969 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2970 } 2971 addMetadata(NewSI, SI); 2972 } 2973 return; 2974 } 2975 2976 // Handle loads. 2977 assert(LI && "Must have a load instruction"); 2978 setDebugLocFromInst(Builder, LI); 2979 for (unsigned Part = 0; Part < UF; ++Part) { 2980 Value *NewLI; 2981 if (CreateGatherScatter) { 2982 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2983 Value *VectorGep = State.get(Addr, Part); 2984 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2985 nullptr, "wide.masked.gather"); 2986 addMetadata(NewLI, LI); 2987 } else { 2988 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2989 if (isMaskRequired) 2990 NewLI = Builder.CreateMaskedLoad( 2991 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2992 "wide.masked.load"); 2993 else 2994 NewLI = 2995 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2996 2997 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2998 addMetadata(NewLI, LI); 2999 if (Reverse) 3000 NewLI = reverseVector(NewLI); 3001 } 3002 3003 State.set(Def, NewLI, Part); 3004 } 3005 } 3006 3007 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3008 VPUser &User, 3009 const VPIteration &Instance, 3010 bool IfPredicateInstr, 3011 VPTransformState &State) { 3012 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3013 3014 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3015 // the first lane and part. 3016 if (isa<NoAliasScopeDeclInst>(Instr)) 3017 if (!Instance.isFirstIteration()) 3018 return; 3019 3020 setDebugLocFromInst(Builder, Instr); 3021 3022 // Does this instruction return a value ? 3023 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3024 3025 Instruction *Cloned = Instr->clone(); 3026 if (!IsVoidRetTy) 3027 Cloned->setName(Instr->getName() + ".cloned"); 3028 3029 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3030 Builder.GetInsertPoint()); 3031 // Replace the operands of the cloned instructions with their scalar 3032 // equivalents in the new loop. 3033 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3034 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3035 auto InputInstance = Instance; 3036 if (!Operand || !OrigLoop->contains(Operand) || 3037 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3038 InputInstance.Lane = VPLane::getFirstLane(); 3039 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3040 Cloned->setOperand(op, NewOp); 3041 } 3042 addNewMetadata(Cloned, Instr); 3043 3044 // Place the cloned scalar in the new loop. 3045 Builder.Insert(Cloned); 3046 3047 State.set(Def, Cloned, Instance); 3048 3049 // If we just cloned a new assumption, add it the assumption cache. 3050 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3051 AC->registerAssumption(II); 3052 3053 // End if-block. 3054 if (IfPredicateInstr) 3055 PredicatedInstructions.push_back(Cloned); 3056 } 3057 3058 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3059 Value *End, Value *Step, 3060 Instruction *DL) { 3061 BasicBlock *Header = L->getHeader(); 3062 BasicBlock *Latch = L->getLoopLatch(); 3063 // As we're just creating this loop, it's possible no latch exists 3064 // yet. If so, use the header as this will be a single block loop. 3065 if (!Latch) 3066 Latch = Header; 3067 3068 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3069 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3070 setDebugLocFromInst(Builder, OldInst); 3071 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3072 3073 Builder.SetInsertPoint(Latch->getTerminator()); 3074 setDebugLocFromInst(Builder, OldInst); 3075 3076 // Create i+1 and fill the PHINode. 3077 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3078 Induction->addIncoming(Start, L->getLoopPreheader()); 3079 Induction->addIncoming(Next, Latch); 3080 // Create the compare. 3081 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3082 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3083 3084 // Now we have two terminators. Remove the old one from the block. 3085 Latch->getTerminator()->eraseFromParent(); 3086 3087 return Induction; 3088 } 3089 3090 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3091 if (TripCount) 3092 return TripCount; 3093 3094 assert(L && "Create Trip Count for null loop."); 3095 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3096 // Find the loop boundaries. 3097 ScalarEvolution *SE = PSE.getSE(); 3098 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3099 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3100 "Invalid loop count"); 3101 3102 Type *IdxTy = Legal->getWidestInductionType(); 3103 assert(IdxTy && "No type for induction"); 3104 3105 // The exit count might have the type of i64 while the phi is i32. This can 3106 // happen if we have an induction variable that is sign extended before the 3107 // compare. The only way that we get a backedge taken count is that the 3108 // induction variable was signed and as such will not overflow. In such a case 3109 // truncation is legal. 3110 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3111 IdxTy->getPrimitiveSizeInBits()) 3112 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3113 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3114 3115 // Get the total trip count from the count by adding 1. 3116 const SCEV *ExitCount = SE->getAddExpr( 3117 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3118 3119 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3120 3121 // Expand the trip count and place the new instructions in the preheader. 3122 // Notice that the pre-header does not change, only the loop body. 3123 SCEVExpander Exp(*SE, DL, "induction"); 3124 3125 // Count holds the overall loop count (N). 3126 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3127 L->getLoopPreheader()->getTerminator()); 3128 3129 if (TripCount->getType()->isPointerTy()) 3130 TripCount = 3131 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3132 L->getLoopPreheader()->getTerminator()); 3133 3134 return TripCount; 3135 } 3136 3137 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3138 if (VectorTripCount) 3139 return VectorTripCount; 3140 3141 Value *TC = getOrCreateTripCount(L); 3142 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3143 3144 Type *Ty = TC->getType(); 3145 // This is where we can make the step a runtime constant. 3146 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3147 3148 // If the tail is to be folded by masking, round the number of iterations N 3149 // up to a multiple of Step instead of rounding down. This is done by first 3150 // adding Step-1 and then rounding down. Note that it's ok if this addition 3151 // overflows: the vector induction variable will eventually wrap to zero given 3152 // that it starts at zero and its Step is a power of two; the loop will then 3153 // exit, with the last early-exit vector comparison also producing all-true. 3154 if (Cost->foldTailByMasking()) { 3155 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3156 "VF*UF must be a power of 2 when folding tail by masking"); 3157 assert(!VF.isScalable() && 3158 "Tail folding not yet supported for scalable vectors"); 3159 TC = Builder.CreateAdd( 3160 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3161 } 3162 3163 // Now we need to generate the expression for the part of the loop that the 3164 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3165 // iterations are not required for correctness, or N - Step, otherwise. Step 3166 // is equal to the vectorization factor (number of SIMD elements) times the 3167 // unroll factor (number of SIMD instructions). 3168 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3169 3170 // There are two cases where we need to ensure (at least) the last iteration 3171 // runs in the scalar remainder loop. Thus, if the step evenly divides 3172 // the trip count, we set the remainder to be equal to the step. If the step 3173 // does not evenly divide the trip count, no adjustment is necessary since 3174 // there will already be scalar iterations. Note that the minimum iterations 3175 // check ensures that N >= Step. The cases are: 3176 // 1) If there is a non-reversed interleaved group that may speculatively 3177 // access memory out-of-bounds. 3178 // 2) If any instruction may follow a conditionally taken exit. That is, if 3179 // the loop contains multiple exiting blocks, or a single exiting block 3180 // which is not the latch. 3181 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3182 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3183 R = Builder.CreateSelect(IsZero, Step, R); 3184 } 3185 3186 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3187 3188 return VectorTripCount; 3189 } 3190 3191 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3192 const DataLayout &DL) { 3193 // Verify that V is a vector type with same number of elements as DstVTy. 3194 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3195 unsigned VF = DstFVTy->getNumElements(); 3196 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3197 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3198 Type *SrcElemTy = SrcVecTy->getElementType(); 3199 Type *DstElemTy = DstFVTy->getElementType(); 3200 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3201 "Vector elements must have same size"); 3202 3203 // Do a direct cast if element types are castable. 3204 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3205 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3206 } 3207 // V cannot be directly casted to desired vector type. 3208 // May happen when V is a floating point vector but DstVTy is a vector of 3209 // pointers or vice-versa. Handle this using a two-step bitcast using an 3210 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3211 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3212 "Only one type should be a pointer type"); 3213 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3214 "Only one type should be a floating point type"); 3215 Type *IntTy = 3216 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3217 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3218 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3219 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3220 } 3221 3222 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3223 BasicBlock *Bypass) { 3224 Value *Count = getOrCreateTripCount(L); 3225 // Reuse existing vector loop preheader for TC checks. 3226 // Note that new preheader block is generated for vector loop. 3227 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3228 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3229 3230 // Generate code to check if the loop's trip count is less than VF * UF, or 3231 // equal to it in case a scalar epilogue is required; this implies that the 3232 // vector trip count is zero. This check also covers the case where adding one 3233 // to the backedge-taken count overflowed leading to an incorrect trip count 3234 // of zero. In this case we will also jump to the scalar loop. 3235 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3236 : ICmpInst::ICMP_ULT; 3237 3238 // If tail is to be folded, vector loop takes care of all iterations. 3239 Value *CheckMinIters = Builder.getFalse(); 3240 if (!Cost->foldTailByMasking()) { 3241 Value *Step = 3242 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3243 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3244 } 3245 // Create new preheader for vector loop. 3246 LoopVectorPreHeader = 3247 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3248 "vector.ph"); 3249 3250 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3251 DT->getNode(Bypass)->getIDom()) && 3252 "TC check is expected to dominate Bypass"); 3253 3254 // Update dominator for Bypass & LoopExit. 3255 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3256 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3257 3258 ReplaceInstWithInst( 3259 TCCheckBlock->getTerminator(), 3260 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3261 LoopBypassBlocks.push_back(TCCheckBlock); 3262 } 3263 3264 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3265 3266 BasicBlock *const SCEVCheckBlock = 3267 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3268 if (!SCEVCheckBlock) 3269 return nullptr; 3270 3271 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3272 (OptForSizeBasedOnProfile && 3273 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3274 "Cannot SCEV check stride or overflow when optimizing for size"); 3275 3276 3277 // Update dominator only if this is first RT check. 3278 if (LoopBypassBlocks.empty()) { 3279 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3280 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3281 } 3282 3283 LoopBypassBlocks.push_back(SCEVCheckBlock); 3284 AddedSafetyChecks = true; 3285 return SCEVCheckBlock; 3286 } 3287 3288 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3289 BasicBlock *Bypass) { 3290 // VPlan-native path does not do any analysis for runtime checks currently. 3291 if (EnableVPlanNativePath) 3292 return nullptr; 3293 3294 BasicBlock *const MemCheckBlock = 3295 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3296 3297 // Check if we generated code that checks in runtime if arrays overlap. We put 3298 // the checks into a separate block to make the more common case of few 3299 // elements faster. 3300 if (!MemCheckBlock) 3301 return nullptr; 3302 3303 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3304 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3305 "Cannot emit memory checks when optimizing for size, unless forced " 3306 "to vectorize."); 3307 ORE->emit([&]() { 3308 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3309 L->getStartLoc(), L->getHeader()) 3310 << "Code-size may be reduced by not forcing " 3311 "vectorization, or by source-code modifications " 3312 "eliminating the need for runtime checks " 3313 "(e.g., adding 'restrict')."; 3314 }); 3315 } 3316 3317 LoopBypassBlocks.push_back(MemCheckBlock); 3318 3319 AddedSafetyChecks = true; 3320 3321 // We currently don't use LoopVersioning for the actual loop cloning but we 3322 // still use it to add the noalias metadata. 3323 LVer = std::make_unique<LoopVersioning>( 3324 *Legal->getLAI(), 3325 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3326 DT, PSE.getSE()); 3327 LVer->prepareNoAliasMetadata(); 3328 return MemCheckBlock; 3329 } 3330 3331 Value *InnerLoopVectorizer::emitTransformedIndex( 3332 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3333 const InductionDescriptor &ID) const { 3334 3335 SCEVExpander Exp(*SE, DL, "induction"); 3336 auto Step = ID.getStep(); 3337 auto StartValue = ID.getStartValue(); 3338 assert(Index->getType()->getScalarType() == Step->getType() && 3339 "Index scalar type does not match StepValue type"); 3340 3341 // Note: the IR at this point is broken. We cannot use SE to create any new 3342 // SCEV and then expand it, hoping that SCEV's simplification will give us 3343 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3344 // lead to various SCEV crashes. So all we can do is to use builder and rely 3345 // on InstCombine for future simplifications. Here we handle some trivial 3346 // cases only. 3347 auto CreateAdd = [&B](Value *X, Value *Y) { 3348 assert(X->getType() == Y->getType() && "Types don't match!"); 3349 if (auto *CX = dyn_cast<ConstantInt>(X)) 3350 if (CX->isZero()) 3351 return Y; 3352 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3353 if (CY->isZero()) 3354 return X; 3355 return B.CreateAdd(X, Y); 3356 }; 3357 3358 // We allow X to be a vector type, in which case Y will potentially be 3359 // splatted into a vector with the same element count. 3360 auto CreateMul = [&B](Value *X, Value *Y) { 3361 assert(X->getType()->getScalarType() == Y->getType() && 3362 "Types don't match!"); 3363 if (auto *CX = dyn_cast<ConstantInt>(X)) 3364 if (CX->isOne()) 3365 return Y; 3366 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3367 if (CY->isOne()) 3368 return X; 3369 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3370 if (XVTy && !isa<VectorType>(Y->getType())) 3371 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3372 return B.CreateMul(X, Y); 3373 }; 3374 3375 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3376 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3377 // the DomTree is not kept up-to-date for additional blocks generated in the 3378 // vector loop. By using the header as insertion point, we guarantee that the 3379 // expanded instructions dominate all their uses. 3380 auto GetInsertPoint = [this, &B]() { 3381 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3382 if (InsertBB != LoopVectorBody && 3383 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3384 return LoopVectorBody->getTerminator(); 3385 return &*B.GetInsertPoint(); 3386 }; 3387 3388 switch (ID.getKind()) { 3389 case InductionDescriptor::IK_IntInduction: { 3390 assert(!isa<VectorType>(Index->getType()) && 3391 "Vector indices not supported for integer inductions yet"); 3392 assert(Index->getType() == StartValue->getType() && 3393 "Index type does not match StartValue type"); 3394 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3395 return B.CreateSub(StartValue, Index); 3396 auto *Offset = CreateMul( 3397 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3398 return CreateAdd(StartValue, Offset); 3399 } 3400 case InductionDescriptor::IK_PtrInduction: { 3401 assert(isa<SCEVConstant>(Step) && 3402 "Expected constant step for pointer induction"); 3403 return B.CreateGEP( 3404 StartValue->getType()->getPointerElementType(), StartValue, 3405 CreateMul(Index, 3406 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3407 GetInsertPoint()))); 3408 } 3409 case InductionDescriptor::IK_FpInduction: { 3410 assert(!isa<VectorType>(Index->getType()) && 3411 "Vector indices not supported for FP inductions yet"); 3412 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3413 auto InductionBinOp = ID.getInductionBinOp(); 3414 assert(InductionBinOp && 3415 (InductionBinOp->getOpcode() == Instruction::FAdd || 3416 InductionBinOp->getOpcode() == Instruction::FSub) && 3417 "Original bin op should be defined for FP induction"); 3418 3419 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3420 Value *MulExp = B.CreateFMul(StepValue, Index); 3421 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3422 "induction"); 3423 } 3424 case InductionDescriptor::IK_NoInduction: 3425 return nullptr; 3426 } 3427 llvm_unreachable("invalid enum"); 3428 } 3429 3430 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3431 LoopScalarBody = OrigLoop->getHeader(); 3432 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3433 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3434 assert(LoopExitBlock && "Must have an exit block"); 3435 assert(LoopVectorPreHeader && "Invalid loop structure"); 3436 3437 LoopMiddleBlock = 3438 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3439 LI, nullptr, Twine(Prefix) + "middle.block"); 3440 LoopScalarPreHeader = 3441 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3442 nullptr, Twine(Prefix) + "scalar.ph"); 3443 3444 // Set up branch from middle block to the exit and scalar preheader blocks. 3445 // completeLoopSkeleton will update the condition to use an iteration check, 3446 // if required to decide whether to execute the remainder. 3447 BranchInst *BrInst = 3448 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3449 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3450 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3451 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3452 3453 // We intentionally don't let SplitBlock to update LoopInfo since 3454 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3455 // LoopVectorBody is explicitly added to the correct place few lines later. 3456 LoopVectorBody = 3457 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3458 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3459 3460 // Update dominator for loop exit. 3461 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3462 3463 // Create and register the new vector loop. 3464 Loop *Lp = LI->AllocateLoop(); 3465 Loop *ParentLoop = OrigLoop->getParentLoop(); 3466 3467 // Insert the new loop into the loop nest and register the new basic blocks 3468 // before calling any utilities such as SCEV that require valid LoopInfo. 3469 if (ParentLoop) { 3470 ParentLoop->addChildLoop(Lp); 3471 } else { 3472 LI->addTopLevelLoop(Lp); 3473 } 3474 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3475 return Lp; 3476 } 3477 3478 void InnerLoopVectorizer::createInductionResumeValues( 3479 Loop *L, Value *VectorTripCount, 3480 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3481 assert(VectorTripCount && L && "Expected valid arguments"); 3482 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3483 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3484 "Inconsistent information about additional bypass."); 3485 // We are going to resume the execution of the scalar loop. 3486 // Go over all of the induction variables that we found and fix the 3487 // PHIs that are left in the scalar version of the loop. 3488 // The starting values of PHI nodes depend on the counter of the last 3489 // iteration in the vectorized loop. 3490 // If we come from a bypass edge then we need to start from the original 3491 // start value. 3492 for (auto &InductionEntry : Legal->getInductionVars()) { 3493 PHINode *OrigPhi = InductionEntry.first; 3494 InductionDescriptor II = InductionEntry.second; 3495 3496 // Create phi nodes to merge from the backedge-taken check block. 3497 PHINode *BCResumeVal = 3498 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3499 LoopScalarPreHeader->getTerminator()); 3500 // Copy original phi DL over to the new one. 3501 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3502 Value *&EndValue = IVEndValues[OrigPhi]; 3503 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3504 if (OrigPhi == OldInduction) { 3505 // We know what the end value is. 3506 EndValue = VectorTripCount; 3507 } else { 3508 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3509 3510 // Fast-math-flags propagate from the original induction instruction. 3511 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3512 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3513 3514 Type *StepType = II.getStep()->getType(); 3515 Instruction::CastOps CastOp = 3516 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3517 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3518 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3519 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3520 EndValue->setName("ind.end"); 3521 3522 // Compute the end value for the additional bypass (if applicable). 3523 if (AdditionalBypass.first) { 3524 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3525 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3526 StepType, true); 3527 CRD = 3528 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3529 EndValueFromAdditionalBypass = 3530 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3531 EndValueFromAdditionalBypass->setName("ind.end"); 3532 } 3533 } 3534 // The new PHI merges the original incoming value, in case of a bypass, 3535 // or the value at the end of the vectorized loop. 3536 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3537 3538 // Fix the scalar body counter (PHI node). 3539 // The old induction's phi node in the scalar body needs the truncated 3540 // value. 3541 for (BasicBlock *BB : LoopBypassBlocks) 3542 BCResumeVal->addIncoming(II.getStartValue(), BB); 3543 3544 if (AdditionalBypass.first) 3545 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3546 EndValueFromAdditionalBypass); 3547 3548 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3549 } 3550 } 3551 3552 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3553 MDNode *OrigLoopID) { 3554 assert(L && "Expected valid loop."); 3555 3556 // The trip counts should be cached by now. 3557 Value *Count = getOrCreateTripCount(L); 3558 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3559 3560 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3561 3562 // Add a check in the middle block to see if we have completed 3563 // all of the iterations in the first vector loop. 3564 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3565 // If tail is to be folded, we know we don't need to run the remainder. 3566 if (!Cost->foldTailByMasking()) { 3567 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3568 Count, VectorTripCount, "cmp.n", 3569 LoopMiddleBlock->getTerminator()); 3570 3571 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3572 // of the corresponding compare because they may have ended up with 3573 // different line numbers and we want to avoid awkward line stepping while 3574 // debugging. Eg. if the compare has got a line number inside the loop. 3575 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3576 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3577 } 3578 3579 // Get ready to start creating new instructions into the vectorized body. 3580 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3581 "Inconsistent vector loop preheader"); 3582 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3583 3584 Optional<MDNode *> VectorizedLoopID = 3585 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3586 LLVMLoopVectorizeFollowupVectorized}); 3587 if (VectorizedLoopID.hasValue()) { 3588 L->setLoopID(VectorizedLoopID.getValue()); 3589 3590 // Do not setAlreadyVectorized if loop attributes have been defined 3591 // explicitly. 3592 return LoopVectorPreHeader; 3593 } 3594 3595 // Keep all loop hints from the original loop on the vector loop (we'll 3596 // replace the vectorizer-specific hints below). 3597 if (MDNode *LID = OrigLoop->getLoopID()) 3598 L->setLoopID(LID); 3599 3600 LoopVectorizeHints Hints(L, true, *ORE); 3601 Hints.setAlreadyVectorized(); 3602 3603 #ifdef EXPENSIVE_CHECKS 3604 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3605 LI->verify(*DT); 3606 #endif 3607 3608 return LoopVectorPreHeader; 3609 } 3610 3611 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3612 /* 3613 In this function we generate a new loop. The new loop will contain 3614 the vectorized instructions while the old loop will continue to run the 3615 scalar remainder. 3616 3617 [ ] <-- loop iteration number check. 3618 / | 3619 / v 3620 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3621 | / | 3622 | / v 3623 || [ ] <-- vector pre header. 3624 |/ | 3625 | v 3626 | [ ] \ 3627 | [ ]_| <-- vector loop. 3628 | | 3629 | v 3630 | -[ ] <--- middle-block. 3631 | / | 3632 | / v 3633 -|- >[ ] <--- new preheader. 3634 | | 3635 | v 3636 | [ ] \ 3637 | [ ]_| <-- old scalar loop to handle remainder. 3638 \ | 3639 \ v 3640 >[ ] <-- exit block. 3641 ... 3642 */ 3643 3644 // Get the metadata of the original loop before it gets modified. 3645 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3646 3647 // Workaround! Compute the trip count of the original loop and cache it 3648 // before we start modifying the CFG. This code has a systemic problem 3649 // wherein it tries to run analysis over partially constructed IR; this is 3650 // wrong, and not simply for SCEV. The trip count of the original loop 3651 // simply happens to be prone to hitting this in practice. In theory, we 3652 // can hit the same issue for any SCEV, or ValueTracking query done during 3653 // mutation. See PR49900. 3654 getOrCreateTripCount(OrigLoop); 3655 3656 // Create an empty vector loop, and prepare basic blocks for the runtime 3657 // checks. 3658 Loop *Lp = createVectorLoopSkeleton(""); 3659 3660 // Now, compare the new count to zero. If it is zero skip the vector loop and 3661 // jump to the scalar loop. This check also covers the case where the 3662 // backedge-taken count is uint##_max: adding one to it will overflow leading 3663 // to an incorrect trip count of zero. In this (rare) case we will also jump 3664 // to the scalar loop. 3665 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3666 3667 // Generate the code to check any assumptions that we've made for SCEV 3668 // expressions. 3669 emitSCEVChecks(Lp, LoopScalarPreHeader); 3670 3671 // Generate the code that checks in runtime if arrays overlap. We put the 3672 // checks into a separate block to make the more common case of few elements 3673 // faster. 3674 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3675 3676 // Some loops have a single integer induction variable, while other loops 3677 // don't. One example is c++ iterators that often have multiple pointer 3678 // induction variables. In the code below we also support a case where we 3679 // don't have a single induction variable. 3680 // 3681 // We try to obtain an induction variable from the original loop as hard 3682 // as possible. However if we don't find one that: 3683 // - is an integer 3684 // - counts from zero, stepping by one 3685 // - is the size of the widest induction variable type 3686 // then we create a new one. 3687 OldInduction = Legal->getPrimaryInduction(); 3688 Type *IdxTy = Legal->getWidestInductionType(); 3689 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3690 // The loop step is equal to the vectorization factor (num of SIMD elements) 3691 // times the unroll factor (num of SIMD instructions). 3692 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3693 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3694 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3695 Induction = 3696 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3697 getDebugLocFromInstOrOperands(OldInduction)); 3698 3699 // Emit phis for the new starting index of the scalar loop. 3700 createInductionResumeValues(Lp, CountRoundDown); 3701 3702 return completeLoopSkeleton(Lp, OrigLoopID); 3703 } 3704 3705 // Fix up external users of the induction variable. At this point, we are 3706 // in LCSSA form, with all external PHIs that use the IV having one input value, 3707 // coming from the remainder loop. We need those PHIs to also have a correct 3708 // value for the IV when arriving directly from the middle block. 3709 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3710 const InductionDescriptor &II, 3711 Value *CountRoundDown, Value *EndValue, 3712 BasicBlock *MiddleBlock) { 3713 // There are two kinds of external IV usages - those that use the value 3714 // computed in the last iteration (the PHI) and those that use the penultimate 3715 // value (the value that feeds into the phi from the loop latch). 3716 // We allow both, but they, obviously, have different values. 3717 3718 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3719 3720 DenseMap<Value *, Value *> MissingVals; 3721 3722 // An external user of the last iteration's value should see the value that 3723 // the remainder loop uses to initialize its own IV. 3724 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3725 for (User *U : PostInc->users()) { 3726 Instruction *UI = cast<Instruction>(U); 3727 if (!OrigLoop->contains(UI)) { 3728 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3729 MissingVals[UI] = EndValue; 3730 } 3731 } 3732 3733 // An external user of the penultimate value need to see EndValue - Step. 3734 // The simplest way to get this is to recompute it from the constituent SCEVs, 3735 // that is Start + (Step * (CRD - 1)). 3736 for (User *U : OrigPhi->users()) { 3737 auto *UI = cast<Instruction>(U); 3738 if (!OrigLoop->contains(UI)) { 3739 const DataLayout &DL = 3740 OrigLoop->getHeader()->getModule()->getDataLayout(); 3741 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3742 3743 IRBuilder<> B(MiddleBlock->getTerminator()); 3744 3745 // Fast-math-flags propagate from the original induction instruction. 3746 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3747 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3748 3749 Value *CountMinusOne = B.CreateSub( 3750 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3751 Value *CMO = 3752 !II.getStep()->getType()->isIntegerTy() 3753 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3754 II.getStep()->getType()) 3755 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3756 CMO->setName("cast.cmo"); 3757 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3758 Escape->setName("ind.escape"); 3759 MissingVals[UI] = Escape; 3760 } 3761 } 3762 3763 for (auto &I : MissingVals) { 3764 PHINode *PHI = cast<PHINode>(I.first); 3765 // One corner case we have to handle is two IVs "chasing" each-other, 3766 // that is %IV2 = phi [...], [ %IV1, %latch ] 3767 // In this case, if IV1 has an external use, we need to avoid adding both 3768 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3769 // don't already have an incoming value for the middle block. 3770 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3771 PHI->addIncoming(I.second, MiddleBlock); 3772 } 3773 } 3774 3775 namespace { 3776 3777 struct CSEDenseMapInfo { 3778 static bool canHandle(const Instruction *I) { 3779 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3780 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3781 } 3782 3783 static inline Instruction *getEmptyKey() { 3784 return DenseMapInfo<Instruction *>::getEmptyKey(); 3785 } 3786 3787 static inline Instruction *getTombstoneKey() { 3788 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3789 } 3790 3791 static unsigned getHashValue(const Instruction *I) { 3792 assert(canHandle(I) && "Unknown instruction!"); 3793 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3794 I->value_op_end())); 3795 } 3796 3797 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3798 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3799 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3800 return LHS == RHS; 3801 return LHS->isIdenticalTo(RHS); 3802 } 3803 }; 3804 3805 } // end anonymous namespace 3806 3807 ///Perform cse of induction variable instructions. 3808 static void cse(BasicBlock *BB) { 3809 // Perform simple cse. 3810 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3811 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3812 Instruction *In = &*I++; 3813 3814 if (!CSEDenseMapInfo::canHandle(In)) 3815 continue; 3816 3817 // Check if we can replace this instruction with any of the 3818 // visited instructions. 3819 if (Instruction *V = CSEMap.lookup(In)) { 3820 In->replaceAllUsesWith(V); 3821 In->eraseFromParent(); 3822 continue; 3823 } 3824 3825 CSEMap[In] = In; 3826 } 3827 } 3828 3829 InstructionCost 3830 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3831 bool &NeedToScalarize) const { 3832 Function *F = CI->getCalledFunction(); 3833 Type *ScalarRetTy = CI->getType(); 3834 SmallVector<Type *, 4> Tys, ScalarTys; 3835 for (auto &ArgOp : CI->arg_operands()) 3836 ScalarTys.push_back(ArgOp->getType()); 3837 3838 // Estimate cost of scalarized vector call. The source operands are assumed 3839 // to be vectors, so we need to extract individual elements from there, 3840 // execute VF scalar calls, and then gather the result into the vector return 3841 // value. 3842 InstructionCost ScalarCallCost = 3843 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3844 if (VF.isScalar()) 3845 return ScalarCallCost; 3846 3847 // Compute corresponding vector type for return value and arguments. 3848 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3849 for (Type *ScalarTy : ScalarTys) 3850 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3851 3852 // Compute costs of unpacking argument values for the scalar calls and 3853 // packing the return values to a vector. 3854 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3855 3856 InstructionCost Cost = 3857 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3858 3859 // If we can't emit a vector call for this function, then the currently found 3860 // cost is the cost we need to return. 3861 NeedToScalarize = true; 3862 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3863 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3864 3865 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3866 return Cost; 3867 3868 // If the corresponding vector cost is cheaper, return its cost. 3869 InstructionCost VectorCallCost = 3870 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3871 if (VectorCallCost < Cost) { 3872 NeedToScalarize = false; 3873 Cost = VectorCallCost; 3874 } 3875 return Cost; 3876 } 3877 3878 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3879 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3880 return Elt; 3881 return VectorType::get(Elt, VF); 3882 } 3883 3884 InstructionCost 3885 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3886 ElementCount VF) const { 3887 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3888 assert(ID && "Expected intrinsic call!"); 3889 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3890 FastMathFlags FMF; 3891 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3892 FMF = FPMO->getFastMathFlags(); 3893 3894 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3895 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3896 SmallVector<Type *> ParamTys; 3897 std::transform(FTy->param_begin(), FTy->param_end(), 3898 std::back_inserter(ParamTys), 3899 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3900 3901 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3902 dyn_cast<IntrinsicInst>(CI)); 3903 return TTI.getIntrinsicInstrCost(CostAttrs, 3904 TargetTransformInfo::TCK_RecipThroughput); 3905 } 3906 3907 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3908 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3909 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3910 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3911 } 3912 3913 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3914 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3915 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3916 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3917 } 3918 3919 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3920 // For every instruction `I` in MinBWs, truncate the operands, create a 3921 // truncated version of `I` and reextend its result. InstCombine runs 3922 // later and will remove any ext/trunc pairs. 3923 SmallPtrSet<Value *, 4> Erased; 3924 for (const auto &KV : Cost->getMinimalBitwidths()) { 3925 // If the value wasn't vectorized, we must maintain the original scalar 3926 // type. The absence of the value from State indicates that it 3927 // wasn't vectorized. 3928 VPValue *Def = State.Plan->getVPValue(KV.first); 3929 if (!State.hasAnyVectorValue(Def)) 3930 continue; 3931 for (unsigned Part = 0; Part < UF; ++Part) { 3932 Value *I = State.get(Def, Part); 3933 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3934 continue; 3935 Type *OriginalTy = I->getType(); 3936 Type *ScalarTruncatedTy = 3937 IntegerType::get(OriginalTy->getContext(), KV.second); 3938 auto *TruncatedTy = FixedVectorType::get( 3939 ScalarTruncatedTy, 3940 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3941 if (TruncatedTy == OriginalTy) 3942 continue; 3943 3944 IRBuilder<> B(cast<Instruction>(I)); 3945 auto ShrinkOperand = [&](Value *V) -> Value * { 3946 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3947 if (ZI->getSrcTy() == TruncatedTy) 3948 return ZI->getOperand(0); 3949 return B.CreateZExtOrTrunc(V, TruncatedTy); 3950 }; 3951 3952 // The actual instruction modification depends on the instruction type, 3953 // unfortunately. 3954 Value *NewI = nullptr; 3955 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3956 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3957 ShrinkOperand(BO->getOperand(1))); 3958 3959 // Any wrapping introduced by shrinking this operation shouldn't be 3960 // considered undefined behavior. So, we can't unconditionally copy 3961 // arithmetic wrapping flags to NewI. 3962 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3963 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3964 NewI = 3965 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3966 ShrinkOperand(CI->getOperand(1))); 3967 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3968 NewI = B.CreateSelect(SI->getCondition(), 3969 ShrinkOperand(SI->getTrueValue()), 3970 ShrinkOperand(SI->getFalseValue())); 3971 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3972 switch (CI->getOpcode()) { 3973 default: 3974 llvm_unreachable("Unhandled cast!"); 3975 case Instruction::Trunc: 3976 NewI = ShrinkOperand(CI->getOperand(0)); 3977 break; 3978 case Instruction::SExt: 3979 NewI = B.CreateSExtOrTrunc( 3980 CI->getOperand(0), 3981 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3982 break; 3983 case Instruction::ZExt: 3984 NewI = B.CreateZExtOrTrunc( 3985 CI->getOperand(0), 3986 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3987 break; 3988 } 3989 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3990 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3991 ->getNumElements(); 3992 auto *O0 = B.CreateZExtOrTrunc( 3993 SI->getOperand(0), 3994 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3995 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3996 ->getNumElements(); 3997 auto *O1 = B.CreateZExtOrTrunc( 3998 SI->getOperand(1), 3999 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 4000 4001 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4002 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4003 // Don't do anything with the operands, just extend the result. 4004 continue; 4005 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4006 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 4007 ->getNumElements(); 4008 auto *O0 = B.CreateZExtOrTrunc( 4009 IE->getOperand(0), 4010 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4011 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4012 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4013 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4014 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4015 ->getNumElements(); 4016 auto *O0 = B.CreateZExtOrTrunc( 4017 EE->getOperand(0), 4018 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4019 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4020 } else { 4021 // If we don't know what to do, be conservative and don't do anything. 4022 continue; 4023 } 4024 4025 // Lastly, extend the result. 4026 NewI->takeName(cast<Instruction>(I)); 4027 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4028 I->replaceAllUsesWith(Res); 4029 cast<Instruction>(I)->eraseFromParent(); 4030 Erased.insert(I); 4031 State.reset(Def, Res, Part); 4032 } 4033 } 4034 4035 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4036 for (const auto &KV : Cost->getMinimalBitwidths()) { 4037 // If the value wasn't vectorized, we must maintain the original scalar 4038 // type. The absence of the value from State indicates that it 4039 // wasn't vectorized. 4040 VPValue *Def = State.Plan->getVPValue(KV.first); 4041 if (!State.hasAnyVectorValue(Def)) 4042 continue; 4043 for (unsigned Part = 0; Part < UF; ++Part) { 4044 Value *I = State.get(Def, Part); 4045 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4046 if (Inst && Inst->use_empty()) { 4047 Value *NewI = Inst->getOperand(0); 4048 Inst->eraseFromParent(); 4049 State.reset(Def, NewI, Part); 4050 } 4051 } 4052 } 4053 } 4054 4055 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4056 // Insert truncates and extends for any truncated instructions as hints to 4057 // InstCombine. 4058 if (VF.isVector()) 4059 truncateToMinimalBitwidths(State); 4060 4061 // Fix widened non-induction PHIs by setting up the PHI operands. 4062 if (OrigPHIsToFix.size()) { 4063 assert(EnableVPlanNativePath && 4064 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4065 fixNonInductionPHIs(State); 4066 } 4067 4068 // At this point every instruction in the original loop is widened to a 4069 // vector form. Now we need to fix the recurrences in the loop. These PHI 4070 // nodes are currently empty because we did not want to introduce cycles. 4071 // This is the second stage of vectorizing recurrences. 4072 fixCrossIterationPHIs(State); 4073 4074 // Forget the original basic block. 4075 PSE.getSE()->forgetLoop(OrigLoop); 4076 4077 // Fix-up external users of the induction variables. 4078 for (auto &Entry : Legal->getInductionVars()) 4079 fixupIVUsers(Entry.first, Entry.second, 4080 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4081 IVEndValues[Entry.first], LoopMiddleBlock); 4082 4083 fixLCSSAPHIs(State); 4084 for (Instruction *PI : PredicatedInstructions) 4085 sinkScalarOperands(&*PI); 4086 4087 // Remove redundant induction instructions. 4088 cse(LoopVectorBody); 4089 4090 // Set/update profile weights for the vector and remainder loops as original 4091 // loop iterations are now distributed among them. Note that original loop 4092 // represented by LoopScalarBody becomes remainder loop after vectorization. 4093 // 4094 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4095 // end up getting slightly roughened result but that should be OK since 4096 // profile is not inherently precise anyway. Note also possible bypass of 4097 // vector code caused by legality checks is ignored, assigning all the weight 4098 // to the vector loop, optimistically. 4099 // 4100 // For scalable vectorization we can't know at compile time how many iterations 4101 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4102 // vscale of '1'. 4103 setProfileInfoAfterUnrolling( 4104 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4105 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4106 } 4107 4108 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4109 // In order to support recurrences we need to be able to vectorize Phi nodes. 4110 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4111 // stage #2: We now need to fix the recurrences by adding incoming edges to 4112 // the currently empty PHI nodes. At this point every instruction in the 4113 // original loop is widened to a vector form so we can use them to construct 4114 // the incoming edges. 4115 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4116 for (VPRecipeBase &R : Header->phis()) { 4117 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4118 if (!PhiR) 4119 continue; 4120 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4121 if (PhiR->getRecurrenceDescriptor()) { 4122 fixReduction(PhiR, State); 4123 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4124 fixFirstOrderRecurrence(OrigPhi, State); 4125 } 4126 } 4127 4128 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4129 VPTransformState &State) { 4130 // This is the second phase of vectorizing first-order recurrences. An 4131 // overview of the transformation is described below. Suppose we have the 4132 // following loop. 4133 // 4134 // for (int i = 0; i < n; ++i) 4135 // b[i] = a[i] - a[i - 1]; 4136 // 4137 // There is a first-order recurrence on "a". For this loop, the shorthand 4138 // scalar IR looks like: 4139 // 4140 // scalar.ph: 4141 // s_init = a[-1] 4142 // br scalar.body 4143 // 4144 // scalar.body: 4145 // i = phi [0, scalar.ph], [i+1, scalar.body] 4146 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4147 // s2 = a[i] 4148 // b[i] = s2 - s1 4149 // br cond, scalar.body, ... 4150 // 4151 // In this example, s1 is a recurrence because it's value depends on the 4152 // previous iteration. In the first phase of vectorization, we created a 4153 // temporary value for s1. We now complete the vectorization and produce the 4154 // shorthand vector IR shown below (for VF = 4, UF = 1). 4155 // 4156 // vector.ph: 4157 // v_init = vector(..., ..., ..., a[-1]) 4158 // br vector.body 4159 // 4160 // vector.body 4161 // i = phi [0, vector.ph], [i+4, vector.body] 4162 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4163 // v2 = a[i, i+1, i+2, i+3]; 4164 // v3 = vector(v1(3), v2(0, 1, 2)) 4165 // b[i, i+1, i+2, i+3] = v2 - v3 4166 // br cond, vector.body, middle.block 4167 // 4168 // middle.block: 4169 // x = v2(3) 4170 // br scalar.ph 4171 // 4172 // scalar.ph: 4173 // s_init = phi [x, middle.block], [a[-1], otherwise] 4174 // br scalar.body 4175 // 4176 // After execution completes the vector loop, we extract the next value of 4177 // the recurrence (x) to use as the initial value in the scalar loop. 4178 4179 // Get the original loop preheader and single loop latch. 4180 auto *Preheader = OrigLoop->getLoopPreheader(); 4181 auto *Latch = OrigLoop->getLoopLatch(); 4182 4183 // Get the initial and previous values of the scalar recurrence. 4184 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4185 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4186 4187 auto *IdxTy = Builder.getInt32Ty(); 4188 auto *One = ConstantInt::get(IdxTy, 1); 4189 4190 // Create a vector from the initial value. 4191 auto *VectorInit = ScalarInit; 4192 if (VF.isVector()) { 4193 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4194 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4195 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4196 VectorInit = Builder.CreateInsertElement( 4197 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4198 VectorInit, LastIdx, "vector.recur.init"); 4199 } 4200 4201 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4202 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4203 // We constructed a temporary phi node in the first phase of vectorization. 4204 // This phi node will eventually be deleted. 4205 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4206 4207 // Create a phi node for the new recurrence. The current value will either be 4208 // the initial value inserted into a vector or loop-varying vector value. 4209 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4210 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4211 4212 // Get the vectorized previous value of the last part UF - 1. It appears last 4213 // among all unrolled iterations, due to the order of their construction. 4214 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4215 4216 // Find and set the insertion point after the previous value if it is an 4217 // instruction. 4218 BasicBlock::iterator InsertPt; 4219 // Note that the previous value may have been constant-folded so it is not 4220 // guaranteed to be an instruction in the vector loop. 4221 // FIXME: Loop invariant values do not form recurrences. We should deal with 4222 // them earlier. 4223 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4224 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4225 else { 4226 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4227 if (isa<PHINode>(PreviousLastPart)) 4228 // If the previous value is a phi node, we should insert after all the phi 4229 // nodes in the block containing the PHI to avoid breaking basic block 4230 // verification. Note that the basic block may be different to 4231 // LoopVectorBody, in case we predicate the loop. 4232 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4233 else 4234 InsertPt = ++PreviousInst->getIterator(); 4235 } 4236 Builder.SetInsertPoint(&*InsertPt); 4237 4238 // The vector from which to take the initial value for the current iteration 4239 // (actual or unrolled). Initially, this is the vector phi node. 4240 Value *Incoming = VecPhi; 4241 4242 // Shuffle the current and previous vector and update the vector parts. 4243 for (unsigned Part = 0; Part < UF; ++Part) { 4244 Value *PreviousPart = State.get(PreviousDef, Part); 4245 Value *PhiPart = State.get(PhiDef, Part); 4246 auto *Shuffle = VF.isVector() 4247 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4248 : Incoming; 4249 PhiPart->replaceAllUsesWith(Shuffle); 4250 cast<Instruction>(PhiPart)->eraseFromParent(); 4251 State.reset(PhiDef, Shuffle, Part); 4252 Incoming = PreviousPart; 4253 } 4254 4255 // Fix the latch value of the new recurrence in the vector loop. 4256 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4257 4258 // Extract the last vector element in the middle block. This will be the 4259 // initial value for the recurrence when jumping to the scalar loop. 4260 auto *ExtractForScalar = Incoming; 4261 if (VF.isVector()) { 4262 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4263 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4264 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4265 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4266 "vector.recur.extract"); 4267 } 4268 // Extract the second last element in the middle block if the 4269 // Phi is used outside the loop. We need to extract the phi itself 4270 // and not the last element (the phi update in the current iteration). This 4271 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4272 // when the scalar loop is not run at all. 4273 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4274 if (VF.isVector()) { 4275 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4276 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4277 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4278 Incoming, Idx, "vector.recur.extract.for.phi"); 4279 } else if (UF > 1) 4280 // When loop is unrolled without vectorizing, initialize 4281 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4282 // of `Incoming`. This is analogous to the vectorized case above: extracting 4283 // the second last element when VF > 1. 4284 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4285 4286 // Fix the initial value of the original recurrence in the scalar loop. 4287 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4288 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4289 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4290 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4291 Start->addIncoming(Incoming, BB); 4292 } 4293 4294 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4295 Phi->setName("scalar.recur"); 4296 4297 // Finally, fix users of the recurrence outside the loop. The users will need 4298 // either the last value of the scalar recurrence or the last value of the 4299 // vector recurrence we extracted in the middle block. Since the loop is in 4300 // LCSSA form, we just need to find all the phi nodes for the original scalar 4301 // recurrence in the exit block, and then add an edge for the middle block. 4302 // Note that LCSSA does not imply single entry when the original scalar loop 4303 // had multiple exiting edges (as we always run the last iteration in the 4304 // scalar epilogue); in that case, the exiting path through middle will be 4305 // dynamically dead and the value picked for the phi doesn't matter. 4306 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4307 if (any_of(LCSSAPhi.incoming_values(), 4308 [Phi](Value *V) { return V == Phi; })) 4309 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4310 } 4311 4312 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4313 return EnableStrictReductions && RdxDesc.isOrdered(); 4314 } 4315 4316 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4317 VPTransformState &State) { 4318 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4319 // Get it's reduction variable descriptor. 4320 assert(Legal->isReductionVariable(OrigPhi) && 4321 "Unable to find the reduction variable"); 4322 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4323 4324 RecurKind RK = RdxDesc.getRecurrenceKind(); 4325 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4326 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4327 setDebugLocFromInst(Builder, ReductionStartValue); 4328 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4329 4330 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4331 // This is the vector-clone of the value that leaves the loop. 4332 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4333 4334 // Wrap flags are in general invalid after vectorization, clear them. 4335 clearReductionWrapFlags(RdxDesc, State); 4336 4337 // Fix the vector-loop phi. 4338 4339 // Reductions do not have to start at zero. They can start with 4340 // any loop invariant values. 4341 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4342 4343 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4344 useOrderedReductions(RdxDesc); 4345 4346 for (unsigned Part = 0; Part < UF; ++Part) { 4347 if (IsOrdered && Part > 0) 4348 break; 4349 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4350 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4351 if (IsOrdered) 4352 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4353 4354 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4355 } 4356 4357 // Before each round, move the insertion point right between 4358 // the PHIs and the values we are going to write. 4359 // This allows us to write both PHINodes and the extractelement 4360 // instructions. 4361 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4362 4363 setDebugLocFromInst(Builder, LoopExitInst); 4364 4365 Type *PhiTy = OrigPhi->getType(); 4366 // If tail is folded by masking, the vector value to leave the loop should be 4367 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4368 // instead of the former. For an inloop reduction the reduction will already 4369 // be predicated, and does not need to be handled here. 4370 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4371 for (unsigned Part = 0; Part < UF; ++Part) { 4372 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4373 Value *Sel = nullptr; 4374 for (User *U : VecLoopExitInst->users()) { 4375 if (isa<SelectInst>(U)) { 4376 assert(!Sel && "Reduction exit feeding two selects"); 4377 Sel = U; 4378 } else 4379 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4380 } 4381 assert(Sel && "Reduction exit feeds no select"); 4382 State.reset(LoopExitInstDef, Sel, Part); 4383 4384 // If the target can create a predicated operator for the reduction at no 4385 // extra cost in the loop (for example a predicated vadd), it can be 4386 // cheaper for the select to remain in the loop than be sunk out of it, 4387 // and so use the select value for the phi instead of the old 4388 // LoopExitValue. 4389 if (PreferPredicatedReductionSelect || 4390 TTI->preferPredicatedReductionSelect( 4391 RdxDesc.getOpcode(), PhiTy, 4392 TargetTransformInfo::ReductionFlags())) { 4393 auto *VecRdxPhi = 4394 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4395 VecRdxPhi->setIncomingValueForBlock( 4396 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4397 } 4398 } 4399 } 4400 4401 // If the vector reduction can be performed in a smaller type, we truncate 4402 // then extend the loop exit value to enable InstCombine to evaluate the 4403 // entire expression in the smaller type. 4404 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4405 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4406 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4407 Builder.SetInsertPoint( 4408 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4409 VectorParts RdxParts(UF); 4410 for (unsigned Part = 0; Part < UF; ++Part) { 4411 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4412 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4413 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4414 : Builder.CreateZExt(Trunc, VecTy); 4415 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4416 UI != RdxParts[Part]->user_end();) 4417 if (*UI != Trunc) { 4418 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4419 RdxParts[Part] = Extnd; 4420 } else { 4421 ++UI; 4422 } 4423 } 4424 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4425 for (unsigned Part = 0; Part < UF; ++Part) { 4426 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4427 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4428 } 4429 } 4430 4431 // Reduce all of the unrolled parts into a single vector. 4432 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4433 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4434 4435 // The middle block terminator has already been assigned a DebugLoc here (the 4436 // OrigLoop's single latch terminator). We want the whole middle block to 4437 // appear to execute on this line because: (a) it is all compiler generated, 4438 // (b) these instructions are always executed after evaluating the latch 4439 // conditional branch, and (c) other passes may add new predecessors which 4440 // terminate on this line. This is the easiest way to ensure we don't 4441 // accidentally cause an extra step back into the loop while debugging. 4442 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4443 if (IsOrdered) 4444 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4445 else { 4446 // Floating-point operations should have some FMF to enable the reduction. 4447 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4448 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4449 for (unsigned Part = 1; Part < UF; ++Part) { 4450 Value *RdxPart = State.get(LoopExitInstDef, Part); 4451 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4452 ReducedPartRdx = Builder.CreateBinOp( 4453 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4454 } else { 4455 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4456 } 4457 } 4458 } 4459 4460 // Create the reduction after the loop. Note that inloop reductions create the 4461 // target reduction in the loop using a Reduction recipe. 4462 if (VF.isVector() && !IsInLoopReductionPhi) { 4463 ReducedPartRdx = 4464 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4465 // If the reduction can be performed in a smaller type, we need to extend 4466 // the reduction to the wider type before we branch to the original loop. 4467 if (PhiTy != RdxDesc.getRecurrenceType()) 4468 ReducedPartRdx = RdxDesc.isSigned() 4469 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4470 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4471 } 4472 4473 // Create a phi node that merges control-flow from the backedge-taken check 4474 // block and the middle block. 4475 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4476 LoopScalarPreHeader->getTerminator()); 4477 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4478 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4479 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4480 4481 // Now, we need to fix the users of the reduction variable 4482 // inside and outside of the scalar remainder loop. 4483 4484 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4485 // in the exit blocks. See comment on analogous loop in 4486 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4487 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4488 if (any_of(LCSSAPhi.incoming_values(), 4489 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4490 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4491 4492 // Fix the scalar loop reduction variable with the incoming reduction sum 4493 // from the vector body and from the backedge value. 4494 int IncomingEdgeBlockIdx = 4495 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4496 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4497 // Pick the other block. 4498 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4499 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4500 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4501 } 4502 4503 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4504 VPTransformState &State) { 4505 RecurKind RK = RdxDesc.getRecurrenceKind(); 4506 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4507 return; 4508 4509 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4510 assert(LoopExitInstr && "null loop exit instruction"); 4511 SmallVector<Instruction *, 8> Worklist; 4512 SmallPtrSet<Instruction *, 8> Visited; 4513 Worklist.push_back(LoopExitInstr); 4514 Visited.insert(LoopExitInstr); 4515 4516 while (!Worklist.empty()) { 4517 Instruction *Cur = Worklist.pop_back_val(); 4518 if (isa<OverflowingBinaryOperator>(Cur)) 4519 for (unsigned Part = 0; Part < UF; ++Part) { 4520 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4521 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4522 } 4523 4524 for (User *U : Cur->users()) { 4525 Instruction *UI = cast<Instruction>(U); 4526 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4527 Visited.insert(UI).second) 4528 Worklist.push_back(UI); 4529 } 4530 } 4531 } 4532 4533 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4534 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4535 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4536 // Some phis were already hand updated by the reduction and recurrence 4537 // code above, leave them alone. 4538 continue; 4539 4540 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4541 // Non-instruction incoming values will have only one value. 4542 4543 VPLane Lane = VPLane::getFirstLane(); 4544 if (isa<Instruction>(IncomingValue) && 4545 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4546 VF)) 4547 Lane = VPLane::getLastLaneForVF(VF); 4548 4549 // Can be a loop invariant incoming value or the last scalar value to be 4550 // extracted from the vectorized loop. 4551 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4552 Value *lastIncomingValue = 4553 OrigLoop->isLoopInvariant(IncomingValue) 4554 ? IncomingValue 4555 : State.get(State.Plan->getVPValue(IncomingValue), 4556 VPIteration(UF - 1, Lane)); 4557 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4558 } 4559 } 4560 4561 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4562 // The basic block and loop containing the predicated instruction. 4563 auto *PredBB = PredInst->getParent(); 4564 auto *VectorLoop = LI->getLoopFor(PredBB); 4565 4566 // Initialize a worklist with the operands of the predicated instruction. 4567 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4568 4569 // Holds instructions that we need to analyze again. An instruction may be 4570 // reanalyzed if we don't yet know if we can sink it or not. 4571 SmallVector<Instruction *, 8> InstsToReanalyze; 4572 4573 // Returns true if a given use occurs in the predicated block. Phi nodes use 4574 // their operands in their corresponding predecessor blocks. 4575 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4576 auto *I = cast<Instruction>(U.getUser()); 4577 BasicBlock *BB = I->getParent(); 4578 if (auto *Phi = dyn_cast<PHINode>(I)) 4579 BB = Phi->getIncomingBlock( 4580 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4581 return BB == PredBB; 4582 }; 4583 4584 // Iteratively sink the scalarized operands of the predicated instruction 4585 // into the block we created for it. When an instruction is sunk, it's 4586 // operands are then added to the worklist. The algorithm ends after one pass 4587 // through the worklist doesn't sink a single instruction. 4588 bool Changed; 4589 do { 4590 // Add the instructions that need to be reanalyzed to the worklist, and 4591 // reset the changed indicator. 4592 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4593 InstsToReanalyze.clear(); 4594 Changed = false; 4595 4596 while (!Worklist.empty()) { 4597 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4598 4599 // We can't sink an instruction if it is a phi node, is already in the 4600 // predicated block, is not in the loop, or may have side effects. 4601 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4602 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4603 continue; 4604 4605 // It's legal to sink the instruction if all its uses occur in the 4606 // predicated block. Otherwise, there's nothing to do yet, and we may 4607 // need to reanalyze the instruction. 4608 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4609 InstsToReanalyze.push_back(I); 4610 continue; 4611 } 4612 4613 // Move the instruction to the beginning of the predicated block, and add 4614 // it's operands to the worklist. 4615 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4616 Worklist.insert(I->op_begin(), I->op_end()); 4617 4618 // The sinking may have enabled other instructions to be sunk, so we will 4619 // need to iterate. 4620 Changed = true; 4621 } 4622 } while (Changed); 4623 } 4624 4625 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4626 for (PHINode *OrigPhi : OrigPHIsToFix) { 4627 VPWidenPHIRecipe *VPPhi = 4628 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4629 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4630 // Make sure the builder has a valid insert point. 4631 Builder.SetInsertPoint(NewPhi); 4632 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4633 VPValue *Inc = VPPhi->getIncomingValue(i); 4634 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4635 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4636 } 4637 } 4638 } 4639 4640 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4641 VPUser &Operands, unsigned UF, 4642 ElementCount VF, bool IsPtrLoopInvariant, 4643 SmallBitVector &IsIndexLoopInvariant, 4644 VPTransformState &State) { 4645 // Construct a vector GEP by widening the operands of the scalar GEP as 4646 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4647 // results in a vector of pointers when at least one operand of the GEP 4648 // is vector-typed. Thus, to keep the representation compact, we only use 4649 // vector-typed operands for loop-varying values. 4650 4651 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4652 // If we are vectorizing, but the GEP has only loop-invariant operands, 4653 // the GEP we build (by only using vector-typed operands for 4654 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4655 // produce a vector of pointers, we need to either arbitrarily pick an 4656 // operand to broadcast, or broadcast a clone of the original GEP. 4657 // Here, we broadcast a clone of the original. 4658 // 4659 // TODO: If at some point we decide to scalarize instructions having 4660 // loop-invariant operands, this special case will no longer be 4661 // required. We would add the scalarization decision to 4662 // collectLoopScalars() and teach getVectorValue() to broadcast 4663 // the lane-zero scalar value. 4664 auto *Clone = Builder.Insert(GEP->clone()); 4665 for (unsigned Part = 0; Part < UF; ++Part) { 4666 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4667 State.set(VPDef, EntryPart, Part); 4668 addMetadata(EntryPart, GEP); 4669 } 4670 } else { 4671 // If the GEP has at least one loop-varying operand, we are sure to 4672 // produce a vector of pointers. But if we are only unrolling, we want 4673 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4674 // produce with the code below will be scalar (if VF == 1) or vector 4675 // (otherwise). Note that for the unroll-only case, we still maintain 4676 // values in the vector mapping with initVector, as we do for other 4677 // instructions. 4678 for (unsigned Part = 0; Part < UF; ++Part) { 4679 // The pointer operand of the new GEP. If it's loop-invariant, we 4680 // won't broadcast it. 4681 auto *Ptr = IsPtrLoopInvariant 4682 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4683 : State.get(Operands.getOperand(0), Part); 4684 4685 // Collect all the indices for the new GEP. If any index is 4686 // loop-invariant, we won't broadcast it. 4687 SmallVector<Value *, 4> Indices; 4688 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4689 VPValue *Operand = Operands.getOperand(I); 4690 if (IsIndexLoopInvariant[I - 1]) 4691 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4692 else 4693 Indices.push_back(State.get(Operand, Part)); 4694 } 4695 4696 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4697 // but it should be a vector, otherwise. 4698 auto *NewGEP = 4699 GEP->isInBounds() 4700 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4701 Indices) 4702 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4703 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4704 "NewGEP is not a pointer vector"); 4705 State.set(VPDef, NewGEP, Part); 4706 addMetadata(NewGEP, GEP); 4707 } 4708 } 4709 } 4710 4711 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4712 RecurrenceDescriptor *RdxDesc, 4713 VPWidenPHIRecipe *PhiR, 4714 VPTransformState &State) { 4715 PHINode *P = cast<PHINode>(PN); 4716 if (EnableVPlanNativePath) { 4717 // Currently we enter here in the VPlan-native path for non-induction 4718 // PHIs where all control flow is uniform. We simply widen these PHIs. 4719 // Create a vector phi with no operands - the vector phi operands will be 4720 // set at the end of vector code generation. 4721 Type *VecTy = (State.VF.isScalar()) 4722 ? PN->getType() 4723 : VectorType::get(PN->getType(), State.VF); 4724 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4725 State.set(PhiR, VecPhi, 0); 4726 OrigPHIsToFix.push_back(P); 4727 4728 return; 4729 } 4730 4731 assert(PN->getParent() == OrigLoop->getHeader() && 4732 "Non-header phis should have been handled elsewhere"); 4733 4734 VPValue *StartVPV = PhiR->getStartValue(); 4735 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4736 // In order to support recurrences we need to be able to vectorize Phi nodes. 4737 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4738 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4739 // this value when we vectorize all of the instructions that use the PHI. 4740 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4741 Value *Iden = nullptr; 4742 bool ScalarPHI = 4743 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4744 Type *VecTy = 4745 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4746 4747 if (RdxDesc) { 4748 assert(Legal->isReductionVariable(P) && StartV && 4749 "RdxDesc should only be set for reduction variables; in that case " 4750 "a StartV is also required"); 4751 RecurKind RK = RdxDesc->getRecurrenceKind(); 4752 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4753 // MinMax reduction have the start value as their identify. 4754 if (ScalarPHI) { 4755 Iden = StartV; 4756 } else { 4757 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4758 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4759 StartV = Iden = 4760 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4761 } 4762 } else { 4763 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4764 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4765 Iden = IdenC; 4766 4767 if (!ScalarPHI) { 4768 Iden = ConstantVector::getSplat(State.VF, IdenC); 4769 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4770 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4771 Constant *Zero = Builder.getInt32(0); 4772 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4773 } 4774 } 4775 } 4776 4777 bool IsOrdered = State.VF.isVector() && 4778 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4779 useOrderedReductions(*RdxDesc); 4780 4781 for (unsigned Part = 0; Part < State.UF; ++Part) { 4782 // This is phase one of vectorizing PHIs. 4783 if (Part > 0 && IsOrdered) 4784 return; 4785 Value *EntryPart = PHINode::Create( 4786 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4787 State.set(PhiR, EntryPart, Part); 4788 if (StartV) { 4789 // Make sure to add the reduction start value only to the 4790 // first unroll part. 4791 Value *StartVal = (Part == 0) ? StartV : Iden; 4792 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4793 } 4794 } 4795 return; 4796 } 4797 4798 assert(!Legal->isReductionVariable(P) && 4799 "reductions should be handled above"); 4800 4801 setDebugLocFromInst(Builder, P); 4802 4803 // This PHINode must be an induction variable. 4804 // Make sure that we know about it. 4805 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4806 4807 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4808 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4809 4810 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4811 // which can be found from the original scalar operations. 4812 switch (II.getKind()) { 4813 case InductionDescriptor::IK_NoInduction: 4814 llvm_unreachable("Unknown induction"); 4815 case InductionDescriptor::IK_IntInduction: 4816 case InductionDescriptor::IK_FpInduction: 4817 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4818 case InductionDescriptor::IK_PtrInduction: { 4819 // Handle the pointer induction variable case. 4820 assert(P->getType()->isPointerTy() && "Unexpected type."); 4821 4822 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4823 // This is the normalized GEP that starts counting at zero. 4824 Value *PtrInd = 4825 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4826 // Determine the number of scalars we need to generate for each unroll 4827 // iteration. If the instruction is uniform, we only need to generate the 4828 // first lane. Otherwise, we generate all VF values. 4829 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4830 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4831 4832 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4833 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4834 if (NeedsVectorIndex) { 4835 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4836 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4837 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4838 } 4839 4840 for (unsigned Part = 0; Part < UF; ++Part) { 4841 Value *PartStart = createStepForVF( 4842 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4843 4844 if (NeedsVectorIndex) { 4845 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4846 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4847 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4848 Value *SclrGep = 4849 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4850 SclrGep->setName("next.gep"); 4851 State.set(PhiR, SclrGep, Part); 4852 // We've cached the whole vector, which means we can support the 4853 // extraction of any lane. 4854 continue; 4855 } 4856 4857 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4858 Value *Idx = Builder.CreateAdd( 4859 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4860 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4861 Value *SclrGep = 4862 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4863 SclrGep->setName("next.gep"); 4864 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4865 } 4866 } 4867 return; 4868 } 4869 assert(isa<SCEVConstant>(II.getStep()) && 4870 "Induction step not a SCEV constant!"); 4871 Type *PhiType = II.getStep()->getType(); 4872 4873 // Build a pointer phi 4874 Value *ScalarStartValue = II.getStartValue(); 4875 Type *ScStValueType = ScalarStartValue->getType(); 4876 PHINode *NewPointerPhi = 4877 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4878 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4879 4880 // A pointer induction, performed by using a gep 4881 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4882 Instruction *InductionLoc = LoopLatch->getTerminator(); 4883 const SCEV *ScalarStep = II.getStep(); 4884 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4885 Value *ScalarStepValue = 4886 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4887 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4888 Value *NumUnrolledElems = 4889 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4890 Value *InductionGEP = GetElementPtrInst::Create( 4891 ScStValueType->getPointerElementType(), NewPointerPhi, 4892 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4893 InductionLoc); 4894 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4895 4896 // Create UF many actual address geps that use the pointer 4897 // phi as base and a vectorized version of the step value 4898 // (<step*0, ..., step*N>) as offset. 4899 for (unsigned Part = 0; Part < State.UF; ++Part) { 4900 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4901 Value *StartOffsetScalar = 4902 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4903 Value *StartOffset = 4904 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4905 // Create a vector of consecutive numbers from zero to VF. 4906 StartOffset = 4907 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4908 4909 Value *GEP = Builder.CreateGEP( 4910 ScStValueType->getPointerElementType(), NewPointerPhi, 4911 Builder.CreateMul( 4912 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4913 "vector.gep")); 4914 State.set(PhiR, GEP, Part); 4915 } 4916 } 4917 } 4918 } 4919 4920 /// A helper function for checking whether an integer division-related 4921 /// instruction may divide by zero (in which case it must be predicated if 4922 /// executed conditionally in the scalar code). 4923 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4924 /// Non-zero divisors that are non compile-time constants will not be 4925 /// converted into multiplication, so we will still end up scalarizing 4926 /// the division, but can do so w/o predication. 4927 static bool mayDivideByZero(Instruction &I) { 4928 assert((I.getOpcode() == Instruction::UDiv || 4929 I.getOpcode() == Instruction::SDiv || 4930 I.getOpcode() == Instruction::URem || 4931 I.getOpcode() == Instruction::SRem) && 4932 "Unexpected instruction"); 4933 Value *Divisor = I.getOperand(1); 4934 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4935 return !CInt || CInt->isZero(); 4936 } 4937 4938 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4939 VPUser &User, 4940 VPTransformState &State) { 4941 switch (I.getOpcode()) { 4942 case Instruction::Call: 4943 case Instruction::Br: 4944 case Instruction::PHI: 4945 case Instruction::GetElementPtr: 4946 case Instruction::Select: 4947 llvm_unreachable("This instruction is handled by a different recipe."); 4948 case Instruction::UDiv: 4949 case Instruction::SDiv: 4950 case Instruction::SRem: 4951 case Instruction::URem: 4952 case Instruction::Add: 4953 case Instruction::FAdd: 4954 case Instruction::Sub: 4955 case Instruction::FSub: 4956 case Instruction::FNeg: 4957 case Instruction::Mul: 4958 case Instruction::FMul: 4959 case Instruction::FDiv: 4960 case Instruction::FRem: 4961 case Instruction::Shl: 4962 case Instruction::LShr: 4963 case Instruction::AShr: 4964 case Instruction::And: 4965 case Instruction::Or: 4966 case Instruction::Xor: { 4967 // Just widen unops and binops. 4968 setDebugLocFromInst(Builder, &I); 4969 4970 for (unsigned Part = 0; Part < UF; ++Part) { 4971 SmallVector<Value *, 2> Ops; 4972 for (VPValue *VPOp : User.operands()) 4973 Ops.push_back(State.get(VPOp, Part)); 4974 4975 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4976 4977 if (auto *VecOp = dyn_cast<Instruction>(V)) 4978 VecOp->copyIRFlags(&I); 4979 4980 // Use this vector value for all users of the original instruction. 4981 State.set(Def, V, Part); 4982 addMetadata(V, &I); 4983 } 4984 4985 break; 4986 } 4987 case Instruction::ICmp: 4988 case Instruction::FCmp: { 4989 // Widen compares. Generate vector compares. 4990 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4991 auto *Cmp = cast<CmpInst>(&I); 4992 setDebugLocFromInst(Builder, Cmp); 4993 for (unsigned Part = 0; Part < UF; ++Part) { 4994 Value *A = State.get(User.getOperand(0), Part); 4995 Value *B = State.get(User.getOperand(1), Part); 4996 Value *C = nullptr; 4997 if (FCmp) { 4998 // Propagate fast math flags. 4999 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 5000 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 5001 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 5002 } else { 5003 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 5004 } 5005 State.set(Def, C, Part); 5006 addMetadata(C, &I); 5007 } 5008 5009 break; 5010 } 5011 5012 case Instruction::ZExt: 5013 case Instruction::SExt: 5014 case Instruction::FPToUI: 5015 case Instruction::FPToSI: 5016 case Instruction::FPExt: 5017 case Instruction::PtrToInt: 5018 case Instruction::IntToPtr: 5019 case Instruction::SIToFP: 5020 case Instruction::UIToFP: 5021 case Instruction::Trunc: 5022 case Instruction::FPTrunc: 5023 case Instruction::BitCast: { 5024 auto *CI = cast<CastInst>(&I); 5025 setDebugLocFromInst(Builder, CI); 5026 5027 /// Vectorize casts. 5028 Type *DestTy = 5029 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 5030 5031 for (unsigned Part = 0; Part < UF; ++Part) { 5032 Value *A = State.get(User.getOperand(0), Part); 5033 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5034 State.set(Def, Cast, Part); 5035 addMetadata(Cast, &I); 5036 } 5037 break; 5038 } 5039 default: 5040 // This instruction is not vectorized by simple widening. 5041 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5042 llvm_unreachable("Unhandled instruction!"); 5043 } // end of switch. 5044 } 5045 5046 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5047 VPUser &ArgOperands, 5048 VPTransformState &State) { 5049 assert(!isa<DbgInfoIntrinsic>(I) && 5050 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5051 setDebugLocFromInst(Builder, &I); 5052 5053 Module *M = I.getParent()->getParent()->getParent(); 5054 auto *CI = cast<CallInst>(&I); 5055 5056 SmallVector<Type *, 4> Tys; 5057 for (Value *ArgOperand : CI->arg_operands()) 5058 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5059 5060 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5061 5062 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5063 // version of the instruction. 5064 // Is it beneficial to perform intrinsic call compared to lib call? 5065 bool NeedToScalarize = false; 5066 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5067 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5068 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5069 assert((UseVectorIntrinsic || !NeedToScalarize) && 5070 "Instruction should be scalarized elsewhere."); 5071 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5072 "Either the intrinsic cost or vector call cost must be valid"); 5073 5074 for (unsigned Part = 0; Part < UF; ++Part) { 5075 SmallVector<Value *, 4> Args; 5076 for (auto &I : enumerate(ArgOperands.operands())) { 5077 // Some intrinsics have a scalar argument - don't replace it with a 5078 // vector. 5079 Value *Arg; 5080 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5081 Arg = State.get(I.value(), Part); 5082 else 5083 Arg = State.get(I.value(), VPIteration(0, 0)); 5084 Args.push_back(Arg); 5085 } 5086 5087 Function *VectorF; 5088 if (UseVectorIntrinsic) { 5089 // Use vector version of the intrinsic. 5090 Type *TysForDecl[] = {CI->getType()}; 5091 if (VF.isVector()) 5092 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5093 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5094 assert(VectorF && "Can't retrieve vector intrinsic."); 5095 } else { 5096 // Use vector version of the function call. 5097 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5098 #ifndef NDEBUG 5099 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5100 "Can't create vector function."); 5101 #endif 5102 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5103 } 5104 SmallVector<OperandBundleDef, 1> OpBundles; 5105 CI->getOperandBundlesAsDefs(OpBundles); 5106 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5107 5108 if (isa<FPMathOperator>(V)) 5109 V->copyFastMathFlags(CI); 5110 5111 State.set(Def, V, Part); 5112 addMetadata(V, &I); 5113 } 5114 } 5115 5116 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5117 VPUser &Operands, 5118 bool InvariantCond, 5119 VPTransformState &State) { 5120 setDebugLocFromInst(Builder, &I); 5121 5122 // The condition can be loop invariant but still defined inside the 5123 // loop. This means that we can't just use the original 'cond' value. 5124 // We have to take the 'vectorized' value and pick the first lane. 5125 // Instcombine will make this a no-op. 5126 auto *InvarCond = InvariantCond 5127 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5128 : nullptr; 5129 5130 for (unsigned Part = 0; Part < UF; ++Part) { 5131 Value *Cond = 5132 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5133 Value *Op0 = State.get(Operands.getOperand(1), Part); 5134 Value *Op1 = State.get(Operands.getOperand(2), Part); 5135 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5136 State.set(VPDef, Sel, Part); 5137 addMetadata(Sel, &I); 5138 } 5139 } 5140 5141 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5142 // We should not collect Scalars more than once per VF. Right now, this 5143 // function is called from collectUniformsAndScalars(), which already does 5144 // this check. Collecting Scalars for VF=1 does not make any sense. 5145 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5146 "This function should not be visited twice for the same VF"); 5147 5148 SmallSetVector<Instruction *, 8> Worklist; 5149 5150 // These sets are used to seed the analysis with pointers used by memory 5151 // accesses that will remain scalar. 5152 SmallSetVector<Instruction *, 8> ScalarPtrs; 5153 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5154 auto *Latch = TheLoop->getLoopLatch(); 5155 5156 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5157 // The pointer operands of loads and stores will be scalar as long as the 5158 // memory access is not a gather or scatter operation. The value operand of a 5159 // store will remain scalar if the store is scalarized. 5160 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5161 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5162 assert(WideningDecision != CM_Unknown && 5163 "Widening decision should be ready at this moment"); 5164 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5165 if (Ptr == Store->getValueOperand()) 5166 return WideningDecision == CM_Scalarize; 5167 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5168 "Ptr is neither a value or pointer operand"); 5169 return WideningDecision != CM_GatherScatter; 5170 }; 5171 5172 // A helper that returns true if the given value is a bitcast or 5173 // getelementptr instruction contained in the loop. 5174 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5175 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5176 isa<GetElementPtrInst>(V)) && 5177 !TheLoop->isLoopInvariant(V); 5178 }; 5179 5180 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5181 if (!isa<PHINode>(Ptr) || 5182 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5183 return false; 5184 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5185 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5186 return false; 5187 return isScalarUse(MemAccess, Ptr); 5188 }; 5189 5190 // A helper that evaluates a memory access's use of a pointer. If the 5191 // pointer is actually the pointer induction of a loop, it is being 5192 // inserted into Worklist. If the use will be a scalar use, and the 5193 // pointer is only used by memory accesses, we place the pointer in 5194 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5195 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5196 if (isScalarPtrInduction(MemAccess, Ptr)) { 5197 Worklist.insert(cast<Instruction>(Ptr)); 5198 Instruction *Update = cast<Instruction>( 5199 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5200 Worklist.insert(Update); 5201 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5202 << "\n"); 5203 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5204 << "\n"); 5205 return; 5206 } 5207 // We only care about bitcast and getelementptr instructions contained in 5208 // the loop. 5209 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5210 return; 5211 5212 // If the pointer has already been identified as scalar (e.g., if it was 5213 // also identified as uniform), there's nothing to do. 5214 auto *I = cast<Instruction>(Ptr); 5215 if (Worklist.count(I)) 5216 return; 5217 5218 // If the use of the pointer will be a scalar use, and all users of the 5219 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5220 // place the pointer in PossibleNonScalarPtrs. 5221 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5222 return isa<LoadInst>(U) || isa<StoreInst>(U); 5223 })) 5224 ScalarPtrs.insert(I); 5225 else 5226 PossibleNonScalarPtrs.insert(I); 5227 }; 5228 5229 // We seed the scalars analysis with three classes of instructions: (1) 5230 // instructions marked uniform-after-vectorization and (2) bitcast, 5231 // getelementptr and (pointer) phi instructions used by memory accesses 5232 // requiring a scalar use. 5233 // 5234 // (1) Add to the worklist all instructions that have been identified as 5235 // uniform-after-vectorization. 5236 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5237 5238 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5239 // memory accesses requiring a scalar use. The pointer operands of loads and 5240 // stores will be scalar as long as the memory accesses is not a gather or 5241 // scatter operation. The value operand of a store will remain scalar if the 5242 // store is scalarized. 5243 for (auto *BB : TheLoop->blocks()) 5244 for (auto &I : *BB) { 5245 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5246 evaluatePtrUse(Load, Load->getPointerOperand()); 5247 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5248 evaluatePtrUse(Store, Store->getPointerOperand()); 5249 evaluatePtrUse(Store, Store->getValueOperand()); 5250 } 5251 } 5252 for (auto *I : ScalarPtrs) 5253 if (!PossibleNonScalarPtrs.count(I)) { 5254 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5255 Worklist.insert(I); 5256 } 5257 5258 // Insert the forced scalars. 5259 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5260 // induction variable when the PHI user is scalarized. 5261 auto ForcedScalar = ForcedScalars.find(VF); 5262 if (ForcedScalar != ForcedScalars.end()) 5263 for (auto *I : ForcedScalar->second) 5264 Worklist.insert(I); 5265 5266 // Expand the worklist by looking through any bitcasts and getelementptr 5267 // instructions we've already identified as scalar. This is similar to the 5268 // expansion step in collectLoopUniforms(); however, here we're only 5269 // expanding to include additional bitcasts and getelementptr instructions. 5270 unsigned Idx = 0; 5271 while (Idx != Worklist.size()) { 5272 Instruction *Dst = Worklist[Idx++]; 5273 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5274 continue; 5275 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5276 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5277 auto *J = cast<Instruction>(U); 5278 return !TheLoop->contains(J) || Worklist.count(J) || 5279 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5280 isScalarUse(J, Src)); 5281 })) { 5282 Worklist.insert(Src); 5283 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5284 } 5285 } 5286 5287 // An induction variable will remain scalar if all users of the induction 5288 // variable and induction variable update remain scalar. 5289 for (auto &Induction : Legal->getInductionVars()) { 5290 auto *Ind = Induction.first; 5291 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5292 5293 // If tail-folding is applied, the primary induction variable will be used 5294 // to feed a vector compare. 5295 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5296 continue; 5297 5298 // Determine if all users of the induction variable are scalar after 5299 // vectorization. 5300 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5301 auto *I = cast<Instruction>(U); 5302 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5303 }); 5304 if (!ScalarInd) 5305 continue; 5306 5307 // Determine if all users of the induction variable update instruction are 5308 // scalar after vectorization. 5309 auto ScalarIndUpdate = 5310 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5311 auto *I = cast<Instruction>(U); 5312 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5313 }); 5314 if (!ScalarIndUpdate) 5315 continue; 5316 5317 // The induction variable and its update instruction will remain scalar. 5318 Worklist.insert(Ind); 5319 Worklist.insert(IndUpdate); 5320 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5321 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5322 << "\n"); 5323 } 5324 5325 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5326 } 5327 5328 bool LoopVectorizationCostModel::isScalarWithPredication( 5329 Instruction *I, ElementCount VF) const { 5330 if (!blockNeedsPredication(I->getParent())) 5331 return false; 5332 switch(I->getOpcode()) { 5333 default: 5334 break; 5335 case Instruction::Load: 5336 case Instruction::Store: { 5337 if (!Legal->isMaskRequired(I)) 5338 return false; 5339 auto *Ptr = getLoadStorePointerOperand(I); 5340 auto *Ty = getMemInstValueType(I); 5341 // We have already decided how to vectorize this instruction, get that 5342 // result. 5343 if (VF.isVector()) { 5344 InstWidening WideningDecision = getWideningDecision(I, VF); 5345 assert(WideningDecision != CM_Unknown && 5346 "Widening decision should be ready at this moment"); 5347 return WideningDecision == CM_Scalarize; 5348 } 5349 const Align Alignment = getLoadStoreAlignment(I); 5350 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5351 isLegalMaskedGather(Ty, Alignment)) 5352 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5353 isLegalMaskedScatter(Ty, Alignment)); 5354 } 5355 case Instruction::UDiv: 5356 case Instruction::SDiv: 5357 case Instruction::SRem: 5358 case Instruction::URem: 5359 return mayDivideByZero(*I); 5360 } 5361 return false; 5362 } 5363 5364 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5365 Instruction *I, ElementCount VF) { 5366 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5367 assert(getWideningDecision(I, VF) == CM_Unknown && 5368 "Decision should not be set yet."); 5369 auto *Group = getInterleavedAccessGroup(I); 5370 assert(Group && "Must have a group."); 5371 5372 // If the instruction's allocated size doesn't equal it's type size, it 5373 // requires padding and will be scalarized. 5374 auto &DL = I->getModule()->getDataLayout(); 5375 auto *ScalarTy = getMemInstValueType(I); 5376 if (hasIrregularType(ScalarTy, DL)) 5377 return false; 5378 5379 // Check if masking is required. 5380 // A Group may need masking for one of two reasons: it resides in a block that 5381 // needs predication, or it was decided to use masking to deal with gaps. 5382 bool PredicatedAccessRequiresMasking = 5383 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5384 bool AccessWithGapsRequiresMasking = 5385 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5386 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5387 return true; 5388 5389 // If masked interleaving is required, we expect that the user/target had 5390 // enabled it, because otherwise it either wouldn't have been created or 5391 // it should have been invalidated by the CostModel. 5392 assert(useMaskedInterleavedAccesses(TTI) && 5393 "Masked interleave-groups for predicated accesses are not enabled."); 5394 5395 auto *Ty = getMemInstValueType(I); 5396 const Align Alignment = getLoadStoreAlignment(I); 5397 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5398 : TTI.isLegalMaskedStore(Ty, Alignment); 5399 } 5400 5401 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5402 Instruction *I, ElementCount VF) { 5403 // Get and ensure we have a valid memory instruction. 5404 LoadInst *LI = dyn_cast<LoadInst>(I); 5405 StoreInst *SI = dyn_cast<StoreInst>(I); 5406 assert((LI || SI) && "Invalid memory instruction"); 5407 5408 auto *Ptr = getLoadStorePointerOperand(I); 5409 5410 // In order to be widened, the pointer should be consecutive, first of all. 5411 if (!Legal->isConsecutivePtr(Ptr)) 5412 return false; 5413 5414 // If the instruction is a store located in a predicated block, it will be 5415 // scalarized. 5416 if (isScalarWithPredication(I)) 5417 return false; 5418 5419 // If the instruction's allocated size doesn't equal it's type size, it 5420 // requires padding and will be scalarized. 5421 auto &DL = I->getModule()->getDataLayout(); 5422 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5423 if (hasIrregularType(ScalarTy, DL)) 5424 return false; 5425 5426 return true; 5427 } 5428 5429 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5430 // We should not collect Uniforms more than once per VF. Right now, 5431 // this function is called from collectUniformsAndScalars(), which 5432 // already does this check. Collecting Uniforms for VF=1 does not make any 5433 // sense. 5434 5435 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5436 "This function should not be visited twice for the same VF"); 5437 5438 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5439 // not analyze again. Uniforms.count(VF) will return 1. 5440 Uniforms[VF].clear(); 5441 5442 // We now know that the loop is vectorizable! 5443 // Collect instructions inside the loop that will remain uniform after 5444 // vectorization. 5445 5446 // Global values, params and instructions outside of current loop are out of 5447 // scope. 5448 auto isOutOfScope = [&](Value *V) -> bool { 5449 Instruction *I = dyn_cast<Instruction>(V); 5450 return (!I || !TheLoop->contains(I)); 5451 }; 5452 5453 SetVector<Instruction *> Worklist; 5454 BasicBlock *Latch = TheLoop->getLoopLatch(); 5455 5456 // Instructions that are scalar with predication must not be considered 5457 // uniform after vectorization, because that would create an erroneous 5458 // replicating region where only a single instance out of VF should be formed. 5459 // TODO: optimize such seldom cases if found important, see PR40816. 5460 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5461 if (isOutOfScope(I)) { 5462 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5463 << *I << "\n"); 5464 return; 5465 } 5466 if (isScalarWithPredication(I, VF)) { 5467 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5468 << *I << "\n"); 5469 return; 5470 } 5471 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5472 Worklist.insert(I); 5473 }; 5474 5475 // Start with the conditional branch. If the branch condition is an 5476 // instruction contained in the loop that is only used by the branch, it is 5477 // uniform. 5478 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5479 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5480 addToWorklistIfAllowed(Cmp); 5481 5482 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5483 InstWidening WideningDecision = getWideningDecision(I, VF); 5484 assert(WideningDecision != CM_Unknown && 5485 "Widening decision should be ready at this moment"); 5486 5487 // A uniform memory op is itself uniform. We exclude uniform stores 5488 // here as they demand the last lane, not the first one. 5489 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5490 assert(WideningDecision == CM_Scalarize); 5491 return true; 5492 } 5493 5494 return (WideningDecision == CM_Widen || 5495 WideningDecision == CM_Widen_Reverse || 5496 WideningDecision == CM_Interleave); 5497 }; 5498 5499 5500 // Returns true if Ptr is the pointer operand of a memory access instruction 5501 // I, and I is known to not require scalarization. 5502 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5503 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5504 }; 5505 5506 // Holds a list of values which are known to have at least one uniform use. 5507 // Note that there may be other uses which aren't uniform. A "uniform use" 5508 // here is something which only demands lane 0 of the unrolled iterations; 5509 // it does not imply that all lanes produce the same value (e.g. this is not 5510 // the usual meaning of uniform) 5511 SetVector<Value *> HasUniformUse; 5512 5513 // Scan the loop for instructions which are either a) known to have only 5514 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5515 for (auto *BB : TheLoop->blocks()) 5516 for (auto &I : *BB) { 5517 // If there's no pointer operand, there's nothing to do. 5518 auto *Ptr = getLoadStorePointerOperand(&I); 5519 if (!Ptr) 5520 continue; 5521 5522 // A uniform memory op is itself uniform. We exclude uniform stores 5523 // here as they demand the last lane, not the first one. 5524 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5525 addToWorklistIfAllowed(&I); 5526 5527 if (isUniformDecision(&I, VF)) { 5528 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5529 HasUniformUse.insert(Ptr); 5530 } 5531 } 5532 5533 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5534 // demanding) users. Since loops are assumed to be in LCSSA form, this 5535 // disallows uses outside the loop as well. 5536 for (auto *V : HasUniformUse) { 5537 if (isOutOfScope(V)) 5538 continue; 5539 auto *I = cast<Instruction>(V); 5540 auto UsersAreMemAccesses = 5541 llvm::all_of(I->users(), [&](User *U) -> bool { 5542 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5543 }); 5544 if (UsersAreMemAccesses) 5545 addToWorklistIfAllowed(I); 5546 } 5547 5548 // Expand Worklist in topological order: whenever a new instruction 5549 // is added , its users should be already inside Worklist. It ensures 5550 // a uniform instruction will only be used by uniform instructions. 5551 unsigned idx = 0; 5552 while (idx != Worklist.size()) { 5553 Instruction *I = Worklist[idx++]; 5554 5555 for (auto OV : I->operand_values()) { 5556 // isOutOfScope operands cannot be uniform instructions. 5557 if (isOutOfScope(OV)) 5558 continue; 5559 // First order recurrence Phi's should typically be considered 5560 // non-uniform. 5561 auto *OP = dyn_cast<PHINode>(OV); 5562 if (OP && Legal->isFirstOrderRecurrence(OP)) 5563 continue; 5564 // If all the users of the operand are uniform, then add the 5565 // operand into the uniform worklist. 5566 auto *OI = cast<Instruction>(OV); 5567 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5568 auto *J = cast<Instruction>(U); 5569 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5570 })) 5571 addToWorklistIfAllowed(OI); 5572 } 5573 } 5574 5575 // For an instruction to be added into Worklist above, all its users inside 5576 // the loop should also be in Worklist. However, this condition cannot be 5577 // true for phi nodes that form a cyclic dependence. We must process phi 5578 // nodes separately. An induction variable will remain uniform if all users 5579 // of the induction variable and induction variable update remain uniform. 5580 // The code below handles both pointer and non-pointer induction variables. 5581 for (auto &Induction : Legal->getInductionVars()) { 5582 auto *Ind = Induction.first; 5583 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5584 5585 // Determine if all users of the induction variable are uniform after 5586 // vectorization. 5587 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5588 auto *I = cast<Instruction>(U); 5589 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5590 isVectorizedMemAccessUse(I, Ind); 5591 }); 5592 if (!UniformInd) 5593 continue; 5594 5595 // Determine if all users of the induction variable update instruction are 5596 // uniform after vectorization. 5597 auto UniformIndUpdate = 5598 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5599 auto *I = cast<Instruction>(U); 5600 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5601 isVectorizedMemAccessUse(I, IndUpdate); 5602 }); 5603 if (!UniformIndUpdate) 5604 continue; 5605 5606 // The induction variable and its update instruction will remain uniform. 5607 addToWorklistIfAllowed(Ind); 5608 addToWorklistIfAllowed(IndUpdate); 5609 } 5610 5611 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5612 } 5613 5614 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5615 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5616 5617 if (Legal->getRuntimePointerChecking()->Need) { 5618 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5619 "runtime pointer checks needed. Enable vectorization of this " 5620 "loop with '#pragma clang loop vectorize(enable)' when " 5621 "compiling with -Os/-Oz", 5622 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5623 return true; 5624 } 5625 5626 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5627 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5628 "runtime SCEV checks needed. Enable vectorization of this " 5629 "loop with '#pragma clang loop vectorize(enable)' when " 5630 "compiling with -Os/-Oz", 5631 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5632 return true; 5633 } 5634 5635 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5636 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5637 reportVectorizationFailure("Runtime stride check for small trip count", 5638 "runtime stride == 1 checks needed. Enable vectorization of " 5639 "this loop without such check by compiling with -Os/-Oz", 5640 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5641 return true; 5642 } 5643 5644 return false; 5645 } 5646 5647 ElementCount 5648 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5649 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5650 reportVectorizationInfo( 5651 "Disabling scalable vectorization, because target does not " 5652 "support scalable vectors.", 5653 "ScalableVectorsUnsupported", ORE, TheLoop); 5654 return ElementCount::getScalable(0); 5655 } 5656 5657 auto MaxScalableVF = ElementCount::getScalable( 5658 std::numeric_limits<ElementCount::ScalarTy>::max()); 5659 5660 // Disable scalable vectorization if the loop contains unsupported reductions. 5661 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5662 // FIXME: While for scalable vectors this is currently sufficient, this should 5663 // be replaced by a more detailed mechanism that filters out specific VFs, 5664 // instead of invalidating vectorization for a whole set of VFs based on the 5665 // MaxVF. 5666 if (!canVectorizeReductions(MaxScalableVF)) { 5667 reportVectorizationInfo( 5668 "Scalable vectorization not supported for the reduction " 5669 "operations found in this loop.", 5670 "ScalableVFUnfeasible", ORE, TheLoop); 5671 return ElementCount::getScalable(0); 5672 } 5673 5674 if (Legal->isSafeForAnyVectorWidth()) 5675 return MaxScalableVF; 5676 5677 // Limit MaxScalableVF by the maximum safe dependence distance. 5678 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5679 MaxScalableVF = ElementCount::getScalable( 5680 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5681 if (!MaxScalableVF) 5682 reportVectorizationInfo( 5683 "Max legal vector width too small, scalable vectorization " 5684 "unfeasible.", 5685 "ScalableVFUnfeasible", ORE, TheLoop); 5686 5687 return MaxScalableVF; 5688 } 5689 5690 ElementCount 5691 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5692 ElementCount UserVF) { 5693 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5694 unsigned SmallestType, WidestType; 5695 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5696 5697 // Get the maximum safe dependence distance in bits computed by LAA. 5698 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5699 // the memory accesses that is most restrictive (involved in the smallest 5700 // dependence distance). 5701 unsigned MaxSafeElements = 5702 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5703 5704 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5705 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5706 5707 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5708 << ".\n"); 5709 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5710 << ".\n"); 5711 5712 // First analyze the UserVF, fall back if the UserVF should be ignored. 5713 if (UserVF) { 5714 auto MaxSafeUserVF = 5715 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5716 5717 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5718 return UserVF; 5719 5720 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5721 5722 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5723 // is better to ignore the hint and let the compiler choose a suitable VF. 5724 if (!UserVF.isScalable()) { 5725 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5726 << " is unsafe, clamping to max safe VF=" 5727 << MaxSafeFixedVF << ".\n"); 5728 ORE->emit([&]() { 5729 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5730 TheLoop->getStartLoc(), 5731 TheLoop->getHeader()) 5732 << "User-specified vectorization factor " 5733 << ore::NV("UserVectorizationFactor", UserVF) 5734 << " is unsafe, clamping to maximum safe vectorization factor " 5735 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5736 }); 5737 return MaxSafeFixedVF; 5738 } 5739 5740 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5741 << " is unsafe. Ignoring scalable UserVF.\n"); 5742 ORE->emit([&]() { 5743 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5744 TheLoop->getStartLoc(), 5745 TheLoop->getHeader()) 5746 << "User-specified vectorization factor " 5747 << ore::NV("UserVectorizationFactor", UserVF) 5748 << " is unsafe. Ignoring the hint to let the compiler pick a " 5749 "suitable VF."; 5750 }); 5751 } 5752 5753 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5754 << " / " << WidestType << " bits.\n"); 5755 5756 ElementCount MaxFixedVF = ElementCount::getFixed(1); 5757 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5758 WidestType, MaxSafeFixedVF)) 5759 MaxFixedVF = MaxVF; 5760 5761 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5762 WidestType, MaxSafeScalableVF)) 5763 // FIXME: Return scalable VF as well (to be added in future patch). 5764 if (MaxVF.isScalable()) 5765 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5766 << "\n"); 5767 5768 return MaxFixedVF; 5769 } 5770 5771 Optional<ElementCount> 5772 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5773 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5774 // TODO: It may by useful to do since it's still likely to be dynamically 5775 // uniform if the target can skip. 5776 reportVectorizationFailure( 5777 "Not inserting runtime ptr check for divergent target", 5778 "runtime pointer checks needed. Not enabled for divergent target", 5779 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5780 return None; 5781 } 5782 5783 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5784 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5785 if (TC == 1) { 5786 reportVectorizationFailure("Single iteration (non) loop", 5787 "loop trip count is one, irrelevant for vectorization", 5788 "SingleIterationLoop", ORE, TheLoop); 5789 return None; 5790 } 5791 5792 switch (ScalarEpilogueStatus) { 5793 case CM_ScalarEpilogueAllowed: 5794 return computeFeasibleMaxVF(TC, UserVF); 5795 case CM_ScalarEpilogueNotAllowedUsePredicate: 5796 LLVM_FALLTHROUGH; 5797 case CM_ScalarEpilogueNotNeededUsePredicate: 5798 LLVM_DEBUG( 5799 dbgs() << "LV: vector predicate hint/switch found.\n" 5800 << "LV: Not allowing scalar epilogue, creating predicated " 5801 << "vector loop.\n"); 5802 break; 5803 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5804 // fallthrough as a special case of OptForSize 5805 case CM_ScalarEpilogueNotAllowedOptSize: 5806 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5807 LLVM_DEBUG( 5808 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5809 else 5810 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5811 << "count.\n"); 5812 5813 // Bail if runtime checks are required, which are not good when optimising 5814 // for size. 5815 if (runtimeChecksRequired()) 5816 return None; 5817 5818 break; 5819 } 5820 5821 // The only loops we can vectorize without a scalar epilogue, are loops with 5822 // a bottom-test and a single exiting block. We'd have to handle the fact 5823 // that not every instruction executes on the last iteration. This will 5824 // require a lane mask which varies through the vector loop body. (TODO) 5825 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5826 // If there was a tail-folding hint/switch, but we can't fold the tail by 5827 // masking, fallback to a vectorization with a scalar epilogue. 5828 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5829 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5830 "scalar epilogue instead.\n"); 5831 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5832 return computeFeasibleMaxVF(TC, UserVF); 5833 } 5834 return None; 5835 } 5836 5837 // Now try the tail folding 5838 5839 // Invalidate interleave groups that require an epilogue if we can't mask 5840 // the interleave-group. 5841 if (!useMaskedInterleavedAccesses(TTI)) { 5842 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5843 "No decisions should have been taken at this point"); 5844 // Note: There is no need to invalidate any cost modeling decisions here, as 5845 // non where taken so far. 5846 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5847 } 5848 5849 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5850 assert(!MaxVF.isScalable() && 5851 "Scalable vectors do not yet support tail folding"); 5852 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5853 "MaxVF must be a power of 2"); 5854 unsigned MaxVFtimesIC = 5855 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5856 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5857 // chose. 5858 ScalarEvolution *SE = PSE.getSE(); 5859 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5860 const SCEV *ExitCount = SE->getAddExpr( 5861 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5862 const SCEV *Rem = SE->getURemExpr( 5863 SE->applyLoopGuards(ExitCount, TheLoop), 5864 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5865 if (Rem->isZero()) { 5866 // Accept MaxVF if we do not have a tail. 5867 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5868 return MaxVF; 5869 } 5870 5871 // If we don't know the precise trip count, or if the trip count that we 5872 // found modulo the vectorization factor is not zero, try to fold the tail 5873 // by masking. 5874 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5875 if (Legal->prepareToFoldTailByMasking()) { 5876 FoldTailByMasking = true; 5877 return MaxVF; 5878 } 5879 5880 // If there was a tail-folding hint/switch, but we can't fold the tail by 5881 // masking, fallback to a vectorization with a scalar epilogue. 5882 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5883 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5884 "scalar epilogue instead.\n"); 5885 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5886 return MaxVF; 5887 } 5888 5889 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5890 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5891 return None; 5892 } 5893 5894 if (TC == 0) { 5895 reportVectorizationFailure( 5896 "Unable to calculate the loop count due to complex control flow", 5897 "unable to calculate the loop count due to complex control flow", 5898 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5899 return None; 5900 } 5901 5902 reportVectorizationFailure( 5903 "Cannot optimize for size and vectorize at the same time.", 5904 "cannot optimize for size and vectorize at the same time. " 5905 "Enable vectorization of this loop with '#pragma clang loop " 5906 "vectorize(enable)' when compiling with -Os/-Oz", 5907 "NoTailLoopWithOptForSize", ORE, TheLoop); 5908 return None; 5909 } 5910 5911 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5912 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5913 const ElementCount &MaxSafeVF) { 5914 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5915 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5916 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5917 : TargetTransformInfo::RGK_FixedWidthVector); 5918 5919 // Convenience function to return the minimum of two ElementCounts. 5920 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5921 assert((LHS.isScalable() == RHS.isScalable()) && 5922 "Scalable flags must match"); 5923 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5924 }; 5925 5926 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5927 // Note that both WidestRegister and WidestType may not be a powers of 2. 5928 auto MaxVectorElementCount = ElementCount::get( 5929 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5930 ComputeScalableMaxVF); 5931 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5932 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5933 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5934 5935 if (!MaxVectorElementCount) { 5936 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5937 return ElementCount::getFixed(1); 5938 } 5939 5940 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5941 if (ConstTripCount && 5942 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5943 isPowerOf2_32(ConstTripCount)) { 5944 // We need to clamp the VF to be the ConstTripCount. There is no point in 5945 // choosing a higher viable VF as done in the loop below. If 5946 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5947 // the TC is less than or equal to the known number of lanes. 5948 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5949 << ConstTripCount << "\n"); 5950 return TripCountEC; 5951 } 5952 5953 ElementCount MaxVF = MaxVectorElementCount; 5954 if (TTI.shouldMaximizeVectorBandwidth() || 5955 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5956 auto MaxVectorElementCountMaxBW = ElementCount::get( 5957 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5958 ComputeScalableMaxVF); 5959 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5960 5961 // Collect all viable vectorization factors larger than the default MaxVF 5962 // (i.e. MaxVectorElementCount). 5963 SmallVector<ElementCount, 8> VFs; 5964 for (ElementCount VS = MaxVectorElementCount * 2; 5965 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5966 VFs.push_back(VS); 5967 5968 // For each VF calculate its register usage. 5969 auto RUs = calculateRegisterUsage(VFs); 5970 5971 // Select the largest VF which doesn't require more registers than existing 5972 // ones. 5973 for (int i = RUs.size() - 1; i >= 0; --i) { 5974 bool Selected = true; 5975 for (auto &pair : RUs[i].MaxLocalUsers) { 5976 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5977 if (pair.second > TargetNumRegisters) 5978 Selected = false; 5979 } 5980 if (Selected) { 5981 MaxVF = VFs[i]; 5982 break; 5983 } 5984 } 5985 if (ElementCount MinVF = 5986 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5987 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5988 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5989 << ") with target's minimum: " << MinVF << '\n'); 5990 MaxVF = MinVF; 5991 } 5992 } 5993 } 5994 return MaxVF; 5995 } 5996 5997 bool LoopVectorizationCostModel::isMoreProfitable( 5998 const VectorizationFactor &A, const VectorizationFactor &B) const { 5999 InstructionCost::CostType CostA = *A.Cost.getValue(); 6000 InstructionCost::CostType CostB = *B.Cost.getValue(); 6001 6002 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 6003 6004 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 6005 MaxTripCount) { 6006 // If we are folding the tail and the trip count is a known (possibly small) 6007 // constant, the trip count will be rounded up to an integer number of 6008 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 6009 // which we compare directly. When not folding the tail, the total cost will 6010 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6011 // approximated with the per-lane cost below instead of using the tripcount 6012 // as here. 6013 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6014 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6015 return RTCostA < RTCostB; 6016 } 6017 6018 // To avoid the need for FP division: 6019 // (CostA / A.Width) < (CostB / B.Width) 6020 // <=> (CostA * B.Width) < (CostB * A.Width) 6021 return (CostA * B.Width.getKnownMinValue()) < 6022 (CostB * A.Width.getKnownMinValue()); 6023 } 6024 6025 VectorizationFactor 6026 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 6027 // FIXME: This can be fixed for scalable vectors later, because at this stage 6028 // the LoopVectorizer will only consider vectorizing a loop with scalable 6029 // vectors when the loop has a hint to enable vectorization for a given VF. 6030 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 6031 6032 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6033 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6034 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6035 6036 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6037 VectorizationFactor ChosenFactor = ScalarCost; 6038 6039 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6040 if (ForceVectorization && MaxVF.isVector()) { 6041 // Ignore scalar width, because the user explicitly wants vectorization. 6042 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6043 // evaluation. 6044 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6045 } 6046 6047 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 6048 i *= 2) { 6049 // Notice that the vector loop needs to be executed less times, so 6050 // we need to divide the cost of the vector loops by the width of 6051 // the vector elements. 6052 VectorizationCostTy C = expectedCost(i); 6053 6054 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6055 VectorizationFactor Candidate(i, C.first); 6056 LLVM_DEBUG( 6057 dbgs() << "LV: Vector loop of width " << i << " costs: " 6058 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 6059 << ".\n"); 6060 6061 if (!C.second && !ForceVectorization) { 6062 LLVM_DEBUG( 6063 dbgs() << "LV: Not considering vector loop of width " << i 6064 << " because it will not generate any vector instructions.\n"); 6065 continue; 6066 } 6067 6068 // If profitable add it to ProfitableVF list. 6069 if (isMoreProfitable(Candidate, ScalarCost)) 6070 ProfitableVFs.push_back(Candidate); 6071 6072 if (isMoreProfitable(Candidate, ChosenFactor)) 6073 ChosenFactor = Candidate; 6074 } 6075 6076 if (!EnableCondStoresVectorization && NumPredStores) { 6077 reportVectorizationFailure("There are conditional stores.", 6078 "store that is conditionally executed prevents vectorization", 6079 "ConditionalStore", ORE, TheLoop); 6080 ChosenFactor = ScalarCost; 6081 } 6082 6083 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6084 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6085 dbgs() 6086 << "LV: Vectorization seems to be not beneficial, " 6087 << "but was forced by a user.\n"); 6088 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6089 return ChosenFactor; 6090 } 6091 6092 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6093 const Loop &L, ElementCount VF) const { 6094 // Cross iteration phis such as reductions need special handling and are 6095 // currently unsupported. 6096 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6097 return Legal->isFirstOrderRecurrence(&Phi) || 6098 Legal->isReductionVariable(&Phi); 6099 })) 6100 return false; 6101 6102 // Phis with uses outside of the loop require special handling and are 6103 // currently unsupported. 6104 for (auto &Entry : Legal->getInductionVars()) { 6105 // Look for uses of the value of the induction at the last iteration. 6106 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6107 for (User *U : PostInc->users()) 6108 if (!L.contains(cast<Instruction>(U))) 6109 return false; 6110 // Look for uses of penultimate value of the induction. 6111 for (User *U : Entry.first->users()) 6112 if (!L.contains(cast<Instruction>(U))) 6113 return false; 6114 } 6115 6116 // Induction variables that are widened require special handling that is 6117 // currently not supported. 6118 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6119 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6120 this->isProfitableToScalarize(Entry.first, VF)); 6121 })) 6122 return false; 6123 6124 return true; 6125 } 6126 6127 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6128 const ElementCount VF) const { 6129 // FIXME: We need a much better cost-model to take different parameters such 6130 // as register pressure, code size increase and cost of extra branches into 6131 // account. For now we apply a very crude heuristic and only consider loops 6132 // with vectorization factors larger than a certain value. 6133 // We also consider epilogue vectorization unprofitable for targets that don't 6134 // consider interleaving beneficial (eg. MVE). 6135 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6136 return false; 6137 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6138 return true; 6139 return false; 6140 } 6141 6142 VectorizationFactor 6143 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6144 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6145 VectorizationFactor Result = VectorizationFactor::Disabled(); 6146 if (!EnableEpilogueVectorization) { 6147 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6148 return Result; 6149 } 6150 6151 if (!isScalarEpilogueAllowed()) { 6152 LLVM_DEBUG( 6153 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6154 "allowed.\n";); 6155 return Result; 6156 } 6157 6158 // FIXME: This can be fixed for scalable vectors later, because at this stage 6159 // the LoopVectorizer will only consider vectorizing a loop with scalable 6160 // vectors when the loop has a hint to enable vectorization for a given VF. 6161 if (MainLoopVF.isScalable()) { 6162 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6163 "yet supported.\n"); 6164 return Result; 6165 } 6166 6167 // Not really a cost consideration, but check for unsupported cases here to 6168 // simplify the logic. 6169 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6170 LLVM_DEBUG( 6171 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6172 "not a supported candidate.\n";); 6173 return Result; 6174 } 6175 6176 if (EpilogueVectorizationForceVF > 1) { 6177 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6178 if (LVP.hasPlanWithVFs( 6179 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6180 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6181 else { 6182 LLVM_DEBUG( 6183 dbgs() 6184 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6185 return Result; 6186 } 6187 } 6188 6189 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6190 TheLoop->getHeader()->getParent()->hasMinSize()) { 6191 LLVM_DEBUG( 6192 dbgs() 6193 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6194 return Result; 6195 } 6196 6197 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6198 return Result; 6199 6200 for (auto &NextVF : ProfitableVFs) 6201 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6202 (Result.Width.getFixedValue() == 1 || 6203 isMoreProfitable(NextVF, Result)) && 6204 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6205 Result = NextVF; 6206 6207 if (Result != VectorizationFactor::Disabled()) 6208 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6209 << Result.Width.getFixedValue() << "\n";); 6210 return Result; 6211 } 6212 6213 std::pair<unsigned, unsigned> 6214 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6215 unsigned MinWidth = -1U; 6216 unsigned MaxWidth = 8; 6217 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6218 6219 // For each block. 6220 for (BasicBlock *BB : TheLoop->blocks()) { 6221 // For each instruction in the loop. 6222 for (Instruction &I : BB->instructionsWithoutDebug()) { 6223 Type *T = I.getType(); 6224 6225 // Skip ignored values. 6226 if (ValuesToIgnore.count(&I)) 6227 continue; 6228 6229 // Only examine Loads, Stores and PHINodes. 6230 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6231 continue; 6232 6233 // Examine PHI nodes that are reduction variables. Update the type to 6234 // account for the recurrence type. 6235 if (auto *PN = dyn_cast<PHINode>(&I)) { 6236 if (!Legal->isReductionVariable(PN)) 6237 continue; 6238 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6239 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6240 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6241 RdxDesc.getRecurrenceType(), 6242 TargetTransformInfo::ReductionFlags())) 6243 continue; 6244 T = RdxDesc.getRecurrenceType(); 6245 } 6246 6247 // Examine the stored values. 6248 if (auto *ST = dyn_cast<StoreInst>(&I)) 6249 T = ST->getValueOperand()->getType(); 6250 6251 // Ignore loaded pointer types and stored pointer types that are not 6252 // vectorizable. 6253 // 6254 // FIXME: The check here attempts to predict whether a load or store will 6255 // be vectorized. We only know this for certain after a VF has 6256 // been selected. Here, we assume that if an access can be 6257 // vectorized, it will be. We should also look at extending this 6258 // optimization to non-pointer types. 6259 // 6260 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6261 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6262 continue; 6263 6264 MinWidth = std::min(MinWidth, 6265 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6266 MaxWidth = std::max(MaxWidth, 6267 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6268 } 6269 } 6270 6271 return {MinWidth, MaxWidth}; 6272 } 6273 6274 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6275 unsigned LoopCost) { 6276 // -- The interleave heuristics -- 6277 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6278 // There are many micro-architectural considerations that we can't predict 6279 // at this level. For example, frontend pressure (on decode or fetch) due to 6280 // code size, or the number and capabilities of the execution ports. 6281 // 6282 // We use the following heuristics to select the interleave count: 6283 // 1. If the code has reductions, then we interleave to break the cross 6284 // iteration dependency. 6285 // 2. If the loop is really small, then we interleave to reduce the loop 6286 // overhead. 6287 // 3. We don't interleave if we think that we will spill registers to memory 6288 // due to the increased register pressure. 6289 6290 if (!isScalarEpilogueAllowed()) 6291 return 1; 6292 6293 // We used the distance for the interleave count. 6294 if (Legal->getMaxSafeDepDistBytes() != -1U) 6295 return 1; 6296 6297 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6298 const bool HasReductions = !Legal->getReductionVars().empty(); 6299 // Do not interleave loops with a relatively small known or estimated trip 6300 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6301 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6302 // because with the above conditions interleaving can expose ILP and break 6303 // cross iteration dependences for reductions. 6304 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6305 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6306 return 1; 6307 6308 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6309 // We divide by these constants so assume that we have at least one 6310 // instruction that uses at least one register. 6311 for (auto& pair : R.MaxLocalUsers) { 6312 pair.second = std::max(pair.second, 1U); 6313 } 6314 6315 // We calculate the interleave count using the following formula. 6316 // Subtract the number of loop invariants from the number of available 6317 // registers. These registers are used by all of the interleaved instances. 6318 // Next, divide the remaining registers by the number of registers that is 6319 // required by the loop, in order to estimate how many parallel instances 6320 // fit without causing spills. All of this is rounded down if necessary to be 6321 // a power of two. We want power of two interleave count to simplify any 6322 // addressing operations or alignment considerations. 6323 // We also want power of two interleave counts to ensure that the induction 6324 // variable of the vector loop wraps to zero, when tail is folded by masking; 6325 // this currently happens when OptForSize, in which case IC is set to 1 above. 6326 unsigned IC = UINT_MAX; 6327 6328 for (auto& pair : R.MaxLocalUsers) { 6329 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6330 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6331 << " registers of " 6332 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6333 if (VF.isScalar()) { 6334 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6335 TargetNumRegisters = ForceTargetNumScalarRegs; 6336 } else { 6337 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6338 TargetNumRegisters = ForceTargetNumVectorRegs; 6339 } 6340 unsigned MaxLocalUsers = pair.second; 6341 unsigned LoopInvariantRegs = 0; 6342 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6343 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6344 6345 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6346 // Don't count the induction variable as interleaved. 6347 if (EnableIndVarRegisterHeur) { 6348 TmpIC = 6349 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6350 std::max(1U, (MaxLocalUsers - 1))); 6351 } 6352 6353 IC = std::min(IC, TmpIC); 6354 } 6355 6356 // Clamp the interleave ranges to reasonable counts. 6357 unsigned MaxInterleaveCount = 6358 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6359 6360 // Check if the user has overridden the max. 6361 if (VF.isScalar()) { 6362 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6363 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6364 } else { 6365 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6366 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6367 } 6368 6369 // If trip count is known or estimated compile time constant, limit the 6370 // interleave count to be less than the trip count divided by VF, provided it 6371 // is at least 1. 6372 // 6373 // For scalable vectors we can't know if interleaving is beneficial. It may 6374 // not be beneficial for small loops if none of the lanes in the second vector 6375 // iterations is enabled. However, for larger loops, there is likely to be a 6376 // similar benefit as for fixed-width vectors. For now, we choose to leave 6377 // the InterleaveCount as if vscale is '1', although if some information about 6378 // the vector is known (e.g. min vector size), we can make a better decision. 6379 if (BestKnownTC) { 6380 MaxInterleaveCount = 6381 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6382 // Make sure MaxInterleaveCount is greater than 0. 6383 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6384 } 6385 6386 assert(MaxInterleaveCount > 0 && 6387 "Maximum interleave count must be greater than 0"); 6388 6389 // Clamp the calculated IC to be between the 1 and the max interleave count 6390 // that the target and trip count allows. 6391 if (IC > MaxInterleaveCount) 6392 IC = MaxInterleaveCount; 6393 else 6394 // Make sure IC is greater than 0. 6395 IC = std::max(1u, IC); 6396 6397 assert(IC > 0 && "Interleave count must be greater than 0."); 6398 6399 // If we did not calculate the cost for VF (because the user selected the VF) 6400 // then we calculate the cost of VF here. 6401 if (LoopCost == 0) { 6402 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6403 LoopCost = *expectedCost(VF).first.getValue(); 6404 } 6405 6406 assert(LoopCost && "Non-zero loop cost expected"); 6407 6408 // Interleave if we vectorized this loop and there is a reduction that could 6409 // benefit from interleaving. 6410 if (VF.isVector() && HasReductions) { 6411 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6412 return IC; 6413 } 6414 6415 // Note that if we've already vectorized the loop we will have done the 6416 // runtime check and so interleaving won't require further checks. 6417 bool InterleavingRequiresRuntimePointerCheck = 6418 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6419 6420 // We want to interleave small loops in order to reduce the loop overhead and 6421 // potentially expose ILP opportunities. 6422 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6423 << "LV: IC is " << IC << '\n' 6424 << "LV: VF is " << VF << '\n'); 6425 const bool AggressivelyInterleaveReductions = 6426 TTI.enableAggressiveInterleaving(HasReductions); 6427 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6428 // We assume that the cost overhead is 1 and we use the cost model 6429 // to estimate the cost of the loop and interleave until the cost of the 6430 // loop overhead is about 5% of the cost of the loop. 6431 unsigned SmallIC = 6432 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6433 6434 // Interleave until store/load ports (estimated by max interleave count) are 6435 // saturated. 6436 unsigned NumStores = Legal->getNumStores(); 6437 unsigned NumLoads = Legal->getNumLoads(); 6438 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6439 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6440 6441 // If we have a scalar reduction (vector reductions are already dealt with 6442 // by this point), we can increase the critical path length if the loop 6443 // we're interleaving is inside another loop. Limit, by default to 2, so the 6444 // critical path only gets increased by one reduction operation. 6445 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6446 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6447 SmallIC = std::min(SmallIC, F); 6448 StoresIC = std::min(StoresIC, F); 6449 LoadsIC = std::min(LoadsIC, F); 6450 } 6451 6452 if (EnableLoadStoreRuntimeInterleave && 6453 std::max(StoresIC, LoadsIC) > SmallIC) { 6454 LLVM_DEBUG( 6455 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6456 return std::max(StoresIC, LoadsIC); 6457 } 6458 6459 // If there are scalar reductions and TTI has enabled aggressive 6460 // interleaving for reductions, we will interleave to expose ILP. 6461 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6462 AggressivelyInterleaveReductions) { 6463 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6464 // Interleave no less than SmallIC but not as aggressive as the normal IC 6465 // to satisfy the rare situation when resources are too limited. 6466 return std::max(IC / 2, SmallIC); 6467 } else { 6468 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6469 return SmallIC; 6470 } 6471 } 6472 6473 // Interleave if this is a large loop (small loops are already dealt with by 6474 // this point) that could benefit from interleaving. 6475 if (AggressivelyInterleaveReductions) { 6476 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6477 return IC; 6478 } 6479 6480 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6481 return 1; 6482 } 6483 6484 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6485 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6486 // This function calculates the register usage by measuring the highest number 6487 // of values that are alive at a single location. Obviously, this is a very 6488 // rough estimation. We scan the loop in a topological order in order and 6489 // assign a number to each instruction. We use RPO to ensure that defs are 6490 // met before their users. We assume that each instruction that has in-loop 6491 // users starts an interval. We record every time that an in-loop value is 6492 // used, so we have a list of the first and last occurrences of each 6493 // instruction. Next, we transpose this data structure into a multi map that 6494 // holds the list of intervals that *end* at a specific location. This multi 6495 // map allows us to perform a linear search. We scan the instructions linearly 6496 // and record each time that a new interval starts, by placing it in a set. 6497 // If we find this value in the multi-map then we remove it from the set. 6498 // The max register usage is the maximum size of the set. 6499 // We also search for instructions that are defined outside the loop, but are 6500 // used inside the loop. We need this number separately from the max-interval 6501 // usage number because when we unroll, loop-invariant values do not take 6502 // more register. 6503 LoopBlocksDFS DFS(TheLoop); 6504 DFS.perform(LI); 6505 6506 RegisterUsage RU; 6507 6508 // Each 'key' in the map opens a new interval. The values 6509 // of the map are the index of the 'last seen' usage of the 6510 // instruction that is the key. 6511 using IntervalMap = DenseMap<Instruction *, unsigned>; 6512 6513 // Maps instruction to its index. 6514 SmallVector<Instruction *, 64> IdxToInstr; 6515 // Marks the end of each interval. 6516 IntervalMap EndPoint; 6517 // Saves the list of instruction indices that are used in the loop. 6518 SmallPtrSet<Instruction *, 8> Ends; 6519 // Saves the list of values that are used in the loop but are 6520 // defined outside the loop, such as arguments and constants. 6521 SmallPtrSet<Value *, 8> LoopInvariants; 6522 6523 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6524 for (Instruction &I : BB->instructionsWithoutDebug()) { 6525 IdxToInstr.push_back(&I); 6526 6527 // Save the end location of each USE. 6528 for (Value *U : I.operands()) { 6529 auto *Instr = dyn_cast<Instruction>(U); 6530 6531 // Ignore non-instruction values such as arguments, constants, etc. 6532 if (!Instr) 6533 continue; 6534 6535 // If this instruction is outside the loop then record it and continue. 6536 if (!TheLoop->contains(Instr)) { 6537 LoopInvariants.insert(Instr); 6538 continue; 6539 } 6540 6541 // Overwrite previous end points. 6542 EndPoint[Instr] = IdxToInstr.size(); 6543 Ends.insert(Instr); 6544 } 6545 } 6546 } 6547 6548 // Saves the list of intervals that end with the index in 'key'. 6549 using InstrList = SmallVector<Instruction *, 2>; 6550 DenseMap<unsigned, InstrList> TransposeEnds; 6551 6552 // Transpose the EndPoints to a list of values that end at each index. 6553 for (auto &Interval : EndPoint) 6554 TransposeEnds[Interval.second].push_back(Interval.first); 6555 6556 SmallPtrSet<Instruction *, 8> OpenIntervals; 6557 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6558 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6559 6560 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6561 6562 // A lambda that gets the register usage for the given type and VF. 6563 const auto &TTICapture = TTI; 6564 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6565 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6566 return 0U; 6567 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6568 }; 6569 6570 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6571 Instruction *I = IdxToInstr[i]; 6572 6573 // Remove all of the instructions that end at this location. 6574 InstrList &List = TransposeEnds[i]; 6575 for (Instruction *ToRemove : List) 6576 OpenIntervals.erase(ToRemove); 6577 6578 // Ignore instructions that are never used within the loop. 6579 if (!Ends.count(I)) 6580 continue; 6581 6582 // Skip ignored values. 6583 if (ValuesToIgnore.count(I)) 6584 continue; 6585 6586 // For each VF find the maximum usage of registers. 6587 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6588 // Count the number of live intervals. 6589 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6590 6591 if (VFs[j].isScalar()) { 6592 for (auto Inst : OpenIntervals) { 6593 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6594 if (RegUsage.find(ClassID) == RegUsage.end()) 6595 RegUsage[ClassID] = 1; 6596 else 6597 RegUsage[ClassID] += 1; 6598 } 6599 } else { 6600 collectUniformsAndScalars(VFs[j]); 6601 for (auto Inst : OpenIntervals) { 6602 // Skip ignored values for VF > 1. 6603 if (VecValuesToIgnore.count(Inst)) 6604 continue; 6605 if (isScalarAfterVectorization(Inst, VFs[j])) { 6606 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6607 if (RegUsage.find(ClassID) == RegUsage.end()) 6608 RegUsage[ClassID] = 1; 6609 else 6610 RegUsage[ClassID] += 1; 6611 } else { 6612 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6613 if (RegUsage.find(ClassID) == RegUsage.end()) 6614 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6615 else 6616 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6617 } 6618 } 6619 } 6620 6621 for (auto& pair : RegUsage) { 6622 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6623 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6624 else 6625 MaxUsages[j][pair.first] = pair.second; 6626 } 6627 } 6628 6629 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6630 << OpenIntervals.size() << '\n'); 6631 6632 // Add the current instruction to the list of open intervals. 6633 OpenIntervals.insert(I); 6634 } 6635 6636 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6637 SmallMapVector<unsigned, unsigned, 4> Invariant; 6638 6639 for (auto Inst : LoopInvariants) { 6640 unsigned Usage = 6641 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6642 unsigned ClassID = 6643 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6644 if (Invariant.find(ClassID) == Invariant.end()) 6645 Invariant[ClassID] = Usage; 6646 else 6647 Invariant[ClassID] += Usage; 6648 } 6649 6650 LLVM_DEBUG({ 6651 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6652 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6653 << " item\n"; 6654 for (const auto &pair : MaxUsages[i]) { 6655 dbgs() << "LV(REG): RegisterClass: " 6656 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6657 << " registers\n"; 6658 } 6659 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6660 << " item\n"; 6661 for (const auto &pair : Invariant) { 6662 dbgs() << "LV(REG): RegisterClass: " 6663 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6664 << " registers\n"; 6665 } 6666 }); 6667 6668 RU.LoopInvariantRegs = Invariant; 6669 RU.MaxLocalUsers = MaxUsages[i]; 6670 RUs[i] = RU; 6671 } 6672 6673 return RUs; 6674 } 6675 6676 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6677 // TODO: Cost model for emulated masked load/store is completely 6678 // broken. This hack guides the cost model to use an artificially 6679 // high enough value to practically disable vectorization with such 6680 // operations, except where previously deployed legality hack allowed 6681 // using very low cost values. This is to avoid regressions coming simply 6682 // from moving "masked load/store" check from legality to cost model. 6683 // Masked Load/Gather emulation was previously never allowed. 6684 // Limited number of Masked Store/Scatter emulation was allowed. 6685 assert(isPredicatedInst(I, ElementCount::getFixed(1)) && 6686 "Expecting a scalar emulated instruction"); 6687 return isa<LoadInst>(I) || 6688 (isa<StoreInst>(I) && 6689 NumPredStores > NumberOfStoresToPredicate); 6690 } 6691 6692 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6693 // If we aren't vectorizing the loop, or if we've already collected the 6694 // instructions to scalarize, there's nothing to do. Collection may already 6695 // have occurred if we have a user-selected VF and are now computing the 6696 // expected cost for interleaving. 6697 if (VF.isScalar() || VF.isZero() || 6698 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6699 return; 6700 6701 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6702 // not profitable to scalarize any instructions, the presence of VF in the 6703 // map will indicate that we've analyzed it already. 6704 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6705 6706 // Find all the instructions that are scalar with predication in the loop and 6707 // determine if it would be better to not if-convert the blocks they are in. 6708 // If so, we also record the instructions to scalarize. 6709 for (BasicBlock *BB : TheLoop->blocks()) { 6710 if (!blockNeedsPredication(BB)) 6711 continue; 6712 for (Instruction &I : *BB) 6713 if (isScalarWithPredication(&I)) { 6714 ScalarCostsTy ScalarCosts; 6715 // Do not apply discount logic if hacked cost is needed 6716 // for emulated masked memrefs. 6717 if (!useEmulatedMaskMemRefHack(&I) && 6718 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6719 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6720 // Remember that BB will remain after vectorization. 6721 PredicatedBBsAfterVectorization.insert(BB); 6722 } 6723 } 6724 } 6725 6726 int LoopVectorizationCostModel::computePredInstDiscount( 6727 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6728 assert(!isUniformAfterVectorization(PredInst, VF) && 6729 "Instruction marked uniform-after-vectorization will be predicated"); 6730 6731 // Initialize the discount to zero, meaning that the scalar version and the 6732 // vector version cost the same. 6733 InstructionCost Discount = 0; 6734 6735 // Holds instructions to analyze. The instructions we visit are mapped in 6736 // ScalarCosts. Those instructions are the ones that would be scalarized if 6737 // we find that the scalar version costs less. 6738 SmallVector<Instruction *, 8> Worklist; 6739 6740 // Returns true if the given instruction can be scalarized. 6741 auto canBeScalarized = [&](Instruction *I) -> bool { 6742 // We only attempt to scalarize instructions forming a single-use chain 6743 // from the original predicated block that would otherwise be vectorized. 6744 // Although not strictly necessary, we give up on instructions we know will 6745 // already be scalar to avoid traversing chains that are unlikely to be 6746 // beneficial. 6747 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6748 isScalarAfterVectorization(I, VF)) 6749 return false; 6750 6751 // If the instruction is scalar with predication, it will be analyzed 6752 // separately. We ignore it within the context of PredInst. 6753 if (isScalarWithPredication(I)) 6754 return false; 6755 6756 // If any of the instruction's operands are uniform after vectorization, 6757 // the instruction cannot be scalarized. This prevents, for example, a 6758 // masked load from being scalarized. 6759 // 6760 // We assume we will only emit a value for lane zero of an instruction 6761 // marked uniform after vectorization, rather than VF identical values. 6762 // Thus, if we scalarize an instruction that uses a uniform, we would 6763 // create uses of values corresponding to the lanes we aren't emitting code 6764 // for. This behavior can be changed by allowing getScalarValue to clone 6765 // the lane zero values for uniforms rather than asserting. 6766 for (Use &U : I->operands()) 6767 if (auto *J = dyn_cast<Instruction>(U.get())) 6768 if (isUniformAfterVectorization(J, VF)) 6769 return false; 6770 6771 // Otherwise, we can scalarize the instruction. 6772 return true; 6773 }; 6774 6775 // Compute the expected cost discount from scalarizing the entire expression 6776 // feeding the predicated instruction. We currently only consider expressions 6777 // that are single-use instruction chains. 6778 Worklist.push_back(PredInst); 6779 while (!Worklist.empty()) { 6780 Instruction *I = Worklist.pop_back_val(); 6781 6782 // If we've already analyzed the instruction, there's nothing to do. 6783 if (ScalarCosts.find(I) != ScalarCosts.end()) 6784 continue; 6785 6786 // Compute the cost of the vector instruction. Note that this cost already 6787 // includes the scalarization overhead of the predicated instruction. 6788 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6789 6790 // Compute the cost of the scalarized instruction. This cost is the cost of 6791 // the instruction as if it wasn't if-converted and instead remained in the 6792 // predicated block. We will scale this cost by block probability after 6793 // computing the scalarization overhead. 6794 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6795 InstructionCost ScalarCost = 6796 VF.getKnownMinValue() * 6797 getInstructionCost(I, ElementCount::getFixed(1)).first; 6798 6799 // Compute the scalarization overhead of needed insertelement instructions 6800 // and phi nodes. 6801 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6802 ScalarCost += TTI.getScalarizationOverhead( 6803 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6804 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6805 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6806 ScalarCost += 6807 VF.getKnownMinValue() * 6808 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6809 } 6810 6811 // Compute the scalarization overhead of needed extractelement 6812 // instructions. For each of the instruction's operands, if the operand can 6813 // be scalarized, add it to the worklist; otherwise, account for the 6814 // overhead. 6815 for (Use &U : I->operands()) 6816 if (auto *J = dyn_cast<Instruction>(U.get())) { 6817 assert(VectorType::isValidElementType(J->getType()) && 6818 "Instruction has non-scalar type"); 6819 if (canBeScalarized(J)) 6820 Worklist.push_back(J); 6821 else if (needsExtract(J, VF)) { 6822 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6823 ScalarCost += TTI.getScalarizationOverhead( 6824 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6825 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6826 } 6827 } 6828 6829 // Scale the total scalar cost by block probability. 6830 ScalarCost /= getReciprocalPredBlockProb(); 6831 6832 // Compute the discount. A non-negative discount means the vector version 6833 // of the instruction costs more, and scalarizing would be beneficial. 6834 Discount += VectorCost - ScalarCost; 6835 ScalarCosts[I] = ScalarCost; 6836 } 6837 6838 return *Discount.getValue(); 6839 } 6840 6841 LoopVectorizationCostModel::VectorizationCostTy 6842 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6843 VectorizationCostTy Cost; 6844 6845 // For each block. 6846 for (BasicBlock *BB : TheLoop->blocks()) { 6847 VectorizationCostTy BlockCost; 6848 6849 // For each instruction in the old loop. 6850 for (Instruction &I : BB->instructionsWithoutDebug()) { 6851 // Skip ignored values. 6852 if (ValuesToIgnore.count(&I) || 6853 (VF.isVector() && VecValuesToIgnore.count(&I))) 6854 continue; 6855 6856 VectorizationCostTy C = getInstructionCost(&I, VF); 6857 6858 // Check if we should override the cost. 6859 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6860 C.first = InstructionCost(ForceTargetInstructionCost); 6861 6862 BlockCost.first += C.first; 6863 BlockCost.second |= C.second; 6864 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6865 << " for VF " << VF << " For instruction: " << I 6866 << '\n'); 6867 } 6868 6869 // If we are vectorizing a predicated block, it will have been 6870 // if-converted. This means that the block's instructions (aside from 6871 // stores and instructions that may divide by zero) will now be 6872 // unconditionally executed. For the scalar case, we may not always execute 6873 // the predicated block, if it is an if-else block. Thus, scale the block's 6874 // cost by the probability of executing it. blockNeedsPredication from 6875 // Legal is used so as to not include all blocks in tail folded loops. 6876 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6877 BlockCost.first /= getReciprocalPredBlockProb(); 6878 6879 Cost.first += BlockCost.first; 6880 Cost.second |= BlockCost.second; 6881 } 6882 6883 return Cost; 6884 } 6885 6886 /// Gets Address Access SCEV after verifying that the access pattern 6887 /// is loop invariant except the induction variable dependence. 6888 /// 6889 /// This SCEV can be sent to the Target in order to estimate the address 6890 /// calculation cost. 6891 static const SCEV *getAddressAccessSCEV( 6892 Value *Ptr, 6893 LoopVectorizationLegality *Legal, 6894 PredicatedScalarEvolution &PSE, 6895 const Loop *TheLoop) { 6896 6897 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6898 if (!Gep) 6899 return nullptr; 6900 6901 // We are looking for a gep with all loop invariant indices except for one 6902 // which should be an induction variable. 6903 auto SE = PSE.getSE(); 6904 unsigned NumOperands = Gep->getNumOperands(); 6905 for (unsigned i = 1; i < NumOperands; ++i) { 6906 Value *Opd = Gep->getOperand(i); 6907 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6908 !Legal->isInductionVariable(Opd)) 6909 return nullptr; 6910 } 6911 6912 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6913 return PSE.getSCEV(Ptr); 6914 } 6915 6916 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6917 return Legal->hasStride(I->getOperand(0)) || 6918 Legal->hasStride(I->getOperand(1)); 6919 } 6920 6921 InstructionCost 6922 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6923 ElementCount VF) { 6924 assert(VF.isVector() && 6925 "Scalarization cost of instruction implies vectorization."); 6926 if (VF.isScalable()) 6927 return InstructionCost::getInvalid(); 6928 6929 Type *ValTy = getMemInstValueType(I); 6930 auto SE = PSE.getSE(); 6931 6932 unsigned AS = getLoadStoreAddressSpace(I); 6933 Value *Ptr = getLoadStorePointerOperand(I); 6934 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6935 6936 // Figure out whether the access is strided and get the stride value 6937 // if it's known in compile time 6938 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6939 6940 // Get the cost of the scalar memory instruction and address computation. 6941 InstructionCost Cost = 6942 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6943 6944 // Don't pass *I here, since it is scalar but will actually be part of a 6945 // vectorized loop where the user of it is a vectorized instruction. 6946 const Align Alignment = getLoadStoreAlignment(I); 6947 Cost += VF.getKnownMinValue() * 6948 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6949 AS, TTI::TCK_RecipThroughput); 6950 6951 // Get the overhead of the extractelement and insertelement instructions 6952 // we might create due to scalarization. 6953 Cost += getScalarizationOverhead(I, VF); 6954 6955 // If we have a predicated load/store, it will need extra i1 extracts and 6956 // conditional branches, but may not be executed for each vector lane. Scale 6957 // the cost by the probability of executing the predicated block. 6958 if (isPredicatedInst(I, ElementCount::getFixed(1))) { 6959 Cost /= getReciprocalPredBlockProb(); 6960 6961 // Add the cost of an i1 extract and a branch 6962 auto *Vec_i1Ty = 6963 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6964 Cost += TTI.getScalarizationOverhead( 6965 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6966 /*Insert=*/false, /*Extract=*/true); 6967 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6968 6969 if (useEmulatedMaskMemRefHack(I)) 6970 // Artificially setting to a high enough value to practically disable 6971 // vectorization with such operations. 6972 Cost = 3000000; 6973 } 6974 6975 return Cost; 6976 } 6977 6978 InstructionCost 6979 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6980 ElementCount VF) { 6981 Type *ValTy = getMemInstValueType(I); 6982 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6983 Value *Ptr = getLoadStorePointerOperand(I); 6984 unsigned AS = getLoadStoreAddressSpace(I); 6985 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6986 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6987 6988 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6989 "Stride should be 1 or -1 for consecutive memory access"); 6990 const Align Alignment = getLoadStoreAlignment(I); 6991 InstructionCost Cost = 0; 6992 if (Legal->isMaskRequired(I)) 6993 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6994 CostKind); 6995 else 6996 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6997 CostKind, I); 6998 6999 bool Reverse = ConsecutiveStride < 0; 7000 if (Reverse) 7001 Cost += 7002 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7003 return Cost; 7004 } 7005 7006 InstructionCost 7007 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 7008 ElementCount VF) { 7009 assert(Legal->isUniformMemOp(*I)); 7010 7011 Type *ValTy = getMemInstValueType(I); 7012 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7013 const Align Alignment = getLoadStoreAlignment(I); 7014 unsigned AS = getLoadStoreAddressSpace(I); 7015 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7016 if (isa<LoadInst>(I)) { 7017 return TTI.getAddressComputationCost(ValTy) + 7018 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7019 CostKind) + 7020 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7021 } 7022 StoreInst *SI = cast<StoreInst>(I); 7023 7024 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7025 return TTI.getAddressComputationCost(ValTy) + 7026 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7027 CostKind) + 7028 (isLoopInvariantStoreValue 7029 ? 0 7030 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7031 VF.getKnownMinValue() - 1)); 7032 } 7033 7034 InstructionCost 7035 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7036 ElementCount VF) { 7037 Type *ValTy = getMemInstValueType(I); 7038 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7039 const Align Alignment = getLoadStoreAlignment(I); 7040 const Value *Ptr = getLoadStorePointerOperand(I); 7041 7042 return TTI.getAddressComputationCost(VectorTy) + 7043 TTI.getGatherScatterOpCost( 7044 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7045 TargetTransformInfo::TCK_RecipThroughput, I); 7046 } 7047 7048 InstructionCost 7049 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7050 ElementCount VF) { 7051 // TODO: Once we have support for interleaving with scalable vectors 7052 // we can calculate the cost properly here. 7053 if (VF.isScalable()) 7054 return InstructionCost::getInvalid(); 7055 7056 Type *ValTy = getMemInstValueType(I); 7057 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7058 unsigned AS = getLoadStoreAddressSpace(I); 7059 7060 auto Group = getInterleavedAccessGroup(I); 7061 assert(Group && "Fail to get an interleaved access group."); 7062 7063 unsigned InterleaveFactor = Group->getFactor(); 7064 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7065 7066 // Holds the indices of existing members in an interleaved load group. 7067 // An interleaved store group doesn't need this as it doesn't allow gaps. 7068 SmallVector<unsigned, 4> Indices; 7069 if (isa<LoadInst>(I)) { 7070 for (unsigned i = 0; i < InterleaveFactor; i++) 7071 if (Group->getMember(i)) 7072 Indices.push_back(i); 7073 } 7074 7075 // Calculate the cost of the whole interleaved group. 7076 bool UseMaskForGaps = 7077 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7078 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7079 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7080 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7081 7082 if (Group->isReverse()) { 7083 // TODO: Add support for reversed masked interleaved access. 7084 assert(!Legal->isMaskRequired(I) && 7085 "Reverse masked interleaved access not supported."); 7086 Cost += 7087 Group->getNumMembers() * 7088 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7089 } 7090 return Cost; 7091 } 7092 7093 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7094 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7095 // Early exit for no inloop reductions 7096 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7097 return InstructionCost::getInvalid(); 7098 auto *VectorTy = cast<VectorType>(Ty); 7099 7100 // We are looking for a pattern of, and finding the minimal acceptable cost: 7101 // reduce(mul(ext(A), ext(B))) or 7102 // reduce(mul(A, B)) or 7103 // reduce(ext(A)) or 7104 // reduce(A). 7105 // The basic idea is that we walk down the tree to do that, finding the root 7106 // reduction instruction in InLoopReductionImmediateChains. From there we find 7107 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7108 // of the components. If the reduction cost is lower then we return it for the 7109 // reduction instruction and 0 for the other instructions in the pattern. If 7110 // it is not we return an invalid cost specifying the orignal cost method 7111 // should be used. 7112 Instruction *RetI = I; 7113 if ((RetI->getOpcode() == Instruction::SExt || 7114 RetI->getOpcode() == Instruction::ZExt)) { 7115 if (!RetI->hasOneUser()) 7116 return InstructionCost::getInvalid(); 7117 RetI = RetI->user_back(); 7118 } 7119 if (RetI->getOpcode() == Instruction::Mul && 7120 RetI->user_back()->getOpcode() == Instruction::Add) { 7121 if (!RetI->hasOneUser()) 7122 return InstructionCost::getInvalid(); 7123 RetI = RetI->user_back(); 7124 } 7125 7126 // Test if the found instruction is a reduction, and if not return an invalid 7127 // cost specifying the parent to use the original cost modelling. 7128 if (!InLoopReductionImmediateChains.count(RetI)) 7129 return InstructionCost::getInvalid(); 7130 7131 // Find the reduction this chain is a part of and calculate the basic cost of 7132 // the reduction on its own. 7133 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7134 Instruction *ReductionPhi = LastChain; 7135 while (!isa<PHINode>(ReductionPhi)) 7136 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7137 7138 RecurrenceDescriptor RdxDesc = 7139 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7140 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7141 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7142 7143 // Get the operand that was not the reduction chain and match it to one of the 7144 // patterns, returning the better cost if it is found. 7145 Instruction *RedOp = RetI->getOperand(1) == LastChain 7146 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7147 : dyn_cast<Instruction>(RetI->getOperand(1)); 7148 7149 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7150 7151 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7152 !TheLoop->isLoopInvariant(RedOp)) { 7153 bool IsUnsigned = isa<ZExtInst>(RedOp); 7154 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7155 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7156 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7157 CostKind); 7158 7159 InstructionCost ExtCost = 7160 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7161 TTI::CastContextHint::None, CostKind, RedOp); 7162 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7163 return I == RetI ? *RedCost.getValue() : 0; 7164 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7165 Instruction *Mul = RedOp; 7166 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7167 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7168 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7169 Op0->getOpcode() == Op1->getOpcode() && 7170 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7171 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7172 bool IsUnsigned = isa<ZExtInst>(Op0); 7173 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7174 // reduce(mul(ext, ext)) 7175 InstructionCost ExtCost = 7176 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7177 TTI::CastContextHint::None, CostKind, Op0); 7178 InstructionCost MulCost = 7179 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7180 7181 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7182 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7183 CostKind); 7184 7185 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7186 return I == RetI ? *RedCost.getValue() : 0; 7187 } else { 7188 InstructionCost MulCost = 7189 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7190 7191 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7192 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7193 CostKind); 7194 7195 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7196 return I == RetI ? *RedCost.getValue() : 0; 7197 } 7198 } 7199 7200 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7201 } 7202 7203 InstructionCost 7204 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7205 ElementCount VF) { 7206 // Calculate scalar cost only. Vectorization cost should be ready at this 7207 // moment. 7208 if (VF.isScalar()) { 7209 Type *ValTy = getMemInstValueType(I); 7210 const Align Alignment = getLoadStoreAlignment(I); 7211 unsigned AS = getLoadStoreAddressSpace(I); 7212 7213 return TTI.getAddressComputationCost(ValTy) + 7214 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7215 TTI::TCK_RecipThroughput, I); 7216 } 7217 return getWideningCost(I, VF); 7218 } 7219 7220 LoopVectorizationCostModel::VectorizationCostTy 7221 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7222 ElementCount VF) { 7223 // If we know that this instruction will remain uniform, check the cost of 7224 // the scalar version. 7225 if (isUniformAfterVectorization(I, VF)) 7226 VF = ElementCount::getFixed(1); 7227 7228 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7229 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7230 7231 // Forced scalars do not have any scalarization overhead. 7232 auto ForcedScalar = ForcedScalars.find(VF); 7233 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7234 auto InstSet = ForcedScalar->second; 7235 if (InstSet.count(I)) 7236 return VectorizationCostTy( 7237 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7238 VF.getKnownMinValue()), 7239 false); 7240 } 7241 7242 Type *VectorTy; 7243 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7244 7245 bool TypeNotScalarized = 7246 VF.isVector() && VectorTy->isVectorTy() && 7247 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7248 return VectorizationCostTy(C, TypeNotScalarized); 7249 } 7250 7251 InstructionCost 7252 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7253 ElementCount VF) const { 7254 7255 if (VF.isScalable()) 7256 return InstructionCost::getInvalid(); 7257 7258 if (VF.isScalar()) 7259 return 0; 7260 7261 InstructionCost Cost = 0; 7262 Type *RetTy = ToVectorTy(I->getType(), VF); 7263 if (!RetTy->isVoidTy() && 7264 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7265 Cost += TTI.getScalarizationOverhead( 7266 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7267 true, false); 7268 7269 // Some targets keep addresses scalar. 7270 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7271 return Cost; 7272 7273 // Some targets support efficient element stores. 7274 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7275 return Cost; 7276 7277 // Collect operands to consider. 7278 CallInst *CI = dyn_cast<CallInst>(I); 7279 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7280 7281 // Skip operands that do not require extraction/scalarization and do not incur 7282 // any overhead. 7283 SmallVector<Type *> Tys; 7284 for (auto *V : filterExtractingOperands(Ops, VF)) 7285 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7286 return Cost + TTI.getOperandsScalarizationOverhead( 7287 filterExtractingOperands(Ops, VF), Tys); 7288 } 7289 7290 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7291 if (VF.isScalar()) 7292 return; 7293 NumPredStores = 0; 7294 for (BasicBlock *BB : TheLoop->blocks()) { 7295 // For each instruction in the old loop. 7296 for (Instruction &I : *BB) { 7297 Value *Ptr = getLoadStorePointerOperand(&I); 7298 if (!Ptr) 7299 continue; 7300 7301 // TODO: We should generate better code and update the cost model for 7302 // predicated uniform stores. Today they are treated as any other 7303 // predicated store (see added test cases in 7304 // invariant-store-vectorization.ll). 7305 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7306 NumPredStores++; 7307 7308 if (Legal->isUniformMemOp(I)) { 7309 // TODO: Avoid replicating loads and stores instead of 7310 // relying on instcombine to remove them. 7311 // Load: Scalar load + broadcast 7312 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7313 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7314 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7315 continue; 7316 } 7317 7318 // We assume that widening is the best solution when possible. 7319 if (memoryInstructionCanBeWidened(&I, VF)) { 7320 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7321 int ConsecutiveStride = 7322 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7323 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7324 "Expected consecutive stride."); 7325 InstWidening Decision = 7326 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7327 setWideningDecision(&I, VF, Decision, Cost); 7328 continue; 7329 } 7330 7331 // Choose between Interleaving, Gather/Scatter or Scalarization. 7332 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7333 unsigned NumAccesses = 1; 7334 if (isAccessInterleaved(&I)) { 7335 auto Group = getInterleavedAccessGroup(&I); 7336 assert(Group && "Fail to get an interleaved access group."); 7337 7338 // Make one decision for the whole group. 7339 if (getWideningDecision(&I, VF) != CM_Unknown) 7340 continue; 7341 7342 NumAccesses = Group->getNumMembers(); 7343 if (interleavedAccessCanBeWidened(&I, VF)) 7344 InterleaveCost = getInterleaveGroupCost(&I, VF); 7345 } 7346 7347 InstructionCost GatherScatterCost = 7348 isLegalGatherOrScatter(&I) 7349 ? getGatherScatterCost(&I, VF) * NumAccesses 7350 : InstructionCost::getInvalid(); 7351 7352 InstructionCost ScalarizationCost = 7353 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7354 7355 // Choose better solution for the current VF, 7356 // write down this decision and use it during vectorization. 7357 InstructionCost Cost; 7358 InstWidening Decision; 7359 if (InterleaveCost <= GatherScatterCost && 7360 InterleaveCost < ScalarizationCost) { 7361 Decision = CM_Interleave; 7362 Cost = InterleaveCost; 7363 } else if (GatherScatterCost < ScalarizationCost) { 7364 Decision = CM_GatherScatter; 7365 Cost = GatherScatterCost; 7366 } else { 7367 assert(!VF.isScalable() && 7368 "We cannot yet scalarise for scalable vectors"); 7369 Decision = CM_Scalarize; 7370 Cost = ScalarizationCost; 7371 } 7372 // If the instructions belongs to an interleave group, the whole group 7373 // receives the same decision. The whole group receives the cost, but 7374 // the cost will actually be assigned to one instruction. 7375 if (auto Group = getInterleavedAccessGroup(&I)) 7376 setWideningDecision(Group, VF, Decision, Cost); 7377 else 7378 setWideningDecision(&I, VF, Decision, Cost); 7379 } 7380 } 7381 7382 // Make sure that any load of address and any other address computation 7383 // remains scalar unless there is gather/scatter support. This avoids 7384 // inevitable extracts into address registers, and also has the benefit of 7385 // activating LSR more, since that pass can't optimize vectorized 7386 // addresses. 7387 if (TTI.prefersVectorizedAddressing()) 7388 return; 7389 7390 // Start with all scalar pointer uses. 7391 SmallPtrSet<Instruction *, 8> AddrDefs; 7392 for (BasicBlock *BB : TheLoop->blocks()) 7393 for (Instruction &I : *BB) { 7394 Instruction *PtrDef = 7395 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7396 if (PtrDef && TheLoop->contains(PtrDef) && 7397 getWideningDecision(&I, VF) != CM_GatherScatter) 7398 AddrDefs.insert(PtrDef); 7399 } 7400 7401 // Add all instructions used to generate the addresses. 7402 SmallVector<Instruction *, 4> Worklist; 7403 append_range(Worklist, AddrDefs); 7404 while (!Worklist.empty()) { 7405 Instruction *I = Worklist.pop_back_val(); 7406 for (auto &Op : I->operands()) 7407 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7408 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7409 AddrDefs.insert(InstOp).second) 7410 Worklist.push_back(InstOp); 7411 } 7412 7413 for (auto *I : AddrDefs) { 7414 if (isa<LoadInst>(I)) { 7415 // Setting the desired widening decision should ideally be handled in 7416 // by cost functions, but since this involves the task of finding out 7417 // if the loaded register is involved in an address computation, it is 7418 // instead changed here when we know this is the case. 7419 InstWidening Decision = getWideningDecision(I, VF); 7420 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7421 // Scalarize a widened load of address. 7422 setWideningDecision( 7423 I, VF, CM_Scalarize, 7424 (VF.getKnownMinValue() * 7425 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7426 else if (auto Group = getInterleavedAccessGroup(I)) { 7427 // Scalarize an interleave group of address loads. 7428 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7429 if (Instruction *Member = Group->getMember(I)) 7430 setWideningDecision( 7431 Member, VF, CM_Scalarize, 7432 (VF.getKnownMinValue() * 7433 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7434 } 7435 } 7436 } else 7437 // Make sure I gets scalarized and a cost estimate without 7438 // scalarization overhead. 7439 ForcedScalars[VF].insert(I); 7440 } 7441 } 7442 7443 InstructionCost 7444 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7445 Type *&VectorTy) { 7446 Type *RetTy = I->getType(); 7447 if (canTruncateToMinimalBitwidth(I, VF)) 7448 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7449 auto SE = PSE.getSE(); 7450 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7451 7452 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7453 ElementCount VF) -> bool { 7454 if (VF.isScalar()) 7455 return true; 7456 7457 auto Scalarized = InstsToScalarize.find(VF); 7458 assert(Scalarized != InstsToScalarize.end() && 7459 "VF not yet analyzed for scalarization profitability"); 7460 return !Scalarized->second.count(I) && 7461 llvm::all_of(I->users(), [&](User *U) { 7462 auto *UI = cast<Instruction>(U); 7463 return !Scalarized->second.count(UI); 7464 }); 7465 }; 7466 (void) hasSingleCopyAfterVectorization; 7467 7468 if (isScalarAfterVectorization(I, VF)) { 7469 // With the exception of GEPs and PHIs, after scalarization there should 7470 // only be one copy of the instruction generated in the loop. This is 7471 // because the VF is either 1, or any instructions that need scalarizing 7472 // have already been dealt with by the the time we get here. As a result, 7473 // it means we don't have to multiply the instruction cost by VF. 7474 assert(I->getOpcode() == Instruction::GetElementPtr || 7475 I->getOpcode() == Instruction::PHI || 7476 (I->getOpcode() == Instruction::BitCast && 7477 I->getType()->isPointerTy()) || 7478 hasSingleCopyAfterVectorization(I, VF)); 7479 VectorTy = RetTy; 7480 } else 7481 VectorTy = ToVectorTy(RetTy, VF); 7482 7483 // TODO: We need to estimate the cost of intrinsic calls. 7484 switch (I->getOpcode()) { 7485 case Instruction::GetElementPtr: 7486 // We mark this instruction as zero-cost because the cost of GEPs in 7487 // vectorized code depends on whether the corresponding memory instruction 7488 // is scalarized or not. Therefore, we handle GEPs with the memory 7489 // instruction cost. 7490 return 0; 7491 case Instruction::Br: { 7492 // In cases of scalarized and predicated instructions, there will be VF 7493 // predicated blocks in the vectorized loop. Each branch around these 7494 // blocks requires also an extract of its vector compare i1 element. 7495 bool ScalarPredicatedBB = false; 7496 BranchInst *BI = cast<BranchInst>(I); 7497 if (VF.isVector() && BI->isConditional() && 7498 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7499 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7500 ScalarPredicatedBB = true; 7501 7502 if (ScalarPredicatedBB) { 7503 // Return cost for branches around scalarized and predicated blocks. 7504 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7505 auto *Vec_i1Ty = 7506 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7507 return (TTI.getScalarizationOverhead( 7508 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7509 false, true) + 7510 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7511 VF.getKnownMinValue())); 7512 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7513 // The back-edge branch will remain, as will all scalar branches. 7514 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7515 else 7516 // This branch will be eliminated by if-conversion. 7517 return 0; 7518 // Note: We currently assume zero cost for an unconditional branch inside 7519 // a predicated block since it will become a fall-through, although we 7520 // may decide in the future to call TTI for all branches. 7521 } 7522 case Instruction::PHI: { 7523 auto *Phi = cast<PHINode>(I); 7524 7525 // First-order recurrences are replaced by vector shuffles inside the loop. 7526 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7527 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7528 return TTI.getShuffleCost( 7529 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7530 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7531 7532 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7533 // converted into select instructions. We require N - 1 selects per phi 7534 // node, where N is the number of incoming values. 7535 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7536 return (Phi->getNumIncomingValues() - 1) * 7537 TTI.getCmpSelInstrCost( 7538 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7539 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7540 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7541 7542 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7543 } 7544 case Instruction::UDiv: 7545 case Instruction::SDiv: 7546 case Instruction::URem: 7547 case Instruction::SRem: 7548 // If we have a predicated instruction, it may not be executed for each 7549 // vector lane. Get the scalarization cost and scale this amount by the 7550 // probability of executing the predicated block. If the instruction is not 7551 // predicated, we fall through to the next case. 7552 if (VF.isVector() && isScalarWithPredication(I)) { 7553 InstructionCost Cost = 0; 7554 7555 // These instructions have a non-void type, so account for the phi nodes 7556 // that we will create. This cost is likely to be zero. The phi node 7557 // cost, if any, should be scaled by the block probability because it 7558 // models a copy at the end of each predicated block. 7559 Cost += VF.getKnownMinValue() * 7560 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7561 7562 // The cost of the non-predicated instruction. 7563 Cost += VF.getKnownMinValue() * 7564 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7565 7566 // The cost of insertelement and extractelement instructions needed for 7567 // scalarization. 7568 Cost += getScalarizationOverhead(I, VF); 7569 7570 // Scale the cost by the probability of executing the predicated blocks. 7571 // This assumes the predicated block for each vector lane is equally 7572 // likely. 7573 return Cost / getReciprocalPredBlockProb(); 7574 } 7575 LLVM_FALLTHROUGH; 7576 case Instruction::Add: 7577 case Instruction::FAdd: 7578 case Instruction::Sub: 7579 case Instruction::FSub: 7580 case Instruction::Mul: 7581 case Instruction::FMul: 7582 case Instruction::FDiv: 7583 case Instruction::FRem: 7584 case Instruction::Shl: 7585 case Instruction::LShr: 7586 case Instruction::AShr: 7587 case Instruction::And: 7588 case Instruction::Or: 7589 case Instruction::Xor: { 7590 // Since we will replace the stride by 1 the multiplication should go away. 7591 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7592 return 0; 7593 7594 // Detect reduction patterns 7595 InstructionCost RedCost; 7596 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7597 .isValid()) 7598 return RedCost; 7599 7600 // Certain instructions can be cheaper to vectorize if they have a constant 7601 // second vector operand. One example of this are shifts on x86. 7602 Value *Op2 = I->getOperand(1); 7603 TargetTransformInfo::OperandValueProperties Op2VP; 7604 TargetTransformInfo::OperandValueKind Op2VK = 7605 TTI.getOperandInfo(Op2, Op2VP); 7606 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7607 Op2VK = TargetTransformInfo::OK_UniformValue; 7608 7609 SmallVector<const Value *, 4> Operands(I->operand_values()); 7610 return TTI.getArithmeticInstrCost( 7611 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7612 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7613 } 7614 case Instruction::FNeg: { 7615 return TTI.getArithmeticInstrCost( 7616 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7617 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7618 TargetTransformInfo::OP_None, I->getOperand(0), I); 7619 } 7620 case Instruction::Select: { 7621 SelectInst *SI = cast<SelectInst>(I); 7622 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7623 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7624 7625 const Value *Op0, *Op1; 7626 using namespace llvm::PatternMatch; 7627 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7628 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7629 // select x, y, false --> x & y 7630 // select x, true, y --> x | y 7631 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7632 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7633 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7634 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7635 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7636 Op1->getType()->getScalarSizeInBits() == 1); 7637 7638 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7639 return TTI.getArithmeticInstrCost( 7640 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7641 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7642 } 7643 7644 Type *CondTy = SI->getCondition()->getType(); 7645 if (!ScalarCond) 7646 CondTy = VectorType::get(CondTy, VF); 7647 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7648 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7649 } 7650 case Instruction::ICmp: 7651 case Instruction::FCmp: { 7652 Type *ValTy = I->getOperand(0)->getType(); 7653 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7654 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7655 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7656 VectorTy = ToVectorTy(ValTy, VF); 7657 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7658 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7659 } 7660 case Instruction::Store: 7661 case Instruction::Load: { 7662 ElementCount Width = VF; 7663 if (Width.isVector()) { 7664 InstWidening Decision = getWideningDecision(I, Width); 7665 assert(Decision != CM_Unknown && 7666 "CM decision should be taken at this point"); 7667 if (Decision == CM_Scalarize) 7668 Width = ElementCount::getFixed(1); 7669 } 7670 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7671 return getMemoryInstructionCost(I, VF); 7672 } 7673 case Instruction::BitCast: 7674 if (I->getType()->isPointerTy()) 7675 return 0; 7676 LLVM_FALLTHROUGH; 7677 case Instruction::ZExt: 7678 case Instruction::SExt: 7679 case Instruction::FPToUI: 7680 case Instruction::FPToSI: 7681 case Instruction::FPExt: 7682 case Instruction::PtrToInt: 7683 case Instruction::IntToPtr: 7684 case Instruction::SIToFP: 7685 case Instruction::UIToFP: 7686 case Instruction::Trunc: 7687 case Instruction::FPTrunc: { 7688 // Computes the CastContextHint from a Load/Store instruction. 7689 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7690 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7691 "Expected a load or a store!"); 7692 7693 if (VF.isScalar() || !TheLoop->contains(I)) 7694 return TTI::CastContextHint::Normal; 7695 7696 switch (getWideningDecision(I, VF)) { 7697 case LoopVectorizationCostModel::CM_GatherScatter: 7698 return TTI::CastContextHint::GatherScatter; 7699 case LoopVectorizationCostModel::CM_Interleave: 7700 return TTI::CastContextHint::Interleave; 7701 case LoopVectorizationCostModel::CM_Scalarize: 7702 case LoopVectorizationCostModel::CM_Widen: 7703 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7704 : TTI::CastContextHint::Normal; 7705 case LoopVectorizationCostModel::CM_Widen_Reverse: 7706 return TTI::CastContextHint::Reversed; 7707 case LoopVectorizationCostModel::CM_Unknown: 7708 llvm_unreachable("Instr did not go through cost modelling?"); 7709 } 7710 7711 llvm_unreachable("Unhandled case!"); 7712 }; 7713 7714 unsigned Opcode = I->getOpcode(); 7715 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7716 // For Trunc, the context is the only user, which must be a StoreInst. 7717 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7718 if (I->hasOneUse()) 7719 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7720 CCH = ComputeCCH(Store); 7721 } 7722 // For Z/Sext, the context is the operand, which must be a LoadInst. 7723 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7724 Opcode == Instruction::FPExt) { 7725 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7726 CCH = ComputeCCH(Load); 7727 } 7728 7729 // We optimize the truncation of induction variables having constant 7730 // integer steps. The cost of these truncations is the same as the scalar 7731 // operation. 7732 if (isOptimizableIVTruncate(I, VF)) { 7733 auto *Trunc = cast<TruncInst>(I); 7734 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7735 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7736 } 7737 7738 // Detect reduction patterns 7739 InstructionCost RedCost; 7740 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7741 .isValid()) 7742 return RedCost; 7743 7744 Type *SrcScalarTy = I->getOperand(0)->getType(); 7745 Type *SrcVecTy = 7746 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7747 if (canTruncateToMinimalBitwidth(I, VF)) { 7748 // This cast is going to be shrunk. This may remove the cast or it might 7749 // turn it into slightly different cast. For example, if MinBW == 16, 7750 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7751 // 7752 // Calculate the modified src and dest types. 7753 Type *MinVecTy = VectorTy; 7754 if (Opcode == Instruction::Trunc) { 7755 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7756 VectorTy = 7757 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7758 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7759 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7760 VectorTy = 7761 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7762 } 7763 } 7764 7765 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7766 } 7767 case Instruction::Call: { 7768 bool NeedToScalarize; 7769 CallInst *CI = cast<CallInst>(I); 7770 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7771 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7772 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7773 return std::min(CallCost, IntrinsicCost); 7774 } 7775 return CallCost; 7776 } 7777 case Instruction::ExtractValue: 7778 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7779 default: 7780 // This opcode is unknown. Assume that it is the same as 'mul'. 7781 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7782 } // end of switch. 7783 } 7784 7785 char LoopVectorize::ID = 0; 7786 7787 static const char lv_name[] = "Loop Vectorization"; 7788 7789 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7790 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7791 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7792 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7793 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7794 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7795 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7796 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7797 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7798 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7799 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7800 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7801 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7802 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7803 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7804 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7805 7806 namespace llvm { 7807 7808 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7809 7810 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7811 bool VectorizeOnlyWhenForced) { 7812 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7813 } 7814 7815 } // end namespace llvm 7816 7817 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7818 // Check if the pointer operand of a load or store instruction is 7819 // consecutive. 7820 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7821 return Legal->isConsecutivePtr(Ptr); 7822 return false; 7823 } 7824 7825 void LoopVectorizationCostModel::collectValuesToIgnore() { 7826 // Ignore ephemeral values. 7827 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7828 7829 // Ignore type-promoting instructions we identified during reduction 7830 // detection. 7831 for (auto &Reduction : Legal->getReductionVars()) { 7832 RecurrenceDescriptor &RedDes = Reduction.second; 7833 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7834 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7835 } 7836 // Ignore type-casting instructions we identified during induction 7837 // detection. 7838 for (auto &Induction : Legal->getInductionVars()) { 7839 InductionDescriptor &IndDes = Induction.second; 7840 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7841 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7842 } 7843 } 7844 7845 void LoopVectorizationCostModel::collectInLoopReductions() { 7846 for (auto &Reduction : Legal->getReductionVars()) { 7847 PHINode *Phi = Reduction.first; 7848 RecurrenceDescriptor &RdxDesc = Reduction.second; 7849 7850 // We don't collect reductions that are type promoted (yet). 7851 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7852 continue; 7853 7854 // If the target would prefer this reduction to happen "in-loop", then we 7855 // want to record it as such. 7856 unsigned Opcode = RdxDesc.getOpcode(); 7857 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7858 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7859 TargetTransformInfo::ReductionFlags())) 7860 continue; 7861 7862 // Check that we can correctly put the reductions into the loop, by 7863 // finding the chain of operations that leads from the phi to the loop 7864 // exit value. 7865 SmallVector<Instruction *, 4> ReductionOperations = 7866 RdxDesc.getReductionOpChain(Phi, TheLoop); 7867 bool InLoop = !ReductionOperations.empty(); 7868 if (InLoop) { 7869 InLoopReductionChains[Phi] = ReductionOperations; 7870 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7871 Instruction *LastChain = Phi; 7872 for (auto *I : ReductionOperations) { 7873 InLoopReductionImmediateChains[I] = LastChain; 7874 LastChain = I; 7875 } 7876 } 7877 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7878 << " reduction for phi: " << *Phi << "\n"); 7879 } 7880 } 7881 7882 // TODO: we could return a pair of values that specify the max VF and 7883 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7884 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7885 // doesn't have a cost model that can choose which plan to execute if 7886 // more than one is generated. 7887 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7888 LoopVectorizationCostModel &CM) { 7889 unsigned WidestType; 7890 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7891 return WidestVectorRegBits / WidestType; 7892 } 7893 7894 VectorizationFactor 7895 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7896 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7897 ElementCount VF = UserVF; 7898 // Outer loop handling: They may require CFG and instruction level 7899 // transformations before even evaluating whether vectorization is profitable. 7900 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7901 // the vectorization pipeline. 7902 if (!OrigLoop->isInnermost()) { 7903 // If the user doesn't provide a vectorization factor, determine a 7904 // reasonable one. 7905 if (UserVF.isZero()) { 7906 VF = ElementCount::getFixed(determineVPlanVF( 7907 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7908 .getFixedSize(), 7909 CM)); 7910 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7911 7912 // Make sure we have a VF > 1 for stress testing. 7913 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7914 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7915 << "overriding computed VF.\n"); 7916 VF = ElementCount::getFixed(4); 7917 } 7918 } 7919 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7920 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7921 "VF needs to be a power of two"); 7922 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7923 << "VF " << VF << " to build VPlans.\n"); 7924 buildVPlans(VF, VF); 7925 7926 // For VPlan build stress testing, we bail out after VPlan construction. 7927 if (VPlanBuildStressTest) 7928 return VectorizationFactor::Disabled(); 7929 7930 return {VF, 0 /*Cost*/}; 7931 } 7932 7933 LLVM_DEBUG( 7934 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7935 "VPlan-native path.\n"); 7936 return VectorizationFactor::Disabled(); 7937 } 7938 7939 Optional<VectorizationFactor> 7940 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7941 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7942 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7943 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7944 return None; 7945 7946 // Invalidate interleave groups if all blocks of loop will be predicated. 7947 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7948 !useMaskedInterleavedAccesses(*TTI)) { 7949 LLVM_DEBUG( 7950 dbgs() 7951 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7952 "which requires masked-interleaved support.\n"); 7953 if (CM.InterleaveInfo.invalidateGroups()) 7954 // Invalidating interleave groups also requires invalidating all decisions 7955 // based on them, which includes widening decisions and uniform and scalar 7956 // values. 7957 CM.invalidateCostModelingDecisions(); 7958 } 7959 7960 ElementCount MaxVF = MaybeMaxVF.getValue(); 7961 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7962 7963 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7964 if (!UserVF.isZero() && 7965 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7966 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7967 // VFs here, this should be reverted to only use legal UserVFs once the 7968 // loop below supports scalable VFs. 7969 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7970 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7971 << " VF " << VF << ".\n"); 7972 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7973 "VF needs to be a power of two"); 7974 // Collect the instructions (and their associated costs) that will be more 7975 // profitable to scalarize. 7976 CM.selectUserVectorizationFactor(VF); 7977 CM.collectInLoopReductions(); 7978 buildVPlansWithVPRecipes(VF, VF); 7979 LLVM_DEBUG(printPlans(dbgs())); 7980 return {{VF, 0}}; 7981 } 7982 7983 assert(!MaxVF.isScalable() && 7984 "Scalable vectors not yet supported beyond this point"); 7985 7986 for (ElementCount VF = ElementCount::getFixed(1); 7987 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7988 // Collect Uniform and Scalar instructions after vectorization with VF. 7989 CM.collectUniformsAndScalars(VF); 7990 7991 // Collect the instructions (and their associated costs) that will be more 7992 // profitable to scalarize. 7993 if (VF.isVector()) 7994 CM.collectInstsToScalarize(VF); 7995 } 7996 7997 CM.collectInLoopReductions(); 7998 7999 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 8000 LLVM_DEBUG(printPlans(dbgs())); 8001 if (MaxVF.isScalar()) 8002 return VectorizationFactor::Disabled(); 8003 8004 // Select the optimal vectorization factor. 8005 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 8006 8007 // Check if it is profitable to vectorize with runtime checks. 8008 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 8009 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 8010 bool PragmaThresholdReached = 8011 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8012 bool ThresholdReached = 8013 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8014 if ((ThresholdReached && !Hints.allowReordering()) || 8015 PragmaThresholdReached) { 8016 ORE->emit([&]() { 8017 return OptimizationRemarkAnalysisAliasing( 8018 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8019 OrigLoop->getHeader()) 8020 << "loop not vectorized: cannot prove it is safe to reorder " 8021 "memory operations"; 8022 }); 8023 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8024 Hints.emitRemarkWithHints(); 8025 return VectorizationFactor::Disabled(); 8026 } 8027 } 8028 return SelectedVF; 8029 } 8030 8031 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8032 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8033 << '\n'); 8034 BestVF = VF; 8035 BestUF = UF; 8036 8037 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8038 return !Plan->hasVF(VF); 8039 }); 8040 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8041 } 8042 8043 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8044 DominatorTree *DT) { 8045 // Perform the actual loop transformation. 8046 8047 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8048 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8049 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8050 8051 VPTransformState State{ 8052 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8053 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8054 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8055 State.CanonicalIV = ILV.Induction; 8056 8057 ILV.printDebugTracesAtStart(); 8058 8059 //===------------------------------------------------===// 8060 // 8061 // Notice: any optimization or new instruction that go 8062 // into the code below should also be implemented in 8063 // the cost-model. 8064 // 8065 //===------------------------------------------------===// 8066 8067 // 2. Copy and widen instructions from the old loop into the new loop. 8068 VPlans.front()->execute(&State); 8069 8070 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8071 // predication, updating analyses. 8072 ILV.fixVectorizedLoop(State); 8073 8074 ILV.printDebugTracesAtEnd(); 8075 } 8076 8077 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8078 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8079 for (const auto &Plan : VPlans) 8080 if (PrintVPlansInDotFormat) 8081 Plan->printDOT(O); 8082 else 8083 Plan->print(O); 8084 } 8085 #endif 8086 8087 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8088 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8089 8090 // We create new control-flow for the vectorized loop, so the original exit 8091 // conditions will be dead after vectorization if it's only used by the 8092 // terminator 8093 SmallVector<BasicBlock*> ExitingBlocks; 8094 OrigLoop->getExitingBlocks(ExitingBlocks); 8095 for (auto *BB : ExitingBlocks) { 8096 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8097 if (!Cmp || !Cmp->hasOneUse()) 8098 continue; 8099 8100 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8101 if (!DeadInstructions.insert(Cmp).second) 8102 continue; 8103 8104 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8105 // TODO: can recurse through operands in general 8106 for (Value *Op : Cmp->operands()) { 8107 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8108 DeadInstructions.insert(cast<Instruction>(Op)); 8109 } 8110 } 8111 8112 // We create new "steps" for induction variable updates to which the original 8113 // induction variables map. An original update instruction will be dead if 8114 // all its users except the induction variable are dead. 8115 auto *Latch = OrigLoop->getLoopLatch(); 8116 for (auto &Induction : Legal->getInductionVars()) { 8117 PHINode *Ind = Induction.first; 8118 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8119 8120 // If the tail is to be folded by masking, the primary induction variable, 8121 // if exists, isn't dead: it will be used for masking. Don't kill it. 8122 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8123 continue; 8124 8125 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8126 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8127 })) 8128 DeadInstructions.insert(IndUpdate); 8129 8130 // We record as "Dead" also the type-casting instructions we had identified 8131 // during induction analysis. We don't need any handling for them in the 8132 // vectorized loop because we have proven that, under a proper runtime 8133 // test guarding the vectorized loop, the value of the phi, and the casted 8134 // value of the phi, are the same. The last instruction in this casting chain 8135 // will get its scalar/vector/widened def from the scalar/vector/widened def 8136 // of the respective phi node. Any other casts in the induction def-use chain 8137 // have no other uses outside the phi update chain, and will be ignored. 8138 InductionDescriptor &IndDes = Induction.second; 8139 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8140 DeadInstructions.insert(Casts.begin(), Casts.end()); 8141 } 8142 } 8143 8144 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8145 8146 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8147 8148 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8149 Instruction::BinaryOps BinOp) { 8150 // When unrolling and the VF is 1, we only need to add a simple scalar. 8151 Type *Ty = Val->getType(); 8152 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8153 8154 if (Ty->isFloatingPointTy()) { 8155 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8156 8157 // Floating-point operations inherit FMF via the builder's flags. 8158 Value *MulOp = Builder.CreateFMul(C, Step); 8159 return Builder.CreateBinOp(BinOp, Val, MulOp); 8160 } 8161 Constant *C = ConstantInt::get(Ty, StartIdx); 8162 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8163 } 8164 8165 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8166 SmallVector<Metadata *, 4> MDs; 8167 // Reserve first location for self reference to the LoopID metadata node. 8168 MDs.push_back(nullptr); 8169 bool IsUnrollMetadata = false; 8170 MDNode *LoopID = L->getLoopID(); 8171 if (LoopID) { 8172 // First find existing loop unrolling disable metadata. 8173 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8174 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8175 if (MD) { 8176 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8177 IsUnrollMetadata = 8178 S && S->getString().startswith("llvm.loop.unroll.disable"); 8179 } 8180 MDs.push_back(LoopID->getOperand(i)); 8181 } 8182 } 8183 8184 if (!IsUnrollMetadata) { 8185 // Add runtime unroll disable metadata. 8186 LLVMContext &Context = L->getHeader()->getContext(); 8187 SmallVector<Metadata *, 1> DisableOperands; 8188 DisableOperands.push_back( 8189 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8190 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8191 MDs.push_back(DisableNode); 8192 MDNode *NewLoopID = MDNode::get(Context, MDs); 8193 // Set operand 0 to refer to the loop id itself. 8194 NewLoopID->replaceOperandWith(0, NewLoopID); 8195 L->setLoopID(NewLoopID); 8196 } 8197 } 8198 8199 //===--------------------------------------------------------------------===// 8200 // EpilogueVectorizerMainLoop 8201 //===--------------------------------------------------------------------===// 8202 8203 /// This function is partially responsible for generating the control flow 8204 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8205 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8206 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8207 Loop *Lp = createVectorLoopSkeleton(""); 8208 8209 // Generate the code to check the minimum iteration count of the vector 8210 // epilogue (see below). 8211 EPI.EpilogueIterationCountCheck = 8212 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8213 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8214 8215 // Generate the code to check any assumptions that we've made for SCEV 8216 // expressions. 8217 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8218 8219 // Generate the code that checks at runtime if arrays overlap. We put the 8220 // checks into a separate block to make the more common case of few elements 8221 // faster. 8222 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8223 8224 // Generate the iteration count check for the main loop, *after* the check 8225 // for the epilogue loop, so that the path-length is shorter for the case 8226 // that goes directly through the vector epilogue. The longer-path length for 8227 // the main loop is compensated for, by the gain from vectorizing the larger 8228 // trip count. Note: the branch will get updated later on when we vectorize 8229 // the epilogue. 8230 EPI.MainLoopIterationCountCheck = 8231 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8232 8233 // Generate the induction variable. 8234 OldInduction = Legal->getPrimaryInduction(); 8235 Type *IdxTy = Legal->getWidestInductionType(); 8236 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8237 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8238 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8239 EPI.VectorTripCount = CountRoundDown; 8240 Induction = 8241 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8242 getDebugLocFromInstOrOperands(OldInduction)); 8243 8244 // Skip induction resume value creation here because they will be created in 8245 // the second pass. If we created them here, they wouldn't be used anyway, 8246 // because the vplan in the second pass still contains the inductions from the 8247 // original loop. 8248 8249 return completeLoopSkeleton(Lp, OrigLoopID); 8250 } 8251 8252 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8253 LLVM_DEBUG({ 8254 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8255 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8256 << ", Main Loop UF:" << EPI.MainLoopUF 8257 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8258 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8259 }); 8260 } 8261 8262 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8263 DEBUG_WITH_TYPE(VerboseDebug, { 8264 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8265 }); 8266 } 8267 8268 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8269 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8270 assert(L && "Expected valid Loop."); 8271 assert(Bypass && "Expected valid bypass basic block."); 8272 unsigned VFactor = 8273 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8274 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8275 Value *Count = getOrCreateTripCount(L); 8276 // Reuse existing vector loop preheader for TC checks. 8277 // Note that new preheader block is generated for vector loop. 8278 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8279 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8280 8281 // Generate code to check if the loop's trip count is less than VF * UF of the 8282 // main vector loop. 8283 auto P = 8284 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8285 8286 Value *CheckMinIters = Builder.CreateICmp( 8287 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8288 "min.iters.check"); 8289 8290 if (!ForEpilogue) 8291 TCCheckBlock->setName("vector.main.loop.iter.check"); 8292 8293 // Create new preheader for vector loop. 8294 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8295 DT, LI, nullptr, "vector.ph"); 8296 8297 if (ForEpilogue) { 8298 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8299 DT->getNode(Bypass)->getIDom()) && 8300 "TC check is expected to dominate Bypass"); 8301 8302 // Update dominator for Bypass & LoopExit. 8303 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8304 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8305 8306 LoopBypassBlocks.push_back(TCCheckBlock); 8307 8308 // Save the trip count so we don't have to regenerate it in the 8309 // vec.epilog.iter.check. This is safe to do because the trip count 8310 // generated here dominates the vector epilog iter check. 8311 EPI.TripCount = Count; 8312 } 8313 8314 ReplaceInstWithInst( 8315 TCCheckBlock->getTerminator(), 8316 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8317 8318 return TCCheckBlock; 8319 } 8320 8321 //===--------------------------------------------------------------------===// 8322 // EpilogueVectorizerEpilogueLoop 8323 //===--------------------------------------------------------------------===// 8324 8325 /// This function is partially responsible for generating the control flow 8326 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8327 BasicBlock * 8328 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8329 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8330 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8331 8332 // Now, compare the remaining count and if there aren't enough iterations to 8333 // execute the vectorized epilogue skip to the scalar part. 8334 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8335 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8336 LoopVectorPreHeader = 8337 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8338 LI, nullptr, "vec.epilog.ph"); 8339 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8340 VecEpilogueIterationCountCheck); 8341 8342 // Adjust the control flow taking the state info from the main loop 8343 // vectorization into account. 8344 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8345 "expected this to be saved from the previous pass."); 8346 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8347 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8348 8349 DT->changeImmediateDominator(LoopVectorPreHeader, 8350 EPI.MainLoopIterationCountCheck); 8351 8352 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8353 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8354 8355 if (EPI.SCEVSafetyCheck) 8356 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8357 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8358 if (EPI.MemSafetyCheck) 8359 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8360 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8361 8362 DT->changeImmediateDominator( 8363 VecEpilogueIterationCountCheck, 8364 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8365 8366 DT->changeImmediateDominator(LoopScalarPreHeader, 8367 EPI.EpilogueIterationCountCheck); 8368 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8369 8370 // Keep track of bypass blocks, as they feed start values to the induction 8371 // phis in the scalar loop preheader. 8372 if (EPI.SCEVSafetyCheck) 8373 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8374 if (EPI.MemSafetyCheck) 8375 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8376 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8377 8378 // Generate a resume induction for the vector epilogue and put it in the 8379 // vector epilogue preheader 8380 Type *IdxTy = Legal->getWidestInductionType(); 8381 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8382 LoopVectorPreHeader->getFirstNonPHI()); 8383 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8384 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8385 EPI.MainLoopIterationCountCheck); 8386 8387 // Generate the induction variable. 8388 OldInduction = Legal->getPrimaryInduction(); 8389 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8390 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8391 Value *StartIdx = EPResumeVal; 8392 Induction = 8393 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8394 getDebugLocFromInstOrOperands(OldInduction)); 8395 8396 // Generate induction resume values. These variables save the new starting 8397 // indexes for the scalar loop. They are used to test if there are any tail 8398 // iterations left once the vector loop has completed. 8399 // Note that when the vectorized epilogue is skipped due to iteration count 8400 // check, then the resume value for the induction variable comes from 8401 // the trip count of the main vector loop, hence passing the AdditionalBypass 8402 // argument. 8403 createInductionResumeValues(Lp, CountRoundDown, 8404 {VecEpilogueIterationCountCheck, 8405 EPI.VectorTripCount} /* AdditionalBypass */); 8406 8407 AddRuntimeUnrollDisableMetaData(Lp); 8408 return completeLoopSkeleton(Lp, OrigLoopID); 8409 } 8410 8411 BasicBlock * 8412 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8413 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8414 8415 assert(EPI.TripCount && 8416 "Expected trip count to have been safed in the first pass."); 8417 assert( 8418 (!isa<Instruction>(EPI.TripCount) || 8419 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8420 "saved trip count does not dominate insertion point."); 8421 Value *TC = EPI.TripCount; 8422 IRBuilder<> Builder(Insert->getTerminator()); 8423 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8424 8425 // Generate code to check if the loop's trip count is less than VF * UF of the 8426 // vector epilogue loop. 8427 auto P = 8428 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8429 8430 Value *CheckMinIters = Builder.CreateICmp( 8431 P, Count, 8432 ConstantInt::get(Count->getType(), 8433 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8434 "min.epilog.iters.check"); 8435 8436 ReplaceInstWithInst( 8437 Insert->getTerminator(), 8438 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8439 8440 LoopBypassBlocks.push_back(Insert); 8441 return Insert; 8442 } 8443 8444 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8445 LLVM_DEBUG({ 8446 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8447 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8448 << ", Main Loop UF:" << EPI.MainLoopUF 8449 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8450 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8451 }); 8452 } 8453 8454 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8455 DEBUG_WITH_TYPE(VerboseDebug, { 8456 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8457 }); 8458 } 8459 8460 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8461 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8462 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8463 bool PredicateAtRangeStart = Predicate(Range.Start); 8464 8465 for (ElementCount TmpVF = Range.Start * 2; 8466 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8467 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8468 Range.End = TmpVF; 8469 break; 8470 } 8471 8472 return PredicateAtRangeStart; 8473 } 8474 8475 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8476 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8477 /// of VF's starting at a given VF and extending it as much as possible. Each 8478 /// vectorization decision can potentially shorten this sub-range during 8479 /// buildVPlan(). 8480 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8481 ElementCount MaxVF) { 8482 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8483 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8484 VFRange SubRange = {VF, MaxVFPlusOne}; 8485 VPlans.push_back(buildVPlan(SubRange)); 8486 VF = SubRange.End; 8487 } 8488 } 8489 8490 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8491 VPlanPtr &Plan) { 8492 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8493 8494 // Look for cached value. 8495 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8496 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8497 if (ECEntryIt != EdgeMaskCache.end()) 8498 return ECEntryIt->second; 8499 8500 VPValue *SrcMask = createBlockInMask(Src, Plan); 8501 8502 // The terminator has to be a branch inst! 8503 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8504 assert(BI && "Unexpected terminator found"); 8505 8506 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8507 return EdgeMaskCache[Edge] = SrcMask; 8508 8509 // If source is an exiting block, we know the exit edge is dynamically dead 8510 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8511 // adding uses of an otherwise potentially dead instruction. 8512 if (OrigLoop->isLoopExiting(Src)) 8513 return EdgeMaskCache[Edge] = SrcMask; 8514 8515 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8516 assert(EdgeMask && "No Edge Mask found for condition"); 8517 8518 if (BI->getSuccessor(0) != Dst) 8519 EdgeMask = Builder.createNot(EdgeMask); 8520 8521 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8522 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8523 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8524 // The select version does not introduce new UB if SrcMask is false and 8525 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8526 VPValue *False = Plan->getOrAddVPValue( 8527 ConstantInt::getFalse(BI->getCondition()->getType())); 8528 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8529 } 8530 8531 return EdgeMaskCache[Edge] = EdgeMask; 8532 } 8533 8534 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8535 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8536 8537 // Look for cached value. 8538 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8539 if (BCEntryIt != BlockMaskCache.end()) 8540 return BCEntryIt->second; 8541 8542 // All-one mask is modelled as no-mask following the convention for masked 8543 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8544 VPValue *BlockMask = nullptr; 8545 8546 if (OrigLoop->getHeader() == BB) { 8547 if (!CM.blockNeedsPredication(BB)) 8548 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8549 8550 // Create the block in mask as the first non-phi instruction in the block. 8551 VPBuilder::InsertPointGuard Guard(Builder); 8552 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8553 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8554 8555 // Introduce the early-exit compare IV <= BTC to form header block mask. 8556 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8557 // Start by constructing the desired canonical IV. 8558 VPValue *IV = nullptr; 8559 if (Legal->getPrimaryInduction()) 8560 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8561 else { 8562 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8563 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8564 IV = IVRecipe->getVPSingleValue(); 8565 } 8566 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8567 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8568 8569 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8570 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8571 // as a second argument, we only pass the IV here and extract the 8572 // tripcount from the transform state where codegen of the VP instructions 8573 // happen. 8574 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8575 } else { 8576 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8577 } 8578 return BlockMaskCache[BB] = BlockMask; 8579 } 8580 8581 // This is the block mask. We OR all incoming edges. 8582 for (auto *Predecessor : predecessors(BB)) { 8583 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8584 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8585 return BlockMaskCache[BB] = EdgeMask; 8586 8587 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8588 BlockMask = EdgeMask; 8589 continue; 8590 } 8591 8592 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8593 } 8594 8595 return BlockMaskCache[BB] = BlockMask; 8596 } 8597 8598 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8599 ArrayRef<VPValue *> Operands, 8600 VFRange &Range, 8601 VPlanPtr &Plan) { 8602 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8603 "Must be called with either a load or store"); 8604 8605 auto willWiden = [&](ElementCount VF) -> bool { 8606 if (VF.isScalar()) 8607 return false; 8608 LoopVectorizationCostModel::InstWidening Decision = 8609 CM.getWideningDecision(I, VF); 8610 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8611 "CM decision should be taken at this point."); 8612 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8613 return true; 8614 if (CM.isScalarAfterVectorization(I, VF) || 8615 CM.isProfitableToScalarize(I, VF)) 8616 return false; 8617 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8618 }; 8619 8620 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8621 return nullptr; 8622 8623 VPValue *Mask = nullptr; 8624 if (Legal->isMaskRequired(I)) 8625 Mask = createBlockInMask(I->getParent(), Plan); 8626 8627 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8628 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8629 8630 StoreInst *Store = cast<StoreInst>(I); 8631 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8632 Mask); 8633 } 8634 8635 VPWidenIntOrFpInductionRecipe * 8636 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8637 ArrayRef<VPValue *> Operands) const { 8638 // Check if this is an integer or fp induction. If so, build the recipe that 8639 // produces its scalar and vector values. 8640 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8641 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8642 II.getKind() == InductionDescriptor::IK_FpInduction) { 8643 assert(II.getStartValue() == 8644 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8645 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8646 return new VPWidenIntOrFpInductionRecipe( 8647 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8648 } 8649 8650 return nullptr; 8651 } 8652 8653 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8654 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8655 VPlan &Plan) const { 8656 // Optimize the special case where the source is a constant integer 8657 // induction variable. Notice that we can only optimize the 'trunc' case 8658 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8659 // (c) other casts depend on pointer size. 8660 8661 // Determine whether \p K is a truncation based on an induction variable that 8662 // can be optimized. 8663 auto isOptimizableIVTruncate = 8664 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8665 return [=](ElementCount VF) -> bool { 8666 return CM.isOptimizableIVTruncate(K, VF); 8667 }; 8668 }; 8669 8670 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8671 isOptimizableIVTruncate(I), Range)) { 8672 8673 InductionDescriptor II = 8674 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8675 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8676 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8677 Start, nullptr, I); 8678 } 8679 return nullptr; 8680 } 8681 8682 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8683 ArrayRef<VPValue *> Operands, 8684 VPlanPtr &Plan) { 8685 // If all incoming values are equal, the incoming VPValue can be used directly 8686 // instead of creating a new VPBlendRecipe. 8687 VPValue *FirstIncoming = Operands[0]; 8688 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8689 return FirstIncoming == Inc; 8690 })) { 8691 return Operands[0]; 8692 } 8693 8694 // We know that all PHIs in non-header blocks are converted into selects, so 8695 // we don't have to worry about the insertion order and we can just use the 8696 // builder. At this point we generate the predication tree. There may be 8697 // duplications since this is a simple recursive scan, but future 8698 // optimizations will clean it up. 8699 SmallVector<VPValue *, 2> OperandsWithMask; 8700 unsigned NumIncoming = Phi->getNumIncomingValues(); 8701 8702 for (unsigned In = 0; In < NumIncoming; In++) { 8703 VPValue *EdgeMask = 8704 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8705 assert((EdgeMask || NumIncoming == 1) && 8706 "Multiple predecessors with one having a full mask"); 8707 OperandsWithMask.push_back(Operands[In]); 8708 if (EdgeMask) 8709 OperandsWithMask.push_back(EdgeMask); 8710 } 8711 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8712 } 8713 8714 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8715 ArrayRef<VPValue *> Operands, 8716 VFRange &Range) const { 8717 8718 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8719 [this, CI](ElementCount VF) { 8720 return CM.isScalarWithPredication(CI, VF); 8721 }, 8722 Range); 8723 8724 if (IsPredicated) 8725 return nullptr; 8726 8727 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8728 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8729 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8730 ID == Intrinsic::pseudoprobe || 8731 ID == Intrinsic::experimental_noalias_scope_decl)) 8732 return nullptr; 8733 8734 auto willWiden = [&](ElementCount VF) -> bool { 8735 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8736 // The following case may be scalarized depending on the VF. 8737 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8738 // version of the instruction. 8739 // Is it beneficial to perform intrinsic call compared to lib call? 8740 bool NeedToScalarize = false; 8741 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8742 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8743 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8744 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8745 "Either the intrinsic cost or vector call cost must be valid"); 8746 return UseVectorIntrinsic || !NeedToScalarize; 8747 }; 8748 8749 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8750 return nullptr; 8751 8752 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8753 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8754 } 8755 8756 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8757 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8758 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8759 // Instruction should be widened, unless it is scalar after vectorization, 8760 // scalarization is profitable or it is predicated. 8761 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8762 return CM.isScalarAfterVectorization(I, VF) || 8763 CM.isProfitableToScalarize(I, VF) || 8764 CM.isScalarWithPredication(I, VF); 8765 }; 8766 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8767 Range); 8768 } 8769 8770 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8771 ArrayRef<VPValue *> Operands) const { 8772 auto IsVectorizableOpcode = [](unsigned Opcode) { 8773 switch (Opcode) { 8774 case Instruction::Add: 8775 case Instruction::And: 8776 case Instruction::AShr: 8777 case Instruction::BitCast: 8778 case Instruction::FAdd: 8779 case Instruction::FCmp: 8780 case Instruction::FDiv: 8781 case Instruction::FMul: 8782 case Instruction::FNeg: 8783 case Instruction::FPExt: 8784 case Instruction::FPToSI: 8785 case Instruction::FPToUI: 8786 case Instruction::FPTrunc: 8787 case Instruction::FRem: 8788 case Instruction::FSub: 8789 case Instruction::ICmp: 8790 case Instruction::IntToPtr: 8791 case Instruction::LShr: 8792 case Instruction::Mul: 8793 case Instruction::Or: 8794 case Instruction::PtrToInt: 8795 case Instruction::SDiv: 8796 case Instruction::Select: 8797 case Instruction::SExt: 8798 case Instruction::Shl: 8799 case Instruction::SIToFP: 8800 case Instruction::SRem: 8801 case Instruction::Sub: 8802 case Instruction::Trunc: 8803 case Instruction::UDiv: 8804 case Instruction::UIToFP: 8805 case Instruction::URem: 8806 case Instruction::Xor: 8807 case Instruction::ZExt: 8808 return true; 8809 } 8810 return false; 8811 }; 8812 8813 if (!IsVectorizableOpcode(I->getOpcode())) 8814 return nullptr; 8815 8816 // Success: widen this instruction. 8817 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8818 } 8819 8820 void VPRecipeBuilder::fixHeaderPhis() { 8821 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8822 for (VPWidenPHIRecipe *R : PhisToFix) { 8823 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8824 VPRecipeBase *IncR = 8825 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8826 R->addOperand(IncR->getVPSingleValue()); 8827 } 8828 } 8829 8830 VPBasicBlock *VPRecipeBuilder::handleReplication( 8831 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8832 VPlanPtr &Plan) { 8833 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8834 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8835 Range); 8836 8837 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8838 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range); 8839 8840 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8841 IsUniform, IsPredicated); 8842 setRecipe(I, Recipe); 8843 Plan->addVPValue(I, Recipe); 8844 8845 // Find if I uses a predicated instruction. If so, it will use its scalar 8846 // value. Avoid hoisting the insert-element which packs the scalar value into 8847 // a vector value, as that happens iff all users use the vector value. 8848 for (VPValue *Op : Recipe->operands()) { 8849 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8850 if (!PredR) 8851 continue; 8852 auto *RepR = 8853 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8854 assert(RepR->isPredicated() && 8855 "expected Replicate recipe to be predicated"); 8856 RepR->setAlsoPack(false); 8857 } 8858 8859 // Finalize the recipe for Instr, first if it is not predicated. 8860 if (!IsPredicated) { 8861 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8862 VPBB->appendRecipe(Recipe); 8863 return VPBB; 8864 } 8865 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8866 assert(VPBB->getSuccessors().empty() && 8867 "VPBB has successors when handling predicated replication."); 8868 // Record predicated instructions for above packing optimizations. 8869 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8870 VPBlockUtils::insertBlockAfter(Region, VPBB); 8871 auto *RegSucc = new VPBasicBlock(); 8872 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8873 return RegSucc; 8874 } 8875 8876 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8877 VPRecipeBase *PredRecipe, 8878 VPlanPtr &Plan) { 8879 // Instructions marked for predication are replicated and placed under an 8880 // if-then construct to prevent side-effects. 8881 8882 // Generate recipes to compute the block mask for this region. 8883 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8884 8885 // Build the triangular if-then region. 8886 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8887 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8888 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8889 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8890 auto *PHIRecipe = Instr->getType()->isVoidTy() 8891 ? nullptr 8892 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8893 if (PHIRecipe) { 8894 Plan->removeVPValueFor(Instr); 8895 Plan->addVPValue(Instr, PHIRecipe); 8896 } 8897 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8898 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8899 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8900 8901 // Note: first set Entry as region entry and then connect successors starting 8902 // from it in order, to propagate the "parent" of each VPBasicBlock. 8903 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8904 VPBlockUtils::connectBlocks(Pred, Exit); 8905 8906 return Region; 8907 } 8908 8909 VPRecipeOrVPValueTy 8910 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8911 ArrayRef<VPValue *> Operands, 8912 VFRange &Range, VPlanPtr &Plan) { 8913 // First, check for specific widening recipes that deal with calls, memory 8914 // operations, inductions and Phi nodes. 8915 if (auto *CI = dyn_cast<CallInst>(Instr)) 8916 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8917 8918 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8919 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8920 8921 VPRecipeBase *Recipe; 8922 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8923 if (Phi->getParent() != OrigLoop->getHeader()) 8924 return tryToBlend(Phi, Operands, Plan); 8925 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8926 return toVPRecipeResult(Recipe); 8927 8928 if (Legal->isReductionVariable(Phi)) { 8929 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8930 assert(RdxDesc.getRecurrenceStartValue() == 8931 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8932 VPValue *StartV = Operands[0]; 8933 8934 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8935 PhisToFix.push_back(PhiRecipe); 8936 // Record the incoming value from the backedge, so we can add the incoming 8937 // value from the backedge after all recipes have been created. 8938 recordRecipeOf(cast<Instruction>( 8939 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8940 return toVPRecipeResult(PhiRecipe); 8941 } 8942 8943 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8944 } 8945 8946 if (isa<TruncInst>(Instr) && 8947 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8948 Range, *Plan))) 8949 return toVPRecipeResult(Recipe); 8950 8951 if (!shouldWiden(Instr, Range)) 8952 return nullptr; 8953 8954 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8955 return toVPRecipeResult(new VPWidenGEPRecipe( 8956 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8957 8958 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8959 bool InvariantCond = 8960 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8961 return toVPRecipeResult(new VPWidenSelectRecipe( 8962 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8963 } 8964 8965 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8966 } 8967 8968 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8969 ElementCount MaxVF) { 8970 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8971 8972 // Collect instructions from the original loop that will become trivially dead 8973 // in the vectorized loop. We don't need to vectorize these instructions. For 8974 // example, original induction update instructions can become dead because we 8975 // separately emit induction "steps" when generating code for the new loop. 8976 // Similarly, we create a new latch condition when setting up the structure 8977 // of the new loop, so the old one can become dead. 8978 SmallPtrSet<Instruction *, 4> DeadInstructions; 8979 collectTriviallyDeadInstructions(DeadInstructions); 8980 8981 // Add assume instructions we need to drop to DeadInstructions, to prevent 8982 // them from being added to the VPlan. 8983 // TODO: We only need to drop assumes in blocks that get flattend. If the 8984 // control flow is preserved, we should keep them. 8985 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8986 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8987 8988 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8989 // Dead instructions do not need sinking. Remove them from SinkAfter. 8990 for (Instruction *I : DeadInstructions) 8991 SinkAfter.erase(I); 8992 8993 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8994 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8995 VFRange SubRange = {VF, MaxVFPlusOne}; 8996 VPlans.push_back( 8997 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8998 VF = SubRange.End; 8999 } 9000 } 9001 9002 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 9003 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 9004 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 9005 9006 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 9007 9008 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 9009 9010 // --------------------------------------------------------------------------- 9011 // Pre-construction: record ingredients whose recipes we'll need to further 9012 // process after constructing the initial VPlan. 9013 // --------------------------------------------------------------------------- 9014 9015 // Mark instructions we'll need to sink later and their targets as 9016 // ingredients whose recipe we'll need to record. 9017 for (auto &Entry : SinkAfter) { 9018 RecipeBuilder.recordRecipeOf(Entry.first); 9019 RecipeBuilder.recordRecipeOf(Entry.second); 9020 } 9021 for (auto &Reduction : CM.getInLoopReductionChains()) { 9022 PHINode *Phi = Reduction.first; 9023 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9024 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9025 9026 RecipeBuilder.recordRecipeOf(Phi); 9027 for (auto &R : ReductionOperations) { 9028 RecipeBuilder.recordRecipeOf(R); 9029 // For min/max reducitons, where we have a pair of icmp/select, we also 9030 // need to record the ICmp recipe, so it can be removed later. 9031 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9032 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9033 } 9034 } 9035 9036 // For each interleave group which is relevant for this (possibly trimmed) 9037 // Range, add it to the set of groups to be later applied to the VPlan and add 9038 // placeholders for its members' Recipes which we'll be replacing with a 9039 // single VPInterleaveRecipe. 9040 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9041 auto applyIG = [IG, this](ElementCount VF) -> bool { 9042 return (VF.isVector() && // Query is illegal for VF == 1 9043 CM.getWideningDecision(IG->getInsertPos(), VF) == 9044 LoopVectorizationCostModel::CM_Interleave); 9045 }; 9046 if (!getDecisionAndClampRange(applyIG, Range)) 9047 continue; 9048 InterleaveGroups.insert(IG); 9049 for (unsigned i = 0; i < IG->getFactor(); i++) 9050 if (Instruction *Member = IG->getMember(i)) 9051 RecipeBuilder.recordRecipeOf(Member); 9052 }; 9053 9054 // --------------------------------------------------------------------------- 9055 // Build initial VPlan: Scan the body of the loop in a topological order to 9056 // visit each basic block after having visited its predecessor basic blocks. 9057 // --------------------------------------------------------------------------- 9058 9059 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9060 auto Plan = std::make_unique<VPlan>(); 9061 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9062 Plan->setEntry(VPBB); 9063 9064 // Scan the body of the loop in a topological order to visit each basic block 9065 // after having visited its predecessor basic blocks. 9066 LoopBlocksDFS DFS(OrigLoop); 9067 DFS.perform(LI); 9068 9069 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9070 // Relevant instructions from basic block BB will be grouped into VPRecipe 9071 // ingredients and fill a new VPBasicBlock. 9072 unsigned VPBBsForBB = 0; 9073 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9074 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9075 VPBB = FirstVPBBForBB; 9076 Builder.setInsertPoint(VPBB); 9077 9078 // Introduce each ingredient into VPlan. 9079 // TODO: Model and preserve debug instrinsics in VPlan. 9080 for (Instruction &I : BB->instructionsWithoutDebug()) { 9081 Instruction *Instr = &I; 9082 9083 // First filter out irrelevant instructions, to ensure no recipes are 9084 // built for them. 9085 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9086 continue; 9087 9088 SmallVector<VPValue *, 4> Operands; 9089 auto *Phi = dyn_cast<PHINode>(Instr); 9090 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9091 Operands.push_back(Plan->getOrAddVPValue( 9092 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9093 } else { 9094 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9095 Operands = {OpRange.begin(), OpRange.end()}; 9096 } 9097 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9098 Instr, Operands, Range, Plan)) { 9099 // If Instr can be simplified to an existing VPValue, use it. 9100 if (RecipeOrValue.is<VPValue *>()) { 9101 auto *VPV = RecipeOrValue.get<VPValue *>(); 9102 Plan->addVPValue(Instr, VPV); 9103 // If the re-used value is a recipe, register the recipe for the 9104 // instruction, in case the recipe for Instr needs to be recorded. 9105 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9106 RecipeBuilder.setRecipe(Instr, R); 9107 continue; 9108 } 9109 // Otherwise, add the new recipe. 9110 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9111 for (auto *Def : Recipe->definedValues()) { 9112 auto *UV = Def->getUnderlyingValue(); 9113 Plan->addVPValue(UV, Def); 9114 } 9115 9116 RecipeBuilder.setRecipe(Instr, Recipe); 9117 VPBB->appendRecipe(Recipe); 9118 continue; 9119 } 9120 9121 // Otherwise, if all widening options failed, Instruction is to be 9122 // replicated. This may create a successor for VPBB. 9123 VPBasicBlock *NextVPBB = 9124 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9125 if (NextVPBB != VPBB) { 9126 VPBB = NextVPBB; 9127 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9128 : ""); 9129 } 9130 } 9131 } 9132 9133 RecipeBuilder.fixHeaderPhis(); 9134 9135 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9136 // may also be empty, such as the last one VPBB, reflecting original 9137 // basic-blocks with no recipes. 9138 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9139 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9140 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9141 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9142 delete PreEntry; 9143 9144 // --------------------------------------------------------------------------- 9145 // Transform initial VPlan: Apply previously taken decisions, in order, to 9146 // bring the VPlan to its final state. 9147 // --------------------------------------------------------------------------- 9148 9149 // Apply Sink-After legal constraints. 9150 for (auto &Entry : SinkAfter) { 9151 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9152 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9153 9154 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9155 auto *Region = 9156 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9157 if (Region && Region->isReplicator()) 9158 return Region; 9159 return nullptr; 9160 }; 9161 9162 // If the target is in a replication region, make sure to move Sink to the 9163 // block after it, not into the replication region itself. 9164 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9165 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9166 assert(!GetReplicateRegion(Sink) && 9167 "cannot sink a region into another region yet"); 9168 VPBasicBlock *NextBlock = 9169 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9170 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9171 continue; 9172 } 9173 9174 auto *SinkRegion = GetReplicateRegion(Sink); 9175 // Unless the sink source is in a replicate region, sink the recipe 9176 // directly. 9177 if (!SinkRegion) { 9178 Sink->moveAfter(Target); 9179 continue; 9180 } 9181 9182 // If the sink source is in a replicate region, we need to move the whole 9183 // replicate region, which should only contain a single recipe in the main 9184 // block. 9185 assert(Sink->getParent()->size() == 1 && 9186 "parent must be a replicator with a single recipe"); 9187 auto *SplitBlock = 9188 Target->getParent()->splitAt(std::next(Target->getIterator())); 9189 9190 auto *Pred = SinkRegion->getSinglePredecessor(); 9191 auto *Succ = SinkRegion->getSingleSuccessor(); 9192 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9193 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9194 VPBlockUtils::connectBlocks(Pred, Succ); 9195 9196 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9197 9198 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9199 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9200 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9201 if (VPBB == SplitPred) 9202 VPBB = SplitBlock; 9203 } 9204 9205 // Interleave memory: for each Interleave Group we marked earlier as relevant 9206 // for this VPlan, replace the Recipes widening its memory instructions with a 9207 // single VPInterleaveRecipe at its insertion point. 9208 for (auto IG : InterleaveGroups) { 9209 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9210 RecipeBuilder.getRecipe(IG->getInsertPos())); 9211 SmallVector<VPValue *, 4> StoredValues; 9212 for (unsigned i = 0; i < IG->getFactor(); ++i) 9213 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9214 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9215 9216 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9217 Recipe->getMask()); 9218 VPIG->insertBefore(Recipe); 9219 unsigned J = 0; 9220 for (unsigned i = 0; i < IG->getFactor(); ++i) 9221 if (Instruction *Member = IG->getMember(i)) { 9222 if (!Member->getType()->isVoidTy()) { 9223 VPValue *OriginalV = Plan->getVPValue(Member); 9224 Plan->removeVPValueFor(Member); 9225 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9226 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9227 J++; 9228 } 9229 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9230 } 9231 } 9232 9233 // Adjust the recipes for any inloop reductions. 9234 if (Range.Start.isVector()) 9235 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9236 9237 // Finally, if tail is folded by masking, introduce selects between the phi 9238 // and the live-out instruction of each reduction, at the end of the latch. 9239 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9240 Builder.setInsertPoint(VPBB); 9241 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9242 for (auto &Reduction : Legal->getReductionVars()) { 9243 if (CM.isInLoopReduction(Reduction.first)) 9244 continue; 9245 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9246 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9247 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9248 } 9249 } 9250 9251 std::string PlanName; 9252 raw_string_ostream RSO(PlanName); 9253 ElementCount VF = Range.Start; 9254 Plan->addVF(VF); 9255 RSO << "Initial VPlan for VF={" << VF; 9256 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9257 Plan->addVF(VF); 9258 RSO << "," << VF; 9259 } 9260 RSO << "},UF>=1"; 9261 RSO.flush(); 9262 Plan->setName(PlanName); 9263 9264 return Plan; 9265 } 9266 9267 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9268 // Outer loop handling: They may require CFG and instruction level 9269 // transformations before even evaluating whether vectorization is profitable. 9270 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9271 // the vectorization pipeline. 9272 assert(!OrigLoop->isInnermost()); 9273 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9274 9275 // Create new empty VPlan 9276 auto Plan = std::make_unique<VPlan>(); 9277 9278 // Build hierarchical CFG 9279 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9280 HCFGBuilder.buildHierarchicalCFG(); 9281 9282 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9283 VF *= 2) 9284 Plan->addVF(VF); 9285 9286 if (EnableVPlanPredication) { 9287 VPlanPredicator VPP(*Plan); 9288 VPP.predicate(); 9289 9290 // Avoid running transformation to recipes until masked code generation in 9291 // VPlan-native path is in place. 9292 return Plan; 9293 } 9294 9295 SmallPtrSet<Instruction *, 1> DeadInstructions; 9296 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9297 Legal->getInductionVars(), 9298 DeadInstructions, *PSE.getSE()); 9299 return Plan; 9300 } 9301 9302 // Adjust the recipes for any inloop reductions. The chain of instructions 9303 // leading from the loop exit instr to the phi need to be converted to 9304 // reductions, with one operand being vector and the other being the scalar 9305 // reduction chain. 9306 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9307 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9308 for (auto &Reduction : CM.getInLoopReductionChains()) { 9309 PHINode *Phi = Reduction.first; 9310 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9311 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9312 9313 // ReductionOperations are orders top-down from the phi's use to the 9314 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9315 // which of the two operands will remain scalar and which will be reduced. 9316 // For minmax the chain will be the select instructions. 9317 Instruction *Chain = Phi; 9318 for (Instruction *R : ReductionOperations) { 9319 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9320 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9321 9322 VPValue *ChainOp = Plan->getVPValue(Chain); 9323 unsigned FirstOpId; 9324 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9325 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9326 "Expected to replace a VPWidenSelectSC"); 9327 FirstOpId = 1; 9328 } else { 9329 assert(isa<VPWidenRecipe>(WidenRecipe) && 9330 "Expected to replace a VPWidenSC"); 9331 FirstOpId = 0; 9332 } 9333 unsigned VecOpId = 9334 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9335 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9336 9337 auto *CondOp = CM.foldTailByMasking() 9338 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9339 : nullptr; 9340 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9341 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9342 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9343 Plan->removeVPValueFor(R); 9344 Plan->addVPValue(R, RedRecipe); 9345 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9346 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9347 WidenRecipe->eraseFromParent(); 9348 9349 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9350 VPRecipeBase *CompareRecipe = 9351 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9352 assert(isa<VPWidenRecipe>(CompareRecipe) && 9353 "Expected to replace a VPWidenSC"); 9354 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9355 "Expected no remaining users"); 9356 CompareRecipe->eraseFromParent(); 9357 } 9358 Chain = R; 9359 } 9360 } 9361 } 9362 9363 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9364 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9365 VPSlotTracker &SlotTracker) const { 9366 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9367 IG->getInsertPos()->printAsOperand(O, false); 9368 O << ", "; 9369 getAddr()->printAsOperand(O, SlotTracker); 9370 VPValue *Mask = getMask(); 9371 if (Mask) { 9372 O << ", "; 9373 Mask->printAsOperand(O, SlotTracker); 9374 } 9375 for (unsigned i = 0; i < IG->getFactor(); ++i) 9376 if (Instruction *I = IG->getMember(i)) 9377 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9378 } 9379 #endif 9380 9381 void VPWidenCallRecipe::execute(VPTransformState &State) { 9382 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9383 *this, State); 9384 } 9385 9386 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9387 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9388 this, *this, InvariantCond, State); 9389 } 9390 9391 void VPWidenRecipe::execute(VPTransformState &State) { 9392 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9393 } 9394 9395 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9396 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9397 *this, State.UF, State.VF, IsPtrLoopInvariant, 9398 IsIndexLoopInvariant, State); 9399 } 9400 9401 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9402 assert(!State.Instance && "Int or FP induction being replicated."); 9403 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9404 getTruncInst(), getVPValue(0), 9405 getCastValue(), State); 9406 } 9407 9408 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9409 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9410 this, State); 9411 } 9412 9413 void VPBlendRecipe::execute(VPTransformState &State) { 9414 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9415 // We know that all PHIs in non-header blocks are converted into 9416 // selects, so we don't have to worry about the insertion order and we 9417 // can just use the builder. 9418 // At this point we generate the predication tree. There may be 9419 // duplications since this is a simple recursive scan, but future 9420 // optimizations will clean it up. 9421 9422 unsigned NumIncoming = getNumIncomingValues(); 9423 9424 // Generate a sequence of selects of the form: 9425 // SELECT(Mask3, In3, 9426 // SELECT(Mask2, In2, 9427 // SELECT(Mask1, In1, 9428 // In0))) 9429 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9430 // are essentially undef are taken from In0. 9431 InnerLoopVectorizer::VectorParts Entry(State.UF); 9432 for (unsigned In = 0; In < NumIncoming; ++In) { 9433 for (unsigned Part = 0; Part < State.UF; ++Part) { 9434 // We might have single edge PHIs (blocks) - use an identity 9435 // 'select' for the first PHI operand. 9436 Value *In0 = State.get(getIncomingValue(In), Part); 9437 if (In == 0) 9438 Entry[Part] = In0; // Initialize with the first incoming value. 9439 else { 9440 // Select between the current value and the previous incoming edge 9441 // based on the incoming mask. 9442 Value *Cond = State.get(getMask(In), Part); 9443 Entry[Part] = 9444 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9445 } 9446 } 9447 } 9448 for (unsigned Part = 0; Part < State.UF; ++Part) 9449 State.set(this, Entry[Part], Part); 9450 } 9451 9452 void VPInterleaveRecipe::execute(VPTransformState &State) { 9453 assert(!State.Instance && "Interleave group being replicated."); 9454 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9455 getStoredValues(), getMask()); 9456 } 9457 9458 void VPReductionRecipe::execute(VPTransformState &State) { 9459 assert(!State.Instance && "Reduction being replicated."); 9460 Value *PrevInChain = State.get(getChainOp(), 0); 9461 for (unsigned Part = 0; Part < State.UF; ++Part) { 9462 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9463 bool IsOrdered = useOrderedReductions(*RdxDesc); 9464 Value *NewVecOp = State.get(getVecOp(), Part); 9465 if (VPValue *Cond = getCondOp()) { 9466 Value *NewCond = State.get(Cond, Part); 9467 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9468 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9469 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9470 Constant *IdenVec = 9471 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9472 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9473 NewVecOp = Select; 9474 } 9475 Value *NewRed; 9476 Value *NextInChain; 9477 if (IsOrdered) { 9478 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9479 PrevInChain); 9480 PrevInChain = NewRed; 9481 } else { 9482 PrevInChain = State.get(getChainOp(), Part); 9483 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9484 } 9485 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9486 NextInChain = 9487 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9488 NewRed, PrevInChain); 9489 } else if (IsOrdered) 9490 NextInChain = NewRed; 9491 else { 9492 NextInChain = State.Builder.CreateBinOp( 9493 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9494 PrevInChain); 9495 } 9496 State.set(this, NextInChain, Part); 9497 } 9498 } 9499 9500 void VPReplicateRecipe::execute(VPTransformState &State) { 9501 if (State.Instance) { // Generate a single instance. 9502 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9503 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9504 *State.Instance, IsPredicated, State); 9505 // Insert scalar instance packing it into a vector. 9506 if (AlsoPack && State.VF.isVector()) { 9507 // If we're constructing lane 0, initialize to start from poison. 9508 if (State.Instance->Lane.isFirstLane()) { 9509 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9510 Value *Poison = PoisonValue::get( 9511 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9512 State.set(this, Poison, State.Instance->Part); 9513 } 9514 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9515 } 9516 return; 9517 } 9518 9519 // Generate scalar instances for all VF lanes of all UF parts, unless the 9520 // instruction is uniform inwhich case generate only the first lane for each 9521 // of the UF parts. 9522 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9523 assert((!State.VF.isScalable() || IsUniform) && 9524 "Can't scalarize a scalable vector"); 9525 for (unsigned Part = 0; Part < State.UF; ++Part) 9526 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9527 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9528 VPIteration(Part, Lane), IsPredicated, 9529 State); 9530 } 9531 9532 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9533 assert(State.Instance && "Branch on Mask works only on single instance."); 9534 9535 unsigned Part = State.Instance->Part; 9536 unsigned Lane = State.Instance->Lane.getKnownLane(); 9537 9538 Value *ConditionBit = nullptr; 9539 VPValue *BlockInMask = getMask(); 9540 if (BlockInMask) { 9541 ConditionBit = State.get(BlockInMask, Part); 9542 if (ConditionBit->getType()->isVectorTy()) 9543 ConditionBit = State.Builder.CreateExtractElement( 9544 ConditionBit, State.Builder.getInt32(Lane)); 9545 } else // Block in mask is all-one. 9546 ConditionBit = State.Builder.getTrue(); 9547 9548 // Replace the temporary unreachable terminator with a new conditional branch, 9549 // whose two destinations will be set later when they are created. 9550 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9551 assert(isa<UnreachableInst>(CurrentTerminator) && 9552 "Expected to replace unreachable terminator with conditional branch."); 9553 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9554 CondBr->setSuccessor(0, nullptr); 9555 ReplaceInstWithInst(CurrentTerminator, CondBr); 9556 } 9557 9558 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9559 assert(State.Instance && "Predicated instruction PHI works per instance."); 9560 Instruction *ScalarPredInst = 9561 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9562 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9563 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9564 assert(PredicatingBB && "Predicated block has no single predecessor."); 9565 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9566 "operand must be VPReplicateRecipe"); 9567 9568 // By current pack/unpack logic we need to generate only a single phi node: if 9569 // a vector value for the predicated instruction exists at this point it means 9570 // the instruction has vector users only, and a phi for the vector value is 9571 // needed. In this case the recipe of the predicated instruction is marked to 9572 // also do that packing, thereby "hoisting" the insert-element sequence. 9573 // Otherwise, a phi node for the scalar value is needed. 9574 unsigned Part = State.Instance->Part; 9575 if (State.hasVectorValue(getOperand(0), Part)) { 9576 Value *VectorValue = State.get(getOperand(0), Part); 9577 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9578 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9579 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9580 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9581 if (State.hasVectorValue(this, Part)) 9582 State.reset(this, VPhi, Part); 9583 else 9584 State.set(this, VPhi, Part); 9585 // NOTE: Currently we need to update the value of the operand, so the next 9586 // predicated iteration inserts its generated value in the correct vector. 9587 State.reset(getOperand(0), VPhi, Part); 9588 } else { 9589 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9590 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9591 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9592 PredicatingBB); 9593 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9594 if (State.hasScalarValue(this, *State.Instance)) 9595 State.reset(this, Phi, *State.Instance); 9596 else 9597 State.set(this, Phi, *State.Instance); 9598 // NOTE: Currently we need to update the value of the operand, so the next 9599 // predicated iteration inserts its generated value in the correct vector. 9600 State.reset(getOperand(0), Phi, *State.Instance); 9601 } 9602 } 9603 9604 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9605 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9606 State.ILV->vectorizeMemoryInstruction( 9607 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9608 StoredValue, getMask()); 9609 } 9610 9611 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9612 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9613 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9614 // for predication. 9615 static ScalarEpilogueLowering getScalarEpilogueLowering( 9616 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9617 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9618 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9619 LoopVectorizationLegality &LVL) { 9620 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9621 // don't look at hints or options, and don't request a scalar epilogue. 9622 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9623 // LoopAccessInfo (due to code dependency and not being able to reliably get 9624 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9625 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9626 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9627 // back to the old way and vectorize with versioning when forced. See D81345.) 9628 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9629 PGSOQueryType::IRPass) && 9630 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9631 return CM_ScalarEpilogueNotAllowedOptSize; 9632 9633 // 2) If set, obey the directives 9634 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9635 switch (PreferPredicateOverEpilogue) { 9636 case PreferPredicateTy::ScalarEpilogue: 9637 return CM_ScalarEpilogueAllowed; 9638 case PreferPredicateTy::PredicateElseScalarEpilogue: 9639 return CM_ScalarEpilogueNotNeededUsePredicate; 9640 case PreferPredicateTy::PredicateOrDontVectorize: 9641 return CM_ScalarEpilogueNotAllowedUsePredicate; 9642 }; 9643 } 9644 9645 // 3) If set, obey the hints 9646 switch (Hints.getPredicate()) { 9647 case LoopVectorizeHints::FK_Enabled: 9648 return CM_ScalarEpilogueNotNeededUsePredicate; 9649 case LoopVectorizeHints::FK_Disabled: 9650 return CM_ScalarEpilogueAllowed; 9651 }; 9652 9653 // 4) if the TTI hook indicates this is profitable, request predication. 9654 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9655 LVL.getLAI())) 9656 return CM_ScalarEpilogueNotNeededUsePredicate; 9657 9658 return CM_ScalarEpilogueAllowed; 9659 } 9660 9661 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9662 // If Values have been set for this Def return the one relevant for \p Part. 9663 if (hasVectorValue(Def, Part)) 9664 return Data.PerPartOutput[Def][Part]; 9665 9666 if (!hasScalarValue(Def, {Part, 0})) { 9667 Value *IRV = Def->getLiveInIRValue(); 9668 Value *B = ILV->getBroadcastInstrs(IRV); 9669 set(Def, B, Part); 9670 return B; 9671 } 9672 9673 Value *ScalarValue = get(Def, {Part, 0}); 9674 // If we aren't vectorizing, we can just copy the scalar map values over 9675 // to the vector map. 9676 if (VF.isScalar()) { 9677 set(Def, ScalarValue, Part); 9678 return ScalarValue; 9679 } 9680 9681 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9682 bool IsUniform = RepR && RepR->isUniform(); 9683 9684 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9685 // Check if there is a scalar value for the selected lane. 9686 if (!hasScalarValue(Def, {Part, LastLane})) { 9687 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9688 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9689 "unexpected recipe found to be invariant"); 9690 IsUniform = true; 9691 LastLane = 0; 9692 } 9693 9694 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9695 9696 // Set the insert point after the last scalarized instruction. This 9697 // ensures the insertelement sequence will directly follow the scalar 9698 // definitions. 9699 auto OldIP = Builder.saveIP(); 9700 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9701 Builder.SetInsertPoint(&*NewIP); 9702 9703 // However, if we are vectorizing, we need to construct the vector values. 9704 // If the value is known to be uniform after vectorization, we can just 9705 // broadcast the scalar value corresponding to lane zero for each unroll 9706 // iteration. Otherwise, we construct the vector values using 9707 // insertelement instructions. Since the resulting vectors are stored in 9708 // State, we will only generate the insertelements once. 9709 Value *VectorValue = nullptr; 9710 if (IsUniform) { 9711 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9712 set(Def, VectorValue, Part); 9713 } else { 9714 // Initialize packing with insertelements to start from undef. 9715 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9716 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9717 set(Def, Undef, Part); 9718 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9719 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9720 VectorValue = get(Def, Part); 9721 } 9722 Builder.restoreIP(OldIP); 9723 return VectorValue; 9724 } 9725 9726 // Process the loop in the VPlan-native vectorization path. This path builds 9727 // VPlan upfront in the vectorization pipeline, which allows to apply 9728 // VPlan-to-VPlan transformations from the very beginning without modifying the 9729 // input LLVM IR. 9730 static bool processLoopInVPlanNativePath( 9731 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9732 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9733 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9734 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9735 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9736 LoopVectorizationRequirements &Requirements) { 9737 9738 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9739 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9740 return false; 9741 } 9742 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9743 Function *F = L->getHeader()->getParent(); 9744 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9745 9746 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9747 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9748 9749 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9750 &Hints, IAI); 9751 // Use the planner for outer loop vectorization. 9752 // TODO: CM is not used at this point inside the planner. Turn CM into an 9753 // optional argument if we don't need it in the future. 9754 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9755 Requirements, ORE); 9756 9757 // Get user vectorization factor. 9758 ElementCount UserVF = Hints.getWidth(); 9759 9760 // Plan how to best vectorize, return the best VF and its cost. 9761 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9762 9763 // If we are stress testing VPlan builds, do not attempt to generate vector 9764 // code. Masked vector code generation support will follow soon. 9765 // Also, do not attempt to vectorize if no vector code will be produced. 9766 if (VPlanBuildStressTest || EnableVPlanPredication || 9767 VectorizationFactor::Disabled() == VF) 9768 return false; 9769 9770 LVP.setBestPlan(VF.Width, 1); 9771 9772 { 9773 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9774 F->getParent()->getDataLayout()); 9775 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9776 &CM, BFI, PSI, Checks); 9777 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9778 << L->getHeader()->getParent()->getName() << "\"\n"); 9779 LVP.executePlan(LB, DT); 9780 } 9781 9782 // Mark the loop as already vectorized to avoid vectorizing again. 9783 Hints.setAlreadyVectorized(); 9784 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9785 return true; 9786 } 9787 9788 // Emit a remark if there are stores to floats that required a floating point 9789 // extension. If the vectorized loop was generated with floating point there 9790 // will be a performance penalty from the conversion overhead and the change in 9791 // the vector width. 9792 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9793 SmallVector<Instruction *, 4> Worklist; 9794 for (BasicBlock *BB : L->getBlocks()) { 9795 for (Instruction &Inst : *BB) { 9796 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9797 if (S->getValueOperand()->getType()->isFloatTy()) 9798 Worklist.push_back(S); 9799 } 9800 } 9801 } 9802 9803 // Traverse the floating point stores upwards searching, for floating point 9804 // conversions. 9805 SmallPtrSet<const Instruction *, 4> Visited; 9806 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9807 while (!Worklist.empty()) { 9808 auto *I = Worklist.pop_back_val(); 9809 if (!L->contains(I)) 9810 continue; 9811 if (!Visited.insert(I).second) 9812 continue; 9813 9814 // Emit a remark if the floating point store required a floating 9815 // point conversion. 9816 // TODO: More work could be done to identify the root cause such as a 9817 // constant or a function return type and point the user to it. 9818 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9819 ORE->emit([&]() { 9820 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9821 I->getDebugLoc(), L->getHeader()) 9822 << "floating point conversion changes vector width. " 9823 << "Mixed floating point precision requires an up/down " 9824 << "cast that will negatively impact performance."; 9825 }); 9826 9827 for (Use &Op : I->operands()) 9828 if (auto *OpI = dyn_cast<Instruction>(Op)) 9829 Worklist.push_back(OpI); 9830 } 9831 } 9832 9833 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9834 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9835 !EnableLoopInterleaving), 9836 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9837 !EnableLoopVectorization) {} 9838 9839 bool LoopVectorizePass::processLoop(Loop *L) { 9840 assert((EnableVPlanNativePath || L->isInnermost()) && 9841 "VPlan-native path is not enabled. Only process inner loops."); 9842 9843 #ifndef NDEBUG 9844 const std::string DebugLocStr = getDebugLocString(L); 9845 #endif /* NDEBUG */ 9846 9847 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9848 << L->getHeader()->getParent()->getName() << "\" from " 9849 << DebugLocStr << "\n"); 9850 9851 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9852 9853 LLVM_DEBUG( 9854 dbgs() << "LV: Loop hints:" 9855 << " force=" 9856 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9857 ? "disabled" 9858 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9859 ? "enabled" 9860 : "?")) 9861 << " width=" << Hints.getWidth() 9862 << " interleave=" << Hints.getInterleave() << "\n"); 9863 9864 // Function containing loop 9865 Function *F = L->getHeader()->getParent(); 9866 9867 // Looking at the diagnostic output is the only way to determine if a loop 9868 // was vectorized (other than looking at the IR or machine code), so it 9869 // is important to generate an optimization remark for each loop. Most of 9870 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9871 // generated as OptimizationRemark and OptimizationRemarkMissed are 9872 // less verbose reporting vectorized loops and unvectorized loops that may 9873 // benefit from vectorization, respectively. 9874 9875 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9876 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9877 return false; 9878 } 9879 9880 PredicatedScalarEvolution PSE(*SE, *L); 9881 9882 // Check if it is legal to vectorize the loop. 9883 LoopVectorizationRequirements Requirements; 9884 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9885 &Requirements, &Hints, DB, AC, BFI, PSI); 9886 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9887 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9888 Hints.emitRemarkWithHints(); 9889 return false; 9890 } 9891 9892 // Check the function attributes and profiles to find out if this function 9893 // should be optimized for size. 9894 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9895 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9896 9897 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9898 // here. They may require CFG and instruction level transformations before 9899 // even evaluating whether vectorization is profitable. Since we cannot modify 9900 // the incoming IR, we need to build VPlan upfront in the vectorization 9901 // pipeline. 9902 if (!L->isInnermost()) 9903 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9904 ORE, BFI, PSI, Hints, Requirements); 9905 9906 assert(L->isInnermost() && "Inner loop expected."); 9907 9908 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9909 // count by optimizing for size, to minimize overheads. 9910 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9911 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9912 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9913 << "This loop is worth vectorizing only if no scalar " 9914 << "iteration overheads are incurred."); 9915 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9916 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9917 else { 9918 LLVM_DEBUG(dbgs() << "\n"); 9919 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9920 } 9921 } 9922 9923 // Check the function attributes to see if implicit floats are allowed. 9924 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9925 // an integer loop and the vector instructions selected are purely integer 9926 // vector instructions? 9927 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9928 reportVectorizationFailure( 9929 "Can't vectorize when the NoImplicitFloat attribute is used", 9930 "loop not vectorized due to NoImplicitFloat attribute", 9931 "NoImplicitFloat", ORE, L); 9932 Hints.emitRemarkWithHints(); 9933 return false; 9934 } 9935 9936 // Check if the target supports potentially unsafe FP vectorization. 9937 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9938 // for the target we're vectorizing for, to make sure none of the 9939 // additional fp-math flags can help. 9940 if (Hints.isPotentiallyUnsafe() && 9941 TTI->isFPVectorizationPotentiallyUnsafe()) { 9942 reportVectorizationFailure( 9943 "Potentially unsafe FP op prevents vectorization", 9944 "loop not vectorized due to unsafe FP support.", 9945 "UnsafeFP", ORE, L); 9946 Hints.emitRemarkWithHints(); 9947 return false; 9948 } 9949 9950 if (!Requirements.canVectorizeFPMath(Hints)) { 9951 ORE->emit([&]() { 9952 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9953 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9954 ExactFPMathInst->getDebugLoc(), 9955 ExactFPMathInst->getParent()) 9956 << "loop not vectorized: cannot prove it is safe to reorder " 9957 "floating-point operations"; 9958 }); 9959 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9960 "reorder floating-point operations\n"); 9961 Hints.emitRemarkWithHints(); 9962 return false; 9963 } 9964 9965 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9966 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9967 9968 // If an override option has been passed in for interleaved accesses, use it. 9969 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9970 UseInterleaved = EnableInterleavedMemAccesses; 9971 9972 // Analyze interleaved memory accesses. 9973 if (UseInterleaved) { 9974 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9975 } 9976 9977 // Use the cost model. 9978 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9979 F, &Hints, IAI); 9980 CM.collectValuesToIgnore(); 9981 9982 // Use the planner for vectorization. 9983 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9984 Requirements, ORE); 9985 9986 // Get user vectorization factor and interleave count. 9987 ElementCount UserVF = Hints.getWidth(); 9988 unsigned UserIC = Hints.getInterleave(); 9989 9990 // Plan how to best vectorize, return the best VF and its cost. 9991 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9992 9993 VectorizationFactor VF = VectorizationFactor::Disabled(); 9994 unsigned IC = 1; 9995 9996 if (MaybeVF) { 9997 VF = *MaybeVF; 9998 // Select the interleave count. 9999 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 10000 } 10001 10002 // Identify the diagnostic messages that should be produced. 10003 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10004 bool VectorizeLoop = true, InterleaveLoop = true; 10005 if (VF.Width.isScalar()) { 10006 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10007 VecDiagMsg = std::make_pair( 10008 "VectorizationNotBeneficial", 10009 "the cost-model indicates that vectorization is not beneficial"); 10010 VectorizeLoop = false; 10011 } 10012 10013 if (!MaybeVF && UserIC > 1) { 10014 // Tell the user interleaving was avoided up-front, despite being explicitly 10015 // requested. 10016 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10017 "interleaving should be avoided up front\n"); 10018 IntDiagMsg = std::make_pair( 10019 "InterleavingAvoided", 10020 "Ignoring UserIC, because interleaving was avoided up front"); 10021 InterleaveLoop = false; 10022 } else if (IC == 1 && UserIC <= 1) { 10023 // Tell the user interleaving is not beneficial. 10024 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10025 IntDiagMsg = std::make_pair( 10026 "InterleavingNotBeneficial", 10027 "the cost-model indicates that interleaving is not beneficial"); 10028 InterleaveLoop = false; 10029 if (UserIC == 1) { 10030 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10031 IntDiagMsg.second += 10032 " and is explicitly disabled or interleave count is set to 1"; 10033 } 10034 } else if (IC > 1 && UserIC == 1) { 10035 // Tell the user interleaving is beneficial, but it explicitly disabled. 10036 LLVM_DEBUG( 10037 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10038 IntDiagMsg = std::make_pair( 10039 "InterleavingBeneficialButDisabled", 10040 "the cost-model indicates that interleaving is beneficial " 10041 "but is explicitly disabled or interleave count is set to 1"); 10042 InterleaveLoop = false; 10043 } 10044 10045 // Override IC if user provided an interleave count. 10046 IC = UserIC > 0 ? UserIC : IC; 10047 10048 // Emit diagnostic messages, if any. 10049 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10050 if (!VectorizeLoop && !InterleaveLoop) { 10051 // Do not vectorize or interleaving the loop. 10052 ORE->emit([&]() { 10053 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10054 L->getStartLoc(), L->getHeader()) 10055 << VecDiagMsg.second; 10056 }); 10057 ORE->emit([&]() { 10058 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10059 L->getStartLoc(), L->getHeader()) 10060 << IntDiagMsg.second; 10061 }); 10062 return false; 10063 } else if (!VectorizeLoop && InterleaveLoop) { 10064 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10065 ORE->emit([&]() { 10066 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10067 L->getStartLoc(), L->getHeader()) 10068 << VecDiagMsg.second; 10069 }); 10070 } else if (VectorizeLoop && !InterleaveLoop) { 10071 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10072 << ") in " << DebugLocStr << '\n'); 10073 ORE->emit([&]() { 10074 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10075 L->getStartLoc(), L->getHeader()) 10076 << IntDiagMsg.second; 10077 }); 10078 } else if (VectorizeLoop && InterleaveLoop) { 10079 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10080 << ") in " << DebugLocStr << '\n'); 10081 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10082 } 10083 10084 bool DisableRuntimeUnroll = false; 10085 MDNode *OrigLoopID = L->getLoopID(); 10086 { 10087 // Optimistically generate runtime checks. Drop them if they turn out to not 10088 // be profitable. Limit the scope of Checks, so the cleanup happens 10089 // immediately after vector codegeneration is done. 10090 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10091 F->getParent()->getDataLayout()); 10092 if (!VF.Width.isScalar() || IC > 1) 10093 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10094 LVP.setBestPlan(VF.Width, IC); 10095 10096 using namespace ore; 10097 if (!VectorizeLoop) { 10098 assert(IC > 1 && "interleave count should not be 1 or 0"); 10099 // If we decided that it is not legal to vectorize the loop, then 10100 // interleave it. 10101 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10102 &CM, BFI, PSI, Checks); 10103 LVP.executePlan(Unroller, DT); 10104 10105 ORE->emit([&]() { 10106 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10107 L->getHeader()) 10108 << "interleaved loop (interleaved count: " 10109 << NV("InterleaveCount", IC) << ")"; 10110 }); 10111 } else { 10112 // If we decided that it is *legal* to vectorize the loop, then do it. 10113 10114 // Consider vectorizing the epilogue too if it's profitable. 10115 VectorizationFactor EpilogueVF = 10116 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10117 if (EpilogueVF.Width.isVector()) { 10118 10119 // The first pass vectorizes the main loop and creates a scalar epilogue 10120 // to be vectorized by executing the plan (potentially with a different 10121 // factor) again shortly afterwards. 10122 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10123 EpilogueVF.Width.getKnownMinValue(), 10124 1); 10125 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10126 EPI, &LVL, &CM, BFI, PSI, Checks); 10127 10128 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10129 LVP.executePlan(MainILV, DT); 10130 ++LoopsVectorized; 10131 10132 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10133 formLCSSARecursively(*L, *DT, LI, SE); 10134 10135 // Second pass vectorizes the epilogue and adjusts the control flow 10136 // edges from the first pass. 10137 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10138 EPI.MainLoopVF = EPI.EpilogueVF; 10139 EPI.MainLoopUF = EPI.EpilogueUF; 10140 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10141 ORE, EPI, &LVL, &CM, BFI, PSI, 10142 Checks); 10143 LVP.executePlan(EpilogILV, DT); 10144 ++LoopsEpilogueVectorized; 10145 10146 if (!MainILV.areSafetyChecksAdded()) 10147 DisableRuntimeUnroll = true; 10148 } else { 10149 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10150 &LVL, &CM, BFI, PSI, Checks); 10151 LVP.executePlan(LB, DT); 10152 ++LoopsVectorized; 10153 10154 // Add metadata to disable runtime unrolling a scalar loop when there 10155 // are no runtime checks about strides and memory. A scalar loop that is 10156 // rarely used is not worth unrolling. 10157 if (!LB.areSafetyChecksAdded()) 10158 DisableRuntimeUnroll = true; 10159 } 10160 // Report the vectorization decision. 10161 ORE->emit([&]() { 10162 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10163 L->getHeader()) 10164 << "vectorized loop (vectorization width: " 10165 << NV("VectorizationFactor", VF.Width) 10166 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10167 }); 10168 } 10169 10170 if (ORE->allowExtraAnalysis(LV_NAME)) 10171 checkMixedPrecision(L, ORE); 10172 } 10173 10174 Optional<MDNode *> RemainderLoopID = 10175 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10176 LLVMLoopVectorizeFollowupEpilogue}); 10177 if (RemainderLoopID.hasValue()) { 10178 L->setLoopID(RemainderLoopID.getValue()); 10179 } else { 10180 if (DisableRuntimeUnroll) 10181 AddRuntimeUnrollDisableMetaData(L); 10182 10183 // Mark the loop as already vectorized to avoid vectorizing again. 10184 Hints.setAlreadyVectorized(); 10185 } 10186 10187 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10188 return true; 10189 } 10190 10191 LoopVectorizeResult LoopVectorizePass::runImpl( 10192 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10193 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10194 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10195 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10196 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10197 SE = &SE_; 10198 LI = &LI_; 10199 TTI = &TTI_; 10200 DT = &DT_; 10201 BFI = &BFI_; 10202 TLI = TLI_; 10203 AA = &AA_; 10204 AC = &AC_; 10205 GetLAA = &GetLAA_; 10206 DB = &DB_; 10207 ORE = &ORE_; 10208 PSI = PSI_; 10209 10210 // Don't attempt if 10211 // 1. the target claims to have no vector registers, and 10212 // 2. interleaving won't help ILP. 10213 // 10214 // The second condition is necessary because, even if the target has no 10215 // vector registers, loop vectorization may still enable scalar 10216 // interleaving. 10217 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10218 TTI->getMaxInterleaveFactor(1) < 2) 10219 return LoopVectorizeResult(false, false); 10220 10221 bool Changed = false, CFGChanged = false; 10222 10223 // The vectorizer requires loops to be in simplified form. 10224 // Since simplification may add new inner loops, it has to run before the 10225 // legality and profitability checks. This means running the loop vectorizer 10226 // will simplify all loops, regardless of whether anything end up being 10227 // vectorized. 10228 for (auto &L : *LI) 10229 Changed |= CFGChanged |= 10230 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10231 10232 // Build up a worklist of inner-loops to vectorize. This is necessary as 10233 // the act of vectorizing or partially unrolling a loop creates new loops 10234 // and can invalidate iterators across the loops. 10235 SmallVector<Loop *, 8> Worklist; 10236 10237 for (Loop *L : *LI) 10238 collectSupportedLoops(*L, LI, ORE, Worklist); 10239 10240 LoopsAnalyzed += Worklist.size(); 10241 10242 // Now walk the identified inner loops. 10243 while (!Worklist.empty()) { 10244 Loop *L = Worklist.pop_back_val(); 10245 10246 // For the inner loops we actually process, form LCSSA to simplify the 10247 // transform. 10248 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10249 10250 Changed |= CFGChanged |= processLoop(L); 10251 } 10252 10253 // Process each loop nest in the function. 10254 return LoopVectorizeResult(Changed, CFGChanged); 10255 } 10256 10257 PreservedAnalyses LoopVectorizePass::run(Function &F, 10258 FunctionAnalysisManager &AM) { 10259 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10260 auto &LI = AM.getResult<LoopAnalysis>(F); 10261 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10262 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10263 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10264 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10265 auto &AA = AM.getResult<AAManager>(F); 10266 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10267 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10268 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10269 MemorySSA *MSSA = EnableMSSALoopDependency 10270 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10271 : nullptr; 10272 10273 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10274 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10275 [&](Loop &L) -> const LoopAccessInfo & { 10276 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10277 TLI, TTI, nullptr, MSSA}; 10278 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10279 }; 10280 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10281 ProfileSummaryInfo *PSI = 10282 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10283 LoopVectorizeResult Result = 10284 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10285 if (!Result.MadeAnyChange) 10286 return PreservedAnalyses::all(); 10287 PreservedAnalyses PA; 10288 10289 // We currently do not preserve loopinfo/dominator analyses with outer loop 10290 // vectorization. Until this is addressed, mark these analyses as preserved 10291 // only for non-VPlan-native path. 10292 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10293 if (!EnableVPlanNativePath) { 10294 PA.preserve<LoopAnalysis>(); 10295 PA.preserve<DominatorTreeAnalysis>(); 10296 } 10297 PA.preserve<BasicAA>(); 10298 PA.preserve<GlobalsAA>(); 10299 if (!Result.MadeCFGChange) 10300 PA.preserveSet<CFGAnalyses>(); 10301 return PA; 10302 } 10303