1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> EnableStrictReductions( 335 "enable-strict-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns the type of loaded or stored value. 379 static Type *getMemInstValueType(Value *I) { 380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 381 "Expected Load or Store instruction"); 382 if (auto *LI = dyn_cast<LoadInst>(I)) 383 return LI->getType(); 384 return cast<StoreInst>(I)->getValueOperand()->getType(); 385 } 386 387 /// A helper function that returns true if the given type is irregular. The 388 /// type is irregular if its allocated size doesn't equal the store size of an 389 /// element of the corresponding vector type. 390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 391 // Determine if an array of N elements of type Ty is "bitcast compatible" 392 // with a <N x Ty> vector. 393 // This is only true if there is no padding between the array elements. 394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 395 } 396 397 /// A helper function that returns the reciprocal of the block probability of 398 /// predicated blocks. If we return X, we are assuming the predicated block 399 /// will execute once for every X iterations of the loop header. 400 /// 401 /// TODO: We should use actual block probability here, if available. Currently, 402 /// we always assume predicated blocks have a 50% chance of executing. 403 static unsigned getReciprocalPredBlockProb() { return 2; } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPWidenPHIRecipe *PhiR, VPTransformState &State); 519 520 /// A helper function to scalarize a single Instruction in the innermost loop. 521 /// Generates a sequence of scalar instances for each lane between \p MinLane 522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 524 /// Instr's operands. 525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 526 const VPIteration &Instance, bool IfPredicateInstr, 527 VPTransformState &State); 528 529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 530 /// is provided, the integer induction variable will first be truncated to 531 /// the corresponding type. 532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 533 VPValue *Def, VPValue *CastDef, 534 VPTransformState &State); 535 536 /// Construct the vector value of a scalarized value \p V one lane at a time. 537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 538 VPTransformState &State); 539 540 /// Try to vectorize interleaved access group \p Group with the base address 541 /// given in \p Addr, optionally masking the vector operations if \p 542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 543 /// values in the vectorized loop. 544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 545 ArrayRef<VPValue *> VPDefs, 546 VPTransformState &State, VPValue *Addr, 547 ArrayRef<VPValue *> StoredValues, 548 VPValue *BlockInMask = nullptr); 549 550 /// Vectorize Load and Store instructions with the base address given in \p 551 /// Addr, optionally masking the vector operations if \p BlockInMask is 552 /// non-null. Use \p State to translate given VPValues to IR values in the 553 /// vectorized loop. 554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 555 VPValue *Def, VPValue *Addr, 556 VPValue *StoredValue, VPValue *BlockInMask); 557 558 /// Set the debug location in the builder using the debug location in 559 /// the instruction. 560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 561 562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 563 void fixNonInductionPHIs(VPTransformState &State); 564 565 /// Create a broadcast instruction. This method generates a broadcast 566 /// instruction (shuffle) for loop invariant values and for the induction 567 /// value. If this is the induction variable then we extend it to N, N+1, ... 568 /// this is needed because each iteration in the loop corresponds to a SIMD 569 /// element. 570 virtual Value *getBroadcastInstrs(Value *V); 571 572 protected: 573 friend class LoopVectorizationPlanner; 574 575 /// A small list of PHINodes. 576 using PhiVector = SmallVector<PHINode *, 4>; 577 578 /// A type for scalarized values in the new loop. Each value from the 579 /// original loop, when scalarized, is represented by UF x VF scalar values 580 /// in the new unrolled loop, where UF is the unroll factor and VF is the 581 /// vectorization factor. 582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 583 584 /// Set up the values of the IVs correctly when exiting the vector loop. 585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 586 Value *CountRoundDown, Value *EndValue, 587 BasicBlock *MiddleBlock); 588 589 /// Create a new induction variable inside L. 590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 591 Value *Step, Instruction *DL); 592 593 /// Handle all cross-iteration phis in the header. 594 void fixCrossIterationPHIs(VPTransformState &State); 595 596 /// Fix a first-order recurrence. This is the second phase of vectorizing 597 /// this phi node. 598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 599 600 /// Fix a reduction cross-iteration phi. This is the second phase of 601 /// vectorizing this phi node. 602 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 603 604 /// Clear NSW/NUW flags from reduction instructions if necessary. 605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 606 VPTransformState &State); 607 608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 609 /// means we need to add the appropriate incoming value from the middle 610 /// block as exiting edges from the scalar epilogue loop (if present) are 611 /// already in place, and we exit the vector loop exclusively to the middle 612 /// block. 613 void fixLCSSAPHIs(VPTransformState &State); 614 615 /// Iteratively sink the scalarized operands of a predicated instruction into 616 /// the block that was created for it. 617 void sinkScalarOperands(Instruction *PredInst); 618 619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 620 /// represented as. 621 void truncateToMinimalBitwidths(VPTransformState &State); 622 623 /// This function adds 624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The (unique) ExitBlock of the scalar loop. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// LoopVectorizationCostModel - estimates the expected speedups due to 1211 /// vectorization. 1212 /// In many cases vectorization is not profitable. This can happen because of 1213 /// a number of reasons. In this class we mainly attempt to predict the 1214 /// expected speedup/slowdowns due to the supported instruction set. We use the 1215 /// TargetTransformInfo to query the different backends for the cost of 1216 /// different operations. 1217 class LoopVectorizationCostModel { 1218 public: 1219 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1220 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1221 LoopVectorizationLegality *Legal, 1222 const TargetTransformInfo &TTI, 1223 const TargetLibraryInfo *TLI, DemandedBits *DB, 1224 AssumptionCache *AC, 1225 OptimizationRemarkEmitter *ORE, const Function *F, 1226 const LoopVectorizeHints *Hints, 1227 InterleavedAccessInfo &IAI) 1228 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1229 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1230 Hints(Hints), InterleaveInfo(IAI) {} 1231 1232 /// \return An upper bound for the vectorization factor, or None if 1233 /// vectorization and interleaving should be avoided up front. 1234 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1235 1236 /// \return True if runtime checks are required for vectorization, and false 1237 /// otherwise. 1238 bool runtimeChecksRequired(); 1239 1240 /// \return The most profitable vectorization factor and the cost of that VF. 1241 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1242 /// then this vectorization factor will be selected if vectorization is 1243 /// possible. 1244 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1245 VectorizationFactor 1246 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1247 const LoopVectorizationPlanner &LVP); 1248 1249 /// Setup cost-based decisions for user vectorization factor. 1250 void selectUserVectorizationFactor(ElementCount UserVF) { 1251 collectUniformsAndScalars(UserVF); 1252 collectInstsToScalarize(UserVF); 1253 } 1254 1255 /// \return The size (in bits) of the smallest and widest types in the code 1256 /// that needs to be vectorized. We ignore values that remain scalar such as 1257 /// 64 bit loop indices. 1258 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1259 1260 /// \return The desired interleave count. 1261 /// If interleave count has been specified by metadata it will be returned. 1262 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1263 /// are the selected vectorization factor and the cost of the selected VF. 1264 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1265 1266 /// Memory access instruction may be vectorized in more than one way. 1267 /// Form of instruction after vectorization depends on cost. 1268 /// This function takes cost-based decisions for Load/Store instructions 1269 /// and collects them in a map. This decisions map is used for building 1270 /// the lists of loop-uniform and loop-scalar instructions. 1271 /// The calculated cost is saved with widening decision in order to 1272 /// avoid redundant calculations. 1273 void setCostBasedWideningDecision(ElementCount VF); 1274 1275 /// A struct that represents some properties of the register usage 1276 /// of a loop. 1277 struct RegisterUsage { 1278 /// Holds the number of loop invariant values that are used in the loop. 1279 /// The key is ClassID of target-provided register class. 1280 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1281 /// Holds the maximum number of concurrent live intervals in the loop. 1282 /// The key is ClassID of target-provided register class. 1283 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1284 }; 1285 1286 /// \return Returns information about the register usages of the loop for the 1287 /// given vectorization factors. 1288 SmallVector<RegisterUsage, 8> 1289 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1290 1291 /// Collect values we want to ignore in the cost model. 1292 void collectValuesToIgnore(); 1293 1294 /// Split reductions into those that happen in the loop, and those that happen 1295 /// outside. In loop reductions are collected into InLoopReductionChains. 1296 void collectInLoopReductions(); 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 InstructionCost Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, 1384 InstructionCost Cost) { 1385 assert(VF.isVector() && "Expected VF >=2"); 1386 /// Broadcast this decicion to all instructions inside the group. 1387 /// But the cost will be assigned to one instruction only. 1388 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1389 if (auto *I = Grp->getMember(i)) { 1390 if (Grp->getInsertPos() == I) 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 else 1393 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1394 } 1395 } 1396 } 1397 1398 /// Return the cost model decision for the given instruction \p I and vector 1399 /// width \p VF. Return CM_Unknown if this instruction did not pass 1400 /// through the cost modeling. 1401 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1402 assert(VF.isVector() && "Expected VF to be a vector VF"); 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return CM_GatherScatter; 1407 1408 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1409 auto Itr = WideningDecisions.find(InstOnVF); 1410 if (Itr == WideningDecisions.end()) 1411 return CM_Unknown; 1412 return Itr->second.first; 1413 } 1414 1415 /// Return the vectorization cost for the given instruction \p I and vector 1416 /// width \p VF. 1417 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1418 assert(VF.isVector() && "Expected VF >=2"); 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1421 "The cost is not calculated"); 1422 return WideningDecisions[InstOnVF].second; 1423 } 1424 1425 /// Return True if instruction \p I is an optimizable truncate whose operand 1426 /// is an induction variable. Such a truncate will be removed by adding a new 1427 /// induction variable with the destination type. 1428 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1429 // If the instruction is not a truncate, return false. 1430 auto *Trunc = dyn_cast<TruncInst>(I); 1431 if (!Trunc) 1432 return false; 1433 1434 // Get the source and destination types of the truncate. 1435 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1436 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1437 1438 // If the truncate is free for the given types, return false. Replacing a 1439 // free truncate with an induction variable would add an induction variable 1440 // update instruction to each iteration of the loop. We exclude from this 1441 // check the primary induction variable since it will need an update 1442 // instruction regardless. 1443 Value *Op = Trunc->getOperand(0); 1444 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1445 return false; 1446 1447 // If the truncated value is not an induction variable, return false. 1448 return Legal->isInductionPhi(Op); 1449 } 1450 1451 /// Collects the instructions to scalarize for each predicated instruction in 1452 /// the loop. 1453 void collectInstsToScalarize(ElementCount VF); 1454 1455 /// Collect Uniform and Scalar values for the given \p VF. 1456 /// The sets depend on CM decision for Load/Store instructions 1457 /// that may be vectorized as interleave, gather-scatter or scalarized. 1458 void collectUniformsAndScalars(ElementCount VF) { 1459 // Do the analysis once. 1460 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1461 return; 1462 setCostBasedWideningDecision(VF); 1463 collectLoopUniforms(VF); 1464 collectLoopScalars(VF); 1465 } 1466 1467 /// Returns true if the target machine supports masked store operation 1468 /// for the given \p DataType and kind of access to \p Ptr. 1469 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1470 return Legal->isConsecutivePtr(Ptr) && 1471 TTI.isLegalMaskedStore(DataType, Alignment); 1472 } 1473 1474 /// Returns true if the target machine supports masked load operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedLoad(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked scatter operation 1482 /// for the given \p DataType. 1483 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1484 return TTI.isLegalMaskedScatter(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked gather operation 1488 /// for the given \p DataType. 1489 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1490 return TTI.isLegalMaskedGather(DataType, Alignment); 1491 } 1492 1493 /// Returns true if the target machine can represent \p V as a masked gather 1494 /// or scatter operation. 1495 bool isLegalGatherOrScatter(Value *V) { 1496 bool LI = isa<LoadInst>(V); 1497 bool SI = isa<StoreInst>(V); 1498 if (!LI && !SI) 1499 return false; 1500 auto *Ty = getMemInstValueType(V); 1501 Align Align = getLoadStoreAlignment(V); 1502 return (LI && isLegalMaskedGather(Ty, Align)) || 1503 (SI && isLegalMaskedScatter(Ty, Align)); 1504 } 1505 1506 /// Returns true if the target machine supports all of the reduction 1507 /// variables found for the given VF. 1508 bool canVectorizeReductions(ElementCount VF) { 1509 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1510 RecurrenceDescriptor RdxDesc = Reduction.second; 1511 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1512 })); 1513 } 1514 1515 /// Returns true if \p I is an instruction that will be scalarized with 1516 /// predication. Such instructions include conditional stores and 1517 /// instructions that may divide by zero. 1518 /// If a non-zero VF has been calculated, we check if I will be scalarized 1519 /// predication for that VF. 1520 bool isScalarWithPredication(Instruction *I) const; 1521 1522 // Returns true if \p I is an instruction that will be predicated either 1523 // through scalar predication or masked load/store or masked gather/scatter. 1524 // Superset of instructions that return true for isScalarWithPredication. 1525 bool isPredicatedInst(Instruction *I) { 1526 if (!blockNeedsPredication(I->getParent())) 1527 return false; 1528 // Loads and stores that need some form of masked operation are predicated 1529 // instructions. 1530 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1531 return Legal->isMaskRequired(I); 1532 return isScalarWithPredication(I); 1533 } 1534 1535 /// Returns true if \p I is a memory instruction with consecutive memory 1536 /// access that can be widened. 1537 bool 1538 memoryInstructionCanBeWidened(Instruction *I, 1539 ElementCount VF = ElementCount::getFixed(1)); 1540 1541 /// Returns true if \p I is a memory instruction in an interleaved-group 1542 /// of memory accesses that can be vectorized with wide vector loads/stores 1543 /// and shuffles. 1544 bool 1545 interleavedAccessCanBeWidened(Instruction *I, 1546 ElementCount VF = ElementCount::getFixed(1)); 1547 1548 /// Check if \p Instr belongs to any interleaved access group. 1549 bool isAccessInterleaved(Instruction *Instr) { 1550 return InterleaveInfo.isInterleaved(Instr); 1551 } 1552 1553 /// Get the interleaved access group that \p Instr belongs to. 1554 const InterleaveGroup<Instruction> * 1555 getInterleavedAccessGroup(Instruction *Instr) { 1556 return InterleaveInfo.getInterleaveGroup(Instr); 1557 } 1558 1559 /// Returns true if we're required to use a scalar epilogue for at least 1560 /// the final iteration of the original loop. 1561 bool requiresScalarEpilogue() const { 1562 if (!isScalarEpilogueAllowed()) 1563 return false; 1564 // If we might exit from anywhere but the latch, must run the exiting 1565 // iteration in scalar form. 1566 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1567 return true; 1568 return InterleaveInfo.requiresScalarEpilogue(); 1569 } 1570 1571 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1572 /// loop hint annotation. 1573 bool isScalarEpilogueAllowed() const { 1574 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1575 } 1576 1577 /// Returns true if all loop blocks should be masked to fold tail loop. 1578 bool foldTailByMasking() const { return FoldTailByMasking; } 1579 1580 bool blockNeedsPredication(BasicBlock *BB) const { 1581 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1582 } 1583 1584 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1585 /// nodes to the chain of instructions representing the reductions. Uses a 1586 /// MapVector to ensure deterministic iteration order. 1587 using ReductionChainMap = 1588 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1589 1590 /// Return the chain of instructions representing an inloop reduction. 1591 const ReductionChainMap &getInLoopReductionChains() const { 1592 return InLoopReductionChains; 1593 } 1594 1595 /// Returns true if the Phi is part of an inloop reduction. 1596 bool isInLoopReduction(PHINode *Phi) const { 1597 return InLoopReductionChains.count(Phi); 1598 } 1599 1600 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1601 /// with factor VF. Return the cost of the instruction, including 1602 /// scalarization overhead if it's needed. 1603 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1604 1605 /// Estimate cost of a call instruction CI if it were vectorized with factor 1606 /// VF. Return the cost of the instruction, including scalarization overhead 1607 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1608 /// scalarized - 1609 /// i.e. either vector version isn't available, or is too expensive. 1610 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1611 bool &NeedToScalarize) const; 1612 1613 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1614 /// that of B. 1615 bool isMoreProfitable(const VectorizationFactor &A, 1616 const VectorizationFactor &B) const; 1617 1618 /// Invalidates decisions already taken by the cost model. 1619 void invalidateCostModelingDecisions() { 1620 WideningDecisions.clear(); 1621 Uniforms.clear(); 1622 Scalars.clear(); 1623 } 1624 1625 private: 1626 unsigned NumPredStores = 0; 1627 1628 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1629 /// than zero. One is returned if vectorization should best be avoided due 1630 /// to cost. 1631 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1632 ElementCount UserVF); 1633 1634 /// \return the maximized element count based on the targets vector 1635 /// registers and the loop trip-count, but limited to a maximum safe VF. 1636 /// This is a helper function of computeFeasibleMaxVF. 1637 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1638 /// issue that occurred on one of the buildbots which cannot be reproduced 1639 /// without having access to the properietary compiler (see comments on 1640 /// D98509). The issue is currently under investigation and this workaround 1641 /// will be removed as soon as possible. 1642 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1643 unsigned SmallestType, 1644 unsigned WidestType, 1645 const ElementCount &MaxSafeVF); 1646 1647 /// \return the maximum legal scalable VF, based on the safe max number 1648 /// of elements. 1649 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1650 1651 /// The vectorization cost is a combination of the cost itself and a boolean 1652 /// indicating whether any of the contributing operations will actually 1653 /// operate on 1654 /// vector values after type legalization in the backend. If this latter value 1655 /// is 1656 /// false, then all operations will be scalarized (i.e. no vectorization has 1657 /// actually taken place). 1658 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1659 1660 /// Returns the expected execution cost. The unit of the cost does 1661 /// not matter because we use the 'cost' units to compare different 1662 /// vector widths. The cost that is returned is *not* normalized by 1663 /// the factor width. 1664 VectorizationCostTy expectedCost(ElementCount VF); 1665 1666 /// Returns the execution time cost of an instruction for a given vector 1667 /// width. Vector width of one means scalar. 1668 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1669 1670 /// The cost-computation logic from getInstructionCost which provides 1671 /// the vector type as an output parameter. 1672 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1673 Type *&VectorTy); 1674 1675 /// Return the cost of instructions in an inloop reduction pattern, if I is 1676 /// part of that pattern. 1677 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1678 Type *VectorTy, 1679 TTI::TargetCostKind CostKind); 1680 1681 /// Calculate vectorization cost of memory instruction \p I. 1682 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1683 1684 /// The cost computation for scalarized memory instruction. 1685 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1686 1687 /// The cost computation for interleaving group of memory instructions. 1688 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1689 1690 /// The cost computation for Gather/Scatter instruction. 1691 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1692 1693 /// The cost computation for widening instruction \p I with consecutive 1694 /// memory access. 1695 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1696 1697 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1698 /// Load: scalar load + broadcast. 1699 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1700 /// element) 1701 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1702 1703 /// Estimate the overhead of scalarizing an instruction. This is a 1704 /// convenience wrapper for the type-based getScalarizationOverhead API. 1705 InstructionCost getScalarizationOverhead(Instruction *I, 1706 ElementCount VF) const; 1707 1708 /// Returns whether the instruction is a load or store and will be a emitted 1709 /// as a vector operation. 1710 bool isConsecutiveLoadOrStore(Instruction *I); 1711 1712 /// Returns true if an artificially high cost for emulated masked memrefs 1713 /// should be used. 1714 bool useEmulatedMaskMemRefHack(Instruction *I); 1715 1716 /// Map of scalar integer values to the smallest bitwidth they can be legally 1717 /// represented as. The vector equivalents of these values should be truncated 1718 /// to this type. 1719 MapVector<Instruction *, uint64_t> MinBWs; 1720 1721 /// A type representing the costs for instructions if they were to be 1722 /// scalarized rather than vectorized. The entries are Instruction-Cost 1723 /// pairs. 1724 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1725 1726 /// A set containing all BasicBlocks that are known to present after 1727 /// vectorization as a predicated block. 1728 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1729 1730 /// Records whether it is allowed to have the original scalar loop execute at 1731 /// least once. This may be needed as a fallback loop in case runtime 1732 /// aliasing/dependence checks fail, or to handle the tail/remainder 1733 /// iterations when the trip count is unknown or doesn't divide by the VF, 1734 /// or as a peel-loop to handle gaps in interleave-groups. 1735 /// Under optsize and when the trip count is very small we don't allow any 1736 /// iterations to execute in the scalar loop. 1737 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1738 1739 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1740 bool FoldTailByMasking = false; 1741 1742 /// A map holding scalar costs for different vectorization factors. The 1743 /// presence of a cost for an instruction in the mapping indicates that the 1744 /// instruction will be scalarized when vectorizing with the associated 1745 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1746 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1747 1748 /// Holds the instructions known to be uniform after vectorization. 1749 /// The data is collected per VF. 1750 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1751 1752 /// Holds the instructions known to be scalar after vectorization. 1753 /// The data is collected per VF. 1754 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1755 1756 /// Holds the instructions (address computations) that are forced to be 1757 /// scalarized. 1758 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1759 1760 /// PHINodes of the reductions that should be expanded in-loop along with 1761 /// their associated chains of reduction operations, in program order from top 1762 /// (PHI) to bottom 1763 ReductionChainMap InLoopReductionChains; 1764 1765 /// A Map of inloop reduction operations and their immediate chain operand. 1766 /// FIXME: This can be removed once reductions can be costed correctly in 1767 /// vplan. This was added to allow quick lookup to the inloop operations, 1768 /// without having to loop through InLoopReductionChains. 1769 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1770 1771 /// Returns the expected difference in cost from scalarizing the expression 1772 /// feeding a predicated instruction \p PredInst. The instructions to 1773 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1774 /// non-negative return value implies the expression will be scalarized. 1775 /// Currently, only single-use chains are considered for scalarization. 1776 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1777 ElementCount VF); 1778 1779 /// Collect the instructions that are uniform after vectorization. An 1780 /// instruction is uniform if we represent it with a single scalar value in 1781 /// the vectorized loop corresponding to each vector iteration. Examples of 1782 /// uniform instructions include pointer operands of consecutive or 1783 /// interleaved memory accesses. Note that although uniformity implies an 1784 /// instruction will be scalar, the reverse is not true. In general, a 1785 /// scalarized instruction will be represented by VF scalar values in the 1786 /// vectorized loop, each corresponding to an iteration of the original 1787 /// scalar loop. 1788 void collectLoopUniforms(ElementCount VF); 1789 1790 /// Collect the instructions that are scalar after vectorization. An 1791 /// instruction is scalar if it is known to be uniform or will be scalarized 1792 /// during vectorization. Non-uniform scalarized instructions will be 1793 /// represented by VF values in the vectorized loop, each corresponding to an 1794 /// iteration of the original scalar loop. 1795 void collectLoopScalars(ElementCount VF); 1796 1797 /// Keeps cost model vectorization decision and cost for instructions. 1798 /// Right now it is used for memory instructions only. 1799 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1800 std::pair<InstWidening, InstructionCost>>; 1801 1802 DecisionList WideningDecisions; 1803 1804 /// Returns true if \p V is expected to be vectorized and it needs to be 1805 /// extracted. 1806 bool needsExtract(Value *V, ElementCount VF) const { 1807 Instruction *I = dyn_cast<Instruction>(V); 1808 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1809 TheLoop->isLoopInvariant(I)) 1810 return false; 1811 1812 // Assume we can vectorize V (and hence we need extraction) if the 1813 // scalars are not computed yet. This can happen, because it is called 1814 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1815 // the scalars are collected. That should be a safe assumption in most 1816 // cases, because we check if the operands have vectorizable types 1817 // beforehand in LoopVectorizationLegality. 1818 return Scalars.find(VF) == Scalars.end() || 1819 !isScalarAfterVectorization(I, VF); 1820 }; 1821 1822 /// Returns a range containing only operands needing to be extracted. 1823 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1824 ElementCount VF) const { 1825 return SmallVector<Value *, 4>(make_filter_range( 1826 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1827 } 1828 1829 /// Determines if we have the infrastructure to vectorize loop \p L and its 1830 /// epilogue, assuming the main loop is vectorized by \p VF. 1831 bool isCandidateForEpilogueVectorization(const Loop &L, 1832 const ElementCount VF) const; 1833 1834 /// Returns true if epilogue vectorization is considered profitable, and 1835 /// false otherwise. 1836 /// \p VF is the vectorization factor chosen for the original loop. 1837 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1838 1839 public: 1840 /// The loop that we evaluate. 1841 Loop *TheLoop; 1842 1843 /// Predicated scalar evolution analysis. 1844 PredicatedScalarEvolution &PSE; 1845 1846 /// Loop Info analysis. 1847 LoopInfo *LI; 1848 1849 /// Vectorization legality. 1850 LoopVectorizationLegality *Legal; 1851 1852 /// Vector target information. 1853 const TargetTransformInfo &TTI; 1854 1855 /// Target Library Info. 1856 const TargetLibraryInfo *TLI; 1857 1858 /// Demanded bits analysis. 1859 DemandedBits *DB; 1860 1861 /// Assumption cache. 1862 AssumptionCache *AC; 1863 1864 /// Interface to emit optimization remarks. 1865 OptimizationRemarkEmitter *ORE; 1866 1867 const Function *TheFunction; 1868 1869 /// Loop Vectorize Hint. 1870 const LoopVectorizeHints *Hints; 1871 1872 /// The interleave access information contains groups of interleaved accesses 1873 /// with the same stride and close to each other. 1874 InterleavedAccessInfo &InterleaveInfo; 1875 1876 /// Values to ignore in the cost model. 1877 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1878 1879 /// Values to ignore in the cost model when VF > 1. 1880 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1881 1882 /// Profitable vector factors. 1883 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1884 }; 1885 } // end namespace llvm 1886 1887 /// Helper struct to manage generating runtime checks for vectorization. 1888 /// 1889 /// The runtime checks are created up-front in temporary blocks to allow better 1890 /// estimating the cost and un-linked from the existing IR. After deciding to 1891 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1892 /// temporary blocks are completely removed. 1893 class GeneratedRTChecks { 1894 /// Basic block which contains the generated SCEV checks, if any. 1895 BasicBlock *SCEVCheckBlock = nullptr; 1896 1897 /// The value representing the result of the generated SCEV checks. If it is 1898 /// nullptr, either no SCEV checks have been generated or they have been used. 1899 Value *SCEVCheckCond = nullptr; 1900 1901 /// Basic block which contains the generated memory runtime checks, if any. 1902 BasicBlock *MemCheckBlock = nullptr; 1903 1904 /// The value representing the result of the generated memory runtime checks. 1905 /// If it is nullptr, either no memory runtime checks have been generated or 1906 /// they have been used. 1907 Instruction *MemRuntimeCheckCond = nullptr; 1908 1909 DominatorTree *DT; 1910 LoopInfo *LI; 1911 1912 SCEVExpander SCEVExp; 1913 SCEVExpander MemCheckExp; 1914 1915 public: 1916 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1917 const DataLayout &DL) 1918 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1919 MemCheckExp(SE, DL, "scev.check") {} 1920 1921 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1922 /// accurately estimate the cost of the runtime checks. The blocks are 1923 /// un-linked from the IR and is added back during vector code generation. If 1924 /// there is no vector code generation, the check blocks are removed 1925 /// completely. 1926 void Create(Loop *L, const LoopAccessInfo &LAI, 1927 const SCEVUnionPredicate &UnionPred) { 1928 1929 BasicBlock *LoopHeader = L->getHeader(); 1930 BasicBlock *Preheader = L->getLoopPreheader(); 1931 1932 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1933 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1934 // may be used by SCEVExpander. The blocks will be un-linked from their 1935 // predecessors and removed from LI & DT at the end of the function. 1936 if (!UnionPred.isAlwaysTrue()) { 1937 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1938 nullptr, "vector.scevcheck"); 1939 1940 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1941 &UnionPred, SCEVCheckBlock->getTerminator()); 1942 } 1943 1944 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1945 if (RtPtrChecking.Need) { 1946 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1947 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1948 "vector.memcheck"); 1949 1950 std::tie(std::ignore, MemRuntimeCheckCond) = 1951 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1952 RtPtrChecking.getChecks(), MemCheckExp); 1953 assert(MemRuntimeCheckCond && 1954 "no RT checks generated although RtPtrChecking " 1955 "claimed checks are required"); 1956 } 1957 1958 if (!MemCheckBlock && !SCEVCheckBlock) 1959 return; 1960 1961 // Unhook the temporary block with the checks, update various places 1962 // accordingly. 1963 if (SCEVCheckBlock) 1964 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1965 if (MemCheckBlock) 1966 MemCheckBlock->replaceAllUsesWith(Preheader); 1967 1968 if (SCEVCheckBlock) { 1969 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1970 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1971 Preheader->getTerminator()->eraseFromParent(); 1972 } 1973 if (MemCheckBlock) { 1974 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1975 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1976 Preheader->getTerminator()->eraseFromParent(); 1977 } 1978 1979 DT->changeImmediateDominator(LoopHeader, Preheader); 1980 if (MemCheckBlock) { 1981 DT->eraseNode(MemCheckBlock); 1982 LI->removeBlock(MemCheckBlock); 1983 } 1984 if (SCEVCheckBlock) { 1985 DT->eraseNode(SCEVCheckBlock); 1986 LI->removeBlock(SCEVCheckBlock); 1987 } 1988 } 1989 1990 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1991 /// unused. 1992 ~GeneratedRTChecks() { 1993 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1994 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1995 if (!SCEVCheckCond) 1996 SCEVCleaner.markResultUsed(); 1997 1998 if (!MemRuntimeCheckCond) 1999 MemCheckCleaner.markResultUsed(); 2000 2001 if (MemRuntimeCheckCond) { 2002 auto &SE = *MemCheckExp.getSE(); 2003 // Memory runtime check generation creates compares that use expanded 2004 // values. Remove them before running the SCEVExpanderCleaners. 2005 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2006 if (MemCheckExp.isInsertedInstruction(&I)) 2007 continue; 2008 SE.forgetValue(&I); 2009 SE.eraseValueFromMap(&I); 2010 I.eraseFromParent(); 2011 } 2012 } 2013 MemCheckCleaner.cleanup(); 2014 SCEVCleaner.cleanup(); 2015 2016 if (SCEVCheckCond) 2017 SCEVCheckBlock->eraseFromParent(); 2018 if (MemRuntimeCheckCond) 2019 MemCheckBlock->eraseFromParent(); 2020 } 2021 2022 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2023 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2024 /// depending on the generated condition. 2025 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2026 BasicBlock *LoopVectorPreHeader, 2027 BasicBlock *LoopExitBlock) { 2028 if (!SCEVCheckCond) 2029 return nullptr; 2030 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2031 if (C->isZero()) 2032 return nullptr; 2033 2034 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2035 2036 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2037 // Create new preheader for vector loop. 2038 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2039 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2040 2041 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2042 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2043 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2044 SCEVCheckBlock); 2045 2046 DT->addNewBlock(SCEVCheckBlock, Pred); 2047 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2048 2049 ReplaceInstWithInst( 2050 SCEVCheckBlock->getTerminator(), 2051 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2052 // Mark the check as used, to prevent it from being removed during cleanup. 2053 SCEVCheckCond = nullptr; 2054 return SCEVCheckBlock; 2055 } 2056 2057 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2058 /// the branches to branch to the vector preheader or \p Bypass, depending on 2059 /// the generated condition. 2060 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2061 BasicBlock *LoopVectorPreHeader) { 2062 // Check if we generated code that checks in runtime if arrays overlap. 2063 if (!MemRuntimeCheckCond) 2064 return nullptr; 2065 2066 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2067 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2068 MemCheckBlock); 2069 2070 DT->addNewBlock(MemCheckBlock, Pred); 2071 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2072 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2073 2074 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2075 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2076 2077 ReplaceInstWithInst( 2078 MemCheckBlock->getTerminator(), 2079 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2080 MemCheckBlock->getTerminator()->setDebugLoc( 2081 Pred->getTerminator()->getDebugLoc()); 2082 2083 // Mark the check as used, to prevent it from being removed during cleanup. 2084 MemRuntimeCheckCond = nullptr; 2085 return MemCheckBlock; 2086 } 2087 }; 2088 2089 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2090 // vectorization. The loop needs to be annotated with #pragma omp simd 2091 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2092 // vector length information is not provided, vectorization is not considered 2093 // explicit. Interleave hints are not allowed either. These limitations will be 2094 // relaxed in the future. 2095 // Please, note that we are currently forced to abuse the pragma 'clang 2096 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2097 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2098 // provides *explicit vectorization hints* (LV can bypass legal checks and 2099 // assume that vectorization is legal). However, both hints are implemented 2100 // using the same metadata (llvm.loop.vectorize, processed by 2101 // LoopVectorizeHints). This will be fixed in the future when the native IR 2102 // representation for pragma 'omp simd' is introduced. 2103 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2104 OptimizationRemarkEmitter *ORE) { 2105 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2106 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2107 2108 // Only outer loops with an explicit vectorization hint are supported. 2109 // Unannotated outer loops are ignored. 2110 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2111 return false; 2112 2113 Function *Fn = OuterLp->getHeader()->getParent(); 2114 if (!Hints.allowVectorization(Fn, OuterLp, 2115 true /*VectorizeOnlyWhenForced*/)) { 2116 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2117 return false; 2118 } 2119 2120 if (Hints.getInterleave() > 1) { 2121 // TODO: Interleave support is future work. 2122 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2123 "outer loops.\n"); 2124 Hints.emitRemarkWithHints(); 2125 return false; 2126 } 2127 2128 return true; 2129 } 2130 2131 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2132 OptimizationRemarkEmitter *ORE, 2133 SmallVectorImpl<Loop *> &V) { 2134 // Collect inner loops and outer loops without irreducible control flow. For 2135 // now, only collect outer loops that have explicit vectorization hints. If we 2136 // are stress testing the VPlan H-CFG construction, we collect the outermost 2137 // loop of every loop nest. 2138 if (L.isInnermost() || VPlanBuildStressTest || 2139 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2140 LoopBlocksRPO RPOT(&L); 2141 RPOT.perform(LI); 2142 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2143 V.push_back(&L); 2144 // TODO: Collect inner loops inside marked outer loops in case 2145 // vectorization fails for the outer loop. Do not invoke 2146 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2147 // already known to be reducible. We can use an inherited attribute for 2148 // that. 2149 return; 2150 } 2151 } 2152 for (Loop *InnerL : L) 2153 collectSupportedLoops(*InnerL, LI, ORE, V); 2154 } 2155 2156 namespace { 2157 2158 /// The LoopVectorize Pass. 2159 struct LoopVectorize : public FunctionPass { 2160 /// Pass identification, replacement for typeid 2161 static char ID; 2162 2163 LoopVectorizePass Impl; 2164 2165 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2166 bool VectorizeOnlyWhenForced = false) 2167 : FunctionPass(ID), 2168 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2169 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2170 } 2171 2172 bool runOnFunction(Function &F) override { 2173 if (skipFunction(F)) 2174 return false; 2175 2176 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2177 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2178 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2179 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2180 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2181 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2182 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2183 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2184 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2185 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2186 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2187 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2188 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2189 2190 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2191 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2192 2193 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2194 GetLAA, *ORE, PSI).MadeAnyChange; 2195 } 2196 2197 void getAnalysisUsage(AnalysisUsage &AU) const override { 2198 AU.addRequired<AssumptionCacheTracker>(); 2199 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2200 AU.addRequired<DominatorTreeWrapperPass>(); 2201 AU.addRequired<LoopInfoWrapperPass>(); 2202 AU.addRequired<ScalarEvolutionWrapperPass>(); 2203 AU.addRequired<TargetTransformInfoWrapperPass>(); 2204 AU.addRequired<AAResultsWrapperPass>(); 2205 AU.addRequired<LoopAccessLegacyAnalysis>(); 2206 AU.addRequired<DemandedBitsWrapperPass>(); 2207 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2208 AU.addRequired<InjectTLIMappingsLegacy>(); 2209 2210 // We currently do not preserve loopinfo/dominator analyses with outer loop 2211 // vectorization. Until this is addressed, mark these analyses as preserved 2212 // only for non-VPlan-native path. 2213 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2214 if (!EnableVPlanNativePath) { 2215 AU.addPreserved<LoopInfoWrapperPass>(); 2216 AU.addPreserved<DominatorTreeWrapperPass>(); 2217 } 2218 2219 AU.addPreserved<BasicAAWrapperPass>(); 2220 AU.addPreserved<GlobalsAAWrapperPass>(); 2221 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2222 } 2223 }; 2224 2225 } // end anonymous namespace 2226 2227 //===----------------------------------------------------------------------===// 2228 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2229 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2230 //===----------------------------------------------------------------------===// 2231 2232 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2233 // We need to place the broadcast of invariant variables outside the loop, 2234 // but only if it's proven safe to do so. Else, broadcast will be inside 2235 // vector loop body. 2236 Instruction *Instr = dyn_cast<Instruction>(V); 2237 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2238 (!Instr || 2239 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2240 // Place the code for broadcasting invariant variables in the new preheader. 2241 IRBuilder<>::InsertPointGuard Guard(Builder); 2242 if (SafeToHoist) 2243 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2244 2245 // Broadcast the scalar into all locations in the vector. 2246 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2247 2248 return Shuf; 2249 } 2250 2251 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2252 const InductionDescriptor &II, Value *Step, Value *Start, 2253 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2254 VPTransformState &State) { 2255 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2256 "Expected either an induction phi-node or a truncate of it!"); 2257 2258 // Construct the initial value of the vector IV in the vector loop preheader 2259 auto CurrIP = Builder.saveIP(); 2260 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2261 if (isa<TruncInst>(EntryVal)) { 2262 assert(Start->getType()->isIntegerTy() && 2263 "Truncation requires an integer type"); 2264 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2265 Step = Builder.CreateTrunc(Step, TruncType); 2266 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2267 } 2268 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2269 Value *SteppedStart = 2270 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2271 2272 // We create vector phi nodes for both integer and floating-point induction 2273 // variables. Here, we determine the kind of arithmetic we will perform. 2274 Instruction::BinaryOps AddOp; 2275 Instruction::BinaryOps MulOp; 2276 if (Step->getType()->isIntegerTy()) { 2277 AddOp = Instruction::Add; 2278 MulOp = Instruction::Mul; 2279 } else { 2280 AddOp = II.getInductionOpcode(); 2281 MulOp = Instruction::FMul; 2282 } 2283 2284 // Multiply the vectorization factor by the step using integer or 2285 // floating-point arithmetic as appropriate. 2286 Type *StepType = Step->getType(); 2287 if (Step->getType()->isFloatingPointTy()) 2288 StepType = IntegerType::get(StepType->getContext(), 2289 StepType->getScalarSizeInBits()); 2290 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2291 if (Step->getType()->isFloatingPointTy()) 2292 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2293 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2294 2295 // Create a vector splat to use in the induction update. 2296 // 2297 // FIXME: If the step is non-constant, we create the vector splat with 2298 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2299 // handle a constant vector splat. 2300 Value *SplatVF = isa<Constant>(Mul) 2301 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2302 : Builder.CreateVectorSplat(VF, Mul); 2303 Builder.restoreIP(CurrIP); 2304 2305 // We may need to add the step a number of times, depending on the unroll 2306 // factor. The last of those goes into the PHI. 2307 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2308 &*LoopVectorBody->getFirstInsertionPt()); 2309 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2310 Instruction *LastInduction = VecInd; 2311 for (unsigned Part = 0; Part < UF; ++Part) { 2312 State.set(Def, LastInduction, Part); 2313 2314 if (isa<TruncInst>(EntryVal)) 2315 addMetadata(LastInduction, EntryVal); 2316 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2317 State, Part); 2318 2319 LastInduction = cast<Instruction>( 2320 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2321 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2322 } 2323 2324 // Move the last step to the end of the latch block. This ensures consistent 2325 // placement of all induction updates. 2326 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2327 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2328 auto *ICmp = cast<Instruction>(Br->getCondition()); 2329 LastInduction->moveBefore(ICmp); 2330 LastInduction->setName("vec.ind.next"); 2331 2332 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2333 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2334 } 2335 2336 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2337 return Cost->isScalarAfterVectorization(I, VF) || 2338 Cost->isProfitableToScalarize(I, VF); 2339 } 2340 2341 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2342 if (shouldScalarizeInstruction(IV)) 2343 return true; 2344 auto isScalarInst = [&](User *U) -> bool { 2345 auto *I = cast<Instruction>(U); 2346 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2347 }; 2348 return llvm::any_of(IV->users(), isScalarInst); 2349 } 2350 2351 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2352 const InductionDescriptor &ID, const Instruction *EntryVal, 2353 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2354 unsigned Part, unsigned Lane) { 2355 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2356 "Expected either an induction phi-node or a truncate of it!"); 2357 2358 // This induction variable is not the phi from the original loop but the 2359 // newly-created IV based on the proof that casted Phi is equal to the 2360 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2361 // re-uses the same InductionDescriptor that original IV uses but we don't 2362 // have to do any recording in this case - that is done when original IV is 2363 // processed. 2364 if (isa<TruncInst>(EntryVal)) 2365 return; 2366 2367 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2368 if (Casts.empty()) 2369 return; 2370 // Only the first Cast instruction in the Casts vector is of interest. 2371 // The rest of the Casts (if exist) have no uses outside the 2372 // induction update chain itself. 2373 if (Lane < UINT_MAX) 2374 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2375 else 2376 State.set(CastDef, VectorLoopVal, Part); 2377 } 2378 2379 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2380 TruncInst *Trunc, VPValue *Def, 2381 VPValue *CastDef, 2382 VPTransformState &State) { 2383 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2384 "Primary induction variable must have an integer type"); 2385 2386 auto II = Legal->getInductionVars().find(IV); 2387 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2388 2389 auto ID = II->second; 2390 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2391 2392 // The value from the original loop to which we are mapping the new induction 2393 // variable. 2394 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2395 2396 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2397 2398 // Generate code for the induction step. Note that induction steps are 2399 // required to be loop-invariant 2400 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2401 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2402 "Induction step should be loop invariant"); 2403 if (PSE.getSE()->isSCEVable(IV->getType())) { 2404 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2405 return Exp.expandCodeFor(Step, Step->getType(), 2406 LoopVectorPreHeader->getTerminator()); 2407 } 2408 return cast<SCEVUnknown>(Step)->getValue(); 2409 }; 2410 2411 // The scalar value to broadcast. This is derived from the canonical 2412 // induction variable. If a truncation type is given, truncate the canonical 2413 // induction variable and step. Otherwise, derive these values from the 2414 // induction descriptor. 2415 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2416 Value *ScalarIV = Induction; 2417 if (IV != OldInduction) { 2418 ScalarIV = IV->getType()->isIntegerTy() 2419 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2420 : Builder.CreateCast(Instruction::SIToFP, Induction, 2421 IV->getType()); 2422 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2423 ScalarIV->setName("offset.idx"); 2424 } 2425 if (Trunc) { 2426 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2427 assert(Step->getType()->isIntegerTy() && 2428 "Truncation requires an integer step"); 2429 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2430 Step = Builder.CreateTrunc(Step, TruncType); 2431 } 2432 return ScalarIV; 2433 }; 2434 2435 // Create the vector values from the scalar IV, in the absence of creating a 2436 // vector IV. 2437 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2438 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2439 for (unsigned Part = 0; Part < UF; ++Part) { 2440 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2441 Value *EntryPart = 2442 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2443 ID.getInductionOpcode()); 2444 State.set(Def, EntryPart, Part); 2445 if (Trunc) 2446 addMetadata(EntryPart, Trunc); 2447 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2448 State, Part); 2449 } 2450 }; 2451 2452 // Fast-math-flags propagate from the original induction instruction. 2453 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2454 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2455 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2456 2457 // Now do the actual transformations, and start with creating the step value. 2458 Value *Step = CreateStepValue(ID.getStep()); 2459 if (VF.isZero() || VF.isScalar()) { 2460 Value *ScalarIV = CreateScalarIV(Step); 2461 CreateSplatIV(ScalarIV, Step); 2462 return; 2463 } 2464 2465 // Determine if we want a scalar version of the induction variable. This is 2466 // true if the induction variable itself is not widened, or if it has at 2467 // least one user in the loop that is not widened. 2468 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2469 if (!NeedsScalarIV) { 2470 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2471 State); 2472 return; 2473 } 2474 2475 // Try to create a new independent vector induction variable. If we can't 2476 // create the phi node, we will splat the scalar induction variable in each 2477 // loop iteration. 2478 if (!shouldScalarizeInstruction(EntryVal)) { 2479 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2480 State); 2481 Value *ScalarIV = CreateScalarIV(Step); 2482 // Create scalar steps that can be used by instructions we will later 2483 // scalarize. Note that the addition of the scalar steps will not increase 2484 // the number of instructions in the loop in the common case prior to 2485 // InstCombine. We will be trading one vector extract for each scalar step. 2486 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2487 return; 2488 } 2489 2490 // All IV users are scalar instructions, so only emit a scalar IV, not a 2491 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2492 // predicate used by the masked loads/stores. 2493 Value *ScalarIV = CreateScalarIV(Step); 2494 if (!Cost->isScalarEpilogueAllowed()) 2495 CreateSplatIV(ScalarIV, Step); 2496 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2497 } 2498 2499 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2500 Instruction::BinaryOps BinOp) { 2501 // Create and check the types. 2502 auto *ValVTy = cast<VectorType>(Val->getType()); 2503 ElementCount VLen = ValVTy->getElementCount(); 2504 2505 Type *STy = Val->getType()->getScalarType(); 2506 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2507 "Induction Step must be an integer or FP"); 2508 assert(Step->getType() == STy && "Step has wrong type"); 2509 2510 SmallVector<Constant *, 8> Indices; 2511 2512 // Create a vector of consecutive numbers from zero to VF. 2513 VectorType *InitVecValVTy = ValVTy; 2514 Type *InitVecValSTy = STy; 2515 if (STy->isFloatingPointTy()) { 2516 InitVecValSTy = 2517 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2518 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2519 } 2520 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2521 2522 // Add on StartIdx 2523 Value *StartIdxSplat = Builder.CreateVectorSplat( 2524 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2525 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2526 2527 if (STy->isIntegerTy()) { 2528 Step = Builder.CreateVectorSplat(VLen, Step); 2529 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2530 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2531 // which can be found from the original scalar operations. 2532 Step = Builder.CreateMul(InitVec, Step); 2533 return Builder.CreateAdd(Val, Step, "induction"); 2534 } 2535 2536 // Floating point induction. 2537 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2538 "Binary Opcode should be specified for FP induction"); 2539 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2540 Step = Builder.CreateVectorSplat(VLen, Step); 2541 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2542 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2543 } 2544 2545 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2546 Instruction *EntryVal, 2547 const InductionDescriptor &ID, 2548 VPValue *Def, VPValue *CastDef, 2549 VPTransformState &State) { 2550 // We shouldn't have to build scalar steps if we aren't vectorizing. 2551 assert(VF.isVector() && "VF should be greater than one"); 2552 // Get the value type and ensure it and the step have the same integer type. 2553 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2554 assert(ScalarIVTy == Step->getType() && 2555 "Val and Step should have the same type"); 2556 2557 // We build scalar steps for both integer and floating-point induction 2558 // variables. Here, we determine the kind of arithmetic we will perform. 2559 Instruction::BinaryOps AddOp; 2560 Instruction::BinaryOps MulOp; 2561 if (ScalarIVTy->isIntegerTy()) { 2562 AddOp = Instruction::Add; 2563 MulOp = Instruction::Mul; 2564 } else { 2565 AddOp = ID.getInductionOpcode(); 2566 MulOp = Instruction::FMul; 2567 } 2568 2569 // Determine the number of scalars we need to generate for each unroll 2570 // iteration. If EntryVal is uniform, we only need to generate the first 2571 // lane. Otherwise, we generate all VF values. 2572 bool IsUniform = 2573 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2574 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2575 // Compute the scalar steps and save the results in State. 2576 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2577 ScalarIVTy->getScalarSizeInBits()); 2578 Type *VecIVTy = nullptr; 2579 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2580 if (!IsUniform && VF.isScalable()) { 2581 VecIVTy = VectorType::get(ScalarIVTy, VF); 2582 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2583 SplatStep = Builder.CreateVectorSplat(VF, Step); 2584 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2585 } 2586 2587 for (unsigned Part = 0; Part < UF; ++Part) { 2588 Value *StartIdx0 = 2589 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2590 2591 if (!IsUniform && VF.isScalable()) { 2592 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2593 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2594 if (ScalarIVTy->isFloatingPointTy()) 2595 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2596 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2597 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2598 State.set(Def, Add, Part); 2599 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2600 Part); 2601 // It's useful to record the lane values too for the known minimum number 2602 // of elements so we do those below. This improves the code quality when 2603 // trying to extract the first element, for example. 2604 } 2605 2606 if (ScalarIVTy->isFloatingPointTy()) 2607 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2608 2609 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2610 Value *StartIdx = Builder.CreateBinOp( 2611 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2612 // The step returned by `createStepForVF` is a runtime-evaluated value 2613 // when VF is scalable. Otherwise, it should be folded into a Constant. 2614 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2615 "Expected StartIdx to be folded to a constant when VF is not " 2616 "scalable"); 2617 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2618 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2619 State.set(Def, Add, VPIteration(Part, Lane)); 2620 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2621 Part, Lane); 2622 } 2623 } 2624 } 2625 2626 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2627 const VPIteration &Instance, 2628 VPTransformState &State) { 2629 Value *ScalarInst = State.get(Def, Instance); 2630 Value *VectorValue = State.get(Def, Instance.Part); 2631 VectorValue = Builder.CreateInsertElement( 2632 VectorValue, ScalarInst, 2633 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2634 State.set(Def, VectorValue, Instance.Part); 2635 } 2636 2637 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2638 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2639 return Builder.CreateVectorReverse(Vec, "reverse"); 2640 } 2641 2642 // Return whether we allow using masked interleave-groups (for dealing with 2643 // strided loads/stores that reside in predicated blocks, or for dealing 2644 // with gaps). 2645 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2646 // If an override option has been passed in for interleaved accesses, use it. 2647 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2648 return EnableMaskedInterleavedMemAccesses; 2649 2650 return TTI.enableMaskedInterleavedAccessVectorization(); 2651 } 2652 2653 // Try to vectorize the interleave group that \p Instr belongs to. 2654 // 2655 // E.g. Translate following interleaved load group (factor = 3): 2656 // for (i = 0; i < N; i+=3) { 2657 // R = Pic[i]; // Member of index 0 2658 // G = Pic[i+1]; // Member of index 1 2659 // B = Pic[i+2]; // Member of index 2 2660 // ... // do something to R, G, B 2661 // } 2662 // To: 2663 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2664 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2665 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2666 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2667 // 2668 // Or translate following interleaved store group (factor = 3): 2669 // for (i = 0; i < N; i+=3) { 2670 // ... do something to R, G, B 2671 // Pic[i] = R; // Member of index 0 2672 // Pic[i+1] = G; // Member of index 1 2673 // Pic[i+2] = B; // Member of index 2 2674 // } 2675 // To: 2676 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2677 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2678 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2679 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2680 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2681 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2682 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2683 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2684 VPValue *BlockInMask) { 2685 Instruction *Instr = Group->getInsertPos(); 2686 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2687 2688 // Prepare for the vector type of the interleaved load/store. 2689 Type *ScalarTy = getMemInstValueType(Instr); 2690 unsigned InterleaveFactor = Group->getFactor(); 2691 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2692 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2693 2694 // Prepare for the new pointers. 2695 SmallVector<Value *, 2> AddrParts; 2696 unsigned Index = Group->getIndex(Instr); 2697 2698 // TODO: extend the masked interleaved-group support to reversed access. 2699 assert((!BlockInMask || !Group->isReverse()) && 2700 "Reversed masked interleave-group not supported."); 2701 2702 // If the group is reverse, adjust the index to refer to the last vector lane 2703 // instead of the first. We adjust the index from the first vector lane, 2704 // rather than directly getting the pointer for lane VF - 1, because the 2705 // pointer operand of the interleaved access is supposed to be uniform. For 2706 // uniform instructions, we're only required to generate a value for the 2707 // first vector lane in each unroll iteration. 2708 if (Group->isReverse()) 2709 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2710 2711 for (unsigned Part = 0; Part < UF; Part++) { 2712 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2713 setDebugLocFromInst(Builder, AddrPart); 2714 2715 // Notice current instruction could be any index. Need to adjust the address 2716 // to the member of index 0. 2717 // 2718 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2719 // b = A[i]; // Member of index 0 2720 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2721 // 2722 // E.g. A[i+1] = a; // Member of index 1 2723 // A[i] = b; // Member of index 0 2724 // A[i+2] = c; // Member of index 2 (Current instruction) 2725 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2726 2727 bool InBounds = false; 2728 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2729 InBounds = gep->isInBounds(); 2730 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2731 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2732 2733 // Cast to the vector pointer type. 2734 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2735 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2736 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2737 } 2738 2739 setDebugLocFromInst(Builder, Instr); 2740 Value *PoisonVec = PoisonValue::get(VecTy); 2741 2742 Value *MaskForGaps = nullptr; 2743 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2744 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2745 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2746 } 2747 2748 // Vectorize the interleaved load group. 2749 if (isa<LoadInst>(Instr)) { 2750 // For each unroll part, create a wide load for the group. 2751 SmallVector<Value *, 2> NewLoads; 2752 for (unsigned Part = 0; Part < UF; Part++) { 2753 Instruction *NewLoad; 2754 if (BlockInMask || MaskForGaps) { 2755 assert(useMaskedInterleavedAccesses(*TTI) && 2756 "masked interleaved groups are not allowed."); 2757 Value *GroupMask = MaskForGaps; 2758 if (BlockInMask) { 2759 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2760 Value *ShuffledMask = Builder.CreateShuffleVector( 2761 BlockInMaskPart, 2762 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2763 "interleaved.mask"); 2764 GroupMask = MaskForGaps 2765 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2766 MaskForGaps) 2767 : ShuffledMask; 2768 } 2769 NewLoad = 2770 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2771 GroupMask, PoisonVec, "wide.masked.vec"); 2772 } 2773 else 2774 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2775 Group->getAlign(), "wide.vec"); 2776 Group->addMetadata(NewLoad); 2777 NewLoads.push_back(NewLoad); 2778 } 2779 2780 // For each member in the group, shuffle out the appropriate data from the 2781 // wide loads. 2782 unsigned J = 0; 2783 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2784 Instruction *Member = Group->getMember(I); 2785 2786 // Skip the gaps in the group. 2787 if (!Member) 2788 continue; 2789 2790 auto StrideMask = 2791 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2792 for (unsigned Part = 0; Part < UF; Part++) { 2793 Value *StridedVec = Builder.CreateShuffleVector( 2794 NewLoads[Part], StrideMask, "strided.vec"); 2795 2796 // If this member has different type, cast the result type. 2797 if (Member->getType() != ScalarTy) { 2798 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2799 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2800 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2801 } 2802 2803 if (Group->isReverse()) 2804 StridedVec = reverseVector(StridedVec); 2805 2806 State.set(VPDefs[J], StridedVec, Part); 2807 } 2808 ++J; 2809 } 2810 return; 2811 } 2812 2813 // The sub vector type for current instruction. 2814 auto *SubVT = VectorType::get(ScalarTy, VF); 2815 2816 // Vectorize the interleaved store group. 2817 for (unsigned Part = 0; Part < UF; Part++) { 2818 // Collect the stored vector from each member. 2819 SmallVector<Value *, 4> StoredVecs; 2820 for (unsigned i = 0; i < InterleaveFactor; i++) { 2821 // Interleaved store group doesn't allow a gap, so each index has a member 2822 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2823 2824 Value *StoredVec = State.get(StoredValues[i], Part); 2825 2826 if (Group->isReverse()) 2827 StoredVec = reverseVector(StoredVec); 2828 2829 // If this member has different type, cast it to a unified type. 2830 2831 if (StoredVec->getType() != SubVT) 2832 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2833 2834 StoredVecs.push_back(StoredVec); 2835 } 2836 2837 // Concatenate all vectors into a wide vector. 2838 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2839 2840 // Interleave the elements in the wide vector. 2841 Value *IVec = Builder.CreateShuffleVector( 2842 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2843 "interleaved.vec"); 2844 2845 Instruction *NewStoreInstr; 2846 if (BlockInMask) { 2847 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2848 Value *ShuffledMask = Builder.CreateShuffleVector( 2849 BlockInMaskPart, 2850 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2851 "interleaved.mask"); 2852 NewStoreInstr = Builder.CreateMaskedStore( 2853 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2854 } 2855 else 2856 NewStoreInstr = 2857 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2858 2859 Group->addMetadata(NewStoreInstr); 2860 } 2861 } 2862 2863 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2864 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2865 VPValue *StoredValue, VPValue *BlockInMask) { 2866 // Attempt to issue a wide load. 2867 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2868 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2869 2870 assert((LI || SI) && "Invalid Load/Store instruction"); 2871 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2872 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2873 2874 LoopVectorizationCostModel::InstWidening Decision = 2875 Cost->getWideningDecision(Instr, VF); 2876 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2877 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2878 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2879 "CM decision is not to widen the memory instruction"); 2880 2881 Type *ScalarDataTy = getMemInstValueType(Instr); 2882 2883 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2884 const Align Alignment = getLoadStoreAlignment(Instr); 2885 2886 // Determine if the pointer operand of the access is either consecutive or 2887 // reverse consecutive. 2888 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2889 bool ConsecutiveStride = 2890 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2891 bool CreateGatherScatter = 2892 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2893 2894 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2895 // gather/scatter. Otherwise Decision should have been to Scalarize. 2896 assert((ConsecutiveStride || CreateGatherScatter) && 2897 "The instruction should be scalarized"); 2898 (void)ConsecutiveStride; 2899 2900 VectorParts BlockInMaskParts(UF); 2901 bool isMaskRequired = BlockInMask; 2902 if (isMaskRequired) 2903 for (unsigned Part = 0; Part < UF; ++Part) 2904 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2905 2906 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2907 // Calculate the pointer for the specific unroll-part. 2908 GetElementPtrInst *PartPtr = nullptr; 2909 2910 bool InBounds = false; 2911 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2912 InBounds = gep->isInBounds(); 2913 if (Reverse) { 2914 // If the address is consecutive but reversed, then the 2915 // wide store needs to start at the last vector element. 2916 // RunTimeVF = VScale * VF.getKnownMinValue() 2917 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2918 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2919 // NumElt = -Part * RunTimeVF 2920 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2921 // LastLane = 1 - RunTimeVF 2922 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2923 PartPtr = 2924 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2925 PartPtr->setIsInBounds(InBounds); 2926 PartPtr = cast<GetElementPtrInst>( 2927 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2928 PartPtr->setIsInBounds(InBounds); 2929 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2930 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2931 } else { 2932 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2933 PartPtr = cast<GetElementPtrInst>( 2934 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2935 PartPtr->setIsInBounds(InBounds); 2936 } 2937 2938 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2939 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2940 }; 2941 2942 // Handle Stores: 2943 if (SI) { 2944 setDebugLocFromInst(Builder, SI); 2945 2946 for (unsigned Part = 0; Part < UF; ++Part) { 2947 Instruction *NewSI = nullptr; 2948 Value *StoredVal = State.get(StoredValue, Part); 2949 if (CreateGatherScatter) { 2950 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2951 Value *VectorGep = State.get(Addr, Part); 2952 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2953 MaskPart); 2954 } else { 2955 if (Reverse) { 2956 // If we store to reverse consecutive memory locations, then we need 2957 // to reverse the order of elements in the stored value. 2958 StoredVal = reverseVector(StoredVal); 2959 // We don't want to update the value in the map as it might be used in 2960 // another expression. So don't call resetVectorValue(StoredVal). 2961 } 2962 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2963 if (isMaskRequired) 2964 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2965 BlockInMaskParts[Part]); 2966 else 2967 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2968 } 2969 addMetadata(NewSI, SI); 2970 } 2971 return; 2972 } 2973 2974 // Handle loads. 2975 assert(LI && "Must have a load instruction"); 2976 setDebugLocFromInst(Builder, LI); 2977 for (unsigned Part = 0; Part < UF; ++Part) { 2978 Value *NewLI; 2979 if (CreateGatherScatter) { 2980 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2981 Value *VectorGep = State.get(Addr, Part); 2982 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2983 nullptr, "wide.masked.gather"); 2984 addMetadata(NewLI, LI); 2985 } else { 2986 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2987 if (isMaskRequired) 2988 NewLI = Builder.CreateMaskedLoad( 2989 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2990 "wide.masked.load"); 2991 else 2992 NewLI = 2993 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2994 2995 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2996 addMetadata(NewLI, LI); 2997 if (Reverse) 2998 NewLI = reverseVector(NewLI); 2999 } 3000 3001 State.set(Def, NewLI, Part); 3002 } 3003 } 3004 3005 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3006 VPUser &User, 3007 const VPIteration &Instance, 3008 bool IfPredicateInstr, 3009 VPTransformState &State) { 3010 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3011 3012 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3013 // the first lane and part. 3014 if (isa<NoAliasScopeDeclInst>(Instr)) 3015 if (!Instance.isFirstIteration()) 3016 return; 3017 3018 setDebugLocFromInst(Builder, Instr); 3019 3020 // Does this instruction return a value ? 3021 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3022 3023 Instruction *Cloned = Instr->clone(); 3024 if (!IsVoidRetTy) 3025 Cloned->setName(Instr->getName() + ".cloned"); 3026 3027 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3028 Builder.GetInsertPoint()); 3029 // Replace the operands of the cloned instructions with their scalar 3030 // equivalents in the new loop. 3031 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3032 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3033 auto InputInstance = Instance; 3034 if (!Operand || !OrigLoop->contains(Operand) || 3035 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3036 InputInstance.Lane = VPLane::getFirstLane(); 3037 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3038 Cloned->setOperand(op, NewOp); 3039 } 3040 addNewMetadata(Cloned, Instr); 3041 3042 // Place the cloned scalar in the new loop. 3043 Builder.Insert(Cloned); 3044 3045 State.set(Def, Cloned, Instance); 3046 3047 // If we just cloned a new assumption, add it the assumption cache. 3048 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3049 AC->registerAssumption(II); 3050 3051 // End if-block. 3052 if (IfPredicateInstr) 3053 PredicatedInstructions.push_back(Cloned); 3054 } 3055 3056 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3057 Value *End, Value *Step, 3058 Instruction *DL) { 3059 BasicBlock *Header = L->getHeader(); 3060 BasicBlock *Latch = L->getLoopLatch(); 3061 // As we're just creating this loop, it's possible no latch exists 3062 // yet. If so, use the header as this will be a single block loop. 3063 if (!Latch) 3064 Latch = Header; 3065 3066 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3067 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3068 setDebugLocFromInst(Builder, OldInst); 3069 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3070 3071 Builder.SetInsertPoint(Latch->getTerminator()); 3072 setDebugLocFromInst(Builder, OldInst); 3073 3074 // Create i+1 and fill the PHINode. 3075 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3076 Induction->addIncoming(Start, L->getLoopPreheader()); 3077 Induction->addIncoming(Next, Latch); 3078 // Create the compare. 3079 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3080 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3081 3082 // Now we have two terminators. Remove the old one from the block. 3083 Latch->getTerminator()->eraseFromParent(); 3084 3085 return Induction; 3086 } 3087 3088 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3089 if (TripCount) 3090 return TripCount; 3091 3092 assert(L && "Create Trip Count for null loop."); 3093 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3094 // Find the loop boundaries. 3095 ScalarEvolution *SE = PSE.getSE(); 3096 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3097 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3098 "Invalid loop count"); 3099 3100 Type *IdxTy = Legal->getWidestInductionType(); 3101 assert(IdxTy && "No type for induction"); 3102 3103 // The exit count might have the type of i64 while the phi is i32. This can 3104 // happen if we have an induction variable that is sign extended before the 3105 // compare. The only way that we get a backedge taken count is that the 3106 // induction variable was signed and as such will not overflow. In such a case 3107 // truncation is legal. 3108 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3109 IdxTy->getPrimitiveSizeInBits()) 3110 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3111 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3112 3113 // Get the total trip count from the count by adding 1. 3114 const SCEV *ExitCount = SE->getAddExpr( 3115 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3116 3117 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3118 3119 // Expand the trip count and place the new instructions in the preheader. 3120 // Notice that the pre-header does not change, only the loop body. 3121 SCEVExpander Exp(*SE, DL, "induction"); 3122 3123 // Count holds the overall loop count (N). 3124 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3125 L->getLoopPreheader()->getTerminator()); 3126 3127 if (TripCount->getType()->isPointerTy()) 3128 TripCount = 3129 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3130 L->getLoopPreheader()->getTerminator()); 3131 3132 return TripCount; 3133 } 3134 3135 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3136 if (VectorTripCount) 3137 return VectorTripCount; 3138 3139 Value *TC = getOrCreateTripCount(L); 3140 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3141 3142 Type *Ty = TC->getType(); 3143 // This is where we can make the step a runtime constant. 3144 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3145 3146 // If the tail is to be folded by masking, round the number of iterations N 3147 // up to a multiple of Step instead of rounding down. This is done by first 3148 // adding Step-1 and then rounding down. Note that it's ok if this addition 3149 // overflows: the vector induction variable will eventually wrap to zero given 3150 // that it starts at zero and its Step is a power of two; the loop will then 3151 // exit, with the last early-exit vector comparison also producing all-true. 3152 if (Cost->foldTailByMasking()) { 3153 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3154 "VF*UF must be a power of 2 when folding tail by masking"); 3155 assert(!VF.isScalable() && 3156 "Tail folding not yet supported for scalable vectors"); 3157 TC = Builder.CreateAdd( 3158 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3159 } 3160 3161 // Now we need to generate the expression for the part of the loop that the 3162 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3163 // iterations are not required for correctness, or N - Step, otherwise. Step 3164 // is equal to the vectorization factor (number of SIMD elements) times the 3165 // unroll factor (number of SIMD instructions). 3166 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3167 3168 // There are two cases where we need to ensure (at least) the last iteration 3169 // runs in the scalar remainder loop. Thus, if the step evenly divides 3170 // the trip count, we set the remainder to be equal to the step. If the step 3171 // does not evenly divide the trip count, no adjustment is necessary since 3172 // there will already be scalar iterations. Note that the minimum iterations 3173 // check ensures that N >= Step. The cases are: 3174 // 1) If there is a non-reversed interleaved group that may speculatively 3175 // access memory out-of-bounds. 3176 // 2) If any instruction may follow a conditionally taken exit. That is, if 3177 // the loop contains multiple exiting blocks, or a single exiting block 3178 // which is not the latch. 3179 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3180 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3181 R = Builder.CreateSelect(IsZero, Step, R); 3182 } 3183 3184 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3185 3186 return VectorTripCount; 3187 } 3188 3189 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3190 const DataLayout &DL) { 3191 // Verify that V is a vector type with same number of elements as DstVTy. 3192 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3193 unsigned VF = DstFVTy->getNumElements(); 3194 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3195 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3196 Type *SrcElemTy = SrcVecTy->getElementType(); 3197 Type *DstElemTy = DstFVTy->getElementType(); 3198 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3199 "Vector elements must have same size"); 3200 3201 // Do a direct cast if element types are castable. 3202 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3203 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3204 } 3205 // V cannot be directly casted to desired vector type. 3206 // May happen when V is a floating point vector but DstVTy is a vector of 3207 // pointers or vice-versa. Handle this using a two-step bitcast using an 3208 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3209 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3210 "Only one type should be a pointer type"); 3211 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3212 "Only one type should be a floating point type"); 3213 Type *IntTy = 3214 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3215 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3216 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3217 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3218 } 3219 3220 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3221 BasicBlock *Bypass) { 3222 Value *Count = getOrCreateTripCount(L); 3223 // Reuse existing vector loop preheader for TC checks. 3224 // Note that new preheader block is generated for vector loop. 3225 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3226 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3227 3228 // Generate code to check if the loop's trip count is less than VF * UF, or 3229 // equal to it in case a scalar epilogue is required; this implies that the 3230 // vector trip count is zero. This check also covers the case where adding one 3231 // to the backedge-taken count overflowed leading to an incorrect trip count 3232 // of zero. In this case we will also jump to the scalar loop. 3233 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3234 : ICmpInst::ICMP_ULT; 3235 3236 // If tail is to be folded, vector loop takes care of all iterations. 3237 Value *CheckMinIters = Builder.getFalse(); 3238 if (!Cost->foldTailByMasking()) { 3239 Value *Step = 3240 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3241 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3242 } 3243 // Create new preheader for vector loop. 3244 LoopVectorPreHeader = 3245 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3246 "vector.ph"); 3247 3248 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3249 DT->getNode(Bypass)->getIDom()) && 3250 "TC check is expected to dominate Bypass"); 3251 3252 // Update dominator for Bypass & LoopExit. 3253 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3254 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3255 3256 ReplaceInstWithInst( 3257 TCCheckBlock->getTerminator(), 3258 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3259 LoopBypassBlocks.push_back(TCCheckBlock); 3260 } 3261 3262 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3263 3264 BasicBlock *const SCEVCheckBlock = 3265 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3266 if (!SCEVCheckBlock) 3267 return nullptr; 3268 3269 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3270 (OptForSizeBasedOnProfile && 3271 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3272 "Cannot SCEV check stride or overflow when optimizing for size"); 3273 3274 3275 // Update dominator only if this is first RT check. 3276 if (LoopBypassBlocks.empty()) { 3277 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3278 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3279 } 3280 3281 LoopBypassBlocks.push_back(SCEVCheckBlock); 3282 AddedSafetyChecks = true; 3283 return SCEVCheckBlock; 3284 } 3285 3286 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3287 BasicBlock *Bypass) { 3288 // VPlan-native path does not do any analysis for runtime checks currently. 3289 if (EnableVPlanNativePath) 3290 return nullptr; 3291 3292 BasicBlock *const MemCheckBlock = 3293 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3294 3295 // Check if we generated code that checks in runtime if arrays overlap. We put 3296 // the checks into a separate block to make the more common case of few 3297 // elements faster. 3298 if (!MemCheckBlock) 3299 return nullptr; 3300 3301 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3302 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3303 "Cannot emit memory checks when optimizing for size, unless forced " 3304 "to vectorize."); 3305 ORE->emit([&]() { 3306 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3307 L->getStartLoc(), L->getHeader()) 3308 << "Code-size may be reduced by not forcing " 3309 "vectorization, or by source-code modifications " 3310 "eliminating the need for runtime checks " 3311 "(e.g., adding 'restrict')."; 3312 }); 3313 } 3314 3315 LoopBypassBlocks.push_back(MemCheckBlock); 3316 3317 AddedSafetyChecks = true; 3318 3319 // We currently don't use LoopVersioning for the actual loop cloning but we 3320 // still use it to add the noalias metadata. 3321 LVer = std::make_unique<LoopVersioning>( 3322 *Legal->getLAI(), 3323 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3324 DT, PSE.getSE()); 3325 LVer->prepareNoAliasMetadata(); 3326 return MemCheckBlock; 3327 } 3328 3329 Value *InnerLoopVectorizer::emitTransformedIndex( 3330 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3331 const InductionDescriptor &ID) const { 3332 3333 SCEVExpander Exp(*SE, DL, "induction"); 3334 auto Step = ID.getStep(); 3335 auto StartValue = ID.getStartValue(); 3336 assert(Index->getType()->getScalarType() == Step->getType() && 3337 "Index scalar type does not match StepValue type"); 3338 3339 // Note: the IR at this point is broken. We cannot use SE to create any new 3340 // SCEV and then expand it, hoping that SCEV's simplification will give us 3341 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3342 // lead to various SCEV crashes. So all we can do is to use builder and rely 3343 // on InstCombine for future simplifications. Here we handle some trivial 3344 // cases only. 3345 auto CreateAdd = [&B](Value *X, Value *Y) { 3346 assert(X->getType() == Y->getType() && "Types don't match!"); 3347 if (auto *CX = dyn_cast<ConstantInt>(X)) 3348 if (CX->isZero()) 3349 return Y; 3350 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3351 if (CY->isZero()) 3352 return X; 3353 return B.CreateAdd(X, Y); 3354 }; 3355 3356 // We allow X to be a vector type, in which case Y will potentially be 3357 // splatted into a vector with the same element count. 3358 auto CreateMul = [&B](Value *X, Value *Y) { 3359 assert(X->getType()->getScalarType() == Y->getType() && 3360 "Types don't match!"); 3361 if (auto *CX = dyn_cast<ConstantInt>(X)) 3362 if (CX->isOne()) 3363 return Y; 3364 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3365 if (CY->isOne()) 3366 return X; 3367 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 3368 if (XVTy && !isa<VectorType>(Y->getType())) 3369 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 3370 return B.CreateMul(X, Y); 3371 }; 3372 3373 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3374 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3375 // the DomTree is not kept up-to-date for additional blocks generated in the 3376 // vector loop. By using the header as insertion point, we guarantee that the 3377 // expanded instructions dominate all their uses. 3378 auto GetInsertPoint = [this, &B]() { 3379 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3380 if (InsertBB != LoopVectorBody && 3381 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3382 return LoopVectorBody->getTerminator(); 3383 return &*B.GetInsertPoint(); 3384 }; 3385 3386 switch (ID.getKind()) { 3387 case InductionDescriptor::IK_IntInduction: { 3388 assert(!isa<VectorType>(Index->getType()) && 3389 "Vector indices not supported for integer inductions yet"); 3390 assert(Index->getType() == StartValue->getType() && 3391 "Index type does not match StartValue type"); 3392 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3393 return B.CreateSub(StartValue, Index); 3394 auto *Offset = CreateMul( 3395 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3396 return CreateAdd(StartValue, Offset); 3397 } 3398 case InductionDescriptor::IK_PtrInduction: { 3399 assert(isa<SCEVConstant>(Step) && 3400 "Expected constant step for pointer induction"); 3401 return B.CreateGEP( 3402 StartValue->getType()->getPointerElementType(), StartValue, 3403 CreateMul(Index, 3404 Exp.expandCodeFor(Step, Index->getType()->getScalarType(), 3405 GetInsertPoint()))); 3406 } 3407 case InductionDescriptor::IK_FpInduction: { 3408 assert(!isa<VectorType>(Index->getType()) && 3409 "Vector indices not supported for FP inductions yet"); 3410 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3411 auto InductionBinOp = ID.getInductionBinOp(); 3412 assert(InductionBinOp && 3413 (InductionBinOp->getOpcode() == Instruction::FAdd || 3414 InductionBinOp->getOpcode() == Instruction::FSub) && 3415 "Original bin op should be defined for FP induction"); 3416 3417 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3418 Value *MulExp = B.CreateFMul(StepValue, Index); 3419 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3420 "induction"); 3421 } 3422 case InductionDescriptor::IK_NoInduction: 3423 return nullptr; 3424 } 3425 llvm_unreachable("invalid enum"); 3426 } 3427 3428 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3429 LoopScalarBody = OrigLoop->getHeader(); 3430 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3431 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3432 assert(LoopExitBlock && "Must have an exit block"); 3433 assert(LoopVectorPreHeader && "Invalid loop structure"); 3434 3435 LoopMiddleBlock = 3436 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3437 LI, nullptr, Twine(Prefix) + "middle.block"); 3438 LoopScalarPreHeader = 3439 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3440 nullptr, Twine(Prefix) + "scalar.ph"); 3441 3442 // Set up branch from middle block to the exit and scalar preheader blocks. 3443 // completeLoopSkeleton will update the condition to use an iteration check, 3444 // if required to decide whether to execute the remainder. 3445 BranchInst *BrInst = 3446 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3447 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3448 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3449 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3450 3451 // We intentionally don't let SplitBlock to update LoopInfo since 3452 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3453 // LoopVectorBody is explicitly added to the correct place few lines later. 3454 LoopVectorBody = 3455 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3456 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3457 3458 // Update dominator for loop exit. 3459 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3460 3461 // Create and register the new vector loop. 3462 Loop *Lp = LI->AllocateLoop(); 3463 Loop *ParentLoop = OrigLoop->getParentLoop(); 3464 3465 // Insert the new loop into the loop nest and register the new basic blocks 3466 // before calling any utilities such as SCEV that require valid LoopInfo. 3467 if (ParentLoop) { 3468 ParentLoop->addChildLoop(Lp); 3469 } else { 3470 LI->addTopLevelLoop(Lp); 3471 } 3472 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3473 return Lp; 3474 } 3475 3476 void InnerLoopVectorizer::createInductionResumeValues( 3477 Loop *L, Value *VectorTripCount, 3478 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3479 assert(VectorTripCount && L && "Expected valid arguments"); 3480 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3481 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3482 "Inconsistent information about additional bypass."); 3483 // We are going to resume the execution of the scalar loop. 3484 // Go over all of the induction variables that we found and fix the 3485 // PHIs that are left in the scalar version of the loop. 3486 // The starting values of PHI nodes depend on the counter of the last 3487 // iteration in the vectorized loop. 3488 // If we come from a bypass edge then we need to start from the original 3489 // start value. 3490 for (auto &InductionEntry : Legal->getInductionVars()) { 3491 PHINode *OrigPhi = InductionEntry.first; 3492 InductionDescriptor II = InductionEntry.second; 3493 3494 // Create phi nodes to merge from the backedge-taken check block. 3495 PHINode *BCResumeVal = 3496 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3497 LoopScalarPreHeader->getTerminator()); 3498 // Copy original phi DL over to the new one. 3499 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3500 Value *&EndValue = IVEndValues[OrigPhi]; 3501 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3502 if (OrigPhi == OldInduction) { 3503 // We know what the end value is. 3504 EndValue = VectorTripCount; 3505 } else { 3506 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3507 3508 // Fast-math-flags propagate from the original induction instruction. 3509 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3510 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3511 3512 Type *StepType = II.getStep()->getType(); 3513 Instruction::CastOps CastOp = 3514 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3515 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3516 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3517 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3518 EndValue->setName("ind.end"); 3519 3520 // Compute the end value for the additional bypass (if applicable). 3521 if (AdditionalBypass.first) { 3522 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3523 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3524 StepType, true); 3525 CRD = 3526 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3527 EndValueFromAdditionalBypass = 3528 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3529 EndValueFromAdditionalBypass->setName("ind.end"); 3530 } 3531 } 3532 // The new PHI merges the original incoming value, in case of a bypass, 3533 // or the value at the end of the vectorized loop. 3534 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3535 3536 // Fix the scalar body counter (PHI node). 3537 // The old induction's phi node in the scalar body needs the truncated 3538 // value. 3539 for (BasicBlock *BB : LoopBypassBlocks) 3540 BCResumeVal->addIncoming(II.getStartValue(), BB); 3541 3542 if (AdditionalBypass.first) 3543 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3544 EndValueFromAdditionalBypass); 3545 3546 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3547 } 3548 } 3549 3550 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3551 MDNode *OrigLoopID) { 3552 assert(L && "Expected valid loop."); 3553 3554 // The trip counts should be cached by now. 3555 Value *Count = getOrCreateTripCount(L); 3556 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3557 3558 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3559 3560 // Add a check in the middle block to see if we have completed 3561 // all of the iterations in the first vector loop. 3562 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3563 // If tail is to be folded, we know we don't need to run the remainder. 3564 if (!Cost->foldTailByMasking()) { 3565 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3566 Count, VectorTripCount, "cmp.n", 3567 LoopMiddleBlock->getTerminator()); 3568 3569 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3570 // of the corresponding compare because they may have ended up with 3571 // different line numbers and we want to avoid awkward line stepping while 3572 // debugging. Eg. if the compare has got a line number inside the loop. 3573 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3574 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3575 } 3576 3577 // Get ready to start creating new instructions into the vectorized body. 3578 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3579 "Inconsistent vector loop preheader"); 3580 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3581 3582 Optional<MDNode *> VectorizedLoopID = 3583 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3584 LLVMLoopVectorizeFollowupVectorized}); 3585 if (VectorizedLoopID.hasValue()) { 3586 L->setLoopID(VectorizedLoopID.getValue()); 3587 3588 // Do not setAlreadyVectorized if loop attributes have been defined 3589 // explicitly. 3590 return LoopVectorPreHeader; 3591 } 3592 3593 // Keep all loop hints from the original loop on the vector loop (we'll 3594 // replace the vectorizer-specific hints below). 3595 if (MDNode *LID = OrigLoop->getLoopID()) 3596 L->setLoopID(LID); 3597 3598 LoopVectorizeHints Hints(L, true, *ORE); 3599 Hints.setAlreadyVectorized(); 3600 3601 #ifdef EXPENSIVE_CHECKS 3602 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3603 LI->verify(*DT); 3604 #endif 3605 3606 return LoopVectorPreHeader; 3607 } 3608 3609 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3610 /* 3611 In this function we generate a new loop. The new loop will contain 3612 the vectorized instructions while the old loop will continue to run the 3613 scalar remainder. 3614 3615 [ ] <-- loop iteration number check. 3616 / | 3617 / v 3618 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3619 | / | 3620 | / v 3621 || [ ] <-- vector pre header. 3622 |/ | 3623 | v 3624 | [ ] \ 3625 | [ ]_| <-- vector loop. 3626 | | 3627 | v 3628 | -[ ] <--- middle-block. 3629 | / | 3630 | / v 3631 -|- >[ ] <--- new preheader. 3632 | | 3633 | v 3634 | [ ] \ 3635 | [ ]_| <-- old scalar loop to handle remainder. 3636 \ | 3637 \ v 3638 >[ ] <-- exit block. 3639 ... 3640 */ 3641 3642 // Get the metadata of the original loop before it gets modified. 3643 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3644 3645 // Workaround! Compute the trip count of the original loop and cache it 3646 // before we start modifying the CFG. This code has a systemic problem 3647 // wherein it tries to run analysis over partially constructed IR; this is 3648 // wrong, and not simply for SCEV. The trip count of the original loop 3649 // simply happens to be prone to hitting this in practice. In theory, we 3650 // can hit the same issue for any SCEV, or ValueTracking query done during 3651 // mutation. See PR49900. 3652 getOrCreateTripCount(OrigLoop); 3653 3654 // Create an empty vector loop, and prepare basic blocks for the runtime 3655 // checks. 3656 Loop *Lp = createVectorLoopSkeleton(""); 3657 3658 // Now, compare the new count to zero. If it is zero skip the vector loop and 3659 // jump to the scalar loop. This check also covers the case where the 3660 // backedge-taken count is uint##_max: adding one to it will overflow leading 3661 // to an incorrect trip count of zero. In this (rare) case we will also jump 3662 // to the scalar loop. 3663 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3664 3665 // Generate the code to check any assumptions that we've made for SCEV 3666 // expressions. 3667 emitSCEVChecks(Lp, LoopScalarPreHeader); 3668 3669 // Generate the code that checks in runtime if arrays overlap. We put the 3670 // checks into a separate block to make the more common case of few elements 3671 // faster. 3672 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3673 3674 // Some loops have a single integer induction variable, while other loops 3675 // don't. One example is c++ iterators that often have multiple pointer 3676 // induction variables. In the code below we also support a case where we 3677 // don't have a single induction variable. 3678 // 3679 // We try to obtain an induction variable from the original loop as hard 3680 // as possible. However if we don't find one that: 3681 // - is an integer 3682 // - counts from zero, stepping by one 3683 // - is the size of the widest induction variable type 3684 // then we create a new one. 3685 OldInduction = Legal->getPrimaryInduction(); 3686 Type *IdxTy = Legal->getWidestInductionType(); 3687 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3688 // The loop step is equal to the vectorization factor (num of SIMD elements) 3689 // times the unroll factor (num of SIMD instructions). 3690 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3691 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3692 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3693 Induction = 3694 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3695 getDebugLocFromInstOrOperands(OldInduction)); 3696 3697 // Emit phis for the new starting index of the scalar loop. 3698 createInductionResumeValues(Lp, CountRoundDown); 3699 3700 return completeLoopSkeleton(Lp, OrigLoopID); 3701 } 3702 3703 // Fix up external users of the induction variable. At this point, we are 3704 // in LCSSA form, with all external PHIs that use the IV having one input value, 3705 // coming from the remainder loop. We need those PHIs to also have a correct 3706 // value for the IV when arriving directly from the middle block. 3707 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3708 const InductionDescriptor &II, 3709 Value *CountRoundDown, Value *EndValue, 3710 BasicBlock *MiddleBlock) { 3711 // There are two kinds of external IV usages - those that use the value 3712 // computed in the last iteration (the PHI) and those that use the penultimate 3713 // value (the value that feeds into the phi from the loop latch). 3714 // We allow both, but they, obviously, have different values. 3715 3716 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3717 3718 DenseMap<Value *, Value *> MissingVals; 3719 3720 // An external user of the last iteration's value should see the value that 3721 // the remainder loop uses to initialize its own IV. 3722 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3723 for (User *U : PostInc->users()) { 3724 Instruction *UI = cast<Instruction>(U); 3725 if (!OrigLoop->contains(UI)) { 3726 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3727 MissingVals[UI] = EndValue; 3728 } 3729 } 3730 3731 // An external user of the penultimate value need to see EndValue - Step. 3732 // The simplest way to get this is to recompute it from the constituent SCEVs, 3733 // that is Start + (Step * (CRD - 1)). 3734 for (User *U : OrigPhi->users()) { 3735 auto *UI = cast<Instruction>(U); 3736 if (!OrigLoop->contains(UI)) { 3737 const DataLayout &DL = 3738 OrigLoop->getHeader()->getModule()->getDataLayout(); 3739 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3740 3741 IRBuilder<> B(MiddleBlock->getTerminator()); 3742 3743 // Fast-math-flags propagate from the original induction instruction. 3744 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3745 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3746 3747 Value *CountMinusOne = B.CreateSub( 3748 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3749 Value *CMO = 3750 !II.getStep()->getType()->isIntegerTy() 3751 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3752 II.getStep()->getType()) 3753 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3754 CMO->setName("cast.cmo"); 3755 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3756 Escape->setName("ind.escape"); 3757 MissingVals[UI] = Escape; 3758 } 3759 } 3760 3761 for (auto &I : MissingVals) { 3762 PHINode *PHI = cast<PHINode>(I.first); 3763 // One corner case we have to handle is two IVs "chasing" each-other, 3764 // that is %IV2 = phi [...], [ %IV1, %latch ] 3765 // In this case, if IV1 has an external use, we need to avoid adding both 3766 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3767 // don't already have an incoming value for the middle block. 3768 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3769 PHI->addIncoming(I.second, MiddleBlock); 3770 } 3771 } 3772 3773 namespace { 3774 3775 struct CSEDenseMapInfo { 3776 static bool canHandle(const Instruction *I) { 3777 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3778 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3779 } 3780 3781 static inline Instruction *getEmptyKey() { 3782 return DenseMapInfo<Instruction *>::getEmptyKey(); 3783 } 3784 3785 static inline Instruction *getTombstoneKey() { 3786 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3787 } 3788 3789 static unsigned getHashValue(const Instruction *I) { 3790 assert(canHandle(I) && "Unknown instruction!"); 3791 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3792 I->value_op_end())); 3793 } 3794 3795 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3796 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3797 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3798 return LHS == RHS; 3799 return LHS->isIdenticalTo(RHS); 3800 } 3801 }; 3802 3803 } // end anonymous namespace 3804 3805 ///Perform cse of induction variable instructions. 3806 static void cse(BasicBlock *BB) { 3807 // Perform simple cse. 3808 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3809 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3810 Instruction *In = &*I++; 3811 3812 if (!CSEDenseMapInfo::canHandle(In)) 3813 continue; 3814 3815 // Check if we can replace this instruction with any of the 3816 // visited instructions. 3817 if (Instruction *V = CSEMap.lookup(In)) { 3818 In->replaceAllUsesWith(V); 3819 In->eraseFromParent(); 3820 continue; 3821 } 3822 3823 CSEMap[In] = In; 3824 } 3825 } 3826 3827 InstructionCost 3828 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3829 bool &NeedToScalarize) const { 3830 Function *F = CI->getCalledFunction(); 3831 Type *ScalarRetTy = CI->getType(); 3832 SmallVector<Type *, 4> Tys, ScalarTys; 3833 for (auto &ArgOp : CI->arg_operands()) 3834 ScalarTys.push_back(ArgOp->getType()); 3835 3836 // Estimate cost of scalarized vector call. The source operands are assumed 3837 // to be vectors, so we need to extract individual elements from there, 3838 // execute VF scalar calls, and then gather the result into the vector return 3839 // value. 3840 InstructionCost ScalarCallCost = 3841 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3842 if (VF.isScalar()) 3843 return ScalarCallCost; 3844 3845 // Compute corresponding vector type for return value and arguments. 3846 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3847 for (Type *ScalarTy : ScalarTys) 3848 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3849 3850 // Compute costs of unpacking argument values for the scalar calls and 3851 // packing the return values to a vector. 3852 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3853 3854 InstructionCost Cost = 3855 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3856 3857 // If we can't emit a vector call for this function, then the currently found 3858 // cost is the cost we need to return. 3859 NeedToScalarize = true; 3860 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3861 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3862 3863 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3864 return Cost; 3865 3866 // If the corresponding vector cost is cheaper, return its cost. 3867 InstructionCost VectorCallCost = 3868 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3869 if (VectorCallCost < Cost) { 3870 NeedToScalarize = false; 3871 Cost = VectorCallCost; 3872 } 3873 return Cost; 3874 } 3875 3876 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3877 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3878 return Elt; 3879 return VectorType::get(Elt, VF); 3880 } 3881 3882 InstructionCost 3883 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3884 ElementCount VF) const { 3885 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3886 assert(ID && "Expected intrinsic call!"); 3887 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3888 FastMathFlags FMF; 3889 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3890 FMF = FPMO->getFastMathFlags(); 3891 3892 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3893 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3894 SmallVector<Type *> ParamTys; 3895 std::transform(FTy->param_begin(), FTy->param_end(), 3896 std::back_inserter(ParamTys), 3897 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3898 3899 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3900 dyn_cast<IntrinsicInst>(CI)); 3901 return TTI.getIntrinsicInstrCost(CostAttrs, 3902 TargetTransformInfo::TCK_RecipThroughput); 3903 } 3904 3905 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3906 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3907 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3908 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3909 } 3910 3911 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3912 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3913 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3914 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3915 } 3916 3917 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3918 // For every instruction `I` in MinBWs, truncate the operands, create a 3919 // truncated version of `I` and reextend its result. InstCombine runs 3920 // later and will remove any ext/trunc pairs. 3921 SmallPtrSet<Value *, 4> Erased; 3922 for (const auto &KV : Cost->getMinimalBitwidths()) { 3923 // If the value wasn't vectorized, we must maintain the original scalar 3924 // type. The absence of the value from State indicates that it 3925 // wasn't vectorized. 3926 VPValue *Def = State.Plan->getVPValue(KV.first); 3927 if (!State.hasAnyVectorValue(Def)) 3928 continue; 3929 for (unsigned Part = 0; Part < UF; ++Part) { 3930 Value *I = State.get(Def, Part); 3931 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3932 continue; 3933 Type *OriginalTy = I->getType(); 3934 Type *ScalarTruncatedTy = 3935 IntegerType::get(OriginalTy->getContext(), KV.second); 3936 auto *TruncatedTy = FixedVectorType::get( 3937 ScalarTruncatedTy, 3938 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3939 if (TruncatedTy == OriginalTy) 3940 continue; 3941 3942 IRBuilder<> B(cast<Instruction>(I)); 3943 auto ShrinkOperand = [&](Value *V) -> Value * { 3944 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3945 if (ZI->getSrcTy() == TruncatedTy) 3946 return ZI->getOperand(0); 3947 return B.CreateZExtOrTrunc(V, TruncatedTy); 3948 }; 3949 3950 // The actual instruction modification depends on the instruction type, 3951 // unfortunately. 3952 Value *NewI = nullptr; 3953 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3954 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3955 ShrinkOperand(BO->getOperand(1))); 3956 3957 // Any wrapping introduced by shrinking this operation shouldn't be 3958 // considered undefined behavior. So, we can't unconditionally copy 3959 // arithmetic wrapping flags to NewI. 3960 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3961 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3962 NewI = 3963 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3964 ShrinkOperand(CI->getOperand(1))); 3965 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3966 NewI = B.CreateSelect(SI->getCondition(), 3967 ShrinkOperand(SI->getTrueValue()), 3968 ShrinkOperand(SI->getFalseValue())); 3969 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3970 switch (CI->getOpcode()) { 3971 default: 3972 llvm_unreachable("Unhandled cast!"); 3973 case Instruction::Trunc: 3974 NewI = ShrinkOperand(CI->getOperand(0)); 3975 break; 3976 case Instruction::SExt: 3977 NewI = B.CreateSExtOrTrunc( 3978 CI->getOperand(0), 3979 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3980 break; 3981 case Instruction::ZExt: 3982 NewI = B.CreateZExtOrTrunc( 3983 CI->getOperand(0), 3984 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3985 break; 3986 } 3987 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3988 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3989 ->getNumElements(); 3990 auto *O0 = B.CreateZExtOrTrunc( 3991 SI->getOperand(0), 3992 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3993 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3994 ->getNumElements(); 3995 auto *O1 = B.CreateZExtOrTrunc( 3996 SI->getOperand(1), 3997 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3998 3999 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 4000 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 4001 // Don't do anything with the operands, just extend the result. 4002 continue; 4003 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 4004 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 4005 ->getNumElements(); 4006 auto *O0 = B.CreateZExtOrTrunc( 4007 IE->getOperand(0), 4008 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4009 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4010 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4011 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4012 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4013 ->getNumElements(); 4014 auto *O0 = B.CreateZExtOrTrunc( 4015 EE->getOperand(0), 4016 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4017 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4018 } else { 4019 // If we don't know what to do, be conservative and don't do anything. 4020 continue; 4021 } 4022 4023 // Lastly, extend the result. 4024 NewI->takeName(cast<Instruction>(I)); 4025 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4026 I->replaceAllUsesWith(Res); 4027 cast<Instruction>(I)->eraseFromParent(); 4028 Erased.insert(I); 4029 State.reset(Def, Res, Part); 4030 } 4031 } 4032 4033 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4034 for (const auto &KV : Cost->getMinimalBitwidths()) { 4035 // If the value wasn't vectorized, we must maintain the original scalar 4036 // type. The absence of the value from State indicates that it 4037 // wasn't vectorized. 4038 VPValue *Def = State.Plan->getVPValue(KV.first); 4039 if (!State.hasAnyVectorValue(Def)) 4040 continue; 4041 for (unsigned Part = 0; Part < UF; ++Part) { 4042 Value *I = State.get(Def, Part); 4043 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4044 if (Inst && Inst->use_empty()) { 4045 Value *NewI = Inst->getOperand(0); 4046 Inst->eraseFromParent(); 4047 State.reset(Def, NewI, Part); 4048 } 4049 } 4050 } 4051 } 4052 4053 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4054 // Insert truncates and extends for any truncated instructions as hints to 4055 // InstCombine. 4056 if (VF.isVector()) 4057 truncateToMinimalBitwidths(State); 4058 4059 // Fix widened non-induction PHIs by setting up the PHI operands. 4060 if (OrigPHIsToFix.size()) { 4061 assert(EnableVPlanNativePath && 4062 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4063 fixNonInductionPHIs(State); 4064 } 4065 4066 // At this point every instruction in the original loop is widened to a 4067 // vector form. Now we need to fix the recurrences in the loop. These PHI 4068 // nodes are currently empty because we did not want to introduce cycles. 4069 // This is the second stage of vectorizing recurrences. 4070 fixCrossIterationPHIs(State); 4071 4072 // Forget the original basic block. 4073 PSE.getSE()->forgetLoop(OrigLoop); 4074 4075 // Fix-up external users of the induction variables. 4076 for (auto &Entry : Legal->getInductionVars()) 4077 fixupIVUsers(Entry.first, Entry.second, 4078 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4079 IVEndValues[Entry.first], LoopMiddleBlock); 4080 4081 fixLCSSAPHIs(State); 4082 for (Instruction *PI : PredicatedInstructions) 4083 sinkScalarOperands(&*PI); 4084 4085 // Remove redundant induction instructions. 4086 cse(LoopVectorBody); 4087 4088 // Set/update profile weights for the vector and remainder loops as original 4089 // loop iterations are now distributed among them. Note that original loop 4090 // represented by LoopScalarBody becomes remainder loop after vectorization. 4091 // 4092 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4093 // end up getting slightly roughened result but that should be OK since 4094 // profile is not inherently precise anyway. Note also possible bypass of 4095 // vector code caused by legality checks is ignored, assigning all the weight 4096 // to the vector loop, optimistically. 4097 // 4098 // For scalable vectorization we can't know at compile time how many iterations 4099 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4100 // vscale of '1'. 4101 setProfileInfoAfterUnrolling( 4102 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4103 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4104 } 4105 4106 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4107 // In order to support recurrences we need to be able to vectorize Phi nodes. 4108 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4109 // stage #2: We now need to fix the recurrences by adding incoming edges to 4110 // the currently empty PHI nodes. At this point every instruction in the 4111 // original loop is widened to a vector form so we can use them to construct 4112 // the incoming edges. 4113 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4114 for (VPRecipeBase &R : Header->phis()) { 4115 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4116 if (!PhiR) 4117 continue; 4118 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4119 if (PhiR->getRecurrenceDescriptor()) { 4120 fixReduction(PhiR, State); 4121 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4122 fixFirstOrderRecurrence(OrigPhi, State); 4123 } 4124 } 4125 4126 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4127 VPTransformState &State) { 4128 // This is the second phase of vectorizing first-order recurrences. An 4129 // overview of the transformation is described below. Suppose we have the 4130 // following loop. 4131 // 4132 // for (int i = 0; i < n; ++i) 4133 // b[i] = a[i] - a[i - 1]; 4134 // 4135 // There is a first-order recurrence on "a". For this loop, the shorthand 4136 // scalar IR looks like: 4137 // 4138 // scalar.ph: 4139 // s_init = a[-1] 4140 // br scalar.body 4141 // 4142 // scalar.body: 4143 // i = phi [0, scalar.ph], [i+1, scalar.body] 4144 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4145 // s2 = a[i] 4146 // b[i] = s2 - s1 4147 // br cond, scalar.body, ... 4148 // 4149 // In this example, s1 is a recurrence because it's value depends on the 4150 // previous iteration. In the first phase of vectorization, we created a 4151 // temporary value for s1. We now complete the vectorization and produce the 4152 // shorthand vector IR shown below (for VF = 4, UF = 1). 4153 // 4154 // vector.ph: 4155 // v_init = vector(..., ..., ..., a[-1]) 4156 // br vector.body 4157 // 4158 // vector.body 4159 // i = phi [0, vector.ph], [i+4, vector.body] 4160 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4161 // v2 = a[i, i+1, i+2, i+3]; 4162 // v3 = vector(v1(3), v2(0, 1, 2)) 4163 // b[i, i+1, i+2, i+3] = v2 - v3 4164 // br cond, vector.body, middle.block 4165 // 4166 // middle.block: 4167 // x = v2(3) 4168 // br scalar.ph 4169 // 4170 // scalar.ph: 4171 // s_init = phi [x, middle.block], [a[-1], otherwise] 4172 // br scalar.body 4173 // 4174 // After execution completes the vector loop, we extract the next value of 4175 // the recurrence (x) to use as the initial value in the scalar loop. 4176 4177 // Get the original loop preheader and single loop latch. 4178 auto *Preheader = OrigLoop->getLoopPreheader(); 4179 auto *Latch = OrigLoop->getLoopLatch(); 4180 4181 // Get the initial and previous values of the scalar recurrence. 4182 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4183 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4184 4185 auto *IdxTy = Builder.getInt32Ty(); 4186 auto *One = ConstantInt::get(IdxTy, 1); 4187 4188 // Create a vector from the initial value. 4189 auto *VectorInit = ScalarInit; 4190 if (VF.isVector()) { 4191 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4192 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4193 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4194 VectorInit = Builder.CreateInsertElement( 4195 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4196 VectorInit, LastIdx, "vector.recur.init"); 4197 } 4198 4199 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4200 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4201 // We constructed a temporary phi node in the first phase of vectorization. 4202 // This phi node will eventually be deleted. 4203 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4204 4205 // Create a phi node for the new recurrence. The current value will either be 4206 // the initial value inserted into a vector or loop-varying vector value. 4207 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4208 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4209 4210 // Get the vectorized previous value of the last part UF - 1. It appears last 4211 // among all unrolled iterations, due to the order of their construction. 4212 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4213 4214 // Find and set the insertion point after the previous value if it is an 4215 // instruction. 4216 BasicBlock::iterator InsertPt; 4217 // Note that the previous value may have been constant-folded so it is not 4218 // guaranteed to be an instruction in the vector loop. 4219 // FIXME: Loop invariant values do not form recurrences. We should deal with 4220 // them earlier. 4221 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4222 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4223 else { 4224 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4225 if (isa<PHINode>(PreviousLastPart)) 4226 // If the previous value is a phi node, we should insert after all the phi 4227 // nodes in the block containing the PHI to avoid breaking basic block 4228 // verification. Note that the basic block may be different to 4229 // LoopVectorBody, in case we predicate the loop. 4230 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4231 else 4232 InsertPt = ++PreviousInst->getIterator(); 4233 } 4234 Builder.SetInsertPoint(&*InsertPt); 4235 4236 // The vector from which to take the initial value for the current iteration 4237 // (actual or unrolled). Initially, this is the vector phi node. 4238 Value *Incoming = VecPhi; 4239 4240 // Shuffle the current and previous vector and update the vector parts. 4241 for (unsigned Part = 0; Part < UF; ++Part) { 4242 Value *PreviousPart = State.get(PreviousDef, Part); 4243 Value *PhiPart = State.get(PhiDef, Part); 4244 auto *Shuffle = VF.isVector() 4245 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4246 : Incoming; 4247 PhiPart->replaceAllUsesWith(Shuffle); 4248 cast<Instruction>(PhiPart)->eraseFromParent(); 4249 State.reset(PhiDef, Shuffle, Part); 4250 Incoming = PreviousPart; 4251 } 4252 4253 // Fix the latch value of the new recurrence in the vector loop. 4254 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4255 4256 // Extract the last vector element in the middle block. This will be the 4257 // initial value for the recurrence when jumping to the scalar loop. 4258 auto *ExtractForScalar = Incoming; 4259 if (VF.isVector()) { 4260 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4261 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4262 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4263 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4264 "vector.recur.extract"); 4265 } 4266 // Extract the second last element in the middle block if the 4267 // Phi is used outside the loop. We need to extract the phi itself 4268 // and not the last element (the phi update in the current iteration). This 4269 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4270 // when the scalar loop is not run at all. 4271 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4272 if (VF.isVector()) { 4273 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4274 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4275 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4276 Incoming, Idx, "vector.recur.extract.for.phi"); 4277 } else if (UF > 1) 4278 // When loop is unrolled without vectorizing, initialize 4279 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4280 // of `Incoming`. This is analogous to the vectorized case above: extracting 4281 // the second last element when VF > 1. 4282 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4283 4284 // Fix the initial value of the original recurrence in the scalar loop. 4285 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4286 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4287 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4288 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4289 Start->addIncoming(Incoming, BB); 4290 } 4291 4292 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4293 Phi->setName("scalar.recur"); 4294 4295 // Finally, fix users of the recurrence outside the loop. The users will need 4296 // either the last value of the scalar recurrence or the last value of the 4297 // vector recurrence we extracted in the middle block. Since the loop is in 4298 // LCSSA form, we just need to find all the phi nodes for the original scalar 4299 // recurrence in the exit block, and then add an edge for the middle block. 4300 // Note that LCSSA does not imply single entry when the original scalar loop 4301 // had multiple exiting edges (as we always run the last iteration in the 4302 // scalar epilogue); in that case, the exiting path through middle will be 4303 // dynamically dead and the value picked for the phi doesn't matter. 4304 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4305 if (any_of(LCSSAPhi.incoming_values(), 4306 [Phi](Value *V) { return V == Phi; })) 4307 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4308 } 4309 4310 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4311 return EnableStrictReductions && RdxDesc.isOrdered(); 4312 } 4313 4314 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4315 VPTransformState &State) { 4316 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4317 // Get it's reduction variable descriptor. 4318 assert(Legal->isReductionVariable(OrigPhi) && 4319 "Unable to find the reduction variable"); 4320 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4321 4322 RecurKind RK = RdxDesc.getRecurrenceKind(); 4323 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4324 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4325 setDebugLocFromInst(Builder, ReductionStartValue); 4326 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4327 4328 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4329 // This is the vector-clone of the value that leaves the loop. 4330 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4331 4332 // Wrap flags are in general invalid after vectorization, clear them. 4333 clearReductionWrapFlags(RdxDesc, State); 4334 4335 // Fix the vector-loop phi. 4336 4337 // Reductions do not have to start at zero. They can start with 4338 // any loop invariant values. 4339 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4340 4341 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4342 useOrderedReductions(RdxDesc); 4343 4344 for (unsigned Part = 0; Part < UF; ++Part) { 4345 if (IsOrdered && Part > 0) 4346 break; 4347 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4348 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4349 if (IsOrdered) 4350 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4351 4352 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4353 } 4354 4355 // Before each round, move the insertion point right between 4356 // the PHIs and the values we are going to write. 4357 // This allows us to write both PHINodes and the extractelement 4358 // instructions. 4359 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4360 4361 setDebugLocFromInst(Builder, LoopExitInst); 4362 4363 Type *PhiTy = OrigPhi->getType(); 4364 // If tail is folded by masking, the vector value to leave the loop should be 4365 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4366 // instead of the former. For an inloop reduction the reduction will already 4367 // be predicated, and does not need to be handled here. 4368 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4369 for (unsigned Part = 0; Part < UF; ++Part) { 4370 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4371 Value *Sel = nullptr; 4372 for (User *U : VecLoopExitInst->users()) { 4373 if (isa<SelectInst>(U)) { 4374 assert(!Sel && "Reduction exit feeding two selects"); 4375 Sel = U; 4376 } else 4377 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4378 } 4379 assert(Sel && "Reduction exit feeds no select"); 4380 State.reset(LoopExitInstDef, Sel, Part); 4381 4382 // If the target can create a predicated operator for the reduction at no 4383 // extra cost in the loop (for example a predicated vadd), it can be 4384 // cheaper for the select to remain in the loop than be sunk out of it, 4385 // and so use the select value for the phi instead of the old 4386 // LoopExitValue. 4387 if (PreferPredicatedReductionSelect || 4388 TTI->preferPredicatedReductionSelect( 4389 RdxDesc.getOpcode(), PhiTy, 4390 TargetTransformInfo::ReductionFlags())) { 4391 auto *VecRdxPhi = 4392 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4393 VecRdxPhi->setIncomingValueForBlock( 4394 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4395 } 4396 } 4397 } 4398 4399 // If the vector reduction can be performed in a smaller type, we truncate 4400 // then extend the loop exit value to enable InstCombine to evaluate the 4401 // entire expression in the smaller type. 4402 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4403 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4404 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4405 Builder.SetInsertPoint( 4406 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4407 VectorParts RdxParts(UF); 4408 for (unsigned Part = 0; Part < UF; ++Part) { 4409 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4410 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4411 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4412 : Builder.CreateZExt(Trunc, VecTy); 4413 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4414 UI != RdxParts[Part]->user_end();) 4415 if (*UI != Trunc) { 4416 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4417 RdxParts[Part] = Extnd; 4418 } else { 4419 ++UI; 4420 } 4421 } 4422 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4423 for (unsigned Part = 0; Part < UF; ++Part) { 4424 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4425 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4426 } 4427 } 4428 4429 // Reduce all of the unrolled parts into a single vector. 4430 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4431 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4432 4433 // The middle block terminator has already been assigned a DebugLoc here (the 4434 // OrigLoop's single latch terminator). We want the whole middle block to 4435 // appear to execute on this line because: (a) it is all compiler generated, 4436 // (b) these instructions are always executed after evaluating the latch 4437 // conditional branch, and (c) other passes may add new predecessors which 4438 // terminate on this line. This is the easiest way to ensure we don't 4439 // accidentally cause an extra step back into the loop while debugging. 4440 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4441 if (IsOrdered) 4442 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4443 else { 4444 // Floating-point operations should have some FMF to enable the reduction. 4445 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4446 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4447 for (unsigned Part = 1; Part < UF; ++Part) { 4448 Value *RdxPart = State.get(LoopExitInstDef, Part); 4449 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4450 ReducedPartRdx = Builder.CreateBinOp( 4451 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4452 } else { 4453 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4454 } 4455 } 4456 } 4457 4458 // Create the reduction after the loop. Note that inloop reductions create the 4459 // target reduction in the loop using a Reduction recipe. 4460 if (VF.isVector() && !IsInLoopReductionPhi) { 4461 ReducedPartRdx = 4462 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4463 // If the reduction can be performed in a smaller type, we need to extend 4464 // the reduction to the wider type before we branch to the original loop. 4465 if (PhiTy != RdxDesc.getRecurrenceType()) 4466 ReducedPartRdx = RdxDesc.isSigned() 4467 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4468 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4469 } 4470 4471 // Create a phi node that merges control-flow from the backedge-taken check 4472 // block and the middle block. 4473 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4474 LoopScalarPreHeader->getTerminator()); 4475 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4476 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4477 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4478 4479 // Now, we need to fix the users of the reduction variable 4480 // inside and outside of the scalar remainder loop. 4481 4482 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4483 // in the exit blocks. See comment on analogous loop in 4484 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4485 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4486 if (any_of(LCSSAPhi.incoming_values(), 4487 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4488 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4489 4490 // Fix the scalar loop reduction variable with the incoming reduction sum 4491 // from the vector body and from the backedge value. 4492 int IncomingEdgeBlockIdx = 4493 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4494 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4495 // Pick the other block. 4496 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4497 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4498 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4499 } 4500 4501 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4502 VPTransformState &State) { 4503 RecurKind RK = RdxDesc.getRecurrenceKind(); 4504 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4505 return; 4506 4507 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4508 assert(LoopExitInstr && "null loop exit instruction"); 4509 SmallVector<Instruction *, 8> Worklist; 4510 SmallPtrSet<Instruction *, 8> Visited; 4511 Worklist.push_back(LoopExitInstr); 4512 Visited.insert(LoopExitInstr); 4513 4514 while (!Worklist.empty()) { 4515 Instruction *Cur = Worklist.pop_back_val(); 4516 if (isa<OverflowingBinaryOperator>(Cur)) 4517 for (unsigned Part = 0; Part < UF; ++Part) { 4518 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4519 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4520 } 4521 4522 for (User *U : Cur->users()) { 4523 Instruction *UI = cast<Instruction>(U); 4524 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4525 Visited.insert(UI).second) 4526 Worklist.push_back(UI); 4527 } 4528 } 4529 } 4530 4531 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4532 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4533 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4534 // Some phis were already hand updated by the reduction and recurrence 4535 // code above, leave them alone. 4536 continue; 4537 4538 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4539 // Non-instruction incoming values will have only one value. 4540 4541 VPLane Lane = VPLane::getFirstLane(); 4542 if (isa<Instruction>(IncomingValue) && 4543 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4544 VF)) 4545 Lane = VPLane::getLastLaneForVF(VF); 4546 4547 // Can be a loop invariant incoming value or the last scalar value to be 4548 // extracted from the vectorized loop. 4549 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4550 Value *lastIncomingValue = 4551 OrigLoop->isLoopInvariant(IncomingValue) 4552 ? IncomingValue 4553 : State.get(State.Plan->getVPValue(IncomingValue), 4554 VPIteration(UF - 1, Lane)); 4555 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4556 } 4557 } 4558 4559 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4560 // The basic block and loop containing the predicated instruction. 4561 auto *PredBB = PredInst->getParent(); 4562 auto *VectorLoop = LI->getLoopFor(PredBB); 4563 4564 // Initialize a worklist with the operands of the predicated instruction. 4565 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4566 4567 // Holds instructions that we need to analyze again. An instruction may be 4568 // reanalyzed if we don't yet know if we can sink it or not. 4569 SmallVector<Instruction *, 8> InstsToReanalyze; 4570 4571 // Returns true if a given use occurs in the predicated block. Phi nodes use 4572 // their operands in their corresponding predecessor blocks. 4573 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4574 auto *I = cast<Instruction>(U.getUser()); 4575 BasicBlock *BB = I->getParent(); 4576 if (auto *Phi = dyn_cast<PHINode>(I)) 4577 BB = Phi->getIncomingBlock( 4578 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4579 return BB == PredBB; 4580 }; 4581 4582 // Iteratively sink the scalarized operands of the predicated instruction 4583 // into the block we created for it. When an instruction is sunk, it's 4584 // operands are then added to the worklist. The algorithm ends after one pass 4585 // through the worklist doesn't sink a single instruction. 4586 bool Changed; 4587 do { 4588 // Add the instructions that need to be reanalyzed to the worklist, and 4589 // reset the changed indicator. 4590 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4591 InstsToReanalyze.clear(); 4592 Changed = false; 4593 4594 while (!Worklist.empty()) { 4595 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4596 4597 // We can't sink an instruction if it is a phi node, is already in the 4598 // predicated block, is not in the loop, or may have side effects. 4599 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4600 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4601 continue; 4602 4603 // It's legal to sink the instruction if all its uses occur in the 4604 // predicated block. Otherwise, there's nothing to do yet, and we may 4605 // need to reanalyze the instruction. 4606 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4607 InstsToReanalyze.push_back(I); 4608 continue; 4609 } 4610 4611 // Move the instruction to the beginning of the predicated block, and add 4612 // it's operands to the worklist. 4613 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4614 Worklist.insert(I->op_begin(), I->op_end()); 4615 4616 // The sinking may have enabled other instructions to be sunk, so we will 4617 // need to iterate. 4618 Changed = true; 4619 } 4620 } while (Changed); 4621 } 4622 4623 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4624 for (PHINode *OrigPhi : OrigPHIsToFix) { 4625 VPWidenPHIRecipe *VPPhi = 4626 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4627 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4628 // Make sure the builder has a valid insert point. 4629 Builder.SetInsertPoint(NewPhi); 4630 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4631 VPValue *Inc = VPPhi->getIncomingValue(i); 4632 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4633 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4634 } 4635 } 4636 } 4637 4638 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4639 VPUser &Operands, unsigned UF, 4640 ElementCount VF, bool IsPtrLoopInvariant, 4641 SmallBitVector &IsIndexLoopInvariant, 4642 VPTransformState &State) { 4643 // Construct a vector GEP by widening the operands of the scalar GEP as 4644 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4645 // results in a vector of pointers when at least one operand of the GEP 4646 // is vector-typed. Thus, to keep the representation compact, we only use 4647 // vector-typed operands for loop-varying values. 4648 4649 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4650 // If we are vectorizing, but the GEP has only loop-invariant operands, 4651 // the GEP we build (by only using vector-typed operands for 4652 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4653 // produce a vector of pointers, we need to either arbitrarily pick an 4654 // operand to broadcast, or broadcast a clone of the original GEP. 4655 // Here, we broadcast a clone of the original. 4656 // 4657 // TODO: If at some point we decide to scalarize instructions having 4658 // loop-invariant operands, this special case will no longer be 4659 // required. We would add the scalarization decision to 4660 // collectLoopScalars() and teach getVectorValue() to broadcast 4661 // the lane-zero scalar value. 4662 auto *Clone = Builder.Insert(GEP->clone()); 4663 for (unsigned Part = 0; Part < UF; ++Part) { 4664 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4665 State.set(VPDef, EntryPart, Part); 4666 addMetadata(EntryPart, GEP); 4667 } 4668 } else { 4669 // If the GEP has at least one loop-varying operand, we are sure to 4670 // produce a vector of pointers. But if we are only unrolling, we want 4671 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4672 // produce with the code below will be scalar (if VF == 1) or vector 4673 // (otherwise). Note that for the unroll-only case, we still maintain 4674 // values in the vector mapping with initVector, as we do for other 4675 // instructions. 4676 for (unsigned Part = 0; Part < UF; ++Part) { 4677 // The pointer operand of the new GEP. If it's loop-invariant, we 4678 // won't broadcast it. 4679 auto *Ptr = IsPtrLoopInvariant 4680 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4681 : State.get(Operands.getOperand(0), Part); 4682 4683 // Collect all the indices for the new GEP. If any index is 4684 // loop-invariant, we won't broadcast it. 4685 SmallVector<Value *, 4> Indices; 4686 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4687 VPValue *Operand = Operands.getOperand(I); 4688 if (IsIndexLoopInvariant[I - 1]) 4689 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4690 else 4691 Indices.push_back(State.get(Operand, Part)); 4692 } 4693 4694 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4695 // but it should be a vector, otherwise. 4696 auto *NewGEP = 4697 GEP->isInBounds() 4698 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4699 Indices) 4700 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4701 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4702 "NewGEP is not a pointer vector"); 4703 State.set(VPDef, NewGEP, Part); 4704 addMetadata(NewGEP, GEP); 4705 } 4706 } 4707 } 4708 4709 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4710 RecurrenceDescriptor *RdxDesc, 4711 VPWidenPHIRecipe *PhiR, 4712 VPTransformState &State) { 4713 PHINode *P = cast<PHINode>(PN); 4714 if (EnableVPlanNativePath) { 4715 // Currently we enter here in the VPlan-native path for non-induction 4716 // PHIs where all control flow is uniform. We simply widen these PHIs. 4717 // Create a vector phi with no operands - the vector phi operands will be 4718 // set at the end of vector code generation. 4719 Type *VecTy = (State.VF.isScalar()) 4720 ? PN->getType() 4721 : VectorType::get(PN->getType(), State.VF); 4722 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4723 State.set(PhiR, VecPhi, 0); 4724 OrigPHIsToFix.push_back(P); 4725 4726 return; 4727 } 4728 4729 assert(PN->getParent() == OrigLoop->getHeader() && 4730 "Non-header phis should have been handled elsewhere"); 4731 4732 VPValue *StartVPV = PhiR->getStartValue(); 4733 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4734 // In order to support recurrences we need to be able to vectorize Phi nodes. 4735 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4736 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4737 // this value when we vectorize all of the instructions that use the PHI. 4738 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4739 Value *Iden = nullptr; 4740 bool ScalarPHI = 4741 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4742 Type *VecTy = 4743 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4744 4745 if (RdxDesc) { 4746 assert(Legal->isReductionVariable(P) && StartV && 4747 "RdxDesc should only be set for reduction variables; in that case " 4748 "a StartV is also required"); 4749 RecurKind RK = RdxDesc->getRecurrenceKind(); 4750 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4751 // MinMax reduction have the start value as their identify. 4752 if (ScalarPHI) { 4753 Iden = StartV; 4754 } else { 4755 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4756 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4757 StartV = Iden = 4758 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4759 } 4760 } else { 4761 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4762 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4763 Iden = IdenC; 4764 4765 if (!ScalarPHI) { 4766 Iden = ConstantVector::getSplat(State.VF, IdenC); 4767 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4768 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4769 Constant *Zero = Builder.getInt32(0); 4770 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4771 } 4772 } 4773 } 4774 4775 bool IsOrdered = State.VF.isVector() && 4776 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4777 useOrderedReductions(*RdxDesc); 4778 4779 for (unsigned Part = 0; Part < State.UF; ++Part) { 4780 // This is phase one of vectorizing PHIs. 4781 if (Part > 0 && IsOrdered) 4782 return; 4783 Value *EntryPart = PHINode::Create( 4784 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4785 State.set(PhiR, EntryPart, Part); 4786 if (StartV) { 4787 // Make sure to add the reduction start value only to the 4788 // first unroll part. 4789 Value *StartVal = (Part == 0) ? StartV : Iden; 4790 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4791 } 4792 } 4793 return; 4794 } 4795 4796 assert(!Legal->isReductionVariable(P) && 4797 "reductions should be handled above"); 4798 4799 setDebugLocFromInst(Builder, P); 4800 4801 // This PHINode must be an induction variable. 4802 // Make sure that we know about it. 4803 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4804 4805 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4806 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4807 4808 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4809 // which can be found from the original scalar operations. 4810 switch (II.getKind()) { 4811 case InductionDescriptor::IK_NoInduction: 4812 llvm_unreachable("Unknown induction"); 4813 case InductionDescriptor::IK_IntInduction: 4814 case InductionDescriptor::IK_FpInduction: 4815 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4816 case InductionDescriptor::IK_PtrInduction: { 4817 // Handle the pointer induction variable case. 4818 assert(P->getType()->isPointerTy() && "Unexpected type."); 4819 4820 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4821 // This is the normalized GEP that starts counting at zero. 4822 Value *PtrInd = 4823 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4824 // Determine the number of scalars we need to generate for each unroll 4825 // iteration. If the instruction is uniform, we only need to generate the 4826 // first lane. Otherwise, we generate all VF values. 4827 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4828 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); 4829 4830 bool NeedsVectorIndex = !IsUniform && VF.isScalable(); 4831 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr; 4832 if (NeedsVectorIndex) { 4833 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF); 4834 UnitStepVec = Builder.CreateStepVector(VecIVTy); 4835 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd); 4836 } 4837 4838 for (unsigned Part = 0; Part < UF; ++Part) { 4839 Value *PartStart = createStepForVF( 4840 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4841 4842 if (NeedsVectorIndex) { 4843 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); 4844 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); 4845 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); 4846 Value *SclrGep = 4847 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); 4848 SclrGep->setName("next.gep"); 4849 State.set(PhiR, SclrGep, Part); 4850 // We've cached the whole vector, which means we can support the 4851 // extraction of any lane. 4852 continue; 4853 } 4854 4855 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4856 Value *Idx = Builder.CreateAdd( 4857 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4858 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4859 Value *SclrGep = 4860 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4861 SclrGep->setName("next.gep"); 4862 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4863 } 4864 } 4865 return; 4866 } 4867 assert(isa<SCEVConstant>(II.getStep()) && 4868 "Induction step not a SCEV constant!"); 4869 Type *PhiType = II.getStep()->getType(); 4870 4871 // Build a pointer phi 4872 Value *ScalarStartValue = II.getStartValue(); 4873 Type *ScStValueType = ScalarStartValue->getType(); 4874 PHINode *NewPointerPhi = 4875 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4876 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4877 4878 // A pointer induction, performed by using a gep 4879 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4880 Instruction *InductionLoc = LoopLatch->getTerminator(); 4881 const SCEV *ScalarStep = II.getStep(); 4882 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4883 Value *ScalarStepValue = 4884 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4885 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4886 Value *NumUnrolledElems = 4887 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4888 Value *InductionGEP = GetElementPtrInst::Create( 4889 ScStValueType->getPointerElementType(), NewPointerPhi, 4890 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4891 InductionLoc); 4892 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4893 4894 // Create UF many actual address geps that use the pointer 4895 // phi as base and a vectorized version of the step value 4896 // (<step*0, ..., step*N>) as offset. 4897 for (unsigned Part = 0; Part < State.UF; ++Part) { 4898 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4899 Value *StartOffsetScalar = 4900 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4901 Value *StartOffset = 4902 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4903 // Create a vector of consecutive numbers from zero to VF. 4904 StartOffset = 4905 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4906 4907 Value *GEP = Builder.CreateGEP( 4908 ScStValueType->getPointerElementType(), NewPointerPhi, 4909 Builder.CreateMul( 4910 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4911 "vector.gep")); 4912 State.set(PhiR, GEP, Part); 4913 } 4914 } 4915 } 4916 } 4917 4918 /// A helper function for checking whether an integer division-related 4919 /// instruction may divide by zero (in which case it must be predicated if 4920 /// executed conditionally in the scalar code). 4921 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4922 /// Non-zero divisors that are non compile-time constants will not be 4923 /// converted into multiplication, so we will still end up scalarizing 4924 /// the division, but can do so w/o predication. 4925 static bool mayDivideByZero(Instruction &I) { 4926 assert((I.getOpcode() == Instruction::UDiv || 4927 I.getOpcode() == Instruction::SDiv || 4928 I.getOpcode() == Instruction::URem || 4929 I.getOpcode() == Instruction::SRem) && 4930 "Unexpected instruction"); 4931 Value *Divisor = I.getOperand(1); 4932 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4933 return !CInt || CInt->isZero(); 4934 } 4935 4936 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4937 VPUser &User, 4938 VPTransformState &State) { 4939 switch (I.getOpcode()) { 4940 case Instruction::Call: 4941 case Instruction::Br: 4942 case Instruction::PHI: 4943 case Instruction::GetElementPtr: 4944 case Instruction::Select: 4945 llvm_unreachable("This instruction is handled by a different recipe."); 4946 case Instruction::UDiv: 4947 case Instruction::SDiv: 4948 case Instruction::SRem: 4949 case Instruction::URem: 4950 case Instruction::Add: 4951 case Instruction::FAdd: 4952 case Instruction::Sub: 4953 case Instruction::FSub: 4954 case Instruction::FNeg: 4955 case Instruction::Mul: 4956 case Instruction::FMul: 4957 case Instruction::FDiv: 4958 case Instruction::FRem: 4959 case Instruction::Shl: 4960 case Instruction::LShr: 4961 case Instruction::AShr: 4962 case Instruction::And: 4963 case Instruction::Or: 4964 case Instruction::Xor: { 4965 // Just widen unops and binops. 4966 setDebugLocFromInst(Builder, &I); 4967 4968 for (unsigned Part = 0; Part < UF; ++Part) { 4969 SmallVector<Value *, 2> Ops; 4970 for (VPValue *VPOp : User.operands()) 4971 Ops.push_back(State.get(VPOp, Part)); 4972 4973 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4974 4975 if (auto *VecOp = dyn_cast<Instruction>(V)) 4976 VecOp->copyIRFlags(&I); 4977 4978 // Use this vector value for all users of the original instruction. 4979 State.set(Def, V, Part); 4980 addMetadata(V, &I); 4981 } 4982 4983 break; 4984 } 4985 case Instruction::ICmp: 4986 case Instruction::FCmp: { 4987 // Widen compares. Generate vector compares. 4988 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4989 auto *Cmp = cast<CmpInst>(&I); 4990 setDebugLocFromInst(Builder, Cmp); 4991 for (unsigned Part = 0; Part < UF; ++Part) { 4992 Value *A = State.get(User.getOperand(0), Part); 4993 Value *B = State.get(User.getOperand(1), Part); 4994 Value *C = nullptr; 4995 if (FCmp) { 4996 // Propagate fast math flags. 4997 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4998 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4999 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 5000 } else { 5001 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 5002 } 5003 State.set(Def, C, Part); 5004 addMetadata(C, &I); 5005 } 5006 5007 break; 5008 } 5009 5010 case Instruction::ZExt: 5011 case Instruction::SExt: 5012 case Instruction::FPToUI: 5013 case Instruction::FPToSI: 5014 case Instruction::FPExt: 5015 case Instruction::PtrToInt: 5016 case Instruction::IntToPtr: 5017 case Instruction::SIToFP: 5018 case Instruction::UIToFP: 5019 case Instruction::Trunc: 5020 case Instruction::FPTrunc: 5021 case Instruction::BitCast: { 5022 auto *CI = cast<CastInst>(&I); 5023 setDebugLocFromInst(Builder, CI); 5024 5025 /// Vectorize casts. 5026 Type *DestTy = 5027 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 5028 5029 for (unsigned Part = 0; Part < UF; ++Part) { 5030 Value *A = State.get(User.getOperand(0), Part); 5031 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5032 State.set(Def, Cast, Part); 5033 addMetadata(Cast, &I); 5034 } 5035 break; 5036 } 5037 default: 5038 // This instruction is not vectorized by simple widening. 5039 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5040 llvm_unreachable("Unhandled instruction!"); 5041 } // end of switch. 5042 } 5043 5044 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5045 VPUser &ArgOperands, 5046 VPTransformState &State) { 5047 assert(!isa<DbgInfoIntrinsic>(I) && 5048 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5049 setDebugLocFromInst(Builder, &I); 5050 5051 Module *M = I.getParent()->getParent()->getParent(); 5052 auto *CI = cast<CallInst>(&I); 5053 5054 SmallVector<Type *, 4> Tys; 5055 for (Value *ArgOperand : CI->arg_operands()) 5056 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5057 5058 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5059 5060 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5061 // version of the instruction. 5062 // Is it beneficial to perform intrinsic call compared to lib call? 5063 bool NeedToScalarize = false; 5064 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5065 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5066 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5067 assert((UseVectorIntrinsic || !NeedToScalarize) && 5068 "Instruction should be scalarized elsewhere."); 5069 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5070 "Either the intrinsic cost or vector call cost must be valid"); 5071 5072 for (unsigned Part = 0; Part < UF; ++Part) { 5073 SmallVector<Value *, 4> Args; 5074 for (auto &I : enumerate(ArgOperands.operands())) { 5075 // Some intrinsics have a scalar argument - don't replace it with a 5076 // vector. 5077 Value *Arg; 5078 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5079 Arg = State.get(I.value(), Part); 5080 else 5081 Arg = State.get(I.value(), VPIteration(0, 0)); 5082 Args.push_back(Arg); 5083 } 5084 5085 Function *VectorF; 5086 if (UseVectorIntrinsic) { 5087 // Use vector version of the intrinsic. 5088 Type *TysForDecl[] = {CI->getType()}; 5089 if (VF.isVector()) 5090 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5091 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5092 assert(VectorF && "Can't retrieve vector intrinsic."); 5093 } else { 5094 // Use vector version of the function call. 5095 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5096 #ifndef NDEBUG 5097 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5098 "Can't create vector function."); 5099 #endif 5100 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5101 } 5102 SmallVector<OperandBundleDef, 1> OpBundles; 5103 CI->getOperandBundlesAsDefs(OpBundles); 5104 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5105 5106 if (isa<FPMathOperator>(V)) 5107 V->copyFastMathFlags(CI); 5108 5109 State.set(Def, V, Part); 5110 addMetadata(V, &I); 5111 } 5112 } 5113 5114 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5115 VPUser &Operands, 5116 bool InvariantCond, 5117 VPTransformState &State) { 5118 setDebugLocFromInst(Builder, &I); 5119 5120 // The condition can be loop invariant but still defined inside the 5121 // loop. This means that we can't just use the original 'cond' value. 5122 // We have to take the 'vectorized' value and pick the first lane. 5123 // Instcombine will make this a no-op. 5124 auto *InvarCond = InvariantCond 5125 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5126 : nullptr; 5127 5128 for (unsigned Part = 0; Part < UF; ++Part) { 5129 Value *Cond = 5130 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5131 Value *Op0 = State.get(Operands.getOperand(1), Part); 5132 Value *Op1 = State.get(Operands.getOperand(2), Part); 5133 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5134 State.set(VPDef, Sel, Part); 5135 addMetadata(Sel, &I); 5136 } 5137 } 5138 5139 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5140 // We should not collect Scalars more than once per VF. Right now, this 5141 // function is called from collectUniformsAndScalars(), which already does 5142 // this check. Collecting Scalars for VF=1 does not make any sense. 5143 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5144 "This function should not be visited twice for the same VF"); 5145 5146 SmallSetVector<Instruction *, 8> Worklist; 5147 5148 // These sets are used to seed the analysis with pointers used by memory 5149 // accesses that will remain scalar. 5150 SmallSetVector<Instruction *, 8> ScalarPtrs; 5151 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5152 auto *Latch = TheLoop->getLoopLatch(); 5153 5154 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5155 // The pointer operands of loads and stores will be scalar as long as the 5156 // memory access is not a gather or scatter operation. The value operand of a 5157 // store will remain scalar if the store is scalarized. 5158 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5159 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5160 assert(WideningDecision != CM_Unknown && 5161 "Widening decision should be ready at this moment"); 5162 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5163 if (Ptr == Store->getValueOperand()) 5164 return WideningDecision == CM_Scalarize; 5165 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5166 "Ptr is neither a value or pointer operand"); 5167 return WideningDecision != CM_GatherScatter; 5168 }; 5169 5170 // A helper that returns true if the given value is a bitcast or 5171 // getelementptr instruction contained in the loop. 5172 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5173 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5174 isa<GetElementPtrInst>(V)) && 5175 !TheLoop->isLoopInvariant(V); 5176 }; 5177 5178 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5179 if (!isa<PHINode>(Ptr) || 5180 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5181 return false; 5182 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5183 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5184 return false; 5185 return isScalarUse(MemAccess, Ptr); 5186 }; 5187 5188 // A helper that evaluates a memory access's use of a pointer. If the 5189 // pointer is actually the pointer induction of a loop, it is being 5190 // inserted into Worklist. If the use will be a scalar use, and the 5191 // pointer is only used by memory accesses, we place the pointer in 5192 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5193 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5194 if (isScalarPtrInduction(MemAccess, Ptr)) { 5195 Worklist.insert(cast<Instruction>(Ptr)); 5196 Instruction *Update = cast<Instruction>( 5197 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5198 Worklist.insert(Update); 5199 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5200 << "\n"); 5201 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5202 << "\n"); 5203 return; 5204 } 5205 // We only care about bitcast and getelementptr instructions contained in 5206 // the loop. 5207 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5208 return; 5209 5210 // If the pointer has already been identified as scalar (e.g., if it was 5211 // also identified as uniform), there's nothing to do. 5212 auto *I = cast<Instruction>(Ptr); 5213 if (Worklist.count(I)) 5214 return; 5215 5216 // If the use of the pointer will be a scalar use, and all users of the 5217 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5218 // place the pointer in PossibleNonScalarPtrs. 5219 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5220 return isa<LoadInst>(U) || isa<StoreInst>(U); 5221 })) 5222 ScalarPtrs.insert(I); 5223 else 5224 PossibleNonScalarPtrs.insert(I); 5225 }; 5226 5227 // We seed the scalars analysis with three classes of instructions: (1) 5228 // instructions marked uniform-after-vectorization and (2) bitcast, 5229 // getelementptr and (pointer) phi instructions used by memory accesses 5230 // requiring a scalar use. 5231 // 5232 // (1) Add to the worklist all instructions that have been identified as 5233 // uniform-after-vectorization. 5234 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5235 5236 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5237 // memory accesses requiring a scalar use. The pointer operands of loads and 5238 // stores will be scalar as long as the memory accesses is not a gather or 5239 // scatter operation. The value operand of a store will remain scalar if the 5240 // store is scalarized. 5241 for (auto *BB : TheLoop->blocks()) 5242 for (auto &I : *BB) { 5243 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5244 evaluatePtrUse(Load, Load->getPointerOperand()); 5245 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5246 evaluatePtrUse(Store, Store->getPointerOperand()); 5247 evaluatePtrUse(Store, Store->getValueOperand()); 5248 } 5249 } 5250 for (auto *I : ScalarPtrs) 5251 if (!PossibleNonScalarPtrs.count(I)) { 5252 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5253 Worklist.insert(I); 5254 } 5255 5256 // Insert the forced scalars. 5257 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5258 // induction variable when the PHI user is scalarized. 5259 auto ForcedScalar = ForcedScalars.find(VF); 5260 if (ForcedScalar != ForcedScalars.end()) 5261 for (auto *I : ForcedScalar->second) 5262 Worklist.insert(I); 5263 5264 // Expand the worklist by looking through any bitcasts and getelementptr 5265 // instructions we've already identified as scalar. This is similar to the 5266 // expansion step in collectLoopUniforms(); however, here we're only 5267 // expanding to include additional bitcasts and getelementptr instructions. 5268 unsigned Idx = 0; 5269 while (Idx != Worklist.size()) { 5270 Instruction *Dst = Worklist[Idx++]; 5271 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5272 continue; 5273 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5274 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5275 auto *J = cast<Instruction>(U); 5276 return !TheLoop->contains(J) || Worklist.count(J) || 5277 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5278 isScalarUse(J, Src)); 5279 })) { 5280 Worklist.insert(Src); 5281 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5282 } 5283 } 5284 5285 // An induction variable will remain scalar if all users of the induction 5286 // variable and induction variable update remain scalar. 5287 for (auto &Induction : Legal->getInductionVars()) { 5288 auto *Ind = Induction.first; 5289 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5290 5291 // If tail-folding is applied, the primary induction variable will be used 5292 // to feed a vector compare. 5293 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5294 continue; 5295 5296 // Determine if all users of the induction variable are scalar after 5297 // vectorization. 5298 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5299 auto *I = cast<Instruction>(U); 5300 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5301 }); 5302 if (!ScalarInd) 5303 continue; 5304 5305 // Determine if all users of the induction variable update instruction are 5306 // scalar after vectorization. 5307 auto ScalarIndUpdate = 5308 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5309 auto *I = cast<Instruction>(U); 5310 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5311 }); 5312 if (!ScalarIndUpdate) 5313 continue; 5314 5315 // The induction variable and its update instruction will remain scalar. 5316 Worklist.insert(Ind); 5317 Worklist.insert(IndUpdate); 5318 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5319 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5320 << "\n"); 5321 } 5322 5323 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5324 } 5325 5326 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { 5327 if (!blockNeedsPredication(I->getParent())) 5328 return false; 5329 switch(I->getOpcode()) { 5330 default: 5331 break; 5332 case Instruction::Load: 5333 case Instruction::Store: { 5334 if (!Legal->isMaskRequired(I)) 5335 return false; 5336 auto *Ptr = getLoadStorePointerOperand(I); 5337 auto *Ty = getMemInstValueType(I); 5338 const Align Alignment = getLoadStoreAlignment(I); 5339 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5340 isLegalMaskedGather(Ty, Alignment)) 5341 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5342 isLegalMaskedScatter(Ty, Alignment)); 5343 } 5344 case Instruction::UDiv: 5345 case Instruction::SDiv: 5346 case Instruction::SRem: 5347 case Instruction::URem: 5348 return mayDivideByZero(*I); 5349 } 5350 return false; 5351 } 5352 5353 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5354 Instruction *I, ElementCount VF) { 5355 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5356 assert(getWideningDecision(I, VF) == CM_Unknown && 5357 "Decision should not be set yet."); 5358 auto *Group = getInterleavedAccessGroup(I); 5359 assert(Group && "Must have a group."); 5360 5361 // If the instruction's allocated size doesn't equal it's type size, it 5362 // requires padding and will be scalarized. 5363 auto &DL = I->getModule()->getDataLayout(); 5364 auto *ScalarTy = getMemInstValueType(I); 5365 if (hasIrregularType(ScalarTy, DL)) 5366 return false; 5367 5368 // Check if masking is required. 5369 // A Group may need masking for one of two reasons: it resides in a block that 5370 // needs predication, or it was decided to use masking to deal with gaps. 5371 bool PredicatedAccessRequiresMasking = 5372 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5373 bool AccessWithGapsRequiresMasking = 5374 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5375 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5376 return true; 5377 5378 // If masked interleaving is required, we expect that the user/target had 5379 // enabled it, because otherwise it either wouldn't have been created or 5380 // it should have been invalidated by the CostModel. 5381 assert(useMaskedInterleavedAccesses(TTI) && 5382 "Masked interleave-groups for predicated accesses are not enabled."); 5383 5384 auto *Ty = getMemInstValueType(I); 5385 const Align Alignment = getLoadStoreAlignment(I); 5386 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5387 : TTI.isLegalMaskedStore(Ty, Alignment); 5388 } 5389 5390 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5391 Instruction *I, ElementCount VF) { 5392 // Get and ensure we have a valid memory instruction. 5393 LoadInst *LI = dyn_cast<LoadInst>(I); 5394 StoreInst *SI = dyn_cast<StoreInst>(I); 5395 assert((LI || SI) && "Invalid memory instruction"); 5396 5397 auto *Ptr = getLoadStorePointerOperand(I); 5398 5399 // In order to be widened, the pointer should be consecutive, first of all. 5400 if (!Legal->isConsecutivePtr(Ptr)) 5401 return false; 5402 5403 // If the instruction is a store located in a predicated block, it will be 5404 // scalarized. 5405 if (isScalarWithPredication(I)) 5406 return false; 5407 5408 // If the instruction's allocated size doesn't equal it's type size, it 5409 // requires padding and will be scalarized. 5410 auto &DL = I->getModule()->getDataLayout(); 5411 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5412 if (hasIrregularType(ScalarTy, DL)) 5413 return false; 5414 5415 return true; 5416 } 5417 5418 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5419 // We should not collect Uniforms more than once per VF. Right now, 5420 // this function is called from collectUniformsAndScalars(), which 5421 // already does this check. Collecting Uniforms for VF=1 does not make any 5422 // sense. 5423 5424 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5425 "This function should not be visited twice for the same VF"); 5426 5427 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5428 // not analyze again. Uniforms.count(VF) will return 1. 5429 Uniforms[VF].clear(); 5430 5431 // We now know that the loop is vectorizable! 5432 // Collect instructions inside the loop that will remain uniform after 5433 // vectorization. 5434 5435 // Global values, params and instructions outside of current loop are out of 5436 // scope. 5437 auto isOutOfScope = [&](Value *V) -> bool { 5438 Instruction *I = dyn_cast<Instruction>(V); 5439 return (!I || !TheLoop->contains(I)); 5440 }; 5441 5442 SetVector<Instruction *> Worklist; 5443 BasicBlock *Latch = TheLoop->getLoopLatch(); 5444 5445 // Instructions that are scalar with predication must not be considered 5446 // uniform after vectorization, because that would create an erroneous 5447 // replicating region where only a single instance out of VF should be formed. 5448 // TODO: optimize such seldom cases if found important, see PR40816. 5449 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5450 if (isOutOfScope(I)) { 5451 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5452 << *I << "\n"); 5453 return; 5454 } 5455 if (isScalarWithPredication(I)) { 5456 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5457 << *I << "\n"); 5458 return; 5459 } 5460 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5461 Worklist.insert(I); 5462 }; 5463 5464 // Start with the conditional branch. If the branch condition is an 5465 // instruction contained in the loop that is only used by the branch, it is 5466 // uniform. 5467 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5468 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5469 addToWorklistIfAllowed(Cmp); 5470 5471 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5472 InstWidening WideningDecision = getWideningDecision(I, VF); 5473 assert(WideningDecision != CM_Unknown && 5474 "Widening decision should be ready at this moment"); 5475 5476 // A uniform memory op is itself uniform. We exclude uniform stores 5477 // here as they demand the last lane, not the first one. 5478 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5479 assert(WideningDecision == CM_Scalarize); 5480 return true; 5481 } 5482 5483 return (WideningDecision == CM_Widen || 5484 WideningDecision == CM_Widen_Reverse || 5485 WideningDecision == CM_Interleave); 5486 }; 5487 5488 5489 // Returns true if Ptr is the pointer operand of a memory access instruction 5490 // I, and I is known to not require scalarization. 5491 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5492 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5493 }; 5494 5495 // Holds a list of values which are known to have at least one uniform use. 5496 // Note that there may be other uses which aren't uniform. A "uniform use" 5497 // here is something which only demands lane 0 of the unrolled iterations; 5498 // it does not imply that all lanes produce the same value (e.g. this is not 5499 // the usual meaning of uniform) 5500 SetVector<Value *> HasUniformUse; 5501 5502 // Scan the loop for instructions which are either a) known to have only 5503 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5504 for (auto *BB : TheLoop->blocks()) 5505 for (auto &I : *BB) { 5506 // If there's no pointer operand, there's nothing to do. 5507 auto *Ptr = getLoadStorePointerOperand(&I); 5508 if (!Ptr) 5509 continue; 5510 5511 // A uniform memory op is itself uniform. We exclude uniform stores 5512 // here as they demand the last lane, not the first one. 5513 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5514 addToWorklistIfAllowed(&I); 5515 5516 if (isUniformDecision(&I, VF)) { 5517 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5518 HasUniformUse.insert(Ptr); 5519 } 5520 } 5521 5522 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5523 // demanding) users. Since loops are assumed to be in LCSSA form, this 5524 // disallows uses outside the loop as well. 5525 for (auto *V : HasUniformUse) { 5526 if (isOutOfScope(V)) 5527 continue; 5528 auto *I = cast<Instruction>(V); 5529 auto UsersAreMemAccesses = 5530 llvm::all_of(I->users(), [&](User *U) -> bool { 5531 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5532 }); 5533 if (UsersAreMemAccesses) 5534 addToWorklistIfAllowed(I); 5535 } 5536 5537 // Expand Worklist in topological order: whenever a new instruction 5538 // is added , its users should be already inside Worklist. It ensures 5539 // a uniform instruction will only be used by uniform instructions. 5540 unsigned idx = 0; 5541 while (idx != Worklist.size()) { 5542 Instruction *I = Worklist[idx++]; 5543 5544 for (auto OV : I->operand_values()) { 5545 // isOutOfScope operands cannot be uniform instructions. 5546 if (isOutOfScope(OV)) 5547 continue; 5548 // First order recurrence Phi's should typically be considered 5549 // non-uniform. 5550 auto *OP = dyn_cast<PHINode>(OV); 5551 if (OP && Legal->isFirstOrderRecurrence(OP)) 5552 continue; 5553 // If all the users of the operand are uniform, then add the 5554 // operand into the uniform worklist. 5555 auto *OI = cast<Instruction>(OV); 5556 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5557 auto *J = cast<Instruction>(U); 5558 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5559 })) 5560 addToWorklistIfAllowed(OI); 5561 } 5562 } 5563 5564 // For an instruction to be added into Worklist above, all its users inside 5565 // the loop should also be in Worklist. However, this condition cannot be 5566 // true for phi nodes that form a cyclic dependence. We must process phi 5567 // nodes separately. An induction variable will remain uniform if all users 5568 // of the induction variable and induction variable update remain uniform. 5569 // The code below handles both pointer and non-pointer induction variables. 5570 for (auto &Induction : Legal->getInductionVars()) { 5571 auto *Ind = Induction.first; 5572 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5573 5574 // Determine if all users of the induction variable are uniform after 5575 // vectorization. 5576 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5577 auto *I = cast<Instruction>(U); 5578 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5579 isVectorizedMemAccessUse(I, Ind); 5580 }); 5581 if (!UniformInd) 5582 continue; 5583 5584 // Determine if all users of the induction variable update instruction are 5585 // uniform after vectorization. 5586 auto UniformIndUpdate = 5587 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5588 auto *I = cast<Instruction>(U); 5589 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5590 isVectorizedMemAccessUse(I, IndUpdate); 5591 }); 5592 if (!UniformIndUpdate) 5593 continue; 5594 5595 // The induction variable and its update instruction will remain uniform. 5596 addToWorklistIfAllowed(Ind); 5597 addToWorklistIfAllowed(IndUpdate); 5598 } 5599 5600 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5601 } 5602 5603 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5604 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5605 5606 if (Legal->getRuntimePointerChecking()->Need) { 5607 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5608 "runtime pointer checks needed. Enable vectorization of this " 5609 "loop with '#pragma clang loop vectorize(enable)' when " 5610 "compiling with -Os/-Oz", 5611 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5612 return true; 5613 } 5614 5615 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5616 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5617 "runtime SCEV checks needed. Enable vectorization of this " 5618 "loop with '#pragma clang loop vectorize(enable)' when " 5619 "compiling with -Os/-Oz", 5620 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5621 return true; 5622 } 5623 5624 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5625 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5626 reportVectorizationFailure("Runtime stride check for small trip count", 5627 "runtime stride == 1 checks needed. Enable vectorization of " 5628 "this loop without such check by compiling with -Os/-Oz", 5629 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5630 return true; 5631 } 5632 5633 return false; 5634 } 5635 5636 ElementCount 5637 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5638 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5639 reportVectorizationInfo( 5640 "Disabling scalable vectorization, because target does not " 5641 "support scalable vectors.", 5642 "ScalableVectorsUnsupported", ORE, TheLoop); 5643 return ElementCount::getScalable(0); 5644 } 5645 5646 auto MaxScalableVF = ElementCount::getScalable( 5647 std::numeric_limits<ElementCount::ScalarTy>::max()); 5648 5649 // Disable scalable vectorization if the loop contains unsupported reductions. 5650 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5651 // FIXME: While for scalable vectors this is currently sufficient, this should 5652 // be replaced by a more detailed mechanism that filters out specific VFs, 5653 // instead of invalidating vectorization for a whole set of VFs based on the 5654 // MaxVF. 5655 if (!canVectorizeReductions(MaxScalableVF)) { 5656 reportVectorizationInfo( 5657 "Scalable vectorization not supported for the reduction " 5658 "operations found in this loop.", 5659 "ScalableVFUnfeasible", ORE, TheLoop); 5660 return ElementCount::getScalable(0); 5661 } 5662 5663 if (Legal->isSafeForAnyVectorWidth()) 5664 return MaxScalableVF; 5665 5666 // Limit MaxScalableVF by the maximum safe dependence distance. 5667 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5668 MaxScalableVF = ElementCount::getScalable( 5669 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5670 if (!MaxScalableVF) 5671 reportVectorizationInfo( 5672 "Max legal vector width too small, scalable vectorization " 5673 "unfeasible.", 5674 "ScalableVFUnfeasible", ORE, TheLoop); 5675 5676 return MaxScalableVF; 5677 } 5678 5679 ElementCount 5680 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5681 ElementCount UserVF) { 5682 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5683 unsigned SmallestType, WidestType; 5684 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5685 5686 // Get the maximum safe dependence distance in bits computed by LAA. 5687 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5688 // the memory accesses that is most restrictive (involved in the smallest 5689 // dependence distance). 5690 unsigned MaxSafeElements = 5691 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5692 5693 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5694 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5695 5696 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5697 << ".\n"); 5698 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5699 << ".\n"); 5700 5701 // First analyze the UserVF, fall back if the UserVF should be ignored. 5702 if (UserVF) { 5703 auto MaxSafeUserVF = 5704 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5705 5706 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5707 return UserVF; 5708 5709 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5710 5711 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5712 // is better to ignore the hint and let the compiler choose a suitable VF. 5713 if (!UserVF.isScalable()) { 5714 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5715 << " is unsafe, clamping to max safe VF=" 5716 << MaxSafeFixedVF << ".\n"); 5717 ORE->emit([&]() { 5718 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5719 TheLoop->getStartLoc(), 5720 TheLoop->getHeader()) 5721 << "User-specified vectorization factor " 5722 << ore::NV("UserVectorizationFactor", UserVF) 5723 << " is unsafe, clamping to maximum safe vectorization factor " 5724 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5725 }); 5726 return MaxSafeFixedVF; 5727 } 5728 5729 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5730 << " is unsafe. Ignoring scalable UserVF.\n"); 5731 ORE->emit([&]() { 5732 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5733 TheLoop->getStartLoc(), 5734 TheLoop->getHeader()) 5735 << "User-specified vectorization factor " 5736 << ore::NV("UserVectorizationFactor", UserVF) 5737 << " is unsafe. Ignoring the hint to let the compiler pick a " 5738 "suitable VF."; 5739 }); 5740 } 5741 5742 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5743 << " / " << WidestType << " bits.\n"); 5744 5745 ElementCount MaxFixedVF = ElementCount::getFixed(1); 5746 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5747 WidestType, MaxSafeFixedVF)) 5748 MaxFixedVF = MaxVF; 5749 5750 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5751 WidestType, MaxSafeScalableVF)) 5752 // FIXME: Return scalable VF as well (to be added in future patch). 5753 if (MaxVF.isScalable()) 5754 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5755 << "\n"); 5756 5757 return MaxFixedVF; 5758 } 5759 5760 Optional<ElementCount> 5761 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5762 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5763 // TODO: It may by useful to do since it's still likely to be dynamically 5764 // uniform if the target can skip. 5765 reportVectorizationFailure( 5766 "Not inserting runtime ptr check for divergent target", 5767 "runtime pointer checks needed. Not enabled for divergent target", 5768 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5769 return None; 5770 } 5771 5772 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5773 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5774 if (TC == 1) { 5775 reportVectorizationFailure("Single iteration (non) loop", 5776 "loop trip count is one, irrelevant for vectorization", 5777 "SingleIterationLoop", ORE, TheLoop); 5778 return None; 5779 } 5780 5781 switch (ScalarEpilogueStatus) { 5782 case CM_ScalarEpilogueAllowed: 5783 return computeFeasibleMaxVF(TC, UserVF); 5784 case CM_ScalarEpilogueNotAllowedUsePredicate: 5785 LLVM_FALLTHROUGH; 5786 case CM_ScalarEpilogueNotNeededUsePredicate: 5787 LLVM_DEBUG( 5788 dbgs() << "LV: vector predicate hint/switch found.\n" 5789 << "LV: Not allowing scalar epilogue, creating predicated " 5790 << "vector loop.\n"); 5791 break; 5792 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5793 // fallthrough as a special case of OptForSize 5794 case CM_ScalarEpilogueNotAllowedOptSize: 5795 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5796 LLVM_DEBUG( 5797 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5798 else 5799 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5800 << "count.\n"); 5801 5802 // Bail if runtime checks are required, which are not good when optimising 5803 // for size. 5804 if (runtimeChecksRequired()) 5805 return None; 5806 5807 break; 5808 } 5809 5810 // The only loops we can vectorize without a scalar epilogue, are loops with 5811 // a bottom-test and a single exiting block. We'd have to handle the fact 5812 // that not every instruction executes on the last iteration. This will 5813 // require a lane mask which varies through the vector loop body. (TODO) 5814 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5815 // If there was a tail-folding hint/switch, but we can't fold the tail by 5816 // masking, fallback to a vectorization with a scalar epilogue. 5817 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5818 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5819 "scalar epilogue instead.\n"); 5820 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5821 return computeFeasibleMaxVF(TC, UserVF); 5822 } 5823 return None; 5824 } 5825 5826 // Now try the tail folding 5827 5828 // Invalidate interleave groups that require an epilogue if we can't mask 5829 // the interleave-group. 5830 if (!useMaskedInterleavedAccesses(TTI)) { 5831 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5832 "No decisions should have been taken at this point"); 5833 // Note: There is no need to invalidate any cost modeling decisions here, as 5834 // non where taken so far. 5835 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5836 } 5837 5838 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5839 assert(!MaxVF.isScalable() && 5840 "Scalable vectors do not yet support tail folding"); 5841 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5842 "MaxVF must be a power of 2"); 5843 unsigned MaxVFtimesIC = 5844 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5845 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5846 // chose. 5847 ScalarEvolution *SE = PSE.getSE(); 5848 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5849 const SCEV *ExitCount = SE->getAddExpr( 5850 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5851 const SCEV *Rem = SE->getURemExpr( 5852 SE->applyLoopGuards(ExitCount, TheLoop), 5853 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5854 if (Rem->isZero()) { 5855 // Accept MaxVF if we do not have a tail. 5856 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5857 return MaxVF; 5858 } 5859 5860 // If we don't know the precise trip count, or if the trip count that we 5861 // found modulo the vectorization factor is not zero, try to fold the tail 5862 // by masking. 5863 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5864 if (Legal->prepareToFoldTailByMasking()) { 5865 FoldTailByMasking = true; 5866 return MaxVF; 5867 } 5868 5869 // If there was a tail-folding hint/switch, but we can't fold the tail by 5870 // masking, fallback to a vectorization with a scalar epilogue. 5871 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5872 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5873 "scalar epilogue instead.\n"); 5874 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5875 return MaxVF; 5876 } 5877 5878 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5879 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5880 return None; 5881 } 5882 5883 if (TC == 0) { 5884 reportVectorizationFailure( 5885 "Unable to calculate the loop count due to complex control flow", 5886 "unable to calculate the loop count due to complex control flow", 5887 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5888 return None; 5889 } 5890 5891 reportVectorizationFailure( 5892 "Cannot optimize for size and vectorize at the same time.", 5893 "cannot optimize for size and vectorize at the same time. " 5894 "Enable vectorization of this loop with '#pragma clang loop " 5895 "vectorize(enable)' when compiling with -Os/-Oz", 5896 "NoTailLoopWithOptForSize", ORE, TheLoop); 5897 return None; 5898 } 5899 5900 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5901 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5902 const ElementCount &MaxSafeVF) { 5903 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5904 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5905 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5906 : TargetTransformInfo::RGK_FixedWidthVector); 5907 5908 // Convenience function to return the minimum of two ElementCounts. 5909 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5910 assert((LHS.isScalable() == RHS.isScalable()) && 5911 "Scalable flags must match"); 5912 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5913 }; 5914 5915 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5916 // Note that both WidestRegister and WidestType may not be a powers of 2. 5917 auto MaxVectorElementCount = ElementCount::get( 5918 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5919 ComputeScalableMaxVF); 5920 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5921 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5922 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5923 5924 if (!MaxVectorElementCount) { 5925 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5926 return ElementCount::getFixed(1); 5927 } 5928 5929 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5930 if (ConstTripCount && 5931 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5932 isPowerOf2_32(ConstTripCount)) { 5933 // We need to clamp the VF to be the ConstTripCount. There is no point in 5934 // choosing a higher viable VF as done in the loop below. If 5935 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5936 // the TC is less than or equal to the known number of lanes. 5937 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5938 << ConstTripCount << "\n"); 5939 return TripCountEC; 5940 } 5941 5942 ElementCount MaxVF = MaxVectorElementCount; 5943 if (TTI.shouldMaximizeVectorBandwidth() || 5944 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5945 auto MaxVectorElementCountMaxBW = ElementCount::get( 5946 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5947 ComputeScalableMaxVF); 5948 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5949 5950 // Collect all viable vectorization factors larger than the default MaxVF 5951 // (i.e. MaxVectorElementCount). 5952 SmallVector<ElementCount, 8> VFs; 5953 for (ElementCount VS = MaxVectorElementCount * 2; 5954 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5955 VFs.push_back(VS); 5956 5957 // For each VF calculate its register usage. 5958 auto RUs = calculateRegisterUsage(VFs); 5959 5960 // Select the largest VF which doesn't require more registers than existing 5961 // ones. 5962 for (int i = RUs.size() - 1; i >= 0; --i) { 5963 bool Selected = true; 5964 for (auto &pair : RUs[i].MaxLocalUsers) { 5965 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5966 if (pair.second > TargetNumRegisters) 5967 Selected = false; 5968 } 5969 if (Selected) { 5970 MaxVF = VFs[i]; 5971 break; 5972 } 5973 } 5974 if (ElementCount MinVF = 5975 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5976 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5977 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5978 << ") with target's minimum: " << MinVF << '\n'); 5979 MaxVF = MinVF; 5980 } 5981 } 5982 } 5983 return MaxVF; 5984 } 5985 5986 bool LoopVectorizationCostModel::isMoreProfitable( 5987 const VectorizationFactor &A, const VectorizationFactor &B) const { 5988 InstructionCost::CostType CostA = *A.Cost.getValue(); 5989 InstructionCost::CostType CostB = *B.Cost.getValue(); 5990 5991 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5992 5993 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5994 MaxTripCount) { 5995 // If we are folding the tail and the trip count is a known (possibly small) 5996 // constant, the trip count will be rounded up to an integer number of 5997 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5998 // which we compare directly. When not folding the tail, the total cost will 5999 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 6000 // approximated with the per-lane cost below instead of using the tripcount 6001 // as here. 6002 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 6003 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 6004 return RTCostA < RTCostB; 6005 } 6006 6007 // To avoid the need for FP division: 6008 // (CostA / A.Width) < (CostB / B.Width) 6009 // <=> (CostA * B.Width) < (CostB * A.Width) 6010 return (CostA * B.Width.getKnownMinValue()) < 6011 (CostB * A.Width.getKnownMinValue()); 6012 } 6013 6014 VectorizationFactor 6015 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 6016 // FIXME: This can be fixed for scalable vectors later, because at this stage 6017 // the LoopVectorizer will only consider vectorizing a loop with scalable 6018 // vectors when the loop has a hint to enable vectorization for a given VF. 6019 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 6020 6021 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6022 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6023 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6024 6025 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6026 VectorizationFactor ChosenFactor = ScalarCost; 6027 6028 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6029 if (ForceVectorization && MaxVF.isVector()) { 6030 // Ignore scalar width, because the user explicitly wants vectorization. 6031 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6032 // evaluation. 6033 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6034 } 6035 6036 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 6037 i *= 2) { 6038 // Notice that the vector loop needs to be executed less times, so 6039 // we need to divide the cost of the vector loops by the width of 6040 // the vector elements. 6041 VectorizationCostTy C = expectedCost(i); 6042 6043 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6044 VectorizationFactor Candidate(i, C.first); 6045 LLVM_DEBUG( 6046 dbgs() << "LV: Vector loop of width " << i << " costs: " 6047 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 6048 << ".\n"); 6049 6050 if (!C.second && !ForceVectorization) { 6051 LLVM_DEBUG( 6052 dbgs() << "LV: Not considering vector loop of width " << i 6053 << " because it will not generate any vector instructions.\n"); 6054 continue; 6055 } 6056 6057 // If profitable add it to ProfitableVF list. 6058 if (isMoreProfitable(Candidate, ScalarCost)) 6059 ProfitableVFs.push_back(Candidate); 6060 6061 if (isMoreProfitable(Candidate, ChosenFactor)) 6062 ChosenFactor = Candidate; 6063 } 6064 6065 if (!EnableCondStoresVectorization && NumPredStores) { 6066 reportVectorizationFailure("There are conditional stores.", 6067 "store that is conditionally executed prevents vectorization", 6068 "ConditionalStore", ORE, TheLoop); 6069 ChosenFactor = ScalarCost; 6070 } 6071 6072 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6073 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6074 dbgs() 6075 << "LV: Vectorization seems to be not beneficial, " 6076 << "but was forced by a user.\n"); 6077 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6078 return ChosenFactor; 6079 } 6080 6081 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6082 const Loop &L, ElementCount VF) const { 6083 // Cross iteration phis such as reductions need special handling and are 6084 // currently unsupported. 6085 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6086 return Legal->isFirstOrderRecurrence(&Phi) || 6087 Legal->isReductionVariable(&Phi); 6088 })) 6089 return false; 6090 6091 // Phis with uses outside of the loop require special handling and are 6092 // currently unsupported. 6093 for (auto &Entry : Legal->getInductionVars()) { 6094 // Look for uses of the value of the induction at the last iteration. 6095 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6096 for (User *U : PostInc->users()) 6097 if (!L.contains(cast<Instruction>(U))) 6098 return false; 6099 // Look for uses of penultimate value of the induction. 6100 for (User *U : Entry.first->users()) 6101 if (!L.contains(cast<Instruction>(U))) 6102 return false; 6103 } 6104 6105 // Induction variables that are widened require special handling that is 6106 // currently not supported. 6107 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6108 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6109 this->isProfitableToScalarize(Entry.first, VF)); 6110 })) 6111 return false; 6112 6113 return true; 6114 } 6115 6116 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6117 const ElementCount VF) const { 6118 // FIXME: We need a much better cost-model to take different parameters such 6119 // as register pressure, code size increase and cost of extra branches into 6120 // account. For now we apply a very crude heuristic and only consider loops 6121 // with vectorization factors larger than a certain value. 6122 // We also consider epilogue vectorization unprofitable for targets that don't 6123 // consider interleaving beneficial (eg. MVE). 6124 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6125 return false; 6126 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6127 return true; 6128 return false; 6129 } 6130 6131 VectorizationFactor 6132 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6133 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6134 VectorizationFactor Result = VectorizationFactor::Disabled(); 6135 if (!EnableEpilogueVectorization) { 6136 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6137 return Result; 6138 } 6139 6140 if (!isScalarEpilogueAllowed()) { 6141 LLVM_DEBUG( 6142 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6143 "allowed.\n";); 6144 return Result; 6145 } 6146 6147 // FIXME: This can be fixed for scalable vectors later, because at this stage 6148 // the LoopVectorizer will only consider vectorizing a loop with scalable 6149 // vectors when the loop has a hint to enable vectorization for a given VF. 6150 if (MainLoopVF.isScalable()) { 6151 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6152 "yet supported.\n"); 6153 return Result; 6154 } 6155 6156 // Not really a cost consideration, but check for unsupported cases here to 6157 // simplify the logic. 6158 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6159 LLVM_DEBUG( 6160 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6161 "not a supported candidate.\n";); 6162 return Result; 6163 } 6164 6165 if (EpilogueVectorizationForceVF > 1) { 6166 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6167 if (LVP.hasPlanWithVFs( 6168 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6169 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6170 else { 6171 LLVM_DEBUG( 6172 dbgs() 6173 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6174 return Result; 6175 } 6176 } 6177 6178 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6179 TheLoop->getHeader()->getParent()->hasMinSize()) { 6180 LLVM_DEBUG( 6181 dbgs() 6182 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6183 return Result; 6184 } 6185 6186 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6187 return Result; 6188 6189 for (auto &NextVF : ProfitableVFs) 6190 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6191 (Result.Width.getFixedValue() == 1 || 6192 isMoreProfitable(NextVF, Result)) && 6193 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6194 Result = NextVF; 6195 6196 if (Result != VectorizationFactor::Disabled()) 6197 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6198 << Result.Width.getFixedValue() << "\n";); 6199 return Result; 6200 } 6201 6202 std::pair<unsigned, unsigned> 6203 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6204 unsigned MinWidth = -1U; 6205 unsigned MaxWidth = 8; 6206 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6207 6208 // For each block. 6209 for (BasicBlock *BB : TheLoop->blocks()) { 6210 // For each instruction in the loop. 6211 for (Instruction &I : BB->instructionsWithoutDebug()) { 6212 Type *T = I.getType(); 6213 6214 // Skip ignored values. 6215 if (ValuesToIgnore.count(&I)) 6216 continue; 6217 6218 // Only examine Loads, Stores and PHINodes. 6219 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6220 continue; 6221 6222 // Examine PHI nodes that are reduction variables. Update the type to 6223 // account for the recurrence type. 6224 if (auto *PN = dyn_cast<PHINode>(&I)) { 6225 if (!Legal->isReductionVariable(PN)) 6226 continue; 6227 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6228 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6229 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6230 RdxDesc.getRecurrenceType(), 6231 TargetTransformInfo::ReductionFlags())) 6232 continue; 6233 T = RdxDesc.getRecurrenceType(); 6234 } 6235 6236 // Examine the stored values. 6237 if (auto *ST = dyn_cast<StoreInst>(&I)) 6238 T = ST->getValueOperand()->getType(); 6239 6240 // Ignore loaded pointer types and stored pointer types that are not 6241 // vectorizable. 6242 // 6243 // FIXME: The check here attempts to predict whether a load or store will 6244 // be vectorized. We only know this for certain after a VF has 6245 // been selected. Here, we assume that if an access can be 6246 // vectorized, it will be. We should also look at extending this 6247 // optimization to non-pointer types. 6248 // 6249 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6250 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6251 continue; 6252 6253 MinWidth = std::min(MinWidth, 6254 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6255 MaxWidth = std::max(MaxWidth, 6256 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6257 } 6258 } 6259 6260 return {MinWidth, MaxWidth}; 6261 } 6262 6263 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6264 unsigned LoopCost) { 6265 // -- The interleave heuristics -- 6266 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6267 // There are many micro-architectural considerations that we can't predict 6268 // at this level. For example, frontend pressure (on decode or fetch) due to 6269 // code size, or the number and capabilities of the execution ports. 6270 // 6271 // We use the following heuristics to select the interleave count: 6272 // 1. If the code has reductions, then we interleave to break the cross 6273 // iteration dependency. 6274 // 2. If the loop is really small, then we interleave to reduce the loop 6275 // overhead. 6276 // 3. We don't interleave if we think that we will spill registers to memory 6277 // due to the increased register pressure. 6278 6279 if (!isScalarEpilogueAllowed()) 6280 return 1; 6281 6282 // We used the distance for the interleave count. 6283 if (Legal->getMaxSafeDepDistBytes() != -1U) 6284 return 1; 6285 6286 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6287 const bool HasReductions = !Legal->getReductionVars().empty(); 6288 // Do not interleave loops with a relatively small known or estimated trip 6289 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6290 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6291 // because with the above conditions interleaving can expose ILP and break 6292 // cross iteration dependences for reductions. 6293 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6294 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6295 return 1; 6296 6297 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6298 // We divide by these constants so assume that we have at least one 6299 // instruction that uses at least one register. 6300 for (auto& pair : R.MaxLocalUsers) { 6301 pair.second = std::max(pair.second, 1U); 6302 } 6303 6304 // We calculate the interleave count using the following formula. 6305 // Subtract the number of loop invariants from the number of available 6306 // registers. These registers are used by all of the interleaved instances. 6307 // Next, divide the remaining registers by the number of registers that is 6308 // required by the loop, in order to estimate how many parallel instances 6309 // fit without causing spills. All of this is rounded down if necessary to be 6310 // a power of two. We want power of two interleave count to simplify any 6311 // addressing operations or alignment considerations. 6312 // We also want power of two interleave counts to ensure that the induction 6313 // variable of the vector loop wraps to zero, when tail is folded by masking; 6314 // this currently happens when OptForSize, in which case IC is set to 1 above. 6315 unsigned IC = UINT_MAX; 6316 6317 for (auto& pair : R.MaxLocalUsers) { 6318 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6319 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6320 << " registers of " 6321 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6322 if (VF.isScalar()) { 6323 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6324 TargetNumRegisters = ForceTargetNumScalarRegs; 6325 } else { 6326 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6327 TargetNumRegisters = ForceTargetNumVectorRegs; 6328 } 6329 unsigned MaxLocalUsers = pair.second; 6330 unsigned LoopInvariantRegs = 0; 6331 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6332 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6333 6334 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6335 // Don't count the induction variable as interleaved. 6336 if (EnableIndVarRegisterHeur) { 6337 TmpIC = 6338 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6339 std::max(1U, (MaxLocalUsers - 1))); 6340 } 6341 6342 IC = std::min(IC, TmpIC); 6343 } 6344 6345 // Clamp the interleave ranges to reasonable counts. 6346 unsigned MaxInterleaveCount = 6347 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6348 6349 // Check if the user has overridden the max. 6350 if (VF.isScalar()) { 6351 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6352 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6353 } else { 6354 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6355 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6356 } 6357 6358 // If trip count is known or estimated compile time constant, limit the 6359 // interleave count to be less than the trip count divided by VF, provided it 6360 // is at least 1. 6361 // 6362 // For scalable vectors we can't know if interleaving is beneficial. It may 6363 // not be beneficial for small loops if none of the lanes in the second vector 6364 // iterations is enabled. However, for larger loops, there is likely to be a 6365 // similar benefit as for fixed-width vectors. For now, we choose to leave 6366 // the InterleaveCount as if vscale is '1', although if some information about 6367 // the vector is known (e.g. min vector size), we can make a better decision. 6368 if (BestKnownTC) { 6369 MaxInterleaveCount = 6370 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6371 // Make sure MaxInterleaveCount is greater than 0. 6372 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6373 } 6374 6375 assert(MaxInterleaveCount > 0 && 6376 "Maximum interleave count must be greater than 0"); 6377 6378 // Clamp the calculated IC to be between the 1 and the max interleave count 6379 // that the target and trip count allows. 6380 if (IC > MaxInterleaveCount) 6381 IC = MaxInterleaveCount; 6382 else 6383 // Make sure IC is greater than 0. 6384 IC = std::max(1u, IC); 6385 6386 assert(IC > 0 && "Interleave count must be greater than 0."); 6387 6388 // If we did not calculate the cost for VF (because the user selected the VF) 6389 // then we calculate the cost of VF here. 6390 if (LoopCost == 0) { 6391 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6392 LoopCost = *expectedCost(VF).first.getValue(); 6393 } 6394 6395 assert(LoopCost && "Non-zero loop cost expected"); 6396 6397 // Interleave if we vectorized this loop and there is a reduction that could 6398 // benefit from interleaving. 6399 if (VF.isVector() && HasReductions) { 6400 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6401 return IC; 6402 } 6403 6404 // Note that if we've already vectorized the loop we will have done the 6405 // runtime check and so interleaving won't require further checks. 6406 bool InterleavingRequiresRuntimePointerCheck = 6407 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6408 6409 // We want to interleave small loops in order to reduce the loop overhead and 6410 // potentially expose ILP opportunities. 6411 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6412 << "LV: IC is " << IC << '\n' 6413 << "LV: VF is " << VF << '\n'); 6414 const bool AggressivelyInterleaveReductions = 6415 TTI.enableAggressiveInterleaving(HasReductions); 6416 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6417 // We assume that the cost overhead is 1 and we use the cost model 6418 // to estimate the cost of the loop and interleave until the cost of the 6419 // loop overhead is about 5% of the cost of the loop. 6420 unsigned SmallIC = 6421 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6422 6423 // Interleave until store/load ports (estimated by max interleave count) are 6424 // saturated. 6425 unsigned NumStores = Legal->getNumStores(); 6426 unsigned NumLoads = Legal->getNumLoads(); 6427 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6428 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6429 6430 // If we have a scalar reduction (vector reductions are already dealt with 6431 // by this point), we can increase the critical path length if the loop 6432 // we're interleaving is inside another loop. Limit, by default to 2, so the 6433 // critical path only gets increased by one reduction operation. 6434 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6435 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6436 SmallIC = std::min(SmallIC, F); 6437 StoresIC = std::min(StoresIC, F); 6438 LoadsIC = std::min(LoadsIC, F); 6439 } 6440 6441 if (EnableLoadStoreRuntimeInterleave && 6442 std::max(StoresIC, LoadsIC) > SmallIC) { 6443 LLVM_DEBUG( 6444 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6445 return std::max(StoresIC, LoadsIC); 6446 } 6447 6448 // If there are scalar reductions and TTI has enabled aggressive 6449 // interleaving for reductions, we will interleave to expose ILP. 6450 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6451 AggressivelyInterleaveReductions) { 6452 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6453 // Interleave no less than SmallIC but not as aggressive as the normal IC 6454 // to satisfy the rare situation when resources are too limited. 6455 return std::max(IC / 2, SmallIC); 6456 } else { 6457 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6458 return SmallIC; 6459 } 6460 } 6461 6462 // Interleave if this is a large loop (small loops are already dealt with by 6463 // this point) that could benefit from interleaving. 6464 if (AggressivelyInterleaveReductions) { 6465 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6466 return IC; 6467 } 6468 6469 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6470 return 1; 6471 } 6472 6473 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6474 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6475 // This function calculates the register usage by measuring the highest number 6476 // of values that are alive at a single location. Obviously, this is a very 6477 // rough estimation. We scan the loop in a topological order in order and 6478 // assign a number to each instruction. We use RPO to ensure that defs are 6479 // met before their users. We assume that each instruction that has in-loop 6480 // users starts an interval. We record every time that an in-loop value is 6481 // used, so we have a list of the first and last occurrences of each 6482 // instruction. Next, we transpose this data structure into a multi map that 6483 // holds the list of intervals that *end* at a specific location. This multi 6484 // map allows us to perform a linear search. We scan the instructions linearly 6485 // and record each time that a new interval starts, by placing it in a set. 6486 // If we find this value in the multi-map then we remove it from the set. 6487 // The max register usage is the maximum size of the set. 6488 // We also search for instructions that are defined outside the loop, but are 6489 // used inside the loop. We need this number separately from the max-interval 6490 // usage number because when we unroll, loop-invariant values do not take 6491 // more register. 6492 LoopBlocksDFS DFS(TheLoop); 6493 DFS.perform(LI); 6494 6495 RegisterUsage RU; 6496 6497 // Each 'key' in the map opens a new interval. The values 6498 // of the map are the index of the 'last seen' usage of the 6499 // instruction that is the key. 6500 using IntervalMap = DenseMap<Instruction *, unsigned>; 6501 6502 // Maps instruction to its index. 6503 SmallVector<Instruction *, 64> IdxToInstr; 6504 // Marks the end of each interval. 6505 IntervalMap EndPoint; 6506 // Saves the list of instruction indices that are used in the loop. 6507 SmallPtrSet<Instruction *, 8> Ends; 6508 // Saves the list of values that are used in the loop but are 6509 // defined outside the loop, such as arguments and constants. 6510 SmallPtrSet<Value *, 8> LoopInvariants; 6511 6512 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6513 for (Instruction &I : BB->instructionsWithoutDebug()) { 6514 IdxToInstr.push_back(&I); 6515 6516 // Save the end location of each USE. 6517 for (Value *U : I.operands()) { 6518 auto *Instr = dyn_cast<Instruction>(U); 6519 6520 // Ignore non-instruction values such as arguments, constants, etc. 6521 if (!Instr) 6522 continue; 6523 6524 // If this instruction is outside the loop then record it and continue. 6525 if (!TheLoop->contains(Instr)) { 6526 LoopInvariants.insert(Instr); 6527 continue; 6528 } 6529 6530 // Overwrite previous end points. 6531 EndPoint[Instr] = IdxToInstr.size(); 6532 Ends.insert(Instr); 6533 } 6534 } 6535 } 6536 6537 // Saves the list of intervals that end with the index in 'key'. 6538 using InstrList = SmallVector<Instruction *, 2>; 6539 DenseMap<unsigned, InstrList> TransposeEnds; 6540 6541 // Transpose the EndPoints to a list of values that end at each index. 6542 for (auto &Interval : EndPoint) 6543 TransposeEnds[Interval.second].push_back(Interval.first); 6544 6545 SmallPtrSet<Instruction *, 8> OpenIntervals; 6546 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6547 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6548 6549 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6550 6551 // A lambda that gets the register usage for the given type and VF. 6552 const auto &TTICapture = TTI; 6553 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6554 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6555 return 0U; 6556 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6557 }; 6558 6559 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6560 Instruction *I = IdxToInstr[i]; 6561 6562 // Remove all of the instructions that end at this location. 6563 InstrList &List = TransposeEnds[i]; 6564 for (Instruction *ToRemove : List) 6565 OpenIntervals.erase(ToRemove); 6566 6567 // Ignore instructions that are never used within the loop. 6568 if (!Ends.count(I)) 6569 continue; 6570 6571 // Skip ignored values. 6572 if (ValuesToIgnore.count(I)) 6573 continue; 6574 6575 // For each VF find the maximum usage of registers. 6576 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6577 // Count the number of live intervals. 6578 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6579 6580 if (VFs[j].isScalar()) { 6581 for (auto Inst : OpenIntervals) { 6582 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6583 if (RegUsage.find(ClassID) == RegUsage.end()) 6584 RegUsage[ClassID] = 1; 6585 else 6586 RegUsage[ClassID] += 1; 6587 } 6588 } else { 6589 collectUniformsAndScalars(VFs[j]); 6590 for (auto Inst : OpenIntervals) { 6591 // Skip ignored values for VF > 1. 6592 if (VecValuesToIgnore.count(Inst)) 6593 continue; 6594 if (isScalarAfterVectorization(Inst, VFs[j])) { 6595 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6596 if (RegUsage.find(ClassID) == RegUsage.end()) 6597 RegUsage[ClassID] = 1; 6598 else 6599 RegUsage[ClassID] += 1; 6600 } else { 6601 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6602 if (RegUsage.find(ClassID) == RegUsage.end()) 6603 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6604 else 6605 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6606 } 6607 } 6608 } 6609 6610 for (auto& pair : RegUsage) { 6611 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6612 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6613 else 6614 MaxUsages[j][pair.first] = pair.second; 6615 } 6616 } 6617 6618 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6619 << OpenIntervals.size() << '\n'); 6620 6621 // Add the current instruction to the list of open intervals. 6622 OpenIntervals.insert(I); 6623 } 6624 6625 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6626 SmallMapVector<unsigned, unsigned, 4> Invariant; 6627 6628 for (auto Inst : LoopInvariants) { 6629 unsigned Usage = 6630 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6631 unsigned ClassID = 6632 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6633 if (Invariant.find(ClassID) == Invariant.end()) 6634 Invariant[ClassID] = Usage; 6635 else 6636 Invariant[ClassID] += Usage; 6637 } 6638 6639 LLVM_DEBUG({ 6640 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6641 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6642 << " item\n"; 6643 for (const auto &pair : MaxUsages[i]) { 6644 dbgs() << "LV(REG): RegisterClass: " 6645 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6646 << " registers\n"; 6647 } 6648 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6649 << " item\n"; 6650 for (const auto &pair : Invariant) { 6651 dbgs() << "LV(REG): RegisterClass: " 6652 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6653 << " registers\n"; 6654 } 6655 }); 6656 6657 RU.LoopInvariantRegs = Invariant; 6658 RU.MaxLocalUsers = MaxUsages[i]; 6659 RUs[i] = RU; 6660 } 6661 6662 return RUs; 6663 } 6664 6665 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6666 // TODO: Cost model for emulated masked load/store is completely 6667 // broken. This hack guides the cost model to use an artificially 6668 // high enough value to practically disable vectorization with such 6669 // operations, except where previously deployed legality hack allowed 6670 // using very low cost values. This is to avoid regressions coming simply 6671 // from moving "masked load/store" check from legality to cost model. 6672 // Masked Load/Gather emulation was previously never allowed. 6673 // Limited number of Masked Store/Scatter emulation was allowed. 6674 assert(isPredicatedInst(I) && 6675 "Expecting a scalar emulated instruction"); 6676 return isa<LoadInst>(I) || 6677 (isa<StoreInst>(I) && 6678 NumPredStores > NumberOfStoresToPredicate); 6679 } 6680 6681 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6682 // If we aren't vectorizing the loop, or if we've already collected the 6683 // instructions to scalarize, there's nothing to do. Collection may already 6684 // have occurred if we have a user-selected VF and are now computing the 6685 // expected cost for interleaving. 6686 if (VF.isScalar() || VF.isZero() || 6687 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6688 return; 6689 6690 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6691 // not profitable to scalarize any instructions, the presence of VF in the 6692 // map will indicate that we've analyzed it already. 6693 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6694 6695 // Find all the instructions that are scalar with predication in the loop and 6696 // determine if it would be better to not if-convert the blocks they are in. 6697 // If so, we also record the instructions to scalarize. 6698 for (BasicBlock *BB : TheLoop->blocks()) { 6699 if (!blockNeedsPredication(BB)) 6700 continue; 6701 for (Instruction &I : *BB) 6702 if (isScalarWithPredication(&I)) { 6703 ScalarCostsTy ScalarCosts; 6704 // Do not apply discount logic if hacked cost is needed 6705 // for emulated masked memrefs. 6706 if (!useEmulatedMaskMemRefHack(&I) && 6707 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6708 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6709 // Remember that BB will remain after vectorization. 6710 PredicatedBBsAfterVectorization.insert(BB); 6711 } 6712 } 6713 } 6714 6715 int LoopVectorizationCostModel::computePredInstDiscount( 6716 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6717 assert(!isUniformAfterVectorization(PredInst, VF) && 6718 "Instruction marked uniform-after-vectorization will be predicated"); 6719 6720 // Initialize the discount to zero, meaning that the scalar version and the 6721 // vector version cost the same. 6722 InstructionCost Discount = 0; 6723 6724 // Holds instructions to analyze. The instructions we visit are mapped in 6725 // ScalarCosts. Those instructions are the ones that would be scalarized if 6726 // we find that the scalar version costs less. 6727 SmallVector<Instruction *, 8> Worklist; 6728 6729 // Returns true if the given instruction can be scalarized. 6730 auto canBeScalarized = [&](Instruction *I) -> bool { 6731 // We only attempt to scalarize instructions forming a single-use chain 6732 // from the original predicated block that would otherwise be vectorized. 6733 // Although not strictly necessary, we give up on instructions we know will 6734 // already be scalar to avoid traversing chains that are unlikely to be 6735 // beneficial. 6736 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6737 isScalarAfterVectorization(I, VF)) 6738 return false; 6739 6740 // If the instruction is scalar with predication, it will be analyzed 6741 // separately. We ignore it within the context of PredInst. 6742 if (isScalarWithPredication(I)) 6743 return false; 6744 6745 // If any of the instruction's operands are uniform after vectorization, 6746 // the instruction cannot be scalarized. This prevents, for example, a 6747 // masked load from being scalarized. 6748 // 6749 // We assume we will only emit a value for lane zero of an instruction 6750 // marked uniform after vectorization, rather than VF identical values. 6751 // Thus, if we scalarize an instruction that uses a uniform, we would 6752 // create uses of values corresponding to the lanes we aren't emitting code 6753 // for. This behavior can be changed by allowing getScalarValue to clone 6754 // the lane zero values for uniforms rather than asserting. 6755 for (Use &U : I->operands()) 6756 if (auto *J = dyn_cast<Instruction>(U.get())) 6757 if (isUniformAfterVectorization(J, VF)) 6758 return false; 6759 6760 // Otherwise, we can scalarize the instruction. 6761 return true; 6762 }; 6763 6764 // Compute the expected cost discount from scalarizing the entire expression 6765 // feeding the predicated instruction. We currently only consider expressions 6766 // that are single-use instruction chains. 6767 Worklist.push_back(PredInst); 6768 while (!Worklist.empty()) { 6769 Instruction *I = Worklist.pop_back_val(); 6770 6771 // If we've already analyzed the instruction, there's nothing to do. 6772 if (ScalarCosts.find(I) != ScalarCosts.end()) 6773 continue; 6774 6775 // Compute the cost of the vector instruction. Note that this cost already 6776 // includes the scalarization overhead of the predicated instruction. 6777 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6778 6779 // Compute the cost of the scalarized instruction. This cost is the cost of 6780 // the instruction as if it wasn't if-converted and instead remained in the 6781 // predicated block. We will scale this cost by block probability after 6782 // computing the scalarization overhead. 6783 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6784 InstructionCost ScalarCost = 6785 VF.getKnownMinValue() * 6786 getInstructionCost(I, ElementCount::getFixed(1)).first; 6787 6788 // Compute the scalarization overhead of needed insertelement instructions 6789 // and phi nodes. 6790 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6791 ScalarCost += TTI.getScalarizationOverhead( 6792 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6793 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6794 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6795 ScalarCost += 6796 VF.getKnownMinValue() * 6797 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6798 } 6799 6800 // Compute the scalarization overhead of needed extractelement 6801 // instructions. For each of the instruction's operands, if the operand can 6802 // be scalarized, add it to the worklist; otherwise, account for the 6803 // overhead. 6804 for (Use &U : I->operands()) 6805 if (auto *J = dyn_cast<Instruction>(U.get())) { 6806 assert(VectorType::isValidElementType(J->getType()) && 6807 "Instruction has non-scalar type"); 6808 if (canBeScalarized(J)) 6809 Worklist.push_back(J); 6810 else if (needsExtract(J, VF)) { 6811 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6812 ScalarCost += TTI.getScalarizationOverhead( 6813 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6814 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6815 } 6816 } 6817 6818 // Scale the total scalar cost by block probability. 6819 ScalarCost /= getReciprocalPredBlockProb(); 6820 6821 // Compute the discount. A non-negative discount means the vector version 6822 // of the instruction costs more, and scalarizing would be beneficial. 6823 Discount += VectorCost - ScalarCost; 6824 ScalarCosts[I] = ScalarCost; 6825 } 6826 6827 return *Discount.getValue(); 6828 } 6829 6830 LoopVectorizationCostModel::VectorizationCostTy 6831 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6832 VectorizationCostTy Cost; 6833 6834 // For each block. 6835 for (BasicBlock *BB : TheLoop->blocks()) { 6836 VectorizationCostTy BlockCost; 6837 6838 // For each instruction in the old loop. 6839 for (Instruction &I : BB->instructionsWithoutDebug()) { 6840 // Skip ignored values. 6841 if (ValuesToIgnore.count(&I) || 6842 (VF.isVector() && VecValuesToIgnore.count(&I))) 6843 continue; 6844 6845 VectorizationCostTy C = getInstructionCost(&I, VF); 6846 6847 // Check if we should override the cost. 6848 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6849 C.first = InstructionCost(ForceTargetInstructionCost); 6850 6851 BlockCost.first += C.first; 6852 BlockCost.second |= C.second; 6853 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6854 << " for VF " << VF << " For instruction: " << I 6855 << '\n'); 6856 } 6857 6858 // If we are vectorizing a predicated block, it will have been 6859 // if-converted. This means that the block's instructions (aside from 6860 // stores and instructions that may divide by zero) will now be 6861 // unconditionally executed. For the scalar case, we may not always execute 6862 // the predicated block, if it is an if-else block. Thus, scale the block's 6863 // cost by the probability of executing it. blockNeedsPredication from 6864 // Legal is used so as to not include all blocks in tail folded loops. 6865 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6866 BlockCost.first /= getReciprocalPredBlockProb(); 6867 6868 Cost.first += BlockCost.first; 6869 Cost.second |= BlockCost.second; 6870 } 6871 6872 return Cost; 6873 } 6874 6875 /// Gets Address Access SCEV after verifying that the access pattern 6876 /// is loop invariant except the induction variable dependence. 6877 /// 6878 /// This SCEV can be sent to the Target in order to estimate the address 6879 /// calculation cost. 6880 static const SCEV *getAddressAccessSCEV( 6881 Value *Ptr, 6882 LoopVectorizationLegality *Legal, 6883 PredicatedScalarEvolution &PSE, 6884 const Loop *TheLoop) { 6885 6886 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6887 if (!Gep) 6888 return nullptr; 6889 6890 // We are looking for a gep with all loop invariant indices except for one 6891 // which should be an induction variable. 6892 auto SE = PSE.getSE(); 6893 unsigned NumOperands = Gep->getNumOperands(); 6894 for (unsigned i = 1; i < NumOperands; ++i) { 6895 Value *Opd = Gep->getOperand(i); 6896 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6897 !Legal->isInductionVariable(Opd)) 6898 return nullptr; 6899 } 6900 6901 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6902 return PSE.getSCEV(Ptr); 6903 } 6904 6905 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6906 return Legal->hasStride(I->getOperand(0)) || 6907 Legal->hasStride(I->getOperand(1)); 6908 } 6909 6910 InstructionCost 6911 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6912 ElementCount VF) { 6913 assert(VF.isVector() && 6914 "Scalarization cost of instruction implies vectorization."); 6915 if (VF.isScalable()) 6916 return InstructionCost::getInvalid(); 6917 6918 Type *ValTy = getMemInstValueType(I); 6919 auto SE = PSE.getSE(); 6920 6921 unsigned AS = getLoadStoreAddressSpace(I); 6922 Value *Ptr = getLoadStorePointerOperand(I); 6923 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6924 6925 // Figure out whether the access is strided and get the stride value 6926 // if it's known in compile time 6927 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6928 6929 // Get the cost of the scalar memory instruction and address computation. 6930 InstructionCost Cost = 6931 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6932 6933 // Don't pass *I here, since it is scalar but will actually be part of a 6934 // vectorized loop where the user of it is a vectorized instruction. 6935 const Align Alignment = getLoadStoreAlignment(I); 6936 Cost += VF.getKnownMinValue() * 6937 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6938 AS, TTI::TCK_RecipThroughput); 6939 6940 // Get the overhead of the extractelement and insertelement instructions 6941 // we might create due to scalarization. 6942 Cost += getScalarizationOverhead(I, VF); 6943 6944 // If we have a predicated load/store, it will need extra i1 extracts and 6945 // conditional branches, but may not be executed for each vector lane. Scale 6946 // the cost by the probability of executing the predicated block. 6947 if (isPredicatedInst(I)) { 6948 Cost /= getReciprocalPredBlockProb(); 6949 6950 // Add the cost of an i1 extract and a branch 6951 auto *Vec_i1Ty = 6952 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6953 Cost += TTI.getScalarizationOverhead( 6954 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6955 /*Insert=*/false, /*Extract=*/true); 6956 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6957 6958 if (useEmulatedMaskMemRefHack(I)) 6959 // Artificially setting to a high enough value to practically disable 6960 // vectorization with such operations. 6961 Cost = 3000000; 6962 } 6963 6964 return Cost; 6965 } 6966 6967 InstructionCost 6968 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6969 ElementCount VF) { 6970 Type *ValTy = getMemInstValueType(I); 6971 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6972 Value *Ptr = getLoadStorePointerOperand(I); 6973 unsigned AS = getLoadStoreAddressSpace(I); 6974 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6975 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6976 6977 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6978 "Stride should be 1 or -1 for consecutive memory access"); 6979 const Align Alignment = getLoadStoreAlignment(I); 6980 InstructionCost Cost = 0; 6981 if (Legal->isMaskRequired(I)) 6982 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6983 CostKind); 6984 else 6985 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6986 CostKind, I); 6987 6988 bool Reverse = ConsecutiveStride < 0; 6989 if (Reverse) 6990 Cost += 6991 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6992 return Cost; 6993 } 6994 6995 InstructionCost 6996 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6997 ElementCount VF) { 6998 assert(Legal->isUniformMemOp(*I)); 6999 7000 Type *ValTy = getMemInstValueType(I); 7001 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7002 const Align Alignment = getLoadStoreAlignment(I); 7003 unsigned AS = getLoadStoreAddressSpace(I); 7004 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7005 if (isa<LoadInst>(I)) { 7006 return TTI.getAddressComputationCost(ValTy) + 7007 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 7008 CostKind) + 7009 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 7010 } 7011 StoreInst *SI = cast<StoreInst>(I); 7012 7013 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 7014 return TTI.getAddressComputationCost(ValTy) + 7015 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 7016 CostKind) + 7017 (isLoopInvariantStoreValue 7018 ? 0 7019 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7020 VF.getKnownMinValue() - 1)); 7021 } 7022 7023 InstructionCost 7024 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7025 ElementCount VF) { 7026 Type *ValTy = getMemInstValueType(I); 7027 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7028 const Align Alignment = getLoadStoreAlignment(I); 7029 const Value *Ptr = getLoadStorePointerOperand(I); 7030 7031 return TTI.getAddressComputationCost(VectorTy) + 7032 TTI.getGatherScatterOpCost( 7033 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7034 TargetTransformInfo::TCK_RecipThroughput, I); 7035 } 7036 7037 InstructionCost 7038 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7039 ElementCount VF) { 7040 // TODO: Once we have support for interleaving with scalable vectors 7041 // we can calculate the cost properly here. 7042 if (VF.isScalable()) 7043 return InstructionCost::getInvalid(); 7044 7045 Type *ValTy = getMemInstValueType(I); 7046 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7047 unsigned AS = getLoadStoreAddressSpace(I); 7048 7049 auto Group = getInterleavedAccessGroup(I); 7050 assert(Group && "Fail to get an interleaved access group."); 7051 7052 unsigned InterleaveFactor = Group->getFactor(); 7053 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7054 7055 // Holds the indices of existing members in an interleaved load group. 7056 // An interleaved store group doesn't need this as it doesn't allow gaps. 7057 SmallVector<unsigned, 4> Indices; 7058 if (isa<LoadInst>(I)) { 7059 for (unsigned i = 0; i < InterleaveFactor; i++) 7060 if (Group->getMember(i)) 7061 Indices.push_back(i); 7062 } 7063 7064 // Calculate the cost of the whole interleaved group. 7065 bool UseMaskForGaps = 7066 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7067 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7068 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7069 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7070 7071 if (Group->isReverse()) { 7072 // TODO: Add support for reversed masked interleaved access. 7073 assert(!Legal->isMaskRequired(I) && 7074 "Reverse masked interleaved access not supported."); 7075 Cost += 7076 Group->getNumMembers() * 7077 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7078 } 7079 return Cost; 7080 } 7081 7082 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7083 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7084 // Early exit for no inloop reductions 7085 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7086 return InstructionCost::getInvalid(); 7087 auto *VectorTy = cast<VectorType>(Ty); 7088 7089 // We are looking for a pattern of, and finding the minimal acceptable cost: 7090 // reduce(mul(ext(A), ext(B))) or 7091 // reduce(mul(A, B)) or 7092 // reduce(ext(A)) or 7093 // reduce(A). 7094 // The basic idea is that we walk down the tree to do that, finding the root 7095 // reduction instruction in InLoopReductionImmediateChains. From there we find 7096 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7097 // of the components. If the reduction cost is lower then we return it for the 7098 // reduction instruction and 0 for the other instructions in the pattern. If 7099 // it is not we return an invalid cost specifying the orignal cost method 7100 // should be used. 7101 Instruction *RetI = I; 7102 if ((RetI->getOpcode() == Instruction::SExt || 7103 RetI->getOpcode() == Instruction::ZExt)) { 7104 if (!RetI->hasOneUser()) 7105 return InstructionCost::getInvalid(); 7106 RetI = RetI->user_back(); 7107 } 7108 if (RetI->getOpcode() == Instruction::Mul && 7109 RetI->user_back()->getOpcode() == Instruction::Add) { 7110 if (!RetI->hasOneUser()) 7111 return InstructionCost::getInvalid(); 7112 RetI = RetI->user_back(); 7113 } 7114 7115 // Test if the found instruction is a reduction, and if not return an invalid 7116 // cost specifying the parent to use the original cost modelling. 7117 if (!InLoopReductionImmediateChains.count(RetI)) 7118 return InstructionCost::getInvalid(); 7119 7120 // Find the reduction this chain is a part of and calculate the basic cost of 7121 // the reduction on its own. 7122 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7123 Instruction *ReductionPhi = LastChain; 7124 while (!isa<PHINode>(ReductionPhi)) 7125 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7126 7127 RecurrenceDescriptor RdxDesc = 7128 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7129 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7130 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7131 7132 // Get the operand that was not the reduction chain and match it to one of the 7133 // patterns, returning the better cost if it is found. 7134 Instruction *RedOp = RetI->getOperand(1) == LastChain 7135 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7136 : dyn_cast<Instruction>(RetI->getOperand(1)); 7137 7138 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7139 7140 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7141 !TheLoop->isLoopInvariant(RedOp)) { 7142 bool IsUnsigned = isa<ZExtInst>(RedOp); 7143 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7144 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7145 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7146 CostKind); 7147 7148 InstructionCost ExtCost = 7149 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7150 TTI::CastContextHint::None, CostKind, RedOp); 7151 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7152 return I == RetI ? *RedCost.getValue() : 0; 7153 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7154 Instruction *Mul = RedOp; 7155 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7156 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7157 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7158 Op0->getOpcode() == Op1->getOpcode() && 7159 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7160 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7161 bool IsUnsigned = isa<ZExtInst>(Op0); 7162 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7163 // reduce(mul(ext, ext)) 7164 InstructionCost ExtCost = 7165 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7166 TTI::CastContextHint::None, CostKind, Op0); 7167 InstructionCost MulCost = 7168 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7169 7170 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7171 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7172 CostKind); 7173 7174 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7175 return I == RetI ? *RedCost.getValue() : 0; 7176 } else { 7177 InstructionCost MulCost = 7178 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7179 7180 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7181 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7182 CostKind); 7183 7184 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7185 return I == RetI ? *RedCost.getValue() : 0; 7186 } 7187 } 7188 7189 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7190 } 7191 7192 InstructionCost 7193 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7194 ElementCount VF) { 7195 // Calculate scalar cost only. Vectorization cost should be ready at this 7196 // moment. 7197 if (VF.isScalar()) { 7198 Type *ValTy = getMemInstValueType(I); 7199 const Align Alignment = getLoadStoreAlignment(I); 7200 unsigned AS = getLoadStoreAddressSpace(I); 7201 7202 return TTI.getAddressComputationCost(ValTy) + 7203 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7204 TTI::TCK_RecipThroughput, I); 7205 } 7206 return getWideningCost(I, VF); 7207 } 7208 7209 LoopVectorizationCostModel::VectorizationCostTy 7210 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7211 ElementCount VF) { 7212 // If we know that this instruction will remain uniform, check the cost of 7213 // the scalar version. 7214 if (isUniformAfterVectorization(I, VF)) 7215 VF = ElementCount::getFixed(1); 7216 7217 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7218 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7219 7220 // Forced scalars do not have any scalarization overhead. 7221 auto ForcedScalar = ForcedScalars.find(VF); 7222 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7223 auto InstSet = ForcedScalar->second; 7224 if (InstSet.count(I)) 7225 return VectorizationCostTy( 7226 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7227 VF.getKnownMinValue()), 7228 false); 7229 } 7230 7231 Type *VectorTy; 7232 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7233 7234 bool TypeNotScalarized = 7235 VF.isVector() && VectorTy->isVectorTy() && 7236 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7237 return VectorizationCostTy(C, TypeNotScalarized); 7238 } 7239 7240 InstructionCost 7241 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7242 ElementCount VF) const { 7243 7244 if (VF.isScalable()) 7245 return InstructionCost::getInvalid(); 7246 7247 if (VF.isScalar()) 7248 return 0; 7249 7250 InstructionCost Cost = 0; 7251 Type *RetTy = ToVectorTy(I->getType(), VF); 7252 if (!RetTy->isVoidTy() && 7253 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7254 Cost += TTI.getScalarizationOverhead( 7255 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7256 true, false); 7257 7258 // Some targets keep addresses scalar. 7259 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7260 return Cost; 7261 7262 // Some targets support efficient element stores. 7263 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7264 return Cost; 7265 7266 // Collect operands to consider. 7267 CallInst *CI = dyn_cast<CallInst>(I); 7268 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7269 7270 // Skip operands that do not require extraction/scalarization and do not incur 7271 // any overhead. 7272 SmallVector<Type *> Tys; 7273 for (auto *V : filterExtractingOperands(Ops, VF)) 7274 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7275 return Cost + TTI.getOperandsScalarizationOverhead( 7276 filterExtractingOperands(Ops, VF), Tys); 7277 } 7278 7279 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7280 if (VF.isScalar()) 7281 return; 7282 NumPredStores = 0; 7283 for (BasicBlock *BB : TheLoop->blocks()) { 7284 // For each instruction in the old loop. 7285 for (Instruction &I : *BB) { 7286 Value *Ptr = getLoadStorePointerOperand(&I); 7287 if (!Ptr) 7288 continue; 7289 7290 // TODO: We should generate better code and update the cost model for 7291 // predicated uniform stores. Today they are treated as any other 7292 // predicated store (see added test cases in 7293 // invariant-store-vectorization.ll). 7294 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7295 NumPredStores++; 7296 7297 if (Legal->isUniformMemOp(I)) { 7298 // TODO: Avoid replicating loads and stores instead of 7299 // relying on instcombine to remove them. 7300 // Load: Scalar load + broadcast 7301 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7302 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7303 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7304 continue; 7305 } 7306 7307 // We assume that widening is the best solution when possible. 7308 if (memoryInstructionCanBeWidened(&I, VF)) { 7309 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7310 int ConsecutiveStride = 7311 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7312 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7313 "Expected consecutive stride."); 7314 InstWidening Decision = 7315 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7316 setWideningDecision(&I, VF, Decision, Cost); 7317 continue; 7318 } 7319 7320 // Choose between Interleaving, Gather/Scatter or Scalarization. 7321 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7322 unsigned NumAccesses = 1; 7323 if (isAccessInterleaved(&I)) { 7324 auto Group = getInterleavedAccessGroup(&I); 7325 assert(Group && "Fail to get an interleaved access group."); 7326 7327 // Make one decision for the whole group. 7328 if (getWideningDecision(&I, VF) != CM_Unknown) 7329 continue; 7330 7331 NumAccesses = Group->getNumMembers(); 7332 if (interleavedAccessCanBeWidened(&I, VF)) 7333 InterleaveCost = getInterleaveGroupCost(&I, VF); 7334 } 7335 7336 InstructionCost GatherScatterCost = 7337 isLegalGatherOrScatter(&I) 7338 ? getGatherScatterCost(&I, VF) * NumAccesses 7339 : InstructionCost::getInvalid(); 7340 7341 InstructionCost ScalarizationCost = 7342 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7343 7344 // Choose better solution for the current VF, 7345 // write down this decision and use it during vectorization. 7346 InstructionCost Cost; 7347 InstWidening Decision; 7348 if (InterleaveCost <= GatherScatterCost && 7349 InterleaveCost < ScalarizationCost) { 7350 Decision = CM_Interleave; 7351 Cost = InterleaveCost; 7352 } else if (GatherScatterCost < ScalarizationCost) { 7353 Decision = CM_GatherScatter; 7354 Cost = GatherScatterCost; 7355 } else { 7356 assert(!VF.isScalable() && 7357 "We cannot yet scalarise for scalable vectors"); 7358 Decision = CM_Scalarize; 7359 Cost = ScalarizationCost; 7360 } 7361 // If the instructions belongs to an interleave group, the whole group 7362 // receives the same decision. The whole group receives the cost, but 7363 // the cost will actually be assigned to one instruction. 7364 if (auto Group = getInterleavedAccessGroup(&I)) 7365 setWideningDecision(Group, VF, Decision, Cost); 7366 else 7367 setWideningDecision(&I, VF, Decision, Cost); 7368 } 7369 } 7370 7371 // Make sure that any load of address and any other address computation 7372 // remains scalar unless there is gather/scatter support. This avoids 7373 // inevitable extracts into address registers, and also has the benefit of 7374 // activating LSR more, since that pass can't optimize vectorized 7375 // addresses. 7376 if (TTI.prefersVectorizedAddressing()) 7377 return; 7378 7379 // Start with all scalar pointer uses. 7380 SmallPtrSet<Instruction *, 8> AddrDefs; 7381 for (BasicBlock *BB : TheLoop->blocks()) 7382 for (Instruction &I : *BB) { 7383 Instruction *PtrDef = 7384 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7385 if (PtrDef && TheLoop->contains(PtrDef) && 7386 getWideningDecision(&I, VF) != CM_GatherScatter) 7387 AddrDefs.insert(PtrDef); 7388 } 7389 7390 // Add all instructions used to generate the addresses. 7391 SmallVector<Instruction *, 4> Worklist; 7392 append_range(Worklist, AddrDefs); 7393 while (!Worklist.empty()) { 7394 Instruction *I = Worklist.pop_back_val(); 7395 for (auto &Op : I->operands()) 7396 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7397 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7398 AddrDefs.insert(InstOp).second) 7399 Worklist.push_back(InstOp); 7400 } 7401 7402 for (auto *I : AddrDefs) { 7403 if (isa<LoadInst>(I)) { 7404 // Setting the desired widening decision should ideally be handled in 7405 // by cost functions, but since this involves the task of finding out 7406 // if the loaded register is involved in an address computation, it is 7407 // instead changed here when we know this is the case. 7408 InstWidening Decision = getWideningDecision(I, VF); 7409 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7410 // Scalarize a widened load of address. 7411 setWideningDecision( 7412 I, VF, CM_Scalarize, 7413 (VF.getKnownMinValue() * 7414 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7415 else if (auto Group = getInterleavedAccessGroup(I)) { 7416 // Scalarize an interleave group of address loads. 7417 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7418 if (Instruction *Member = Group->getMember(I)) 7419 setWideningDecision( 7420 Member, VF, CM_Scalarize, 7421 (VF.getKnownMinValue() * 7422 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7423 } 7424 } 7425 } else 7426 // Make sure I gets scalarized and a cost estimate without 7427 // scalarization overhead. 7428 ForcedScalars[VF].insert(I); 7429 } 7430 } 7431 7432 InstructionCost 7433 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7434 Type *&VectorTy) { 7435 Type *RetTy = I->getType(); 7436 if (canTruncateToMinimalBitwidth(I, VF)) 7437 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7438 auto SE = PSE.getSE(); 7439 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7440 7441 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7442 ElementCount VF) -> bool { 7443 if (VF.isScalar()) 7444 return true; 7445 7446 auto Scalarized = InstsToScalarize.find(VF); 7447 assert(Scalarized != InstsToScalarize.end() && 7448 "VF not yet analyzed for scalarization profitability"); 7449 return !Scalarized->second.count(I) && 7450 llvm::all_of(I->users(), [&](User *U) { 7451 auto *UI = cast<Instruction>(U); 7452 return !Scalarized->second.count(UI); 7453 }); 7454 }; 7455 (void) hasSingleCopyAfterVectorization; 7456 7457 if (isScalarAfterVectorization(I, VF)) { 7458 // With the exception of GEPs and PHIs, after scalarization there should 7459 // only be one copy of the instruction generated in the loop. This is 7460 // because the VF is either 1, or any instructions that need scalarizing 7461 // have already been dealt with by the the time we get here. As a result, 7462 // it means we don't have to multiply the instruction cost by VF. 7463 assert(I->getOpcode() == Instruction::GetElementPtr || 7464 I->getOpcode() == Instruction::PHI || 7465 (I->getOpcode() == Instruction::BitCast && 7466 I->getType()->isPointerTy()) || 7467 hasSingleCopyAfterVectorization(I, VF)); 7468 VectorTy = RetTy; 7469 } else 7470 VectorTy = ToVectorTy(RetTy, VF); 7471 7472 // TODO: We need to estimate the cost of intrinsic calls. 7473 switch (I->getOpcode()) { 7474 case Instruction::GetElementPtr: 7475 // We mark this instruction as zero-cost because the cost of GEPs in 7476 // vectorized code depends on whether the corresponding memory instruction 7477 // is scalarized or not. Therefore, we handle GEPs with the memory 7478 // instruction cost. 7479 return 0; 7480 case Instruction::Br: { 7481 // In cases of scalarized and predicated instructions, there will be VF 7482 // predicated blocks in the vectorized loop. Each branch around these 7483 // blocks requires also an extract of its vector compare i1 element. 7484 bool ScalarPredicatedBB = false; 7485 BranchInst *BI = cast<BranchInst>(I); 7486 if (VF.isVector() && BI->isConditional() && 7487 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7488 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7489 ScalarPredicatedBB = true; 7490 7491 if (ScalarPredicatedBB) { 7492 // Return cost for branches around scalarized and predicated blocks. 7493 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7494 auto *Vec_i1Ty = 7495 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7496 return (TTI.getScalarizationOverhead( 7497 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7498 false, true) + 7499 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7500 VF.getKnownMinValue())); 7501 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7502 // The back-edge branch will remain, as will all scalar branches. 7503 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7504 else 7505 // This branch will be eliminated by if-conversion. 7506 return 0; 7507 // Note: We currently assume zero cost for an unconditional branch inside 7508 // a predicated block since it will become a fall-through, although we 7509 // may decide in the future to call TTI for all branches. 7510 } 7511 case Instruction::PHI: { 7512 auto *Phi = cast<PHINode>(I); 7513 7514 // First-order recurrences are replaced by vector shuffles inside the loop. 7515 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7516 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7517 return TTI.getShuffleCost( 7518 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7519 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7520 7521 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7522 // converted into select instructions. We require N - 1 selects per phi 7523 // node, where N is the number of incoming values. 7524 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7525 return (Phi->getNumIncomingValues() - 1) * 7526 TTI.getCmpSelInstrCost( 7527 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7528 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7529 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7530 7531 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7532 } 7533 case Instruction::UDiv: 7534 case Instruction::SDiv: 7535 case Instruction::URem: 7536 case Instruction::SRem: 7537 // If we have a predicated instruction, it may not be executed for each 7538 // vector lane. Get the scalarization cost and scale this amount by the 7539 // probability of executing the predicated block. If the instruction is not 7540 // predicated, we fall through to the next case. 7541 if (VF.isVector() && isScalarWithPredication(I)) { 7542 InstructionCost Cost = 0; 7543 7544 // These instructions have a non-void type, so account for the phi nodes 7545 // that we will create. This cost is likely to be zero. The phi node 7546 // cost, if any, should be scaled by the block probability because it 7547 // models a copy at the end of each predicated block. 7548 Cost += VF.getKnownMinValue() * 7549 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7550 7551 // The cost of the non-predicated instruction. 7552 Cost += VF.getKnownMinValue() * 7553 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7554 7555 // The cost of insertelement and extractelement instructions needed for 7556 // scalarization. 7557 Cost += getScalarizationOverhead(I, VF); 7558 7559 // Scale the cost by the probability of executing the predicated blocks. 7560 // This assumes the predicated block for each vector lane is equally 7561 // likely. 7562 return Cost / getReciprocalPredBlockProb(); 7563 } 7564 LLVM_FALLTHROUGH; 7565 case Instruction::Add: 7566 case Instruction::FAdd: 7567 case Instruction::Sub: 7568 case Instruction::FSub: 7569 case Instruction::Mul: 7570 case Instruction::FMul: 7571 case Instruction::FDiv: 7572 case Instruction::FRem: 7573 case Instruction::Shl: 7574 case Instruction::LShr: 7575 case Instruction::AShr: 7576 case Instruction::And: 7577 case Instruction::Or: 7578 case Instruction::Xor: { 7579 // Since we will replace the stride by 1 the multiplication should go away. 7580 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7581 return 0; 7582 7583 // Detect reduction patterns 7584 InstructionCost RedCost; 7585 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7586 .isValid()) 7587 return RedCost; 7588 7589 // Certain instructions can be cheaper to vectorize if they have a constant 7590 // second vector operand. One example of this are shifts on x86. 7591 Value *Op2 = I->getOperand(1); 7592 TargetTransformInfo::OperandValueProperties Op2VP; 7593 TargetTransformInfo::OperandValueKind Op2VK = 7594 TTI.getOperandInfo(Op2, Op2VP); 7595 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7596 Op2VK = TargetTransformInfo::OK_UniformValue; 7597 7598 SmallVector<const Value *, 4> Operands(I->operand_values()); 7599 return TTI.getArithmeticInstrCost( 7600 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7601 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7602 } 7603 case Instruction::FNeg: { 7604 return TTI.getArithmeticInstrCost( 7605 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7606 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7607 TargetTransformInfo::OP_None, I->getOperand(0), I); 7608 } 7609 case Instruction::Select: { 7610 SelectInst *SI = cast<SelectInst>(I); 7611 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7612 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7613 7614 const Value *Op0, *Op1; 7615 using namespace llvm::PatternMatch; 7616 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7617 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7618 // select x, y, false --> x & y 7619 // select x, true, y --> x | y 7620 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7621 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7622 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7623 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7624 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7625 Op1->getType()->getScalarSizeInBits() == 1); 7626 7627 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7628 return TTI.getArithmeticInstrCost( 7629 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7630 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7631 } 7632 7633 Type *CondTy = SI->getCondition()->getType(); 7634 if (!ScalarCond) 7635 CondTy = VectorType::get(CondTy, VF); 7636 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7637 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7638 } 7639 case Instruction::ICmp: 7640 case Instruction::FCmp: { 7641 Type *ValTy = I->getOperand(0)->getType(); 7642 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7643 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7644 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7645 VectorTy = ToVectorTy(ValTy, VF); 7646 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7647 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7648 } 7649 case Instruction::Store: 7650 case Instruction::Load: { 7651 ElementCount Width = VF; 7652 if (Width.isVector()) { 7653 InstWidening Decision = getWideningDecision(I, Width); 7654 assert(Decision != CM_Unknown && 7655 "CM decision should be taken at this point"); 7656 if (Decision == CM_Scalarize) 7657 Width = ElementCount::getFixed(1); 7658 } 7659 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7660 return getMemoryInstructionCost(I, VF); 7661 } 7662 case Instruction::BitCast: 7663 if (I->getType()->isPointerTy()) 7664 return 0; 7665 LLVM_FALLTHROUGH; 7666 case Instruction::ZExt: 7667 case Instruction::SExt: 7668 case Instruction::FPToUI: 7669 case Instruction::FPToSI: 7670 case Instruction::FPExt: 7671 case Instruction::PtrToInt: 7672 case Instruction::IntToPtr: 7673 case Instruction::SIToFP: 7674 case Instruction::UIToFP: 7675 case Instruction::Trunc: 7676 case Instruction::FPTrunc: { 7677 // Computes the CastContextHint from a Load/Store instruction. 7678 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7679 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7680 "Expected a load or a store!"); 7681 7682 if (VF.isScalar() || !TheLoop->contains(I)) 7683 return TTI::CastContextHint::Normal; 7684 7685 switch (getWideningDecision(I, VF)) { 7686 case LoopVectorizationCostModel::CM_GatherScatter: 7687 return TTI::CastContextHint::GatherScatter; 7688 case LoopVectorizationCostModel::CM_Interleave: 7689 return TTI::CastContextHint::Interleave; 7690 case LoopVectorizationCostModel::CM_Scalarize: 7691 case LoopVectorizationCostModel::CM_Widen: 7692 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7693 : TTI::CastContextHint::Normal; 7694 case LoopVectorizationCostModel::CM_Widen_Reverse: 7695 return TTI::CastContextHint::Reversed; 7696 case LoopVectorizationCostModel::CM_Unknown: 7697 llvm_unreachable("Instr did not go through cost modelling?"); 7698 } 7699 7700 llvm_unreachable("Unhandled case!"); 7701 }; 7702 7703 unsigned Opcode = I->getOpcode(); 7704 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7705 // For Trunc, the context is the only user, which must be a StoreInst. 7706 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7707 if (I->hasOneUse()) 7708 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7709 CCH = ComputeCCH(Store); 7710 } 7711 // For Z/Sext, the context is the operand, which must be a LoadInst. 7712 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7713 Opcode == Instruction::FPExt) { 7714 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7715 CCH = ComputeCCH(Load); 7716 } 7717 7718 // We optimize the truncation of induction variables having constant 7719 // integer steps. The cost of these truncations is the same as the scalar 7720 // operation. 7721 if (isOptimizableIVTruncate(I, VF)) { 7722 auto *Trunc = cast<TruncInst>(I); 7723 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7724 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7725 } 7726 7727 // Detect reduction patterns 7728 InstructionCost RedCost; 7729 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7730 .isValid()) 7731 return RedCost; 7732 7733 Type *SrcScalarTy = I->getOperand(0)->getType(); 7734 Type *SrcVecTy = 7735 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7736 if (canTruncateToMinimalBitwidth(I, VF)) { 7737 // This cast is going to be shrunk. This may remove the cast or it might 7738 // turn it into slightly different cast. For example, if MinBW == 16, 7739 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7740 // 7741 // Calculate the modified src and dest types. 7742 Type *MinVecTy = VectorTy; 7743 if (Opcode == Instruction::Trunc) { 7744 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7745 VectorTy = 7746 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7747 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7748 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7749 VectorTy = 7750 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7751 } 7752 } 7753 7754 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7755 } 7756 case Instruction::Call: { 7757 bool NeedToScalarize; 7758 CallInst *CI = cast<CallInst>(I); 7759 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7760 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7761 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7762 return std::min(CallCost, IntrinsicCost); 7763 } 7764 return CallCost; 7765 } 7766 case Instruction::ExtractValue: 7767 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7768 default: 7769 // This opcode is unknown. Assume that it is the same as 'mul'. 7770 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7771 } // end of switch. 7772 } 7773 7774 char LoopVectorize::ID = 0; 7775 7776 static const char lv_name[] = "Loop Vectorization"; 7777 7778 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7779 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7780 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7781 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7782 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7783 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7784 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7785 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7786 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7787 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7788 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7789 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7790 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7791 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7792 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7793 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7794 7795 namespace llvm { 7796 7797 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7798 7799 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7800 bool VectorizeOnlyWhenForced) { 7801 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7802 } 7803 7804 } // end namespace llvm 7805 7806 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7807 // Check if the pointer operand of a load or store instruction is 7808 // consecutive. 7809 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7810 return Legal->isConsecutivePtr(Ptr); 7811 return false; 7812 } 7813 7814 void LoopVectorizationCostModel::collectValuesToIgnore() { 7815 // Ignore ephemeral values. 7816 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7817 7818 // Ignore type-promoting instructions we identified during reduction 7819 // detection. 7820 for (auto &Reduction : Legal->getReductionVars()) { 7821 RecurrenceDescriptor &RedDes = Reduction.second; 7822 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7823 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7824 } 7825 // Ignore type-casting instructions we identified during induction 7826 // detection. 7827 for (auto &Induction : Legal->getInductionVars()) { 7828 InductionDescriptor &IndDes = Induction.second; 7829 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7830 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7831 } 7832 } 7833 7834 void LoopVectorizationCostModel::collectInLoopReductions() { 7835 for (auto &Reduction : Legal->getReductionVars()) { 7836 PHINode *Phi = Reduction.first; 7837 RecurrenceDescriptor &RdxDesc = Reduction.second; 7838 7839 // We don't collect reductions that are type promoted (yet). 7840 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7841 continue; 7842 7843 // If the target would prefer this reduction to happen "in-loop", then we 7844 // want to record it as such. 7845 unsigned Opcode = RdxDesc.getOpcode(); 7846 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7847 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7848 TargetTransformInfo::ReductionFlags())) 7849 continue; 7850 7851 // Check that we can correctly put the reductions into the loop, by 7852 // finding the chain of operations that leads from the phi to the loop 7853 // exit value. 7854 SmallVector<Instruction *, 4> ReductionOperations = 7855 RdxDesc.getReductionOpChain(Phi, TheLoop); 7856 bool InLoop = !ReductionOperations.empty(); 7857 if (InLoop) { 7858 InLoopReductionChains[Phi] = ReductionOperations; 7859 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7860 Instruction *LastChain = Phi; 7861 for (auto *I : ReductionOperations) { 7862 InLoopReductionImmediateChains[I] = LastChain; 7863 LastChain = I; 7864 } 7865 } 7866 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7867 << " reduction for phi: " << *Phi << "\n"); 7868 } 7869 } 7870 7871 // TODO: we could return a pair of values that specify the max VF and 7872 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7873 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7874 // doesn't have a cost model that can choose which plan to execute if 7875 // more than one is generated. 7876 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7877 LoopVectorizationCostModel &CM) { 7878 unsigned WidestType; 7879 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7880 return WidestVectorRegBits / WidestType; 7881 } 7882 7883 VectorizationFactor 7884 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7885 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7886 ElementCount VF = UserVF; 7887 // Outer loop handling: They may require CFG and instruction level 7888 // transformations before even evaluating whether vectorization is profitable. 7889 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7890 // the vectorization pipeline. 7891 if (!OrigLoop->isInnermost()) { 7892 // If the user doesn't provide a vectorization factor, determine a 7893 // reasonable one. 7894 if (UserVF.isZero()) { 7895 VF = ElementCount::getFixed(determineVPlanVF( 7896 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7897 .getFixedSize(), 7898 CM)); 7899 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7900 7901 // Make sure we have a VF > 1 for stress testing. 7902 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7903 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7904 << "overriding computed VF.\n"); 7905 VF = ElementCount::getFixed(4); 7906 } 7907 } 7908 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7909 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7910 "VF needs to be a power of two"); 7911 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7912 << "VF " << VF << " to build VPlans.\n"); 7913 buildVPlans(VF, VF); 7914 7915 // For VPlan build stress testing, we bail out after VPlan construction. 7916 if (VPlanBuildStressTest) 7917 return VectorizationFactor::Disabled(); 7918 7919 return {VF, 0 /*Cost*/}; 7920 } 7921 7922 LLVM_DEBUG( 7923 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7924 "VPlan-native path.\n"); 7925 return VectorizationFactor::Disabled(); 7926 } 7927 7928 Optional<VectorizationFactor> 7929 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7930 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7931 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7932 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7933 return None; 7934 7935 // Invalidate interleave groups if all blocks of loop will be predicated. 7936 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7937 !useMaskedInterleavedAccesses(*TTI)) { 7938 LLVM_DEBUG( 7939 dbgs() 7940 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7941 "which requires masked-interleaved support.\n"); 7942 if (CM.InterleaveInfo.invalidateGroups()) 7943 // Invalidating interleave groups also requires invalidating all decisions 7944 // based on them, which includes widening decisions and uniform and scalar 7945 // values. 7946 CM.invalidateCostModelingDecisions(); 7947 } 7948 7949 ElementCount MaxVF = MaybeMaxVF.getValue(); 7950 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7951 7952 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7953 if (!UserVF.isZero() && 7954 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7955 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7956 // VFs here, this should be reverted to only use legal UserVFs once the 7957 // loop below supports scalable VFs. 7958 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7959 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7960 << " VF " << VF << ".\n"); 7961 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7962 "VF needs to be a power of two"); 7963 // Collect the instructions (and their associated costs) that will be more 7964 // profitable to scalarize. 7965 CM.selectUserVectorizationFactor(VF); 7966 CM.collectInLoopReductions(); 7967 buildVPlansWithVPRecipes(VF, VF); 7968 LLVM_DEBUG(printPlans(dbgs())); 7969 return {{VF, 0}}; 7970 } 7971 7972 assert(!MaxVF.isScalable() && 7973 "Scalable vectors not yet supported beyond this point"); 7974 7975 for (ElementCount VF = ElementCount::getFixed(1); 7976 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7977 // Collect Uniform and Scalar instructions after vectorization with VF. 7978 CM.collectUniformsAndScalars(VF); 7979 7980 // Collect the instructions (and their associated costs) that will be more 7981 // profitable to scalarize. 7982 if (VF.isVector()) 7983 CM.collectInstsToScalarize(VF); 7984 } 7985 7986 CM.collectInLoopReductions(); 7987 7988 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7989 LLVM_DEBUG(printPlans(dbgs())); 7990 if (MaxVF.isScalar()) 7991 return VectorizationFactor::Disabled(); 7992 7993 // Select the optimal vectorization factor. 7994 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 7995 7996 // Check if it is profitable to vectorize with runtime checks. 7997 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7998 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7999 bool PragmaThresholdReached = 8000 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 8001 bool ThresholdReached = 8002 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 8003 if ((ThresholdReached && !Hints.allowReordering()) || 8004 PragmaThresholdReached) { 8005 ORE->emit([&]() { 8006 return OptimizationRemarkAnalysisAliasing( 8007 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 8008 OrigLoop->getHeader()) 8009 << "loop not vectorized: cannot prove it is safe to reorder " 8010 "memory operations"; 8011 }); 8012 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 8013 Hints.emitRemarkWithHints(); 8014 return VectorizationFactor::Disabled(); 8015 } 8016 } 8017 return SelectedVF; 8018 } 8019 8020 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8021 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8022 << '\n'); 8023 BestVF = VF; 8024 BestUF = UF; 8025 8026 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8027 return !Plan->hasVF(VF); 8028 }); 8029 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8030 } 8031 8032 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8033 DominatorTree *DT) { 8034 // Perform the actual loop transformation. 8035 8036 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8037 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8038 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8039 8040 VPTransformState State{ 8041 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8042 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8043 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8044 State.CanonicalIV = ILV.Induction; 8045 8046 ILV.printDebugTracesAtStart(); 8047 8048 //===------------------------------------------------===// 8049 // 8050 // Notice: any optimization or new instruction that go 8051 // into the code below should also be implemented in 8052 // the cost-model. 8053 // 8054 //===------------------------------------------------===// 8055 8056 // 2. Copy and widen instructions from the old loop into the new loop. 8057 VPlans.front()->execute(&State); 8058 8059 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8060 // predication, updating analyses. 8061 ILV.fixVectorizedLoop(State); 8062 8063 ILV.printDebugTracesAtEnd(); 8064 } 8065 8066 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8067 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8068 for (const auto &Plan : VPlans) 8069 if (PrintVPlansInDotFormat) 8070 Plan->printDOT(O); 8071 else 8072 Plan->print(O); 8073 } 8074 #endif 8075 8076 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8077 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8078 8079 // We create new control-flow for the vectorized loop, so the original exit 8080 // conditions will be dead after vectorization if it's only used by the 8081 // terminator 8082 SmallVector<BasicBlock*> ExitingBlocks; 8083 OrigLoop->getExitingBlocks(ExitingBlocks); 8084 for (auto *BB : ExitingBlocks) { 8085 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8086 if (!Cmp || !Cmp->hasOneUse()) 8087 continue; 8088 8089 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8090 if (!DeadInstructions.insert(Cmp).second) 8091 continue; 8092 8093 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8094 // TODO: can recurse through operands in general 8095 for (Value *Op : Cmp->operands()) { 8096 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8097 DeadInstructions.insert(cast<Instruction>(Op)); 8098 } 8099 } 8100 8101 // We create new "steps" for induction variable updates to which the original 8102 // induction variables map. An original update instruction will be dead if 8103 // all its users except the induction variable are dead. 8104 auto *Latch = OrigLoop->getLoopLatch(); 8105 for (auto &Induction : Legal->getInductionVars()) { 8106 PHINode *Ind = Induction.first; 8107 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8108 8109 // If the tail is to be folded by masking, the primary induction variable, 8110 // if exists, isn't dead: it will be used for masking. Don't kill it. 8111 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8112 continue; 8113 8114 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8115 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8116 })) 8117 DeadInstructions.insert(IndUpdate); 8118 8119 // We record as "Dead" also the type-casting instructions we had identified 8120 // during induction analysis. We don't need any handling for them in the 8121 // vectorized loop because we have proven that, under a proper runtime 8122 // test guarding the vectorized loop, the value of the phi, and the casted 8123 // value of the phi, are the same. The last instruction in this casting chain 8124 // will get its scalar/vector/widened def from the scalar/vector/widened def 8125 // of the respective phi node. Any other casts in the induction def-use chain 8126 // have no other uses outside the phi update chain, and will be ignored. 8127 InductionDescriptor &IndDes = Induction.second; 8128 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8129 DeadInstructions.insert(Casts.begin(), Casts.end()); 8130 } 8131 } 8132 8133 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8134 8135 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8136 8137 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8138 Instruction::BinaryOps BinOp) { 8139 // When unrolling and the VF is 1, we only need to add a simple scalar. 8140 Type *Ty = Val->getType(); 8141 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8142 8143 if (Ty->isFloatingPointTy()) { 8144 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8145 8146 // Floating-point operations inherit FMF via the builder's flags. 8147 Value *MulOp = Builder.CreateFMul(C, Step); 8148 return Builder.CreateBinOp(BinOp, Val, MulOp); 8149 } 8150 Constant *C = ConstantInt::get(Ty, StartIdx); 8151 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8152 } 8153 8154 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8155 SmallVector<Metadata *, 4> MDs; 8156 // Reserve first location for self reference to the LoopID metadata node. 8157 MDs.push_back(nullptr); 8158 bool IsUnrollMetadata = false; 8159 MDNode *LoopID = L->getLoopID(); 8160 if (LoopID) { 8161 // First find existing loop unrolling disable metadata. 8162 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8163 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8164 if (MD) { 8165 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8166 IsUnrollMetadata = 8167 S && S->getString().startswith("llvm.loop.unroll.disable"); 8168 } 8169 MDs.push_back(LoopID->getOperand(i)); 8170 } 8171 } 8172 8173 if (!IsUnrollMetadata) { 8174 // Add runtime unroll disable metadata. 8175 LLVMContext &Context = L->getHeader()->getContext(); 8176 SmallVector<Metadata *, 1> DisableOperands; 8177 DisableOperands.push_back( 8178 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8179 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8180 MDs.push_back(DisableNode); 8181 MDNode *NewLoopID = MDNode::get(Context, MDs); 8182 // Set operand 0 to refer to the loop id itself. 8183 NewLoopID->replaceOperandWith(0, NewLoopID); 8184 L->setLoopID(NewLoopID); 8185 } 8186 } 8187 8188 //===--------------------------------------------------------------------===// 8189 // EpilogueVectorizerMainLoop 8190 //===--------------------------------------------------------------------===// 8191 8192 /// This function is partially responsible for generating the control flow 8193 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8194 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8195 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8196 Loop *Lp = createVectorLoopSkeleton(""); 8197 8198 // Generate the code to check the minimum iteration count of the vector 8199 // epilogue (see below). 8200 EPI.EpilogueIterationCountCheck = 8201 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8202 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8203 8204 // Generate the code to check any assumptions that we've made for SCEV 8205 // expressions. 8206 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8207 8208 // Generate the code that checks at runtime if arrays overlap. We put the 8209 // checks into a separate block to make the more common case of few elements 8210 // faster. 8211 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8212 8213 // Generate the iteration count check for the main loop, *after* the check 8214 // for the epilogue loop, so that the path-length is shorter for the case 8215 // that goes directly through the vector epilogue. The longer-path length for 8216 // the main loop is compensated for, by the gain from vectorizing the larger 8217 // trip count. Note: the branch will get updated later on when we vectorize 8218 // the epilogue. 8219 EPI.MainLoopIterationCountCheck = 8220 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8221 8222 // Generate the induction variable. 8223 OldInduction = Legal->getPrimaryInduction(); 8224 Type *IdxTy = Legal->getWidestInductionType(); 8225 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8226 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8227 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8228 EPI.VectorTripCount = CountRoundDown; 8229 Induction = 8230 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8231 getDebugLocFromInstOrOperands(OldInduction)); 8232 8233 // Skip induction resume value creation here because they will be created in 8234 // the second pass. If we created them here, they wouldn't be used anyway, 8235 // because the vplan in the second pass still contains the inductions from the 8236 // original loop. 8237 8238 return completeLoopSkeleton(Lp, OrigLoopID); 8239 } 8240 8241 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8242 LLVM_DEBUG({ 8243 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8244 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8245 << ", Main Loop UF:" << EPI.MainLoopUF 8246 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8247 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8248 }); 8249 } 8250 8251 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8252 DEBUG_WITH_TYPE(VerboseDebug, { 8253 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8254 }); 8255 } 8256 8257 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8258 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8259 assert(L && "Expected valid Loop."); 8260 assert(Bypass && "Expected valid bypass basic block."); 8261 unsigned VFactor = 8262 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8263 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8264 Value *Count = getOrCreateTripCount(L); 8265 // Reuse existing vector loop preheader for TC checks. 8266 // Note that new preheader block is generated for vector loop. 8267 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8268 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8269 8270 // Generate code to check if the loop's trip count is less than VF * UF of the 8271 // main vector loop. 8272 auto P = 8273 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8274 8275 Value *CheckMinIters = Builder.CreateICmp( 8276 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8277 "min.iters.check"); 8278 8279 if (!ForEpilogue) 8280 TCCheckBlock->setName("vector.main.loop.iter.check"); 8281 8282 // Create new preheader for vector loop. 8283 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8284 DT, LI, nullptr, "vector.ph"); 8285 8286 if (ForEpilogue) { 8287 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8288 DT->getNode(Bypass)->getIDom()) && 8289 "TC check is expected to dominate Bypass"); 8290 8291 // Update dominator for Bypass & LoopExit. 8292 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8293 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8294 8295 LoopBypassBlocks.push_back(TCCheckBlock); 8296 8297 // Save the trip count so we don't have to regenerate it in the 8298 // vec.epilog.iter.check. This is safe to do because the trip count 8299 // generated here dominates the vector epilog iter check. 8300 EPI.TripCount = Count; 8301 } 8302 8303 ReplaceInstWithInst( 8304 TCCheckBlock->getTerminator(), 8305 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8306 8307 return TCCheckBlock; 8308 } 8309 8310 //===--------------------------------------------------------------------===// 8311 // EpilogueVectorizerEpilogueLoop 8312 //===--------------------------------------------------------------------===// 8313 8314 /// This function is partially responsible for generating the control flow 8315 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8316 BasicBlock * 8317 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8318 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8319 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8320 8321 // Now, compare the remaining count and if there aren't enough iterations to 8322 // execute the vectorized epilogue skip to the scalar part. 8323 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8324 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8325 LoopVectorPreHeader = 8326 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8327 LI, nullptr, "vec.epilog.ph"); 8328 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8329 VecEpilogueIterationCountCheck); 8330 8331 // Adjust the control flow taking the state info from the main loop 8332 // vectorization into account. 8333 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8334 "expected this to be saved from the previous pass."); 8335 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8336 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8337 8338 DT->changeImmediateDominator(LoopVectorPreHeader, 8339 EPI.MainLoopIterationCountCheck); 8340 8341 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8342 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8343 8344 if (EPI.SCEVSafetyCheck) 8345 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8346 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8347 if (EPI.MemSafetyCheck) 8348 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8349 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8350 8351 DT->changeImmediateDominator( 8352 VecEpilogueIterationCountCheck, 8353 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8354 8355 DT->changeImmediateDominator(LoopScalarPreHeader, 8356 EPI.EpilogueIterationCountCheck); 8357 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8358 8359 // Keep track of bypass blocks, as they feed start values to the induction 8360 // phis in the scalar loop preheader. 8361 if (EPI.SCEVSafetyCheck) 8362 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8363 if (EPI.MemSafetyCheck) 8364 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8365 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8366 8367 // Generate a resume induction for the vector epilogue and put it in the 8368 // vector epilogue preheader 8369 Type *IdxTy = Legal->getWidestInductionType(); 8370 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8371 LoopVectorPreHeader->getFirstNonPHI()); 8372 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8373 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8374 EPI.MainLoopIterationCountCheck); 8375 8376 // Generate the induction variable. 8377 OldInduction = Legal->getPrimaryInduction(); 8378 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8379 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8380 Value *StartIdx = EPResumeVal; 8381 Induction = 8382 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8383 getDebugLocFromInstOrOperands(OldInduction)); 8384 8385 // Generate induction resume values. These variables save the new starting 8386 // indexes for the scalar loop. They are used to test if there are any tail 8387 // iterations left once the vector loop has completed. 8388 // Note that when the vectorized epilogue is skipped due to iteration count 8389 // check, then the resume value for the induction variable comes from 8390 // the trip count of the main vector loop, hence passing the AdditionalBypass 8391 // argument. 8392 createInductionResumeValues(Lp, CountRoundDown, 8393 {VecEpilogueIterationCountCheck, 8394 EPI.VectorTripCount} /* AdditionalBypass */); 8395 8396 AddRuntimeUnrollDisableMetaData(Lp); 8397 return completeLoopSkeleton(Lp, OrigLoopID); 8398 } 8399 8400 BasicBlock * 8401 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8402 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8403 8404 assert(EPI.TripCount && 8405 "Expected trip count to have been safed in the first pass."); 8406 assert( 8407 (!isa<Instruction>(EPI.TripCount) || 8408 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8409 "saved trip count does not dominate insertion point."); 8410 Value *TC = EPI.TripCount; 8411 IRBuilder<> Builder(Insert->getTerminator()); 8412 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8413 8414 // Generate code to check if the loop's trip count is less than VF * UF of the 8415 // vector epilogue loop. 8416 auto P = 8417 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8418 8419 Value *CheckMinIters = Builder.CreateICmp( 8420 P, Count, 8421 ConstantInt::get(Count->getType(), 8422 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8423 "min.epilog.iters.check"); 8424 8425 ReplaceInstWithInst( 8426 Insert->getTerminator(), 8427 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8428 8429 LoopBypassBlocks.push_back(Insert); 8430 return Insert; 8431 } 8432 8433 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8434 LLVM_DEBUG({ 8435 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8436 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8437 << ", Main Loop UF:" << EPI.MainLoopUF 8438 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8439 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8440 }); 8441 } 8442 8443 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8444 DEBUG_WITH_TYPE(VerboseDebug, { 8445 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8446 }); 8447 } 8448 8449 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8450 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8451 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8452 bool PredicateAtRangeStart = Predicate(Range.Start); 8453 8454 for (ElementCount TmpVF = Range.Start * 2; 8455 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8456 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8457 Range.End = TmpVF; 8458 break; 8459 } 8460 8461 return PredicateAtRangeStart; 8462 } 8463 8464 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8465 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8466 /// of VF's starting at a given VF and extending it as much as possible. Each 8467 /// vectorization decision can potentially shorten this sub-range during 8468 /// buildVPlan(). 8469 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8470 ElementCount MaxVF) { 8471 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8472 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8473 VFRange SubRange = {VF, MaxVFPlusOne}; 8474 VPlans.push_back(buildVPlan(SubRange)); 8475 VF = SubRange.End; 8476 } 8477 } 8478 8479 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8480 VPlanPtr &Plan) { 8481 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8482 8483 // Look for cached value. 8484 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8485 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8486 if (ECEntryIt != EdgeMaskCache.end()) 8487 return ECEntryIt->second; 8488 8489 VPValue *SrcMask = createBlockInMask(Src, Plan); 8490 8491 // The terminator has to be a branch inst! 8492 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8493 assert(BI && "Unexpected terminator found"); 8494 8495 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8496 return EdgeMaskCache[Edge] = SrcMask; 8497 8498 // If source is an exiting block, we know the exit edge is dynamically dead 8499 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8500 // adding uses of an otherwise potentially dead instruction. 8501 if (OrigLoop->isLoopExiting(Src)) 8502 return EdgeMaskCache[Edge] = SrcMask; 8503 8504 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8505 assert(EdgeMask && "No Edge Mask found for condition"); 8506 8507 if (BI->getSuccessor(0) != Dst) 8508 EdgeMask = Builder.createNot(EdgeMask); 8509 8510 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8511 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8512 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8513 // The select version does not introduce new UB if SrcMask is false and 8514 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8515 VPValue *False = Plan->getOrAddVPValue( 8516 ConstantInt::getFalse(BI->getCondition()->getType())); 8517 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8518 } 8519 8520 return EdgeMaskCache[Edge] = EdgeMask; 8521 } 8522 8523 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8524 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8525 8526 // Look for cached value. 8527 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8528 if (BCEntryIt != BlockMaskCache.end()) 8529 return BCEntryIt->second; 8530 8531 // All-one mask is modelled as no-mask following the convention for masked 8532 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8533 VPValue *BlockMask = nullptr; 8534 8535 if (OrigLoop->getHeader() == BB) { 8536 if (!CM.blockNeedsPredication(BB)) 8537 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8538 8539 // Create the block in mask as the first non-phi instruction in the block. 8540 VPBuilder::InsertPointGuard Guard(Builder); 8541 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8542 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8543 8544 // Introduce the early-exit compare IV <= BTC to form header block mask. 8545 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8546 // Start by constructing the desired canonical IV. 8547 VPValue *IV = nullptr; 8548 if (Legal->getPrimaryInduction()) 8549 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8550 else { 8551 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8552 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8553 IV = IVRecipe->getVPSingleValue(); 8554 } 8555 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8556 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8557 8558 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8559 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8560 // as a second argument, we only pass the IV here and extract the 8561 // tripcount from the transform state where codegen of the VP instructions 8562 // happen. 8563 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8564 } else { 8565 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8566 } 8567 return BlockMaskCache[BB] = BlockMask; 8568 } 8569 8570 // This is the block mask. We OR all incoming edges. 8571 for (auto *Predecessor : predecessors(BB)) { 8572 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8573 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8574 return BlockMaskCache[BB] = EdgeMask; 8575 8576 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8577 BlockMask = EdgeMask; 8578 continue; 8579 } 8580 8581 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8582 } 8583 8584 return BlockMaskCache[BB] = BlockMask; 8585 } 8586 8587 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8588 ArrayRef<VPValue *> Operands, 8589 VFRange &Range, 8590 VPlanPtr &Plan) { 8591 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8592 "Must be called with either a load or store"); 8593 8594 auto willWiden = [&](ElementCount VF) -> bool { 8595 if (VF.isScalar()) 8596 return false; 8597 LoopVectorizationCostModel::InstWidening Decision = 8598 CM.getWideningDecision(I, VF); 8599 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8600 "CM decision should be taken at this point."); 8601 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8602 return true; 8603 if (CM.isScalarAfterVectorization(I, VF) || 8604 CM.isProfitableToScalarize(I, VF)) 8605 return false; 8606 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8607 }; 8608 8609 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8610 return nullptr; 8611 8612 VPValue *Mask = nullptr; 8613 if (Legal->isMaskRequired(I)) 8614 Mask = createBlockInMask(I->getParent(), Plan); 8615 8616 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8617 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8618 8619 StoreInst *Store = cast<StoreInst>(I); 8620 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8621 Mask); 8622 } 8623 8624 VPWidenIntOrFpInductionRecipe * 8625 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8626 ArrayRef<VPValue *> Operands) const { 8627 // Check if this is an integer or fp induction. If so, build the recipe that 8628 // produces its scalar and vector values. 8629 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8630 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8631 II.getKind() == InductionDescriptor::IK_FpInduction) { 8632 assert(II.getStartValue() == 8633 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8634 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8635 return new VPWidenIntOrFpInductionRecipe( 8636 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8637 } 8638 8639 return nullptr; 8640 } 8641 8642 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8643 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8644 VPlan &Plan) const { 8645 // Optimize the special case where the source is a constant integer 8646 // induction variable. Notice that we can only optimize the 'trunc' case 8647 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8648 // (c) other casts depend on pointer size. 8649 8650 // Determine whether \p K is a truncation based on an induction variable that 8651 // can be optimized. 8652 auto isOptimizableIVTruncate = 8653 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8654 return [=](ElementCount VF) -> bool { 8655 return CM.isOptimizableIVTruncate(K, VF); 8656 }; 8657 }; 8658 8659 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8660 isOptimizableIVTruncate(I), Range)) { 8661 8662 InductionDescriptor II = 8663 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8664 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8665 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8666 Start, nullptr, I); 8667 } 8668 return nullptr; 8669 } 8670 8671 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8672 ArrayRef<VPValue *> Operands, 8673 VPlanPtr &Plan) { 8674 // If all incoming values are equal, the incoming VPValue can be used directly 8675 // instead of creating a new VPBlendRecipe. 8676 VPValue *FirstIncoming = Operands[0]; 8677 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8678 return FirstIncoming == Inc; 8679 })) { 8680 return Operands[0]; 8681 } 8682 8683 // We know that all PHIs in non-header blocks are converted into selects, so 8684 // we don't have to worry about the insertion order and we can just use the 8685 // builder. At this point we generate the predication tree. There may be 8686 // duplications since this is a simple recursive scan, but future 8687 // optimizations will clean it up. 8688 SmallVector<VPValue *, 2> OperandsWithMask; 8689 unsigned NumIncoming = Phi->getNumIncomingValues(); 8690 8691 for (unsigned In = 0; In < NumIncoming; In++) { 8692 VPValue *EdgeMask = 8693 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8694 assert((EdgeMask || NumIncoming == 1) && 8695 "Multiple predecessors with one having a full mask"); 8696 OperandsWithMask.push_back(Operands[In]); 8697 if (EdgeMask) 8698 OperandsWithMask.push_back(EdgeMask); 8699 } 8700 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8701 } 8702 8703 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8704 ArrayRef<VPValue *> Operands, 8705 VFRange &Range) const { 8706 8707 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8708 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, 8709 Range); 8710 8711 if (IsPredicated) 8712 return nullptr; 8713 8714 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8715 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8716 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8717 ID == Intrinsic::pseudoprobe || 8718 ID == Intrinsic::experimental_noalias_scope_decl)) 8719 return nullptr; 8720 8721 auto willWiden = [&](ElementCount VF) -> bool { 8722 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8723 // The following case may be scalarized depending on the VF. 8724 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8725 // version of the instruction. 8726 // Is it beneficial to perform intrinsic call compared to lib call? 8727 bool NeedToScalarize = false; 8728 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8729 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8730 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8731 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8732 "Either the intrinsic cost or vector call cost must be valid"); 8733 return UseVectorIntrinsic || !NeedToScalarize; 8734 }; 8735 8736 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8737 return nullptr; 8738 8739 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8740 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8741 } 8742 8743 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8744 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8745 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8746 // Instruction should be widened, unless it is scalar after vectorization, 8747 // scalarization is profitable or it is predicated. 8748 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8749 return CM.isScalarAfterVectorization(I, VF) || 8750 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); 8751 }; 8752 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8753 Range); 8754 } 8755 8756 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8757 ArrayRef<VPValue *> Operands) const { 8758 auto IsVectorizableOpcode = [](unsigned Opcode) { 8759 switch (Opcode) { 8760 case Instruction::Add: 8761 case Instruction::And: 8762 case Instruction::AShr: 8763 case Instruction::BitCast: 8764 case Instruction::FAdd: 8765 case Instruction::FCmp: 8766 case Instruction::FDiv: 8767 case Instruction::FMul: 8768 case Instruction::FNeg: 8769 case Instruction::FPExt: 8770 case Instruction::FPToSI: 8771 case Instruction::FPToUI: 8772 case Instruction::FPTrunc: 8773 case Instruction::FRem: 8774 case Instruction::FSub: 8775 case Instruction::ICmp: 8776 case Instruction::IntToPtr: 8777 case Instruction::LShr: 8778 case Instruction::Mul: 8779 case Instruction::Or: 8780 case Instruction::PtrToInt: 8781 case Instruction::SDiv: 8782 case Instruction::Select: 8783 case Instruction::SExt: 8784 case Instruction::Shl: 8785 case Instruction::SIToFP: 8786 case Instruction::SRem: 8787 case Instruction::Sub: 8788 case Instruction::Trunc: 8789 case Instruction::UDiv: 8790 case Instruction::UIToFP: 8791 case Instruction::URem: 8792 case Instruction::Xor: 8793 case Instruction::ZExt: 8794 return true; 8795 } 8796 return false; 8797 }; 8798 8799 if (!IsVectorizableOpcode(I->getOpcode())) 8800 return nullptr; 8801 8802 // Success: widen this instruction. 8803 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8804 } 8805 8806 void VPRecipeBuilder::fixHeaderPhis() { 8807 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8808 for (VPWidenPHIRecipe *R : PhisToFix) { 8809 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8810 VPRecipeBase *IncR = 8811 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8812 R->addOperand(IncR->getVPSingleValue()); 8813 } 8814 } 8815 8816 VPBasicBlock *VPRecipeBuilder::handleReplication( 8817 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8818 VPlanPtr &Plan) { 8819 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8820 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8821 Range); 8822 8823 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8824 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range); 8825 8826 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8827 IsUniform, IsPredicated); 8828 setRecipe(I, Recipe); 8829 Plan->addVPValue(I, Recipe); 8830 8831 // Find if I uses a predicated instruction. If so, it will use its scalar 8832 // value. Avoid hoisting the insert-element which packs the scalar value into 8833 // a vector value, as that happens iff all users use the vector value. 8834 for (VPValue *Op : Recipe->operands()) { 8835 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8836 if (!PredR) 8837 continue; 8838 auto *RepR = 8839 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8840 assert(RepR->isPredicated() && 8841 "expected Replicate recipe to be predicated"); 8842 RepR->setAlsoPack(false); 8843 } 8844 8845 // Finalize the recipe for Instr, first if it is not predicated. 8846 if (!IsPredicated) { 8847 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8848 VPBB->appendRecipe(Recipe); 8849 return VPBB; 8850 } 8851 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8852 assert(VPBB->getSuccessors().empty() && 8853 "VPBB has successors when handling predicated replication."); 8854 // Record predicated instructions for above packing optimizations. 8855 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8856 VPBlockUtils::insertBlockAfter(Region, VPBB); 8857 auto *RegSucc = new VPBasicBlock(); 8858 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8859 return RegSucc; 8860 } 8861 8862 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8863 VPRecipeBase *PredRecipe, 8864 VPlanPtr &Plan) { 8865 // Instructions marked for predication are replicated and placed under an 8866 // if-then construct to prevent side-effects. 8867 8868 // Generate recipes to compute the block mask for this region. 8869 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8870 8871 // Build the triangular if-then region. 8872 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8873 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8874 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8875 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8876 auto *PHIRecipe = Instr->getType()->isVoidTy() 8877 ? nullptr 8878 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8879 if (PHIRecipe) { 8880 Plan->removeVPValueFor(Instr); 8881 Plan->addVPValue(Instr, PHIRecipe); 8882 } 8883 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8884 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8885 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8886 8887 // Note: first set Entry as region entry and then connect successors starting 8888 // from it in order, to propagate the "parent" of each VPBasicBlock. 8889 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8890 VPBlockUtils::connectBlocks(Pred, Exit); 8891 8892 return Region; 8893 } 8894 8895 VPRecipeOrVPValueTy 8896 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8897 ArrayRef<VPValue *> Operands, 8898 VFRange &Range, VPlanPtr &Plan) { 8899 // First, check for specific widening recipes that deal with calls, memory 8900 // operations, inductions and Phi nodes. 8901 if (auto *CI = dyn_cast<CallInst>(Instr)) 8902 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8903 8904 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8905 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8906 8907 VPRecipeBase *Recipe; 8908 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8909 if (Phi->getParent() != OrigLoop->getHeader()) 8910 return tryToBlend(Phi, Operands, Plan); 8911 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8912 return toVPRecipeResult(Recipe); 8913 8914 if (Legal->isReductionVariable(Phi)) { 8915 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8916 assert(RdxDesc.getRecurrenceStartValue() == 8917 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8918 VPValue *StartV = Operands[0]; 8919 8920 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8921 PhisToFix.push_back(PhiRecipe); 8922 // Record the incoming value from the backedge, so we can add the incoming 8923 // value from the backedge after all recipes have been created. 8924 recordRecipeOf(cast<Instruction>( 8925 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8926 return toVPRecipeResult(PhiRecipe); 8927 } 8928 8929 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8930 } 8931 8932 if (isa<TruncInst>(Instr) && 8933 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8934 Range, *Plan))) 8935 return toVPRecipeResult(Recipe); 8936 8937 if (!shouldWiden(Instr, Range)) 8938 return nullptr; 8939 8940 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8941 return toVPRecipeResult(new VPWidenGEPRecipe( 8942 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8943 8944 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8945 bool InvariantCond = 8946 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8947 return toVPRecipeResult(new VPWidenSelectRecipe( 8948 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8949 } 8950 8951 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8952 } 8953 8954 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8955 ElementCount MaxVF) { 8956 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8957 8958 // Collect instructions from the original loop that will become trivially dead 8959 // in the vectorized loop. We don't need to vectorize these instructions. For 8960 // example, original induction update instructions can become dead because we 8961 // separately emit induction "steps" when generating code for the new loop. 8962 // Similarly, we create a new latch condition when setting up the structure 8963 // of the new loop, so the old one can become dead. 8964 SmallPtrSet<Instruction *, 4> DeadInstructions; 8965 collectTriviallyDeadInstructions(DeadInstructions); 8966 8967 // Add assume instructions we need to drop to DeadInstructions, to prevent 8968 // them from being added to the VPlan. 8969 // TODO: We only need to drop assumes in blocks that get flattend. If the 8970 // control flow is preserved, we should keep them. 8971 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8972 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8973 8974 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8975 // Dead instructions do not need sinking. Remove them from SinkAfter. 8976 for (Instruction *I : DeadInstructions) 8977 SinkAfter.erase(I); 8978 8979 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8980 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8981 VFRange SubRange = {VF, MaxVFPlusOne}; 8982 VPlans.push_back( 8983 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8984 VF = SubRange.End; 8985 } 8986 } 8987 8988 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8989 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8990 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8991 8992 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8993 8994 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8995 8996 // --------------------------------------------------------------------------- 8997 // Pre-construction: record ingredients whose recipes we'll need to further 8998 // process after constructing the initial VPlan. 8999 // --------------------------------------------------------------------------- 9000 9001 // Mark instructions we'll need to sink later and their targets as 9002 // ingredients whose recipe we'll need to record. 9003 for (auto &Entry : SinkAfter) { 9004 RecipeBuilder.recordRecipeOf(Entry.first); 9005 RecipeBuilder.recordRecipeOf(Entry.second); 9006 } 9007 for (auto &Reduction : CM.getInLoopReductionChains()) { 9008 PHINode *Phi = Reduction.first; 9009 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 9010 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9011 9012 RecipeBuilder.recordRecipeOf(Phi); 9013 for (auto &R : ReductionOperations) { 9014 RecipeBuilder.recordRecipeOf(R); 9015 // For min/max reducitons, where we have a pair of icmp/select, we also 9016 // need to record the ICmp recipe, so it can be removed later. 9017 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9018 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9019 } 9020 } 9021 9022 // For each interleave group which is relevant for this (possibly trimmed) 9023 // Range, add it to the set of groups to be later applied to the VPlan and add 9024 // placeholders for its members' Recipes which we'll be replacing with a 9025 // single VPInterleaveRecipe. 9026 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9027 auto applyIG = [IG, this](ElementCount VF) -> bool { 9028 return (VF.isVector() && // Query is illegal for VF == 1 9029 CM.getWideningDecision(IG->getInsertPos(), VF) == 9030 LoopVectorizationCostModel::CM_Interleave); 9031 }; 9032 if (!getDecisionAndClampRange(applyIG, Range)) 9033 continue; 9034 InterleaveGroups.insert(IG); 9035 for (unsigned i = 0; i < IG->getFactor(); i++) 9036 if (Instruction *Member = IG->getMember(i)) 9037 RecipeBuilder.recordRecipeOf(Member); 9038 }; 9039 9040 // --------------------------------------------------------------------------- 9041 // Build initial VPlan: Scan the body of the loop in a topological order to 9042 // visit each basic block after having visited its predecessor basic blocks. 9043 // --------------------------------------------------------------------------- 9044 9045 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9046 auto Plan = std::make_unique<VPlan>(); 9047 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9048 Plan->setEntry(VPBB); 9049 9050 // Scan the body of the loop in a topological order to visit each basic block 9051 // after having visited its predecessor basic blocks. 9052 LoopBlocksDFS DFS(OrigLoop); 9053 DFS.perform(LI); 9054 9055 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9056 // Relevant instructions from basic block BB will be grouped into VPRecipe 9057 // ingredients and fill a new VPBasicBlock. 9058 unsigned VPBBsForBB = 0; 9059 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9060 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9061 VPBB = FirstVPBBForBB; 9062 Builder.setInsertPoint(VPBB); 9063 9064 // Introduce each ingredient into VPlan. 9065 // TODO: Model and preserve debug instrinsics in VPlan. 9066 for (Instruction &I : BB->instructionsWithoutDebug()) { 9067 Instruction *Instr = &I; 9068 9069 // First filter out irrelevant instructions, to ensure no recipes are 9070 // built for them. 9071 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9072 continue; 9073 9074 SmallVector<VPValue *, 4> Operands; 9075 auto *Phi = dyn_cast<PHINode>(Instr); 9076 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9077 Operands.push_back(Plan->getOrAddVPValue( 9078 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9079 } else { 9080 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9081 Operands = {OpRange.begin(), OpRange.end()}; 9082 } 9083 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9084 Instr, Operands, Range, Plan)) { 9085 // If Instr can be simplified to an existing VPValue, use it. 9086 if (RecipeOrValue.is<VPValue *>()) { 9087 auto *VPV = RecipeOrValue.get<VPValue *>(); 9088 Plan->addVPValue(Instr, VPV); 9089 // If the re-used value is a recipe, register the recipe for the 9090 // instruction, in case the recipe for Instr needs to be recorded. 9091 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef())) 9092 RecipeBuilder.setRecipe(Instr, R); 9093 continue; 9094 } 9095 // Otherwise, add the new recipe. 9096 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9097 for (auto *Def : Recipe->definedValues()) { 9098 auto *UV = Def->getUnderlyingValue(); 9099 Plan->addVPValue(UV, Def); 9100 } 9101 9102 RecipeBuilder.setRecipe(Instr, Recipe); 9103 VPBB->appendRecipe(Recipe); 9104 continue; 9105 } 9106 9107 // Otherwise, if all widening options failed, Instruction is to be 9108 // replicated. This may create a successor for VPBB. 9109 VPBasicBlock *NextVPBB = 9110 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9111 if (NextVPBB != VPBB) { 9112 VPBB = NextVPBB; 9113 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9114 : ""); 9115 } 9116 } 9117 } 9118 9119 RecipeBuilder.fixHeaderPhis(); 9120 9121 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9122 // may also be empty, such as the last one VPBB, reflecting original 9123 // basic-blocks with no recipes. 9124 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9125 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9126 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9127 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9128 delete PreEntry; 9129 9130 // --------------------------------------------------------------------------- 9131 // Transform initial VPlan: Apply previously taken decisions, in order, to 9132 // bring the VPlan to its final state. 9133 // --------------------------------------------------------------------------- 9134 9135 // Apply Sink-After legal constraints. 9136 for (auto &Entry : SinkAfter) { 9137 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9138 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9139 9140 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9141 auto *Region = 9142 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9143 if (Region && Region->isReplicator()) 9144 return Region; 9145 return nullptr; 9146 }; 9147 9148 // If the target is in a replication region, make sure to move Sink to the 9149 // block after it, not into the replication region itself. 9150 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9151 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9152 assert(!GetReplicateRegion(Sink) && 9153 "cannot sink a region into another region yet"); 9154 VPBasicBlock *NextBlock = 9155 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9156 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9157 continue; 9158 } 9159 9160 auto *SinkRegion = GetReplicateRegion(Sink); 9161 // Unless the sink source is in a replicate region, sink the recipe 9162 // directly. 9163 if (!SinkRegion) { 9164 Sink->moveAfter(Target); 9165 continue; 9166 } 9167 9168 // If the sink source is in a replicate region, we need to move the whole 9169 // replicate region, which should only contain a single recipe in the main 9170 // block. 9171 assert(Sink->getParent()->size() == 1 && 9172 "parent must be a replicator with a single recipe"); 9173 auto *SplitBlock = 9174 Target->getParent()->splitAt(std::next(Target->getIterator())); 9175 9176 auto *Pred = SinkRegion->getSinglePredecessor(); 9177 auto *Succ = SinkRegion->getSingleSuccessor(); 9178 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9179 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9180 VPBlockUtils::connectBlocks(Pred, Succ); 9181 9182 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9183 9184 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9185 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9186 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9187 if (VPBB == SplitPred) 9188 VPBB = SplitBlock; 9189 } 9190 9191 // Interleave memory: for each Interleave Group we marked earlier as relevant 9192 // for this VPlan, replace the Recipes widening its memory instructions with a 9193 // single VPInterleaveRecipe at its insertion point. 9194 for (auto IG : InterleaveGroups) { 9195 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9196 RecipeBuilder.getRecipe(IG->getInsertPos())); 9197 SmallVector<VPValue *, 4> StoredValues; 9198 for (unsigned i = 0; i < IG->getFactor(); ++i) 9199 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9200 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9201 9202 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9203 Recipe->getMask()); 9204 VPIG->insertBefore(Recipe); 9205 unsigned J = 0; 9206 for (unsigned i = 0; i < IG->getFactor(); ++i) 9207 if (Instruction *Member = IG->getMember(i)) { 9208 if (!Member->getType()->isVoidTy()) { 9209 VPValue *OriginalV = Plan->getVPValue(Member); 9210 Plan->removeVPValueFor(Member); 9211 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9212 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9213 J++; 9214 } 9215 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9216 } 9217 } 9218 9219 // Adjust the recipes for any inloop reductions. 9220 if (Range.Start.isVector()) 9221 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9222 9223 // Finally, if tail is folded by masking, introduce selects between the phi 9224 // and the live-out instruction of each reduction, at the end of the latch. 9225 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9226 Builder.setInsertPoint(VPBB); 9227 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9228 for (auto &Reduction : Legal->getReductionVars()) { 9229 if (CM.isInLoopReduction(Reduction.first)) 9230 continue; 9231 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9232 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9233 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9234 } 9235 } 9236 9237 std::string PlanName; 9238 raw_string_ostream RSO(PlanName); 9239 ElementCount VF = Range.Start; 9240 Plan->addVF(VF); 9241 RSO << "Initial VPlan for VF={" << VF; 9242 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9243 Plan->addVF(VF); 9244 RSO << "," << VF; 9245 } 9246 RSO << "},UF>=1"; 9247 RSO.flush(); 9248 Plan->setName(PlanName); 9249 9250 return Plan; 9251 } 9252 9253 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9254 // Outer loop handling: They may require CFG and instruction level 9255 // transformations before even evaluating whether vectorization is profitable. 9256 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9257 // the vectorization pipeline. 9258 assert(!OrigLoop->isInnermost()); 9259 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9260 9261 // Create new empty VPlan 9262 auto Plan = std::make_unique<VPlan>(); 9263 9264 // Build hierarchical CFG 9265 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9266 HCFGBuilder.buildHierarchicalCFG(); 9267 9268 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9269 VF *= 2) 9270 Plan->addVF(VF); 9271 9272 if (EnableVPlanPredication) { 9273 VPlanPredicator VPP(*Plan); 9274 VPP.predicate(); 9275 9276 // Avoid running transformation to recipes until masked code generation in 9277 // VPlan-native path is in place. 9278 return Plan; 9279 } 9280 9281 SmallPtrSet<Instruction *, 1> DeadInstructions; 9282 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9283 Legal->getInductionVars(), 9284 DeadInstructions, *PSE.getSE()); 9285 return Plan; 9286 } 9287 9288 // Adjust the recipes for any inloop reductions. The chain of instructions 9289 // leading from the loop exit instr to the phi need to be converted to 9290 // reductions, with one operand being vector and the other being the scalar 9291 // reduction chain. 9292 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9293 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9294 for (auto &Reduction : CM.getInLoopReductionChains()) { 9295 PHINode *Phi = Reduction.first; 9296 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9297 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9298 9299 // ReductionOperations are orders top-down from the phi's use to the 9300 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9301 // which of the two operands will remain scalar and which will be reduced. 9302 // For minmax the chain will be the select instructions. 9303 Instruction *Chain = Phi; 9304 for (Instruction *R : ReductionOperations) { 9305 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9306 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9307 9308 VPValue *ChainOp = Plan->getVPValue(Chain); 9309 unsigned FirstOpId; 9310 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9311 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9312 "Expected to replace a VPWidenSelectSC"); 9313 FirstOpId = 1; 9314 } else { 9315 assert(isa<VPWidenRecipe>(WidenRecipe) && 9316 "Expected to replace a VPWidenSC"); 9317 FirstOpId = 0; 9318 } 9319 unsigned VecOpId = 9320 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9321 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9322 9323 auto *CondOp = CM.foldTailByMasking() 9324 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9325 : nullptr; 9326 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9327 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9328 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9329 Plan->removeVPValueFor(R); 9330 Plan->addVPValue(R, RedRecipe); 9331 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9332 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9333 WidenRecipe->eraseFromParent(); 9334 9335 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9336 VPRecipeBase *CompareRecipe = 9337 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9338 assert(isa<VPWidenRecipe>(CompareRecipe) && 9339 "Expected to replace a VPWidenSC"); 9340 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9341 "Expected no remaining users"); 9342 CompareRecipe->eraseFromParent(); 9343 } 9344 Chain = R; 9345 } 9346 } 9347 } 9348 9349 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9350 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9351 VPSlotTracker &SlotTracker) const { 9352 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9353 IG->getInsertPos()->printAsOperand(O, false); 9354 O << ", "; 9355 getAddr()->printAsOperand(O, SlotTracker); 9356 VPValue *Mask = getMask(); 9357 if (Mask) { 9358 O << ", "; 9359 Mask->printAsOperand(O, SlotTracker); 9360 } 9361 for (unsigned i = 0; i < IG->getFactor(); ++i) 9362 if (Instruction *I = IG->getMember(i)) 9363 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9364 } 9365 #endif 9366 9367 void VPWidenCallRecipe::execute(VPTransformState &State) { 9368 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9369 *this, State); 9370 } 9371 9372 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9373 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9374 this, *this, InvariantCond, State); 9375 } 9376 9377 void VPWidenRecipe::execute(VPTransformState &State) { 9378 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9379 } 9380 9381 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9382 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9383 *this, State.UF, State.VF, IsPtrLoopInvariant, 9384 IsIndexLoopInvariant, State); 9385 } 9386 9387 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9388 assert(!State.Instance && "Int or FP induction being replicated."); 9389 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9390 getTruncInst(), getVPValue(0), 9391 getCastValue(), State); 9392 } 9393 9394 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9395 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9396 this, State); 9397 } 9398 9399 void VPBlendRecipe::execute(VPTransformState &State) { 9400 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9401 // We know that all PHIs in non-header blocks are converted into 9402 // selects, so we don't have to worry about the insertion order and we 9403 // can just use the builder. 9404 // At this point we generate the predication tree. There may be 9405 // duplications since this is a simple recursive scan, but future 9406 // optimizations will clean it up. 9407 9408 unsigned NumIncoming = getNumIncomingValues(); 9409 9410 // Generate a sequence of selects of the form: 9411 // SELECT(Mask3, In3, 9412 // SELECT(Mask2, In2, 9413 // SELECT(Mask1, In1, 9414 // In0))) 9415 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9416 // are essentially undef are taken from In0. 9417 InnerLoopVectorizer::VectorParts Entry(State.UF); 9418 for (unsigned In = 0; In < NumIncoming; ++In) { 9419 for (unsigned Part = 0; Part < State.UF; ++Part) { 9420 // We might have single edge PHIs (blocks) - use an identity 9421 // 'select' for the first PHI operand. 9422 Value *In0 = State.get(getIncomingValue(In), Part); 9423 if (In == 0) 9424 Entry[Part] = In0; // Initialize with the first incoming value. 9425 else { 9426 // Select between the current value and the previous incoming edge 9427 // based on the incoming mask. 9428 Value *Cond = State.get(getMask(In), Part); 9429 Entry[Part] = 9430 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9431 } 9432 } 9433 } 9434 for (unsigned Part = 0; Part < State.UF; ++Part) 9435 State.set(this, Entry[Part], Part); 9436 } 9437 9438 void VPInterleaveRecipe::execute(VPTransformState &State) { 9439 assert(!State.Instance && "Interleave group being replicated."); 9440 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9441 getStoredValues(), getMask()); 9442 } 9443 9444 void VPReductionRecipe::execute(VPTransformState &State) { 9445 assert(!State.Instance && "Reduction being replicated."); 9446 Value *PrevInChain = State.get(getChainOp(), 0); 9447 for (unsigned Part = 0; Part < State.UF; ++Part) { 9448 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9449 bool IsOrdered = useOrderedReductions(*RdxDesc); 9450 Value *NewVecOp = State.get(getVecOp(), Part); 9451 if (VPValue *Cond = getCondOp()) { 9452 Value *NewCond = State.get(Cond, Part); 9453 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9454 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9455 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9456 Constant *IdenVec = 9457 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9458 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9459 NewVecOp = Select; 9460 } 9461 Value *NewRed; 9462 Value *NextInChain; 9463 if (IsOrdered) { 9464 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9465 PrevInChain); 9466 PrevInChain = NewRed; 9467 } else { 9468 PrevInChain = State.get(getChainOp(), Part); 9469 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9470 } 9471 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9472 NextInChain = 9473 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9474 NewRed, PrevInChain); 9475 } else if (IsOrdered) 9476 NextInChain = NewRed; 9477 else { 9478 NextInChain = State.Builder.CreateBinOp( 9479 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9480 PrevInChain); 9481 } 9482 State.set(this, NextInChain, Part); 9483 } 9484 } 9485 9486 void VPReplicateRecipe::execute(VPTransformState &State) { 9487 if (State.Instance) { // Generate a single instance. 9488 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9489 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9490 *State.Instance, IsPredicated, State); 9491 // Insert scalar instance packing it into a vector. 9492 if (AlsoPack && State.VF.isVector()) { 9493 // If we're constructing lane 0, initialize to start from poison. 9494 if (State.Instance->Lane.isFirstLane()) { 9495 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9496 Value *Poison = PoisonValue::get( 9497 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9498 State.set(this, Poison, State.Instance->Part); 9499 } 9500 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9501 } 9502 return; 9503 } 9504 9505 // Generate scalar instances for all VF lanes of all UF parts, unless the 9506 // instruction is uniform inwhich case generate only the first lane for each 9507 // of the UF parts. 9508 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9509 assert((!State.VF.isScalable() || IsUniform) && 9510 "Can't scalarize a scalable vector"); 9511 for (unsigned Part = 0; Part < State.UF; ++Part) 9512 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9513 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9514 VPIteration(Part, Lane), IsPredicated, 9515 State); 9516 } 9517 9518 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9519 assert(State.Instance && "Branch on Mask works only on single instance."); 9520 9521 unsigned Part = State.Instance->Part; 9522 unsigned Lane = State.Instance->Lane.getKnownLane(); 9523 9524 Value *ConditionBit = nullptr; 9525 VPValue *BlockInMask = getMask(); 9526 if (BlockInMask) { 9527 ConditionBit = State.get(BlockInMask, Part); 9528 if (ConditionBit->getType()->isVectorTy()) 9529 ConditionBit = State.Builder.CreateExtractElement( 9530 ConditionBit, State.Builder.getInt32(Lane)); 9531 } else // Block in mask is all-one. 9532 ConditionBit = State.Builder.getTrue(); 9533 9534 // Replace the temporary unreachable terminator with a new conditional branch, 9535 // whose two destinations will be set later when they are created. 9536 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9537 assert(isa<UnreachableInst>(CurrentTerminator) && 9538 "Expected to replace unreachable terminator with conditional branch."); 9539 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9540 CondBr->setSuccessor(0, nullptr); 9541 ReplaceInstWithInst(CurrentTerminator, CondBr); 9542 } 9543 9544 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9545 assert(State.Instance && "Predicated instruction PHI works per instance."); 9546 Instruction *ScalarPredInst = 9547 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9548 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9549 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9550 assert(PredicatingBB && "Predicated block has no single predecessor."); 9551 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9552 "operand must be VPReplicateRecipe"); 9553 9554 // By current pack/unpack logic we need to generate only a single phi node: if 9555 // a vector value for the predicated instruction exists at this point it means 9556 // the instruction has vector users only, and a phi for the vector value is 9557 // needed. In this case the recipe of the predicated instruction is marked to 9558 // also do that packing, thereby "hoisting" the insert-element sequence. 9559 // Otherwise, a phi node for the scalar value is needed. 9560 unsigned Part = State.Instance->Part; 9561 if (State.hasVectorValue(getOperand(0), Part)) { 9562 Value *VectorValue = State.get(getOperand(0), Part); 9563 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9564 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9565 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9566 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9567 if (State.hasVectorValue(this, Part)) 9568 State.reset(this, VPhi, Part); 9569 else 9570 State.set(this, VPhi, Part); 9571 // NOTE: Currently we need to update the value of the operand, so the next 9572 // predicated iteration inserts its generated value in the correct vector. 9573 State.reset(getOperand(0), VPhi, Part); 9574 } else { 9575 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9576 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9577 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9578 PredicatingBB); 9579 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9580 if (State.hasScalarValue(this, *State.Instance)) 9581 State.reset(this, Phi, *State.Instance); 9582 else 9583 State.set(this, Phi, *State.Instance); 9584 // NOTE: Currently we need to update the value of the operand, so the next 9585 // predicated iteration inserts its generated value in the correct vector. 9586 State.reset(getOperand(0), Phi, *State.Instance); 9587 } 9588 } 9589 9590 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9591 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9592 State.ILV->vectorizeMemoryInstruction( 9593 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9594 StoredValue, getMask()); 9595 } 9596 9597 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9598 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9599 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9600 // for predication. 9601 static ScalarEpilogueLowering getScalarEpilogueLowering( 9602 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9603 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9604 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9605 LoopVectorizationLegality &LVL) { 9606 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9607 // don't look at hints or options, and don't request a scalar epilogue. 9608 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9609 // LoopAccessInfo (due to code dependency and not being able to reliably get 9610 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9611 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9612 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9613 // back to the old way and vectorize with versioning when forced. See D81345.) 9614 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9615 PGSOQueryType::IRPass) && 9616 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9617 return CM_ScalarEpilogueNotAllowedOptSize; 9618 9619 // 2) If set, obey the directives 9620 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9621 switch (PreferPredicateOverEpilogue) { 9622 case PreferPredicateTy::ScalarEpilogue: 9623 return CM_ScalarEpilogueAllowed; 9624 case PreferPredicateTy::PredicateElseScalarEpilogue: 9625 return CM_ScalarEpilogueNotNeededUsePredicate; 9626 case PreferPredicateTy::PredicateOrDontVectorize: 9627 return CM_ScalarEpilogueNotAllowedUsePredicate; 9628 }; 9629 } 9630 9631 // 3) If set, obey the hints 9632 switch (Hints.getPredicate()) { 9633 case LoopVectorizeHints::FK_Enabled: 9634 return CM_ScalarEpilogueNotNeededUsePredicate; 9635 case LoopVectorizeHints::FK_Disabled: 9636 return CM_ScalarEpilogueAllowed; 9637 }; 9638 9639 // 4) if the TTI hook indicates this is profitable, request predication. 9640 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9641 LVL.getLAI())) 9642 return CM_ScalarEpilogueNotNeededUsePredicate; 9643 9644 return CM_ScalarEpilogueAllowed; 9645 } 9646 9647 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9648 // If Values have been set for this Def return the one relevant for \p Part. 9649 if (hasVectorValue(Def, Part)) 9650 return Data.PerPartOutput[Def][Part]; 9651 9652 if (!hasScalarValue(Def, {Part, 0})) { 9653 Value *IRV = Def->getLiveInIRValue(); 9654 Value *B = ILV->getBroadcastInstrs(IRV); 9655 set(Def, B, Part); 9656 return B; 9657 } 9658 9659 Value *ScalarValue = get(Def, {Part, 0}); 9660 // If we aren't vectorizing, we can just copy the scalar map values over 9661 // to the vector map. 9662 if (VF.isScalar()) { 9663 set(Def, ScalarValue, Part); 9664 return ScalarValue; 9665 } 9666 9667 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9668 bool IsUniform = RepR && RepR->isUniform(); 9669 9670 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9671 // Check if there is a scalar value for the selected lane. 9672 if (!hasScalarValue(Def, {Part, LastLane})) { 9673 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9674 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9675 "unexpected recipe found to be invariant"); 9676 IsUniform = true; 9677 LastLane = 0; 9678 } 9679 9680 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9681 9682 // Set the insert point after the last scalarized instruction. This 9683 // ensures the insertelement sequence will directly follow the scalar 9684 // definitions. 9685 auto OldIP = Builder.saveIP(); 9686 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9687 Builder.SetInsertPoint(&*NewIP); 9688 9689 // However, if we are vectorizing, we need to construct the vector values. 9690 // If the value is known to be uniform after vectorization, we can just 9691 // broadcast the scalar value corresponding to lane zero for each unroll 9692 // iteration. Otherwise, we construct the vector values using 9693 // insertelement instructions. Since the resulting vectors are stored in 9694 // State, we will only generate the insertelements once. 9695 Value *VectorValue = nullptr; 9696 if (IsUniform) { 9697 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9698 set(Def, VectorValue, Part); 9699 } else { 9700 // Initialize packing with insertelements to start from undef. 9701 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9702 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9703 set(Def, Undef, Part); 9704 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9705 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9706 VectorValue = get(Def, Part); 9707 } 9708 Builder.restoreIP(OldIP); 9709 return VectorValue; 9710 } 9711 9712 // Process the loop in the VPlan-native vectorization path. This path builds 9713 // VPlan upfront in the vectorization pipeline, which allows to apply 9714 // VPlan-to-VPlan transformations from the very beginning without modifying the 9715 // input LLVM IR. 9716 static bool processLoopInVPlanNativePath( 9717 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9718 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9719 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9720 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9721 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9722 LoopVectorizationRequirements &Requirements) { 9723 9724 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9725 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9726 return false; 9727 } 9728 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9729 Function *F = L->getHeader()->getParent(); 9730 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9731 9732 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9733 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9734 9735 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9736 &Hints, IAI); 9737 // Use the planner for outer loop vectorization. 9738 // TODO: CM is not used at this point inside the planner. Turn CM into an 9739 // optional argument if we don't need it in the future. 9740 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9741 Requirements, ORE); 9742 9743 // Get user vectorization factor. 9744 ElementCount UserVF = Hints.getWidth(); 9745 9746 // Plan how to best vectorize, return the best VF and its cost. 9747 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9748 9749 // If we are stress testing VPlan builds, do not attempt to generate vector 9750 // code. Masked vector code generation support will follow soon. 9751 // Also, do not attempt to vectorize if no vector code will be produced. 9752 if (VPlanBuildStressTest || EnableVPlanPredication || 9753 VectorizationFactor::Disabled() == VF) 9754 return false; 9755 9756 LVP.setBestPlan(VF.Width, 1); 9757 9758 { 9759 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9760 F->getParent()->getDataLayout()); 9761 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9762 &CM, BFI, PSI, Checks); 9763 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9764 << L->getHeader()->getParent()->getName() << "\"\n"); 9765 LVP.executePlan(LB, DT); 9766 } 9767 9768 // Mark the loop as already vectorized to avoid vectorizing again. 9769 Hints.setAlreadyVectorized(); 9770 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9771 return true; 9772 } 9773 9774 // Emit a remark if there are stores to floats that required a floating point 9775 // extension. If the vectorized loop was generated with floating point there 9776 // will be a performance penalty from the conversion overhead and the change in 9777 // the vector width. 9778 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9779 SmallVector<Instruction *, 4> Worklist; 9780 for (BasicBlock *BB : L->getBlocks()) { 9781 for (Instruction &Inst : *BB) { 9782 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9783 if (S->getValueOperand()->getType()->isFloatTy()) 9784 Worklist.push_back(S); 9785 } 9786 } 9787 } 9788 9789 // Traverse the floating point stores upwards searching, for floating point 9790 // conversions. 9791 SmallPtrSet<const Instruction *, 4> Visited; 9792 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9793 while (!Worklist.empty()) { 9794 auto *I = Worklist.pop_back_val(); 9795 if (!L->contains(I)) 9796 continue; 9797 if (!Visited.insert(I).second) 9798 continue; 9799 9800 // Emit a remark if the floating point store required a floating 9801 // point conversion. 9802 // TODO: More work could be done to identify the root cause such as a 9803 // constant or a function return type and point the user to it. 9804 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9805 ORE->emit([&]() { 9806 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9807 I->getDebugLoc(), L->getHeader()) 9808 << "floating point conversion changes vector width. " 9809 << "Mixed floating point precision requires an up/down " 9810 << "cast that will negatively impact performance."; 9811 }); 9812 9813 for (Use &Op : I->operands()) 9814 if (auto *OpI = dyn_cast<Instruction>(Op)) 9815 Worklist.push_back(OpI); 9816 } 9817 } 9818 9819 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9820 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9821 !EnableLoopInterleaving), 9822 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9823 !EnableLoopVectorization) {} 9824 9825 bool LoopVectorizePass::processLoop(Loop *L) { 9826 assert((EnableVPlanNativePath || L->isInnermost()) && 9827 "VPlan-native path is not enabled. Only process inner loops."); 9828 9829 #ifndef NDEBUG 9830 const std::string DebugLocStr = getDebugLocString(L); 9831 #endif /* NDEBUG */ 9832 9833 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9834 << L->getHeader()->getParent()->getName() << "\" from " 9835 << DebugLocStr << "\n"); 9836 9837 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9838 9839 LLVM_DEBUG( 9840 dbgs() << "LV: Loop hints:" 9841 << " force=" 9842 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9843 ? "disabled" 9844 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9845 ? "enabled" 9846 : "?")) 9847 << " width=" << Hints.getWidth() 9848 << " interleave=" << Hints.getInterleave() << "\n"); 9849 9850 // Function containing loop 9851 Function *F = L->getHeader()->getParent(); 9852 9853 // Looking at the diagnostic output is the only way to determine if a loop 9854 // was vectorized (other than looking at the IR or machine code), so it 9855 // is important to generate an optimization remark for each loop. Most of 9856 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9857 // generated as OptimizationRemark and OptimizationRemarkMissed are 9858 // less verbose reporting vectorized loops and unvectorized loops that may 9859 // benefit from vectorization, respectively. 9860 9861 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9862 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9863 return false; 9864 } 9865 9866 PredicatedScalarEvolution PSE(*SE, *L); 9867 9868 // Check if it is legal to vectorize the loop. 9869 LoopVectorizationRequirements Requirements; 9870 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9871 &Requirements, &Hints, DB, AC, BFI, PSI); 9872 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9873 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9874 Hints.emitRemarkWithHints(); 9875 return false; 9876 } 9877 9878 // Check the function attributes and profiles to find out if this function 9879 // should be optimized for size. 9880 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9881 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9882 9883 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9884 // here. They may require CFG and instruction level transformations before 9885 // even evaluating whether vectorization is profitable. Since we cannot modify 9886 // the incoming IR, we need to build VPlan upfront in the vectorization 9887 // pipeline. 9888 if (!L->isInnermost()) 9889 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9890 ORE, BFI, PSI, Hints, Requirements); 9891 9892 assert(L->isInnermost() && "Inner loop expected."); 9893 9894 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9895 // count by optimizing for size, to minimize overheads. 9896 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9897 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9898 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9899 << "This loop is worth vectorizing only if no scalar " 9900 << "iteration overheads are incurred."); 9901 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9902 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9903 else { 9904 LLVM_DEBUG(dbgs() << "\n"); 9905 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9906 } 9907 } 9908 9909 // Check the function attributes to see if implicit floats are allowed. 9910 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9911 // an integer loop and the vector instructions selected are purely integer 9912 // vector instructions? 9913 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9914 reportVectorizationFailure( 9915 "Can't vectorize when the NoImplicitFloat attribute is used", 9916 "loop not vectorized due to NoImplicitFloat attribute", 9917 "NoImplicitFloat", ORE, L); 9918 Hints.emitRemarkWithHints(); 9919 return false; 9920 } 9921 9922 // Check if the target supports potentially unsafe FP vectorization. 9923 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9924 // for the target we're vectorizing for, to make sure none of the 9925 // additional fp-math flags can help. 9926 if (Hints.isPotentiallyUnsafe() && 9927 TTI->isFPVectorizationPotentiallyUnsafe()) { 9928 reportVectorizationFailure( 9929 "Potentially unsafe FP op prevents vectorization", 9930 "loop not vectorized due to unsafe FP support.", 9931 "UnsafeFP", ORE, L); 9932 Hints.emitRemarkWithHints(); 9933 return false; 9934 } 9935 9936 if (!Requirements.canVectorizeFPMath(Hints)) { 9937 ORE->emit([&]() { 9938 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9939 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9940 ExactFPMathInst->getDebugLoc(), 9941 ExactFPMathInst->getParent()) 9942 << "loop not vectorized: cannot prove it is safe to reorder " 9943 "floating-point operations"; 9944 }); 9945 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9946 "reorder floating-point operations\n"); 9947 Hints.emitRemarkWithHints(); 9948 return false; 9949 } 9950 9951 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9952 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9953 9954 // If an override option has been passed in for interleaved accesses, use it. 9955 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9956 UseInterleaved = EnableInterleavedMemAccesses; 9957 9958 // Analyze interleaved memory accesses. 9959 if (UseInterleaved) { 9960 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9961 } 9962 9963 // Use the cost model. 9964 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9965 F, &Hints, IAI); 9966 CM.collectValuesToIgnore(); 9967 9968 // Use the planner for vectorization. 9969 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9970 Requirements, ORE); 9971 9972 // Get user vectorization factor and interleave count. 9973 ElementCount UserVF = Hints.getWidth(); 9974 unsigned UserIC = Hints.getInterleave(); 9975 9976 // Plan how to best vectorize, return the best VF and its cost. 9977 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9978 9979 VectorizationFactor VF = VectorizationFactor::Disabled(); 9980 unsigned IC = 1; 9981 9982 if (MaybeVF) { 9983 VF = *MaybeVF; 9984 // Select the interleave count. 9985 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 9986 } 9987 9988 // Identify the diagnostic messages that should be produced. 9989 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9990 bool VectorizeLoop = true, InterleaveLoop = true; 9991 if (VF.Width.isScalar()) { 9992 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9993 VecDiagMsg = std::make_pair( 9994 "VectorizationNotBeneficial", 9995 "the cost-model indicates that vectorization is not beneficial"); 9996 VectorizeLoop = false; 9997 } 9998 9999 if (!MaybeVF && UserIC > 1) { 10000 // Tell the user interleaving was avoided up-front, despite being explicitly 10001 // requested. 10002 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10003 "interleaving should be avoided up front\n"); 10004 IntDiagMsg = std::make_pair( 10005 "InterleavingAvoided", 10006 "Ignoring UserIC, because interleaving was avoided up front"); 10007 InterleaveLoop = false; 10008 } else if (IC == 1 && UserIC <= 1) { 10009 // Tell the user interleaving is not beneficial. 10010 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10011 IntDiagMsg = std::make_pair( 10012 "InterleavingNotBeneficial", 10013 "the cost-model indicates that interleaving is not beneficial"); 10014 InterleaveLoop = false; 10015 if (UserIC == 1) { 10016 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10017 IntDiagMsg.second += 10018 " and is explicitly disabled or interleave count is set to 1"; 10019 } 10020 } else if (IC > 1 && UserIC == 1) { 10021 // Tell the user interleaving is beneficial, but it explicitly disabled. 10022 LLVM_DEBUG( 10023 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10024 IntDiagMsg = std::make_pair( 10025 "InterleavingBeneficialButDisabled", 10026 "the cost-model indicates that interleaving is beneficial " 10027 "but is explicitly disabled or interleave count is set to 1"); 10028 InterleaveLoop = false; 10029 } 10030 10031 // Override IC if user provided an interleave count. 10032 IC = UserIC > 0 ? UserIC : IC; 10033 10034 // Emit diagnostic messages, if any. 10035 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10036 if (!VectorizeLoop && !InterleaveLoop) { 10037 // Do not vectorize or interleaving the loop. 10038 ORE->emit([&]() { 10039 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10040 L->getStartLoc(), L->getHeader()) 10041 << VecDiagMsg.second; 10042 }); 10043 ORE->emit([&]() { 10044 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10045 L->getStartLoc(), L->getHeader()) 10046 << IntDiagMsg.second; 10047 }); 10048 return false; 10049 } else if (!VectorizeLoop && InterleaveLoop) { 10050 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10051 ORE->emit([&]() { 10052 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10053 L->getStartLoc(), L->getHeader()) 10054 << VecDiagMsg.second; 10055 }); 10056 } else if (VectorizeLoop && !InterleaveLoop) { 10057 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10058 << ") in " << DebugLocStr << '\n'); 10059 ORE->emit([&]() { 10060 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10061 L->getStartLoc(), L->getHeader()) 10062 << IntDiagMsg.second; 10063 }); 10064 } else if (VectorizeLoop && InterleaveLoop) { 10065 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10066 << ") in " << DebugLocStr << '\n'); 10067 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10068 } 10069 10070 bool DisableRuntimeUnroll = false; 10071 MDNode *OrigLoopID = L->getLoopID(); 10072 { 10073 // Optimistically generate runtime checks. Drop them if they turn out to not 10074 // be profitable. Limit the scope of Checks, so the cleanup happens 10075 // immediately after vector codegeneration is done. 10076 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10077 F->getParent()->getDataLayout()); 10078 if (!VF.Width.isScalar() || IC > 1) 10079 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10080 LVP.setBestPlan(VF.Width, IC); 10081 10082 using namespace ore; 10083 if (!VectorizeLoop) { 10084 assert(IC > 1 && "interleave count should not be 1 or 0"); 10085 // If we decided that it is not legal to vectorize the loop, then 10086 // interleave it. 10087 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10088 &CM, BFI, PSI, Checks); 10089 LVP.executePlan(Unroller, DT); 10090 10091 ORE->emit([&]() { 10092 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10093 L->getHeader()) 10094 << "interleaved loop (interleaved count: " 10095 << NV("InterleaveCount", IC) << ")"; 10096 }); 10097 } else { 10098 // If we decided that it is *legal* to vectorize the loop, then do it. 10099 10100 // Consider vectorizing the epilogue too if it's profitable. 10101 VectorizationFactor EpilogueVF = 10102 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10103 if (EpilogueVF.Width.isVector()) { 10104 10105 // The first pass vectorizes the main loop and creates a scalar epilogue 10106 // to be vectorized by executing the plan (potentially with a different 10107 // factor) again shortly afterwards. 10108 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10109 EpilogueVF.Width.getKnownMinValue(), 10110 1); 10111 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10112 EPI, &LVL, &CM, BFI, PSI, Checks); 10113 10114 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10115 LVP.executePlan(MainILV, DT); 10116 ++LoopsVectorized; 10117 10118 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10119 formLCSSARecursively(*L, *DT, LI, SE); 10120 10121 // Second pass vectorizes the epilogue and adjusts the control flow 10122 // edges from the first pass. 10123 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10124 EPI.MainLoopVF = EPI.EpilogueVF; 10125 EPI.MainLoopUF = EPI.EpilogueUF; 10126 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10127 ORE, EPI, &LVL, &CM, BFI, PSI, 10128 Checks); 10129 LVP.executePlan(EpilogILV, DT); 10130 ++LoopsEpilogueVectorized; 10131 10132 if (!MainILV.areSafetyChecksAdded()) 10133 DisableRuntimeUnroll = true; 10134 } else { 10135 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10136 &LVL, &CM, BFI, PSI, Checks); 10137 LVP.executePlan(LB, DT); 10138 ++LoopsVectorized; 10139 10140 // Add metadata to disable runtime unrolling a scalar loop when there 10141 // are no runtime checks about strides and memory. A scalar loop that is 10142 // rarely used is not worth unrolling. 10143 if (!LB.areSafetyChecksAdded()) 10144 DisableRuntimeUnroll = true; 10145 } 10146 // Report the vectorization decision. 10147 ORE->emit([&]() { 10148 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10149 L->getHeader()) 10150 << "vectorized loop (vectorization width: " 10151 << NV("VectorizationFactor", VF.Width) 10152 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10153 }); 10154 } 10155 10156 if (ORE->allowExtraAnalysis(LV_NAME)) 10157 checkMixedPrecision(L, ORE); 10158 } 10159 10160 Optional<MDNode *> RemainderLoopID = 10161 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10162 LLVMLoopVectorizeFollowupEpilogue}); 10163 if (RemainderLoopID.hasValue()) { 10164 L->setLoopID(RemainderLoopID.getValue()); 10165 } else { 10166 if (DisableRuntimeUnroll) 10167 AddRuntimeUnrollDisableMetaData(L); 10168 10169 // Mark the loop as already vectorized to avoid vectorizing again. 10170 Hints.setAlreadyVectorized(); 10171 } 10172 10173 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10174 return true; 10175 } 10176 10177 LoopVectorizeResult LoopVectorizePass::runImpl( 10178 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10179 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10180 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10181 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10182 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10183 SE = &SE_; 10184 LI = &LI_; 10185 TTI = &TTI_; 10186 DT = &DT_; 10187 BFI = &BFI_; 10188 TLI = TLI_; 10189 AA = &AA_; 10190 AC = &AC_; 10191 GetLAA = &GetLAA_; 10192 DB = &DB_; 10193 ORE = &ORE_; 10194 PSI = PSI_; 10195 10196 // Don't attempt if 10197 // 1. the target claims to have no vector registers, and 10198 // 2. interleaving won't help ILP. 10199 // 10200 // The second condition is necessary because, even if the target has no 10201 // vector registers, loop vectorization may still enable scalar 10202 // interleaving. 10203 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10204 TTI->getMaxInterleaveFactor(1) < 2) 10205 return LoopVectorizeResult(false, false); 10206 10207 bool Changed = false, CFGChanged = false; 10208 10209 // The vectorizer requires loops to be in simplified form. 10210 // Since simplification may add new inner loops, it has to run before the 10211 // legality and profitability checks. This means running the loop vectorizer 10212 // will simplify all loops, regardless of whether anything end up being 10213 // vectorized. 10214 for (auto &L : *LI) 10215 Changed |= CFGChanged |= 10216 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10217 10218 // Build up a worklist of inner-loops to vectorize. This is necessary as 10219 // the act of vectorizing or partially unrolling a loop creates new loops 10220 // and can invalidate iterators across the loops. 10221 SmallVector<Loop *, 8> Worklist; 10222 10223 for (Loop *L : *LI) 10224 collectSupportedLoops(*L, LI, ORE, Worklist); 10225 10226 LoopsAnalyzed += Worklist.size(); 10227 10228 // Now walk the identified inner loops. 10229 while (!Worklist.empty()) { 10230 Loop *L = Worklist.pop_back_val(); 10231 10232 // For the inner loops we actually process, form LCSSA to simplify the 10233 // transform. 10234 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10235 10236 Changed |= CFGChanged |= processLoop(L); 10237 } 10238 10239 // Process each loop nest in the function. 10240 return LoopVectorizeResult(Changed, CFGChanged); 10241 } 10242 10243 PreservedAnalyses LoopVectorizePass::run(Function &F, 10244 FunctionAnalysisManager &AM) { 10245 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10246 auto &LI = AM.getResult<LoopAnalysis>(F); 10247 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10248 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10249 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10250 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10251 auto &AA = AM.getResult<AAManager>(F); 10252 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10253 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10254 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10255 MemorySSA *MSSA = EnableMSSALoopDependency 10256 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10257 : nullptr; 10258 10259 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10260 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10261 [&](Loop &L) -> const LoopAccessInfo & { 10262 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10263 TLI, TTI, nullptr, MSSA}; 10264 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10265 }; 10266 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10267 ProfileSummaryInfo *PSI = 10268 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10269 LoopVectorizeResult Result = 10270 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10271 if (!Result.MadeAnyChange) 10272 return PreservedAnalyses::all(); 10273 PreservedAnalyses PA; 10274 10275 // We currently do not preserve loopinfo/dominator analyses with outer loop 10276 // vectorization. Until this is addressed, mark these analyses as preserved 10277 // only for non-VPlan-native path. 10278 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10279 if (!EnableVPlanNativePath) { 10280 PA.preserve<LoopAnalysis>(); 10281 PA.preserve<DominatorTreeAnalysis>(); 10282 } 10283 PA.preserve<BasicAA>(); 10284 PA.preserve<GlobalsAA>(); 10285 if (!Result.MadeCFGChange) 10286 PA.preserveSet<CFGAnalyses>(); 10287 return PA; 10288 } 10289