1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SmallPtrSet.h" 73 #include "llvm/ADT/SmallVector.h" 74 #include "llvm/ADT/Statistic.h" 75 #include "llvm/ADT/StringRef.h" 76 #include "llvm/ADT/Twine.h" 77 #include "llvm/ADT/iterator_range.h" 78 #include "llvm/Analysis/AssumptionCache.h" 79 #include "llvm/Analysis/BasicAliasAnalysis.h" 80 #include "llvm/Analysis/BlockFrequencyInfo.h" 81 #include "llvm/Analysis/CFG.h" 82 #include "llvm/Analysis/CodeMetrics.h" 83 #include "llvm/Analysis/DemandedBits.h" 84 #include "llvm/Analysis/GlobalsModRef.h" 85 #include "llvm/Analysis/LoopAccessAnalysis.h" 86 #include "llvm/Analysis/LoopAnalysisManager.h" 87 #include "llvm/Analysis/LoopInfo.h" 88 #include "llvm/Analysis/LoopIterator.h" 89 #include "llvm/Analysis/MemorySSA.h" 90 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 91 #include "llvm/Analysis/ProfileSummaryInfo.h" 92 #include "llvm/Analysis/ScalarEvolution.h" 93 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 94 #include "llvm/Analysis/TargetLibraryInfo.h" 95 #include "llvm/Analysis/TargetTransformInfo.h" 96 #include "llvm/Analysis/VectorUtils.h" 97 #include "llvm/IR/Attributes.h" 98 #include "llvm/IR/BasicBlock.h" 99 #include "llvm/IR/CFG.h" 100 #include "llvm/IR/Constant.h" 101 #include "llvm/IR/Constants.h" 102 #include "llvm/IR/DataLayout.h" 103 #include "llvm/IR/DebugInfoMetadata.h" 104 #include "llvm/IR/DebugLoc.h" 105 #include "llvm/IR/DerivedTypes.h" 106 #include "llvm/IR/DiagnosticInfo.h" 107 #include "llvm/IR/Dominators.h" 108 #include "llvm/IR/Function.h" 109 #include "llvm/IR/IRBuilder.h" 110 #include "llvm/IR/InstrTypes.h" 111 #include "llvm/IR/Instruction.h" 112 #include "llvm/IR/Instructions.h" 113 #include "llvm/IR/IntrinsicInst.h" 114 #include "llvm/IR/Intrinsics.h" 115 #include "llvm/IR/LLVMContext.h" 116 #include "llvm/IR/Metadata.h" 117 #include "llvm/IR/Module.h" 118 #include "llvm/IR/Operator.h" 119 #include "llvm/IR/PatternMatch.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks with a " 204 "vectorize(enable) pragma.")); 205 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 207 // that predication is preferred, and this lists all options. I.e., the 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 209 // and predicate the instructions accordingly. If tail-folding fails, there are 210 // different fallback strategies depending on these values: 211 namespace PreferPredicateTy { 212 enum Option { 213 ScalarEpilogue = 0, 214 PredicateElseScalarEpilogue, 215 PredicateOrDontVectorize 216 }; 217 } // namespace PreferPredicateTy 218 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 220 "prefer-predicate-over-epilogue", 221 cl::init(PreferPredicateTy::ScalarEpilogue), 222 cl::Hidden, 223 cl::desc("Tail-folding and predication preferences over creating a scalar " 224 "epilogue loop."), 225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 226 "scalar-epilogue", 227 "Don't tail-predicate loops, create scalar epilogue"), 228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 229 "predicate-else-scalar-epilogue", 230 "prefer tail-folding, create scalar epilogue if tail " 231 "folding fails."), 232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 233 "predicate-dont-vectorize", 234 "prefers tail-folding, don't attempt vectorization if " 235 "tail-folding fails."))); 236 237 static cl::opt<bool> MaximizeBandwidth( 238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 239 cl::desc("Maximize bandwidth when selecting vectorization factor which " 240 "will be determined by the smallest type in loop.")); 241 242 static cl::opt<bool> EnableInterleavedMemAccesses( 243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 244 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 245 246 /// An interleave-group may need masking if it resides in a block that needs 247 /// predication, or in order to mask away gaps. 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 251 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 254 cl::desc("We don't interleave loops with a estimated constant trip count " 255 "below this number")); 256 257 static cl::opt<unsigned> ForceTargetNumScalarRegs( 258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 259 cl::desc("A flag that overrides the target's number of scalar registers.")); 260 261 static cl::opt<unsigned> ForceTargetNumVectorRegs( 262 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 263 cl::desc("A flag that overrides the target's number of vector registers.")); 264 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "scalar loops.")); 269 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's max interleave factor for " 273 "vectorized loops.")); 274 275 static cl::opt<unsigned> ForceTargetInstructionCost( 276 "force-target-instruction-cost", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's expected cost for " 278 "an instruction to a single constant value. Mostly " 279 "useful for getting consistent testing.")); 280 281 static cl::opt<bool> ForceTargetSupportsScalableVectors( 282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 283 cl::desc( 284 "Pretend that scalable vectors are supported, even if the target does " 285 "not support them. This flag should only be used for testing.")); 286 287 static cl::opt<unsigned> SmallLoopCost( 288 "small-loop-cost", cl::init(20), cl::Hidden, 289 cl::desc( 290 "The cost of a loop that is considered 'small' by the interleaver.")); 291 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 294 cl::desc("Enable the use of the block frequency analysis to access PGO " 295 "heuristics minimizing code growth in cold regions and being more " 296 "aggressive in hot regions.")); 297 298 // Runtime interleave loops for load/store throughput. 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 301 cl::desc( 302 "Enable runtime interleaving until load/store ports are saturated")); 303 304 /// Interleave small loops with scalar reductions. 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 307 cl::desc("Enable interleaving for loops with small iteration counts that " 308 "contain scalar reductions to expose ILP.")); 309 310 /// The number of stores in a loop that are allowed to need predication. 311 static cl::opt<unsigned> NumberOfStoresToPredicate( 312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 313 cl::desc("Max number of stores to be predicated behind an if.")); 314 315 static cl::opt<bool> EnableIndVarRegisterHeur( 316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 317 cl::desc("Count the induction variable only once when interleaving")); 318 319 static cl::opt<bool> EnableCondStoresVectorization( 320 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 321 cl::desc("Enable if predication of stores during vectorization.")); 322 323 static cl::opt<unsigned> MaxNestedScalarReductionIC( 324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 325 cl::desc("The maximum interleave count to use when interleaving a scalar " 326 "reduction in a nested loop.")); 327 328 static cl::opt<bool> 329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 330 cl::Hidden, 331 cl::desc("Prefer in-loop vector reductions, " 332 "overriding the targets preference.")); 333 334 cl::opt<bool> EnableStrictReductions( 335 "enable-strict-reductions", cl::init(false), cl::Hidden, 336 cl::desc("Enable the vectorisation of loops with in-order (strict) " 337 "FP reductions")); 338 339 static cl::opt<bool> PreferPredicatedReductionSelect( 340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 341 cl::desc( 342 "Prefer predicating a reduction operation over an after loop select.")); 343 344 cl::opt<bool> EnableVPlanNativePath( 345 "enable-vplan-native-path", cl::init(false), cl::Hidden, 346 cl::desc("Enable VPlan-native vectorization path with " 347 "support for outer loop vectorization.")); 348 349 // FIXME: Remove this switch once we have divergence analysis. Currently we 350 // assume divergent non-backedge branches when this switch is true. 351 cl::opt<bool> EnableVPlanPredication( 352 "enable-vplan-predication", cl::init(false), cl::Hidden, 353 cl::desc("Enable VPlan-native vectorization path predicator with " 354 "support for outer loop vectorization.")); 355 356 // This flag enables the stress testing of the VPlan H-CFG construction in the 357 // VPlan-native vectorization path. It must be used in conjuction with 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 359 // verification of the H-CFGs built. 360 static cl::opt<bool> VPlanBuildStressTest( 361 "vplan-build-stress-test", cl::init(false), cl::Hidden, 362 cl::desc( 363 "Build VPlan for every supported loop nest in the function and bail " 364 "out right after the build (stress test the VPlan H-CFG construction " 365 "in the VPlan-native vectorization path).")); 366 367 cl::opt<bool> llvm::EnableLoopInterleaving( 368 "interleave-loops", cl::init(true), cl::Hidden, 369 cl::desc("Enable loop interleaving in Loop vectorization passes")); 370 cl::opt<bool> llvm::EnableLoopVectorization( 371 "vectorize-loops", cl::init(true), cl::Hidden, 372 cl::desc("Run the Loop vectorization passes")); 373 374 cl::opt<bool> PrintVPlansInDotFormat( 375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden, 376 cl::desc("Use dot format instead of plain text when dumping VPlans")); 377 378 /// A helper function that returns the type of loaded or stored value. 379 static Type *getMemInstValueType(Value *I) { 380 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 381 "Expected Load or Store instruction"); 382 if (auto *LI = dyn_cast<LoadInst>(I)) 383 return LI->getType(); 384 return cast<StoreInst>(I)->getValueOperand()->getType(); 385 } 386 387 /// A helper function that returns true if the given type is irregular. The 388 /// type is irregular if its allocated size doesn't equal the store size of an 389 /// element of the corresponding vector type. 390 static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 391 // Determine if an array of N elements of type Ty is "bitcast compatible" 392 // with a <N x Ty> vector. 393 // This is only true if there is no padding between the array elements. 394 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 395 } 396 397 /// A helper function that returns the reciprocal of the block probability of 398 /// predicated blocks. If we return X, we are assuming the predicated block 399 /// will execute once for every X iterations of the loop header. 400 /// 401 /// TODO: We should use actual block probability here, if available. Currently, 402 /// we always assume predicated blocks have a 50% chance of executing. 403 static unsigned getReciprocalPredBlockProb() { return 2; } 404 405 /// A helper function that returns an integer or floating-point constant with 406 /// value C. 407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 408 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 409 : ConstantFP::get(Ty, C); 410 } 411 412 /// Returns "best known" trip count for the specified loop \p L as defined by 413 /// the following procedure: 414 /// 1) Returns exact trip count if it is known. 415 /// 2) Returns expected trip count according to profile data if any. 416 /// 3) Returns upper bound estimate if it is known. 417 /// 4) Returns None if all of the above failed. 418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 419 // Check if exact trip count is known. 420 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 421 return ExpectedTC; 422 423 // Check if there is an expected trip count available from profile data. 424 if (LoopVectorizeWithBlockFrequency) 425 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 426 return EstimatedTC; 427 428 // Check if upper bound estimate is known. 429 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 430 return ExpectedTC; 431 432 return None; 433 } 434 435 // Forward declare GeneratedRTChecks. 436 class GeneratedRTChecks; 437 438 namespace llvm { 439 440 /// InnerLoopVectorizer vectorizes loops which contain only one basic 441 /// block to a specified vectorization factor (VF). 442 /// This class performs the widening of scalars into vectors, or multiple 443 /// scalars. This class also implements the following features: 444 /// * It inserts an epilogue loop for handling loops that don't have iteration 445 /// counts that are known to be a multiple of the vectorization factor. 446 /// * It handles the code generation for reduction variables. 447 /// * Scalarization (implementation using scalars) of un-vectorizable 448 /// instructions. 449 /// InnerLoopVectorizer does not perform any vectorization-legality 450 /// checks, and relies on the caller to check for the different legality 451 /// aspects. The InnerLoopVectorizer relies on the 452 /// LoopVectorizationLegality class to provide information about the induction 453 /// and reduction variables that were found to a given vectorization factor. 454 class InnerLoopVectorizer { 455 public: 456 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 457 LoopInfo *LI, DominatorTree *DT, 458 const TargetLibraryInfo *TLI, 459 const TargetTransformInfo *TTI, AssumptionCache *AC, 460 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 461 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 462 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 463 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 464 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 465 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 466 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 467 PSI(PSI), RTChecks(RTChecks) { 468 // Query this against the original loop and save it here because the profile 469 // of the original loop header may change as the transformation happens. 470 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 471 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 472 } 473 474 virtual ~InnerLoopVectorizer() = default; 475 476 /// Create a new empty loop that will contain vectorized instructions later 477 /// on, while the old loop will be used as the scalar remainder. Control flow 478 /// is generated around the vectorized (and scalar epilogue) loops consisting 479 /// of various checks and bypasses. Return the pre-header block of the new 480 /// loop. 481 /// In the case of epilogue vectorization, this function is overriden to 482 /// handle the more complex control flow around the loops. 483 virtual BasicBlock *createVectorizedLoopSkeleton(); 484 485 /// Widen a single instruction within the innermost loop. 486 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 487 VPTransformState &State); 488 489 /// Widen a single call instruction within the innermost loop. 490 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 491 VPTransformState &State); 492 493 /// Widen a single select instruction within the innermost loop. 494 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 495 bool InvariantCond, VPTransformState &State); 496 497 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 498 void fixVectorizedLoop(VPTransformState &State); 499 500 // Return true if any runtime check is added. 501 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 502 503 /// A type for vectorized values in the new loop. Each value from the 504 /// original loop, when vectorized, is represented by UF vector values in the 505 /// new unrolled loop, where UF is the unroll factor. 506 using VectorParts = SmallVector<Value *, 2>; 507 508 /// Vectorize a single GetElementPtrInst based on information gathered and 509 /// decisions taken during planning. 510 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 511 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 512 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 513 514 /// Vectorize a single PHINode in a block. This method handles the induction 515 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 516 /// arbitrary length vectors. 517 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 518 VPWidenPHIRecipe *PhiR, VPTransformState &State); 519 520 /// A helper function to scalarize a single Instruction in the innermost loop. 521 /// Generates a sequence of scalar instances for each lane between \p MinLane 522 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 523 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 524 /// Instr's operands. 525 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, 526 const VPIteration &Instance, bool IfPredicateInstr, 527 VPTransformState &State); 528 529 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 530 /// is provided, the integer induction variable will first be truncated to 531 /// the corresponding type. 532 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, 533 VPValue *Def, VPValue *CastDef, 534 VPTransformState &State); 535 536 /// Construct the vector value of a scalarized value \p V one lane at a time. 537 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, 538 VPTransformState &State); 539 540 /// Try to vectorize interleaved access group \p Group with the base address 541 /// given in \p Addr, optionally masking the vector operations if \p 542 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 543 /// values in the vectorized loop. 544 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 545 ArrayRef<VPValue *> VPDefs, 546 VPTransformState &State, VPValue *Addr, 547 ArrayRef<VPValue *> StoredValues, 548 VPValue *BlockInMask = nullptr); 549 550 /// Vectorize Load and Store instructions with the base address given in \p 551 /// Addr, optionally masking the vector operations if \p BlockInMask is 552 /// non-null. Use \p State to translate given VPValues to IR values in the 553 /// vectorized loop. 554 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 555 VPValue *Def, VPValue *Addr, 556 VPValue *StoredValue, VPValue *BlockInMask); 557 558 /// Set the debug location in the builder using the debug location in 559 /// the instruction. 560 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 561 562 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 563 void fixNonInductionPHIs(VPTransformState &State); 564 565 /// Create a broadcast instruction. This method generates a broadcast 566 /// instruction (shuffle) for loop invariant values and for the induction 567 /// value. If this is the induction variable then we extend it to N, N+1, ... 568 /// this is needed because each iteration in the loop corresponds to a SIMD 569 /// element. 570 virtual Value *getBroadcastInstrs(Value *V); 571 572 protected: 573 friend class LoopVectorizationPlanner; 574 575 /// A small list of PHINodes. 576 using PhiVector = SmallVector<PHINode *, 4>; 577 578 /// A type for scalarized values in the new loop. Each value from the 579 /// original loop, when scalarized, is represented by UF x VF scalar values 580 /// in the new unrolled loop, where UF is the unroll factor and VF is the 581 /// vectorization factor. 582 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 583 584 /// Set up the values of the IVs correctly when exiting the vector loop. 585 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 586 Value *CountRoundDown, Value *EndValue, 587 BasicBlock *MiddleBlock); 588 589 /// Create a new induction variable inside L. 590 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 591 Value *Step, Instruction *DL); 592 593 /// Handle all cross-iteration phis in the header. 594 void fixCrossIterationPHIs(VPTransformState &State); 595 596 /// Fix a first-order recurrence. This is the second phase of vectorizing 597 /// this phi node. 598 void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State); 599 600 /// Fix a reduction cross-iteration phi. This is the second phase of 601 /// vectorizing this phi node. 602 void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State); 603 604 /// Clear NSW/NUW flags from reduction instructions if necessary. 605 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 606 VPTransformState &State); 607 608 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 609 /// means we need to add the appropriate incoming value from the middle 610 /// block as exiting edges from the scalar epilogue loop (if present) are 611 /// already in place, and we exit the vector loop exclusively to the middle 612 /// block. 613 void fixLCSSAPHIs(VPTransformState &State); 614 615 /// Iteratively sink the scalarized operands of a predicated instruction into 616 /// the block that was created for it. 617 void sinkScalarOperands(Instruction *PredInst); 618 619 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 620 /// represented as. 621 void truncateToMinimalBitwidths(VPTransformState &State); 622 623 /// This function adds 624 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) 625 /// to each vector element of Val. The sequence starts at StartIndex. 626 /// \p Opcode is relevant for FP induction variable. 627 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 628 Instruction::BinaryOps Opcode = 629 Instruction::BinaryOpsEnd); 630 631 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 632 /// variable on which to base the steps, \p Step is the size of the step, and 633 /// \p EntryVal is the value from the original loop that maps to the steps. 634 /// Note that \p EntryVal doesn't have to be an induction variable - it 635 /// can also be a truncate instruction. 636 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 637 const InductionDescriptor &ID, VPValue *Def, 638 VPValue *CastDef, VPTransformState &State); 639 640 /// Create a vector induction phi node based on an existing scalar one. \p 641 /// EntryVal is the value from the original loop that maps to the vector phi 642 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 643 /// truncate instruction, instead of widening the original IV, we widen a 644 /// version of the IV truncated to \p EntryVal's type. 645 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 646 Value *Step, Value *Start, 647 Instruction *EntryVal, VPValue *Def, 648 VPValue *CastDef, 649 VPTransformState &State); 650 651 /// Returns true if an instruction \p I should be scalarized instead of 652 /// vectorized for the chosen vectorization factor. 653 bool shouldScalarizeInstruction(Instruction *I) const; 654 655 /// Returns true if we should generate a scalar version of \p IV. 656 bool needsScalarInduction(Instruction *IV) const; 657 658 /// If there is a cast involved in the induction variable \p ID, which should 659 /// be ignored in the vectorized loop body, this function records the 660 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 661 /// cast. We had already proved that the casted Phi is equal to the uncasted 662 /// Phi in the vectorized loop (under a runtime guard), and therefore 663 /// there is no need to vectorize the cast - the same value can be used in the 664 /// vector loop for both the Phi and the cast. 665 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 666 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 667 /// 668 /// \p EntryVal is the value from the original loop that maps to the vector 669 /// phi node and is used to distinguish what is the IV currently being 670 /// processed - original one (if \p EntryVal is a phi corresponding to the 671 /// original IV) or the "newly-created" one based on the proof mentioned above 672 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 673 /// latter case \p EntryVal is a TruncInst and we must not record anything for 674 /// that IV, but it's error-prone to expect callers of this routine to care 675 /// about that, hence this explicit parameter. 676 void recordVectorLoopValueForInductionCast( 677 const InductionDescriptor &ID, const Instruction *EntryVal, 678 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, 679 unsigned Part, unsigned Lane = UINT_MAX); 680 681 /// Generate a shuffle sequence that will reverse the vector Vec. 682 virtual Value *reverseVector(Value *Vec); 683 684 /// Returns (and creates if needed) the original loop trip count. 685 Value *getOrCreateTripCount(Loop *NewLoop); 686 687 /// Returns (and creates if needed) the trip count of the widened loop. 688 Value *getOrCreateVectorTripCount(Loop *NewLoop); 689 690 /// Returns a bitcasted value to the requested vector type. 691 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 692 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 693 const DataLayout &DL); 694 695 /// Emit a bypass check to see if the vector trip count is zero, including if 696 /// it overflows. 697 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 698 699 /// Emit a bypass check to see if all of the SCEV assumptions we've 700 /// had to make are correct. Returns the block containing the checks or 701 /// nullptr if no checks have been added. 702 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); 703 704 /// Emit bypass checks to check any memory assumptions we may have made. 705 /// Returns the block containing the checks or nullptr if no checks have been 706 /// added. 707 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 708 709 /// Compute the transformed value of Index at offset StartValue using step 710 /// StepValue. 711 /// For integer induction, returns StartValue + Index * StepValue. 712 /// For pointer induction, returns StartValue[Index * StepValue]. 713 /// FIXME: The newly created binary instructions should contain nsw/nuw 714 /// flags, which can be found from the original scalar operations. 715 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 716 const DataLayout &DL, 717 const InductionDescriptor &ID) const; 718 719 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 720 /// vector loop preheader, middle block and scalar preheader. Also 721 /// allocate a loop object for the new vector loop and return it. 722 Loop *createVectorLoopSkeleton(StringRef Prefix); 723 724 /// Create new phi nodes for the induction variables to resume iteration count 725 /// in the scalar epilogue, from where the vectorized loop left off (given by 726 /// \p VectorTripCount). 727 /// In cases where the loop skeleton is more complicated (eg. epilogue 728 /// vectorization) and the resume values can come from an additional bypass 729 /// block, the \p AdditionalBypass pair provides information about the bypass 730 /// block and the end value on the edge from bypass to this loop. 731 void createInductionResumeValues( 732 Loop *L, Value *VectorTripCount, 733 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 734 735 /// Complete the loop skeleton by adding debug MDs, creating appropriate 736 /// conditional branches in the middle block, preparing the builder and 737 /// running the verifier. Take in the vector loop \p L as argument, and return 738 /// the preheader of the completed vector loop. 739 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 740 741 /// Add additional metadata to \p To that was not present on \p Orig. 742 /// 743 /// Currently this is used to add the noalias annotations based on the 744 /// inserted memchecks. Use this for instructions that are *cloned* into the 745 /// vector loop. 746 void addNewMetadata(Instruction *To, const Instruction *Orig); 747 748 /// Add metadata from one instruction to another. 749 /// 750 /// This includes both the original MDs from \p From and additional ones (\see 751 /// addNewMetadata). Use this for *newly created* instructions in the vector 752 /// loop. 753 void addMetadata(Instruction *To, Instruction *From); 754 755 /// Similar to the previous function but it adds the metadata to a 756 /// vector of instructions. 757 void addMetadata(ArrayRef<Value *> To, Instruction *From); 758 759 /// Allow subclasses to override and print debug traces before/after vplan 760 /// execution, when trace information is requested. 761 virtual void printDebugTracesAtStart(){}; 762 virtual void printDebugTracesAtEnd(){}; 763 764 /// The original loop. 765 Loop *OrigLoop; 766 767 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 768 /// dynamic knowledge to simplify SCEV expressions and converts them to a 769 /// more usable form. 770 PredicatedScalarEvolution &PSE; 771 772 /// Loop Info. 773 LoopInfo *LI; 774 775 /// Dominator Tree. 776 DominatorTree *DT; 777 778 /// Alias Analysis. 779 AAResults *AA; 780 781 /// Target Library Info. 782 const TargetLibraryInfo *TLI; 783 784 /// Target Transform Info. 785 const TargetTransformInfo *TTI; 786 787 /// Assumption Cache. 788 AssumptionCache *AC; 789 790 /// Interface to emit optimization remarks. 791 OptimizationRemarkEmitter *ORE; 792 793 /// LoopVersioning. It's only set up (non-null) if memchecks were 794 /// used. 795 /// 796 /// This is currently only used to add no-alias metadata based on the 797 /// memchecks. The actually versioning is performed manually. 798 std::unique_ptr<LoopVersioning> LVer; 799 800 /// The vectorization SIMD factor to use. Each vector will have this many 801 /// vector elements. 802 ElementCount VF; 803 804 /// The vectorization unroll factor to use. Each scalar is vectorized to this 805 /// many different vector instructions. 806 unsigned UF; 807 808 /// The builder that we use 809 IRBuilder<> Builder; 810 811 // --- Vectorization state --- 812 813 /// The vector-loop preheader. 814 BasicBlock *LoopVectorPreHeader; 815 816 /// The scalar-loop preheader. 817 BasicBlock *LoopScalarPreHeader; 818 819 /// Middle Block between the vector and the scalar. 820 BasicBlock *LoopMiddleBlock; 821 822 /// The (unique) ExitBlock of the scalar loop. Note that 823 /// there can be multiple exiting edges reaching this block. 824 BasicBlock *LoopExitBlock; 825 826 /// The vector loop body. 827 BasicBlock *LoopVectorBody; 828 829 /// The scalar loop body. 830 BasicBlock *LoopScalarBody; 831 832 /// A list of all bypass blocks. The first block is the entry of the loop. 833 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 834 835 /// The new Induction variable which was added to the new block. 836 PHINode *Induction = nullptr; 837 838 /// The induction variable of the old basic block. 839 PHINode *OldInduction = nullptr; 840 841 /// Store instructions that were predicated. 842 SmallVector<Instruction *, 4> PredicatedInstructions; 843 844 /// Trip count of the original loop. 845 Value *TripCount = nullptr; 846 847 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 848 Value *VectorTripCount = nullptr; 849 850 /// The legality analysis. 851 LoopVectorizationLegality *Legal; 852 853 /// The profitablity analysis. 854 LoopVectorizationCostModel *Cost; 855 856 // Record whether runtime checks are added. 857 bool AddedSafetyChecks = false; 858 859 // Holds the end values for each induction variable. We save the end values 860 // so we can later fix-up the external users of the induction variables. 861 DenseMap<PHINode *, Value *> IVEndValues; 862 863 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 864 // fixed up at the end of vector code generation. 865 SmallVector<PHINode *, 8> OrigPHIsToFix; 866 867 /// BFI and PSI are used to check for profile guided size optimizations. 868 BlockFrequencyInfo *BFI; 869 ProfileSummaryInfo *PSI; 870 871 // Whether this loop should be optimized for size based on profile guided size 872 // optimizatios. 873 bool OptForSizeBasedOnProfile; 874 875 /// Structure to hold information about generated runtime checks, responsible 876 /// for cleaning the checks, if vectorization turns out unprofitable. 877 GeneratedRTChecks &RTChecks; 878 }; 879 880 class InnerLoopUnroller : public InnerLoopVectorizer { 881 public: 882 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 883 LoopInfo *LI, DominatorTree *DT, 884 const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 887 LoopVectorizationLegality *LVL, 888 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 889 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 890 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 892 BFI, PSI, Check) {} 893 894 private: 895 Value *getBroadcastInstrs(Value *V) override; 896 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 897 Instruction::BinaryOps Opcode = 898 Instruction::BinaryOpsEnd) override; 899 Value *reverseVector(Value *Vec) override; 900 }; 901 902 /// Encapsulate information regarding vectorization of a loop and its epilogue. 903 /// This information is meant to be updated and used across two stages of 904 /// epilogue vectorization. 905 struct EpilogueLoopVectorizationInfo { 906 ElementCount MainLoopVF = ElementCount::getFixed(0); 907 unsigned MainLoopUF = 0; 908 ElementCount EpilogueVF = ElementCount::getFixed(0); 909 unsigned EpilogueUF = 0; 910 BasicBlock *MainLoopIterationCountCheck = nullptr; 911 BasicBlock *EpilogueIterationCountCheck = nullptr; 912 BasicBlock *SCEVSafetyCheck = nullptr; 913 BasicBlock *MemSafetyCheck = nullptr; 914 Value *TripCount = nullptr; 915 Value *VectorTripCount = nullptr; 916 917 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 918 unsigned EUF) 919 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 920 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 921 assert(EUF == 1 && 922 "A high UF for the epilogue loop is likely not beneficial."); 923 } 924 }; 925 926 /// An extension of the inner loop vectorizer that creates a skeleton for a 927 /// vectorized loop that has its epilogue (residual) also vectorized. 928 /// The idea is to run the vplan on a given loop twice, firstly to setup the 929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 930 /// from the first step and vectorize the epilogue. This is achieved by 931 /// deriving two concrete strategy classes from this base class and invoking 932 /// them in succession from the loop vectorizer planner. 933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 934 public: 935 InnerLoopAndEpilogueVectorizer( 936 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 937 DominatorTree *DT, const TargetLibraryInfo *TLI, 938 const TargetTransformInfo *TTI, AssumptionCache *AC, 939 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 940 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 941 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 942 GeneratedRTChecks &Checks) 943 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 944 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, 945 Checks), 946 EPI(EPI) {} 947 948 // Override this function to handle the more complex control flow around the 949 // three loops. 950 BasicBlock *createVectorizedLoopSkeleton() final override { 951 return createEpilogueVectorizedLoopSkeleton(); 952 } 953 954 /// The interface for creating a vectorized skeleton using one of two 955 /// different strategies, each corresponding to one execution of the vplan 956 /// as described above. 957 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 958 959 /// Holds and updates state information required to vectorize the main loop 960 /// and its epilogue in two separate passes. This setup helps us avoid 961 /// regenerating and recomputing runtime safety checks. It also helps us to 962 /// shorten the iteration-count-check path length for the cases where the 963 /// iteration count of the loop is so small that the main vector loop is 964 /// completely skipped. 965 EpilogueLoopVectorizationInfo &EPI; 966 }; 967 968 /// A specialized derived class of inner loop vectorizer that performs 969 /// vectorization of *main* loops in the process of vectorizing loops and their 970 /// epilogues. 971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 972 public: 973 EpilogueVectorizerMainLoop( 974 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 975 DominatorTree *DT, const TargetLibraryInfo *TLI, 976 const TargetTransformInfo *TTI, AssumptionCache *AC, 977 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 978 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 979 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 980 GeneratedRTChecks &Check) 981 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 982 EPI, LVL, CM, BFI, PSI, Check) {} 983 /// Implements the interface for creating a vectorized skeleton using the 984 /// *main loop* strategy (ie the first pass of vplan execution). 985 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 986 987 protected: 988 /// Emits an iteration count bypass check once for the main loop (when \p 989 /// ForEpilogue is false) and once for the epilogue loop (when \p 990 /// ForEpilogue is true). 991 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 992 bool ForEpilogue); 993 void printDebugTracesAtStart() override; 994 void printDebugTracesAtEnd() override; 995 }; 996 997 // A specialized derived class of inner loop vectorizer that performs 998 // vectorization of *epilogue* loops in the process of vectorizing loops and 999 // their epilogues. 1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1001 public: 1002 EpilogueVectorizerEpilogueLoop( 1003 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 1004 DominatorTree *DT, const TargetLibraryInfo *TLI, 1005 const TargetTransformInfo *TTI, AssumptionCache *AC, 1006 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1007 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1008 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 1009 GeneratedRTChecks &Checks) 1010 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1011 EPI, LVL, CM, BFI, PSI, Checks) {} 1012 /// Implements the interface for creating a vectorized skeleton using the 1013 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1014 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1015 1016 protected: 1017 /// Emits an iteration count bypass check after the main vector loop has 1018 /// finished to see if there are any iterations left to execute by either 1019 /// the vector epilogue or the scalar epilogue. 1020 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1021 BasicBlock *Bypass, 1022 BasicBlock *Insert); 1023 void printDebugTracesAtStart() override; 1024 void printDebugTracesAtEnd() override; 1025 }; 1026 } // end namespace llvm 1027 1028 /// Look for a meaningful debug location on the instruction or it's 1029 /// operands. 1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1031 if (!I) 1032 return I; 1033 1034 DebugLoc Empty; 1035 if (I->getDebugLoc() != Empty) 1036 return I; 1037 1038 for (Use &Op : I->operands()) { 1039 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 1040 if (OpInst->getDebugLoc() != Empty) 1041 return OpInst; 1042 } 1043 1044 return I; 1045 } 1046 1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1048 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1049 const DILocation *DIL = Inst->getDebugLoc(); 1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1051 !isa<DbgInfoIntrinsic>(Inst)) { 1052 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1053 auto NewDIL = 1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1055 if (NewDIL) 1056 B.SetCurrentDebugLocation(NewDIL.getValue()); 1057 else 1058 LLVM_DEBUG(dbgs() 1059 << "Failed to create new discriminator: " 1060 << DIL->getFilename() << " Line: " << DIL->getLine()); 1061 } 1062 else 1063 B.SetCurrentDebugLocation(DIL); 1064 } else 1065 B.SetCurrentDebugLocation(DebugLoc()); 1066 } 1067 1068 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 1069 /// is passed, the message relates to that particular instruction. 1070 #ifndef NDEBUG 1071 static void debugVectorizationMessage(const StringRef Prefix, 1072 const StringRef DebugMsg, 1073 Instruction *I) { 1074 dbgs() << "LV: " << Prefix << DebugMsg; 1075 if (I != nullptr) 1076 dbgs() << " " << *I; 1077 else 1078 dbgs() << '.'; 1079 dbgs() << '\n'; 1080 } 1081 #endif 1082 1083 /// Create an analysis remark that explains why vectorization failed 1084 /// 1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1086 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1087 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1088 /// the location of the remark. \return the remark object that can be 1089 /// streamed to. 1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1091 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1092 Value *CodeRegion = TheLoop->getHeader(); 1093 DebugLoc DL = TheLoop->getStartLoc(); 1094 1095 if (I) { 1096 CodeRegion = I->getParent(); 1097 // If there is no debug location attached to the instruction, revert back to 1098 // using the loop's. 1099 if (I->getDebugLoc()) 1100 DL = I->getDebugLoc(); 1101 } 1102 1103 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 1104 } 1105 1106 /// Return a value for Step multiplied by VF. 1107 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1108 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1109 Constant *StepVal = ConstantInt::get( 1110 Step->getType(), 1111 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1112 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1113 } 1114 1115 namespace llvm { 1116 1117 /// Return the runtime value for VF. 1118 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { 1119 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); 1120 return VF.isScalable() ? B.CreateVScale(EC) : EC; 1121 } 1122 1123 void reportVectorizationFailure(const StringRef DebugMsg, 1124 const StringRef OREMsg, const StringRef ORETag, 1125 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1126 Instruction *I) { 1127 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1128 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1129 ORE->emit( 1130 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1131 << "loop not vectorized: " << OREMsg); 1132 } 1133 1134 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1135 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1136 Instruction *I) { 1137 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1138 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1139 ORE->emit( 1140 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1141 << Msg); 1142 } 1143 1144 } // end namespace llvm 1145 1146 #ifndef NDEBUG 1147 /// \return string containing a file name and a line # for the given loop. 1148 static std::string getDebugLocString(const Loop *L) { 1149 std::string Result; 1150 if (L) { 1151 raw_string_ostream OS(Result); 1152 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1153 LoopDbgLoc.print(OS); 1154 else 1155 // Just print the module name. 1156 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1157 OS.flush(); 1158 } 1159 return Result; 1160 } 1161 #endif 1162 1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1164 const Instruction *Orig) { 1165 // If the loop was versioned with memchecks, add the corresponding no-alias 1166 // metadata. 1167 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1168 LVer->annotateInstWithNoAlias(To, Orig); 1169 } 1170 1171 void InnerLoopVectorizer::addMetadata(Instruction *To, 1172 Instruction *From) { 1173 propagateMetadata(To, From); 1174 addNewMetadata(To, From); 1175 } 1176 1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1178 Instruction *From) { 1179 for (Value *V : To) { 1180 if (Instruction *I = dyn_cast<Instruction>(V)) 1181 addMetadata(I, From); 1182 } 1183 } 1184 1185 namespace llvm { 1186 1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1188 // lowered. 1189 enum ScalarEpilogueLowering { 1190 1191 // The default: allowing scalar epilogues. 1192 CM_ScalarEpilogueAllowed, 1193 1194 // Vectorization with OptForSize: don't allow epilogues. 1195 CM_ScalarEpilogueNotAllowedOptSize, 1196 1197 // A special case of vectorisation with OptForSize: loops with a very small 1198 // trip count are considered for vectorization under OptForSize, thereby 1199 // making sure the cost of their loop body is dominant, free of runtime 1200 // guards and scalar iteration overheads. 1201 CM_ScalarEpilogueNotAllowedLowTripLoop, 1202 1203 // Loop hint predicate indicating an epilogue is undesired. 1204 CM_ScalarEpilogueNotNeededUsePredicate, 1205 1206 // Directive indicating we must either tail fold or not vectorize 1207 CM_ScalarEpilogueNotAllowedUsePredicate 1208 }; 1209 1210 /// LoopVectorizationCostModel - estimates the expected speedups due to 1211 /// vectorization. 1212 /// In many cases vectorization is not profitable. This can happen because of 1213 /// a number of reasons. In this class we mainly attempt to predict the 1214 /// expected speedup/slowdowns due to the supported instruction set. We use the 1215 /// TargetTransformInfo to query the different backends for the cost of 1216 /// different operations. 1217 class LoopVectorizationCostModel { 1218 public: 1219 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1220 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1221 LoopVectorizationLegality *Legal, 1222 const TargetTransformInfo &TTI, 1223 const TargetLibraryInfo *TLI, DemandedBits *DB, 1224 AssumptionCache *AC, 1225 OptimizationRemarkEmitter *ORE, const Function *F, 1226 const LoopVectorizeHints *Hints, 1227 InterleavedAccessInfo &IAI) 1228 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1229 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1230 Hints(Hints), InterleaveInfo(IAI) {} 1231 1232 /// \return An upper bound for the vectorization factor, or None if 1233 /// vectorization and interleaving should be avoided up front. 1234 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1235 1236 /// \return True if runtime checks are required for vectorization, and false 1237 /// otherwise. 1238 bool runtimeChecksRequired(); 1239 1240 /// \return The most profitable vectorization factor and the cost of that VF. 1241 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1242 /// then this vectorization factor will be selected if vectorization is 1243 /// possible. 1244 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1245 VectorizationFactor 1246 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1247 const LoopVectorizationPlanner &LVP); 1248 1249 /// Setup cost-based decisions for user vectorization factor. 1250 void selectUserVectorizationFactor(ElementCount UserVF) { 1251 collectUniformsAndScalars(UserVF); 1252 collectInstsToScalarize(UserVF); 1253 } 1254 1255 /// \return The size (in bits) of the smallest and widest types in the code 1256 /// that needs to be vectorized. We ignore values that remain scalar such as 1257 /// 64 bit loop indices. 1258 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1259 1260 /// \return The desired interleave count. 1261 /// If interleave count has been specified by metadata it will be returned. 1262 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1263 /// are the selected vectorization factor and the cost of the selected VF. 1264 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1265 1266 /// Memory access instruction may be vectorized in more than one way. 1267 /// Form of instruction after vectorization depends on cost. 1268 /// This function takes cost-based decisions for Load/Store instructions 1269 /// and collects them in a map. This decisions map is used for building 1270 /// the lists of loop-uniform and loop-scalar instructions. 1271 /// The calculated cost is saved with widening decision in order to 1272 /// avoid redundant calculations. 1273 void setCostBasedWideningDecision(ElementCount VF); 1274 1275 /// A struct that represents some properties of the register usage 1276 /// of a loop. 1277 struct RegisterUsage { 1278 /// Holds the number of loop invariant values that are used in the loop. 1279 /// The key is ClassID of target-provided register class. 1280 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1281 /// Holds the maximum number of concurrent live intervals in the loop. 1282 /// The key is ClassID of target-provided register class. 1283 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1284 }; 1285 1286 /// \return Returns information about the register usages of the loop for the 1287 /// given vectorization factors. 1288 SmallVector<RegisterUsage, 8> 1289 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1290 1291 /// Collect values we want to ignore in the cost model. 1292 void collectValuesToIgnore(); 1293 1294 /// Split reductions into those that happen in the loop, and those that happen 1295 /// outside. In loop reductions are collected into InLoopReductionChains. 1296 void collectInLoopReductions(); 1297 1298 /// \returns The smallest bitwidth each instruction can be represented with. 1299 /// The vector equivalents of these instructions should be truncated to this 1300 /// type. 1301 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1302 return MinBWs; 1303 } 1304 1305 /// \returns True if it is more profitable to scalarize instruction \p I for 1306 /// vectorization factor \p VF. 1307 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1308 assert(VF.isVector() && 1309 "Profitable to scalarize relevant only for VF > 1."); 1310 1311 // Cost model is not run in the VPlan-native path - return conservative 1312 // result until this changes. 1313 if (EnableVPlanNativePath) 1314 return false; 1315 1316 auto Scalars = InstsToScalarize.find(VF); 1317 assert(Scalars != InstsToScalarize.end() && 1318 "VF not yet analyzed for scalarization profitability"); 1319 return Scalars->second.find(I) != Scalars->second.end(); 1320 } 1321 1322 /// Returns true if \p I is known to be uniform after vectorization. 1323 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1324 if (VF.isScalar()) 1325 return true; 1326 1327 // Cost model is not run in the VPlan-native path - return conservative 1328 // result until this changes. 1329 if (EnableVPlanNativePath) 1330 return false; 1331 1332 auto UniformsPerVF = Uniforms.find(VF); 1333 assert(UniformsPerVF != Uniforms.end() && 1334 "VF not yet analyzed for uniformity"); 1335 return UniformsPerVF->second.count(I); 1336 } 1337 1338 /// Returns true if \p I is known to be scalar after vectorization. 1339 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1340 if (VF.isScalar()) 1341 return true; 1342 1343 // Cost model is not run in the VPlan-native path - return conservative 1344 // result until this changes. 1345 if (EnableVPlanNativePath) 1346 return false; 1347 1348 auto ScalarsPerVF = Scalars.find(VF); 1349 assert(ScalarsPerVF != Scalars.end() && 1350 "Scalar values are not calculated for VF"); 1351 return ScalarsPerVF->second.count(I); 1352 } 1353 1354 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1355 /// for vectorization factor \p VF. 1356 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1357 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1358 !isProfitableToScalarize(I, VF) && 1359 !isScalarAfterVectorization(I, VF); 1360 } 1361 1362 /// Decision that was taken during cost calculation for memory instruction. 1363 enum InstWidening { 1364 CM_Unknown, 1365 CM_Widen, // For consecutive accesses with stride +1. 1366 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1367 CM_Interleave, 1368 CM_GatherScatter, 1369 CM_Scalarize 1370 }; 1371 1372 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1373 /// instruction \p I and vector width \p VF. 1374 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1375 InstructionCost Cost) { 1376 assert(VF.isVector() && "Expected VF >=2"); 1377 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1378 } 1379 1380 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1381 /// interleaving group \p Grp and vector width \p VF. 1382 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1383 ElementCount VF, InstWidening W, 1384 InstructionCost Cost) { 1385 assert(VF.isVector() && "Expected VF >=2"); 1386 /// Broadcast this decicion to all instructions inside the group. 1387 /// But the cost will be assigned to one instruction only. 1388 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1389 if (auto *I = Grp->getMember(i)) { 1390 if (Grp->getInsertPos() == I) 1391 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1392 else 1393 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1394 } 1395 } 1396 } 1397 1398 /// Return the cost model decision for the given instruction \p I and vector 1399 /// width \p VF. Return CM_Unknown if this instruction did not pass 1400 /// through the cost modeling. 1401 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1402 assert(VF.isVector() && "Expected VF to be a vector VF"); 1403 // Cost model is not run in the VPlan-native path - return conservative 1404 // result until this changes. 1405 if (EnableVPlanNativePath) 1406 return CM_GatherScatter; 1407 1408 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1409 auto Itr = WideningDecisions.find(InstOnVF); 1410 if (Itr == WideningDecisions.end()) 1411 return CM_Unknown; 1412 return Itr->second.first; 1413 } 1414 1415 /// Return the vectorization cost for the given instruction \p I and vector 1416 /// width \p VF. 1417 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1418 assert(VF.isVector() && "Expected VF >=2"); 1419 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1420 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1421 "The cost is not calculated"); 1422 return WideningDecisions[InstOnVF].second; 1423 } 1424 1425 /// Return True if instruction \p I is an optimizable truncate whose operand 1426 /// is an induction variable. Such a truncate will be removed by adding a new 1427 /// induction variable with the destination type. 1428 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1429 // If the instruction is not a truncate, return false. 1430 auto *Trunc = dyn_cast<TruncInst>(I); 1431 if (!Trunc) 1432 return false; 1433 1434 // Get the source and destination types of the truncate. 1435 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1436 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1437 1438 // If the truncate is free for the given types, return false. Replacing a 1439 // free truncate with an induction variable would add an induction variable 1440 // update instruction to each iteration of the loop. We exclude from this 1441 // check the primary induction variable since it will need an update 1442 // instruction regardless. 1443 Value *Op = Trunc->getOperand(0); 1444 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1445 return false; 1446 1447 // If the truncated value is not an induction variable, return false. 1448 return Legal->isInductionPhi(Op); 1449 } 1450 1451 /// Collects the instructions to scalarize for each predicated instruction in 1452 /// the loop. 1453 void collectInstsToScalarize(ElementCount VF); 1454 1455 /// Collect Uniform and Scalar values for the given \p VF. 1456 /// The sets depend on CM decision for Load/Store instructions 1457 /// that may be vectorized as interleave, gather-scatter or scalarized. 1458 void collectUniformsAndScalars(ElementCount VF) { 1459 // Do the analysis once. 1460 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1461 return; 1462 setCostBasedWideningDecision(VF); 1463 collectLoopUniforms(VF); 1464 collectLoopScalars(VF); 1465 } 1466 1467 /// Returns true if the target machine supports masked store operation 1468 /// for the given \p DataType and kind of access to \p Ptr. 1469 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1470 return Legal->isConsecutivePtr(Ptr) && 1471 TTI.isLegalMaskedStore(DataType, Alignment); 1472 } 1473 1474 /// Returns true if the target machine supports masked load operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedLoad(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked scatter operation 1482 /// for the given \p DataType. 1483 bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { 1484 return TTI.isLegalMaskedScatter(DataType, Alignment); 1485 } 1486 1487 /// Returns true if the target machine supports masked gather operation 1488 /// for the given \p DataType. 1489 bool isLegalMaskedGather(Type *DataType, Align Alignment) const { 1490 return TTI.isLegalMaskedGather(DataType, Alignment); 1491 } 1492 1493 /// Returns true if the target machine can represent \p V as a masked gather 1494 /// or scatter operation. 1495 bool isLegalGatherOrScatter(Value *V) { 1496 bool LI = isa<LoadInst>(V); 1497 bool SI = isa<StoreInst>(V); 1498 if (!LI && !SI) 1499 return false; 1500 auto *Ty = getMemInstValueType(V); 1501 Align Align = getLoadStoreAlignment(V); 1502 return (LI && isLegalMaskedGather(Ty, Align)) || 1503 (SI && isLegalMaskedScatter(Ty, Align)); 1504 } 1505 1506 /// Returns true if the target machine supports all of the reduction 1507 /// variables found for the given VF. 1508 bool canVectorizeReductions(ElementCount VF) { 1509 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1510 RecurrenceDescriptor RdxDesc = Reduction.second; 1511 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1512 })); 1513 } 1514 1515 /// Returns true if \p I is an instruction that will be scalarized with 1516 /// predication. Such instructions include conditional stores and 1517 /// instructions that may divide by zero. 1518 /// If a non-zero VF has been calculated, we check if I will be scalarized 1519 /// predication for that VF. 1520 bool 1521 isScalarWithPredication(Instruction *I, 1522 ElementCount VF = ElementCount::getFixed(1)) const; 1523 1524 // Returns true if \p I is an instruction that will be predicated either 1525 // through scalar predication or masked load/store or masked gather/scatter. 1526 // Superset of instructions that return true for isScalarWithPredication. 1527 bool isPredicatedInst(Instruction *I, ElementCount VF) { 1528 if (!blockNeedsPredication(I->getParent())) 1529 return false; 1530 // Loads and stores that need some form of masked operation are predicated 1531 // instructions. 1532 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1533 return Legal->isMaskRequired(I); 1534 return isScalarWithPredication(I, VF); 1535 } 1536 1537 /// Returns true if \p I is a memory instruction with consecutive memory 1538 /// access that can be widened. 1539 bool 1540 memoryInstructionCanBeWidened(Instruction *I, 1541 ElementCount VF = ElementCount::getFixed(1)); 1542 1543 /// Returns true if \p I is a memory instruction in an interleaved-group 1544 /// of memory accesses that can be vectorized with wide vector loads/stores 1545 /// and shuffles. 1546 bool 1547 interleavedAccessCanBeWidened(Instruction *I, 1548 ElementCount VF = ElementCount::getFixed(1)); 1549 1550 /// Check if \p Instr belongs to any interleaved access group. 1551 bool isAccessInterleaved(Instruction *Instr) { 1552 return InterleaveInfo.isInterleaved(Instr); 1553 } 1554 1555 /// Get the interleaved access group that \p Instr belongs to. 1556 const InterleaveGroup<Instruction> * 1557 getInterleavedAccessGroup(Instruction *Instr) { 1558 return InterleaveInfo.getInterleaveGroup(Instr); 1559 } 1560 1561 /// Returns true if we're required to use a scalar epilogue for at least 1562 /// the final iteration of the original loop. 1563 bool requiresScalarEpilogue() const { 1564 if (!isScalarEpilogueAllowed()) 1565 return false; 1566 // If we might exit from anywhere but the latch, must run the exiting 1567 // iteration in scalar form. 1568 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1569 return true; 1570 return InterleaveInfo.requiresScalarEpilogue(); 1571 } 1572 1573 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1574 /// loop hint annotation. 1575 bool isScalarEpilogueAllowed() const { 1576 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1577 } 1578 1579 /// Returns true if all loop blocks should be masked to fold tail loop. 1580 bool foldTailByMasking() const { return FoldTailByMasking; } 1581 1582 bool blockNeedsPredication(BasicBlock *BB) const { 1583 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1584 } 1585 1586 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1587 /// nodes to the chain of instructions representing the reductions. Uses a 1588 /// MapVector to ensure deterministic iteration order. 1589 using ReductionChainMap = 1590 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1591 1592 /// Return the chain of instructions representing an inloop reduction. 1593 const ReductionChainMap &getInLoopReductionChains() const { 1594 return InLoopReductionChains; 1595 } 1596 1597 /// Returns true if the Phi is part of an inloop reduction. 1598 bool isInLoopReduction(PHINode *Phi) const { 1599 return InLoopReductionChains.count(Phi); 1600 } 1601 1602 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1603 /// with factor VF. Return the cost of the instruction, including 1604 /// scalarization overhead if it's needed. 1605 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1606 1607 /// Estimate cost of a call instruction CI if it were vectorized with factor 1608 /// VF. Return the cost of the instruction, including scalarization overhead 1609 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1610 /// scalarized - 1611 /// i.e. either vector version isn't available, or is too expensive. 1612 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1613 bool &NeedToScalarize) const; 1614 1615 /// Returns true if the per-lane cost of VectorizationFactor A is lower than 1616 /// that of B. 1617 bool isMoreProfitable(const VectorizationFactor &A, 1618 const VectorizationFactor &B) const; 1619 1620 /// Invalidates decisions already taken by the cost model. 1621 void invalidateCostModelingDecisions() { 1622 WideningDecisions.clear(); 1623 Uniforms.clear(); 1624 Scalars.clear(); 1625 } 1626 1627 private: 1628 unsigned NumPredStores = 0; 1629 1630 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1631 /// than zero. One is returned if vectorization should best be avoided due 1632 /// to cost. 1633 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1634 ElementCount UserVF); 1635 1636 /// \return the maximized element count based on the targets vector 1637 /// registers and the loop trip-count, but limited to a maximum safe VF. 1638 /// This is a helper function of computeFeasibleMaxVF. 1639 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure 1640 /// issue that occurred on one of the buildbots which cannot be reproduced 1641 /// without having access to the properietary compiler (see comments on 1642 /// D98509). The issue is currently under investigation and this workaround 1643 /// will be removed as soon as possible. 1644 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, 1645 unsigned SmallestType, 1646 unsigned WidestType, 1647 const ElementCount &MaxSafeVF); 1648 1649 /// \return the maximum legal scalable VF, based on the safe max number 1650 /// of elements. 1651 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1652 1653 /// The vectorization cost is a combination of the cost itself and a boolean 1654 /// indicating whether any of the contributing operations will actually 1655 /// operate on 1656 /// vector values after type legalization in the backend. If this latter value 1657 /// is 1658 /// false, then all operations will be scalarized (i.e. no vectorization has 1659 /// actually taken place). 1660 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1661 1662 /// Returns the expected execution cost. The unit of the cost does 1663 /// not matter because we use the 'cost' units to compare different 1664 /// vector widths. The cost that is returned is *not* normalized by 1665 /// the factor width. 1666 VectorizationCostTy expectedCost(ElementCount VF); 1667 1668 /// Returns the execution time cost of an instruction for a given vector 1669 /// width. Vector width of one means scalar. 1670 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1671 1672 /// The cost-computation logic from getInstructionCost which provides 1673 /// the vector type as an output parameter. 1674 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1675 Type *&VectorTy); 1676 1677 /// Return the cost of instructions in an inloop reduction pattern, if I is 1678 /// part of that pattern. 1679 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1680 Type *VectorTy, 1681 TTI::TargetCostKind CostKind); 1682 1683 /// Calculate vectorization cost of memory instruction \p I. 1684 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1685 1686 /// The cost computation for scalarized memory instruction. 1687 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1688 1689 /// The cost computation for interleaving group of memory instructions. 1690 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1691 1692 /// The cost computation for Gather/Scatter instruction. 1693 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1694 1695 /// The cost computation for widening instruction \p I with consecutive 1696 /// memory access. 1697 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1698 1699 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1700 /// Load: scalar load + broadcast. 1701 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1702 /// element) 1703 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1704 1705 /// Estimate the overhead of scalarizing an instruction. This is a 1706 /// convenience wrapper for the type-based getScalarizationOverhead API. 1707 InstructionCost getScalarizationOverhead(Instruction *I, 1708 ElementCount VF) const; 1709 1710 /// Returns whether the instruction is a load or store and will be a emitted 1711 /// as a vector operation. 1712 bool isConsecutiveLoadOrStore(Instruction *I); 1713 1714 /// Returns true if an artificially high cost for emulated masked memrefs 1715 /// should be used. 1716 bool useEmulatedMaskMemRefHack(Instruction *I); 1717 1718 /// Map of scalar integer values to the smallest bitwidth they can be legally 1719 /// represented as. The vector equivalents of these values should be truncated 1720 /// to this type. 1721 MapVector<Instruction *, uint64_t> MinBWs; 1722 1723 /// A type representing the costs for instructions if they were to be 1724 /// scalarized rather than vectorized. The entries are Instruction-Cost 1725 /// pairs. 1726 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1727 1728 /// A set containing all BasicBlocks that are known to present after 1729 /// vectorization as a predicated block. 1730 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1731 1732 /// Records whether it is allowed to have the original scalar loop execute at 1733 /// least once. This may be needed as a fallback loop in case runtime 1734 /// aliasing/dependence checks fail, or to handle the tail/remainder 1735 /// iterations when the trip count is unknown or doesn't divide by the VF, 1736 /// or as a peel-loop to handle gaps in interleave-groups. 1737 /// Under optsize and when the trip count is very small we don't allow any 1738 /// iterations to execute in the scalar loop. 1739 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1740 1741 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1742 bool FoldTailByMasking = false; 1743 1744 /// A map holding scalar costs for different vectorization factors. The 1745 /// presence of a cost for an instruction in the mapping indicates that the 1746 /// instruction will be scalarized when vectorizing with the associated 1747 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1748 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1749 1750 /// Holds the instructions known to be uniform after vectorization. 1751 /// The data is collected per VF. 1752 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1753 1754 /// Holds the instructions known to be scalar after vectorization. 1755 /// The data is collected per VF. 1756 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1757 1758 /// Holds the instructions (address computations) that are forced to be 1759 /// scalarized. 1760 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1761 1762 /// PHINodes of the reductions that should be expanded in-loop along with 1763 /// their associated chains of reduction operations, in program order from top 1764 /// (PHI) to bottom 1765 ReductionChainMap InLoopReductionChains; 1766 1767 /// A Map of inloop reduction operations and their immediate chain operand. 1768 /// FIXME: This can be removed once reductions can be costed correctly in 1769 /// vplan. This was added to allow quick lookup to the inloop operations, 1770 /// without having to loop through InLoopReductionChains. 1771 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1772 1773 /// Returns the expected difference in cost from scalarizing the expression 1774 /// feeding a predicated instruction \p PredInst. The instructions to 1775 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1776 /// non-negative return value implies the expression will be scalarized. 1777 /// Currently, only single-use chains are considered for scalarization. 1778 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1779 ElementCount VF); 1780 1781 /// Collect the instructions that are uniform after vectorization. An 1782 /// instruction is uniform if we represent it with a single scalar value in 1783 /// the vectorized loop corresponding to each vector iteration. Examples of 1784 /// uniform instructions include pointer operands of consecutive or 1785 /// interleaved memory accesses. Note that although uniformity implies an 1786 /// instruction will be scalar, the reverse is not true. In general, a 1787 /// scalarized instruction will be represented by VF scalar values in the 1788 /// vectorized loop, each corresponding to an iteration of the original 1789 /// scalar loop. 1790 void collectLoopUniforms(ElementCount VF); 1791 1792 /// Collect the instructions that are scalar after vectorization. An 1793 /// instruction is scalar if it is known to be uniform or will be scalarized 1794 /// during vectorization. Non-uniform scalarized instructions will be 1795 /// represented by VF values in the vectorized loop, each corresponding to an 1796 /// iteration of the original scalar loop. 1797 void collectLoopScalars(ElementCount VF); 1798 1799 /// Keeps cost model vectorization decision and cost for instructions. 1800 /// Right now it is used for memory instructions only. 1801 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1802 std::pair<InstWidening, InstructionCost>>; 1803 1804 DecisionList WideningDecisions; 1805 1806 /// Returns true if \p V is expected to be vectorized and it needs to be 1807 /// extracted. 1808 bool needsExtract(Value *V, ElementCount VF) const { 1809 Instruction *I = dyn_cast<Instruction>(V); 1810 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1811 TheLoop->isLoopInvariant(I)) 1812 return false; 1813 1814 // Assume we can vectorize V (and hence we need extraction) if the 1815 // scalars are not computed yet. This can happen, because it is called 1816 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1817 // the scalars are collected. That should be a safe assumption in most 1818 // cases, because we check if the operands have vectorizable types 1819 // beforehand in LoopVectorizationLegality. 1820 return Scalars.find(VF) == Scalars.end() || 1821 !isScalarAfterVectorization(I, VF); 1822 }; 1823 1824 /// Returns a range containing only operands needing to be extracted. 1825 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1826 ElementCount VF) const { 1827 return SmallVector<Value *, 4>(make_filter_range( 1828 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1829 } 1830 1831 /// Determines if we have the infrastructure to vectorize loop \p L and its 1832 /// epilogue, assuming the main loop is vectorized by \p VF. 1833 bool isCandidateForEpilogueVectorization(const Loop &L, 1834 const ElementCount VF) const; 1835 1836 /// Returns true if epilogue vectorization is considered profitable, and 1837 /// false otherwise. 1838 /// \p VF is the vectorization factor chosen for the original loop. 1839 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1840 1841 public: 1842 /// The loop that we evaluate. 1843 Loop *TheLoop; 1844 1845 /// Predicated scalar evolution analysis. 1846 PredicatedScalarEvolution &PSE; 1847 1848 /// Loop Info analysis. 1849 LoopInfo *LI; 1850 1851 /// Vectorization legality. 1852 LoopVectorizationLegality *Legal; 1853 1854 /// Vector target information. 1855 const TargetTransformInfo &TTI; 1856 1857 /// Target Library Info. 1858 const TargetLibraryInfo *TLI; 1859 1860 /// Demanded bits analysis. 1861 DemandedBits *DB; 1862 1863 /// Assumption cache. 1864 AssumptionCache *AC; 1865 1866 /// Interface to emit optimization remarks. 1867 OptimizationRemarkEmitter *ORE; 1868 1869 const Function *TheFunction; 1870 1871 /// Loop Vectorize Hint. 1872 const LoopVectorizeHints *Hints; 1873 1874 /// The interleave access information contains groups of interleaved accesses 1875 /// with the same stride and close to each other. 1876 InterleavedAccessInfo &InterleaveInfo; 1877 1878 /// Values to ignore in the cost model. 1879 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1880 1881 /// Values to ignore in the cost model when VF > 1. 1882 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1883 1884 /// Profitable vector factors. 1885 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1886 }; 1887 } // end namespace llvm 1888 1889 /// Helper struct to manage generating runtime checks for vectorization. 1890 /// 1891 /// The runtime checks are created up-front in temporary blocks to allow better 1892 /// estimating the cost and un-linked from the existing IR. After deciding to 1893 /// vectorize, the checks are moved back. If deciding not to vectorize, the 1894 /// temporary blocks are completely removed. 1895 class GeneratedRTChecks { 1896 /// Basic block which contains the generated SCEV checks, if any. 1897 BasicBlock *SCEVCheckBlock = nullptr; 1898 1899 /// The value representing the result of the generated SCEV checks. If it is 1900 /// nullptr, either no SCEV checks have been generated or they have been used. 1901 Value *SCEVCheckCond = nullptr; 1902 1903 /// Basic block which contains the generated memory runtime checks, if any. 1904 BasicBlock *MemCheckBlock = nullptr; 1905 1906 /// The value representing the result of the generated memory runtime checks. 1907 /// If it is nullptr, either no memory runtime checks have been generated or 1908 /// they have been used. 1909 Instruction *MemRuntimeCheckCond = nullptr; 1910 1911 DominatorTree *DT; 1912 LoopInfo *LI; 1913 1914 SCEVExpander SCEVExp; 1915 SCEVExpander MemCheckExp; 1916 1917 public: 1918 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1919 const DataLayout &DL) 1920 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), 1921 MemCheckExp(SE, DL, "scev.check") {} 1922 1923 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1924 /// accurately estimate the cost of the runtime checks. The blocks are 1925 /// un-linked from the IR and is added back during vector code generation. If 1926 /// there is no vector code generation, the check blocks are removed 1927 /// completely. 1928 void Create(Loop *L, const LoopAccessInfo &LAI, 1929 const SCEVUnionPredicate &UnionPred) { 1930 1931 BasicBlock *LoopHeader = L->getHeader(); 1932 BasicBlock *Preheader = L->getLoopPreheader(); 1933 1934 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1935 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1936 // may be used by SCEVExpander. The blocks will be un-linked from their 1937 // predecessors and removed from LI & DT at the end of the function. 1938 if (!UnionPred.isAlwaysTrue()) { 1939 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1940 nullptr, "vector.scevcheck"); 1941 1942 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1943 &UnionPred, SCEVCheckBlock->getTerminator()); 1944 } 1945 1946 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 1947 if (RtPtrChecking.Need) { 1948 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 1949 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 1950 "vector.memcheck"); 1951 1952 std::tie(std::ignore, MemRuntimeCheckCond) = 1953 addRuntimeChecks(MemCheckBlock->getTerminator(), L, 1954 RtPtrChecking.getChecks(), MemCheckExp); 1955 assert(MemRuntimeCheckCond && 1956 "no RT checks generated although RtPtrChecking " 1957 "claimed checks are required"); 1958 } 1959 1960 if (!MemCheckBlock && !SCEVCheckBlock) 1961 return; 1962 1963 // Unhook the temporary block with the checks, update various places 1964 // accordingly. 1965 if (SCEVCheckBlock) 1966 SCEVCheckBlock->replaceAllUsesWith(Preheader); 1967 if (MemCheckBlock) 1968 MemCheckBlock->replaceAllUsesWith(Preheader); 1969 1970 if (SCEVCheckBlock) { 1971 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1972 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 1973 Preheader->getTerminator()->eraseFromParent(); 1974 } 1975 if (MemCheckBlock) { 1976 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 1977 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 1978 Preheader->getTerminator()->eraseFromParent(); 1979 } 1980 1981 DT->changeImmediateDominator(LoopHeader, Preheader); 1982 if (MemCheckBlock) { 1983 DT->eraseNode(MemCheckBlock); 1984 LI->removeBlock(MemCheckBlock); 1985 } 1986 if (SCEVCheckBlock) { 1987 DT->eraseNode(SCEVCheckBlock); 1988 LI->removeBlock(SCEVCheckBlock); 1989 } 1990 } 1991 1992 /// Remove the created SCEV & memory runtime check blocks & instructions, if 1993 /// unused. 1994 ~GeneratedRTChecks() { 1995 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); 1996 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); 1997 if (!SCEVCheckCond) 1998 SCEVCleaner.markResultUsed(); 1999 2000 if (!MemRuntimeCheckCond) 2001 MemCheckCleaner.markResultUsed(); 2002 2003 if (MemRuntimeCheckCond) { 2004 auto &SE = *MemCheckExp.getSE(); 2005 // Memory runtime check generation creates compares that use expanded 2006 // values. Remove them before running the SCEVExpanderCleaners. 2007 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2008 if (MemCheckExp.isInsertedInstruction(&I)) 2009 continue; 2010 SE.forgetValue(&I); 2011 SE.eraseValueFromMap(&I); 2012 I.eraseFromParent(); 2013 } 2014 } 2015 MemCheckCleaner.cleanup(); 2016 SCEVCleaner.cleanup(); 2017 2018 if (SCEVCheckCond) 2019 SCEVCheckBlock->eraseFromParent(); 2020 if (MemRuntimeCheckCond) 2021 MemCheckBlock->eraseFromParent(); 2022 } 2023 2024 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2025 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2026 /// depending on the generated condition. 2027 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, 2028 BasicBlock *LoopVectorPreHeader, 2029 BasicBlock *LoopExitBlock) { 2030 if (!SCEVCheckCond) 2031 return nullptr; 2032 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond)) 2033 if (C->isZero()) 2034 return nullptr; 2035 2036 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2037 2038 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2039 // Create new preheader for vector loop. 2040 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2041 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2042 2043 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2044 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2045 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2046 SCEVCheckBlock); 2047 2048 DT->addNewBlock(SCEVCheckBlock, Pred); 2049 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2050 2051 ReplaceInstWithInst( 2052 SCEVCheckBlock->getTerminator(), 2053 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); 2054 // Mark the check as used, to prevent it from being removed during cleanup. 2055 SCEVCheckCond = nullptr; 2056 return SCEVCheckBlock; 2057 } 2058 2059 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2060 /// the branches to branch to the vector preheader or \p Bypass, depending on 2061 /// the generated condition. 2062 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, 2063 BasicBlock *LoopVectorPreHeader) { 2064 // Check if we generated code that checks in runtime if arrays overlap. 2065 if (!MemRuntimeCheckCond) 2066 return nullptr; 2067 2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2069 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2070 MemCheckBlock); 2071 2072 DT->addNewBlock(MemCheckBlock, Pred); 2073 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2074 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2075 2076 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) 2077 PL->addBasicBlockToLoop(MemCheckBlock, *LI); 2078 2079 ReplaceInstWithInst( 2080 MemCheckBlock->getTerminator(), 2081 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); 2082 MemCheckBlock->getTerminator()->setDebugLoc( 2083 Pred->getTerminator()->getDebugLoc()); 2084 2085 // Mark the check as used, to prevent it from being removed during cleanup. 2086 MemRuntimeCheckCond = nullptr; 2087 return MemCheckBlock; 2088 } 2089 }; 2090 2091 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 2092 // vectorization. The loop needs to be annotated with #pragma omp simd 2093 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2094 // vector length information is not provided, vectorization is not considered 2095 // explicit. Interleave hints are not allowed either. These limitations will be 2096 // relaxed in the future. 2097 // Please, note that we are currently forced to abuse the pragma 'clang 2098 // vectorize' semantics. This pragma provides *auto-vectorization hints* 2099 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2100 // provides *explicit vectorization hints* (LV can bypass legal checks and 2101 // assume that vectorization is legal). However, both hints are implemented 2102 // using the same metadata (llvm.loop.vectorize, processed by 2103 // LoopVectorizeHints). This will be fixed in the future when the native IR 2104 // representation for pragma 'omp simd' is introduced. 2105 static bool isExplicitVecOuterLoop(Loop *OuterLp, 2106 OptimizationRemarkEmitter *ORE) { 2107 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2108 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2109 2110 // Only outer loops with an explicit vectorization hint are supported. 2111 // Unannotated outer loops are ignored. 2112 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2113 return false; 2114 2115 Function *Fn = OuterLp->getHeader()->getParent(); 2116 if (!Hints.allowVectorization(Fn, OuterLp, 2117 true /*VectorizeOnlyWhenForced*/)) { 2118 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2119 return false; 2120 } 2121 2122 if (Hints.getInterleave() > 1) { 2123 // TODO: Interleave support is future work. 2124 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2125 "outer loops.\n"); 2126 Hints.emitRemarkWithHints(); 2127 return false; 2128 } 2129 2130 return true; 2131 } 2132 2133 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2134 OptimizationRemarkEmitter *ORE, 2135 SmallVectorImpl<Loop *> &V) { 2136 // Collect inner loops and outer loops without irreducible control flow. For 2137 // now, only collect outer loops that have explicit vectorization hints. If we 2138 // are stress testing the VPlan H-CFG construction, we collect the outermost 2139 // loop of every loop nest. 2140 if (L.isInnermost() || VPlanBuildStressTest || 2141 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2142 LoopBlocksRPO RPOT(&L); 2143 RPOT.perform(LI); 2144 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2145 V.push_back(&L); 2146 // TODO: Collect inner loops inside marked outer loops in case 2147 // vectorization fails for the outer loop. Do not invoke 2148 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2149 // already known to be reducible. We can use an inherited attribute for 2150 // that. 2151 return; 2152 } 2153 } 2154 for (Loop *InnerL : L) 2155 collectSupportedLoops(*InnerL, LI, ORE, V); 2156 } 2157 2158 namespace { 2159 2160 /// The LoopVectorize Pass. 2161 struct LoopVectorize : public FunctionPass { 2162 /// Pass identification, replacement for typeid 2163 static char ID; 2164 2165 LoopVectorizePass Impl; 2166 2167 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 2168 bool VectorizeOnlyWhenForced = false) 2169 : FunctionPass(ID), 2170 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 2171 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 2172 } 2173 2174 bool runOnFunction(Function &F) override { 2175 if (skipFunction(F)) 2176 return false; 2177 2178 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2179 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2180 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 2181 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2182 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 2183 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 2184 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 2185 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2186 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 2187 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 2188 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 2189 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 2190 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 2191 2192 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 2193 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 2194 2195 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 2196 GetLAA, *ORE, PSI).MadeAnyChange; 2197 } 2198 2199 void getAnalysisUsage(AnalysisUsage &AU) const override { 2200 AU.addRequired<AssumptionCacheTracker>(); 2201 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 2202 AU.addRequired<DominatorTreeWrapperPass>(); 2203 AU.addRequired<LoopInfoWrapperPass>(); 2204 AU.addRequired<ScalarEvolutionWrapperPass>(); 2205 AU.addRequired<TargetTransformInfoWrapperPass>(); 2206 AU.addRequired<AAResultsWrapperPass>(); 2207 AU.addRequired<LoopAccessLegacyAnalysis>(); 2208 AU.addRequired<DemandedBitsWrapperPass>(); 2209 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 2210 AU.addRequired<InjectTLIMappingsLegacy>(); 2211 2212 // We currently do not preserve loopinfo/dominator analyses with outer loop 2213 // vectorization. Until this is addressed, mark these analyses as preserved 2214 // only for non-VPlan-native path. 2215 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 2216 if (!EnableVPlanNativePath) { 2217 AU.addPreserved<LoopInfoWrapperPass>(); 2218 AU.addPreserved<DominatorTreeWrapperPass>(); 2219 } 2220 2221 AU.addPreserved<BasicAAWrapperPass>(); 2222 AU.addPreserved<GlobalsAAWrapperPass>(); 2223 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 2224 } 2225 }; 2226 2227 } // end anonymous namespace 2228 2229 //===----------------------------------------------------------------------===// 2230 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2231 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2232 //===----------------------------------------------------------------------===// 2233 2234 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2235 // We need to place the broadcast of invariant variables outside the loop, 2236 // but only if it's proven safe to do so. Else, broadcast will be inside 2237 // vector loop body. 2238 Instruction *Instr = dyn_cast<Instruction>(V); 2239 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2240 (!Instr || 2241 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2242 // Place the code for broadcasting invariant variables in the new preheader. 2243 IRBuilder<>::InsertPointGuard Guard(Builder); 2244 if (SafeToHoist) 2245 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2246 2247 // Broadcast the scalar into all locations in the vector. 2248 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2249 2250 return Shuf; 2251 } 2252 2253 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2254 const InductionDescriptor &II, Value *Step, Value *Start, 2255 Instruction *EntryVal, VPValue *Def, VPValue *CastDef, 2256 VPTransformState &State) { 2257 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2258 "Expected either an induction phi-node or a truncate of it!"); 2259 2260 // Construct the initial value of the vector IV in the vector loop preheader 2261 auto CurrIP = Builder.saveIP(); 2262 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2263 if (isa<TruncInst>(EntryVal)) { 2264 assert(Start->getType()->isIntegerTy() && 2265 "Truncation requires an integer type"); 2266 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2267 Step = Builder.CreateTrunc(Step, TruncType); 2268 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2269 } 2270 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2271 Value *SteppedStart = 2272 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2273 2274 // We create vector phi nodes for both integer and floating-point induction 2275 // variables. Here, we determine the kind of arithmetic we will perform. 2276 Instruction::BinaryOps AddOp; 2277 Instruction::BinaryOps MulOp; 2278 if (Step->getType()->isIntegerTy()) { 2279 AddOp = Instruction::Add; 2280 MulOp = Instruction::Mul; 2281 } else { 2282 AddOp = II.getInductionOpcode(); 2283 MulOp = Instruction::FMul; 2284 } 2285 2286 // Multiply the vectorization factor by the step using integer or 2287 // floating-point arithmetic as appropriate. 2288 Type *StepType = Step->getType(); 2289 if (Step->getType()->isFloatingPointTy()) 2290 StepType = IntegerType::get(StepType->getContext(), 2291 StepType->getScalarSizeInBits()); 2292 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); 2293 if (Step->getType()->isFloatingPointTy()) 2294 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); 2295 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); 2296 2297 // Create a vector splat to use in the induction update. 2298 // 2299 // FIXME: If the step is non-constant, we create the vector splat with 2300 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2301 // handle a constant vector splat. 2302 Value *SplatVF = isa<Constant>(Mul) 2303 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2304 : Builder.CreateVectorSplat(VF, Mul); 2305 Builder.restoreIP(CurrIP); 2306 2307 // We may need to add the step a number of times, depending on the unroll 2308 // factor. The last of those goes into the PHI. 2309 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2310 &*LoopVectorBody->getFirstInsertionPt()); 2311 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2312 Instruction *LastInduction = VecInd; 2313 for (unsigned Part = 0; Part < UF; ++Part) { 2314 State.set(Def, LastInduction, Part); 2315 2316 if (isa<TruncInst>(EntryVal)) 2317 addMetadata(LastInduction, EntryVal); 2318 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, 2319 State, Part); 2320 2321 LastInduction = cast<Instruction>( 2322 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); 2323 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2324 } 2325 2326 // Move the last step to the end of the latch block. This ensures consistent 2327 // placement of all induction updates. 2328 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2329 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2330 auto *ICmp = cast<Instruction>(Br->getCondition()); 2331 LastInduction->moveBefore(ICmp); 2332 LastInduction->setName("vec.ind.next"); 2333 2334 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2335 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2336 } 2337 2338 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2339 return Cost->isScalarAfterVectorization(I, VF) || 2340 Cost->isProfitableToScalarize(I, VF); 2341 } 2342 2343 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2344 if (shouldScalarizeInstruction(IV)) 2345 return true; 2346 auto isScalarInst = [&](User *U) -> bool { 2347 auto *I = cast<Instruction>(U); 2348 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2349 }; 2350 return llvm::any_of(IV->users(), isScalarInst); 2351 } 2352 2353 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2354 const InductionDescriptor &ID, const Instruction *EntryVal, 2355 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, 2356 unsigned Part, unsigned Lane) { 2357 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2358 "Expected either an induction phi-node or a truncate of it!"); 2359 2360 // This induction variable is not the phi from the original loop but the 2361 // newly-created IV based on the proof that casted Phi is equal to the 2362 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2363 // re-uses the same InductionDescriptor that original IV uses but we don't 2364 // have to do any recording in this case - that is done when original IV is 2365 // processed. 2366 if (isa<TruncInst>(EntryVal)) 2367 return; 2368 2369 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2370 if (Casts.empty()) 2371 return; 2372 // Only the first Cast instruction in the Casts vector is of interest. 2373 // The rest of the Casts (if exist) have no uses outside the 2374 // induction update chain itself. 2375 if (Lane < UINT_MAX) 2376 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); 2377 else 2378 State.set(CastDef, VectorLoopVal, Part); 2379 } 2380 2381 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2382 TruncInst *Trunc, VPValue *Def, 2383 VPValue *CastDef, 2384 VPTransformState &State) { 2385 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2386 "Primary induction variable must have an integer type"); 2387 2388 auto II = Legal->getInductionVars().find(IV); 2389 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2390 2391 auto ID = II->second; 2392 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2393 2394 // The value from the original loop to which we are mapping the new induction 2395 // variable. 2396 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2397 2398 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2399 2400 // Generate code for the induction step. Note that induction steps are 2401 // required to be loop-invariant 2402 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2403 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2404 "Induction step should be loop invariant"); 2405 if (PSE.getSE()->isSCEVable(IV->getType())) { 2406 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2407 return Exp.expandCodeFor(Step, Step->getType(), 2408 LoopVectorPreHeader->getTerminator()); 2409 } 2410 return cast<SCEVUnknown>(Step)->getValue(); 2411 }; 2412 2413 // The scalar value to broadcast. This is derived from the canonical 2414 // induction variable. If a truncation type is given, truncate the canonical 2415 // induction variable and step. Otherwise, derive these values from the 2416 // induction descriptor. 2417 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2418 Value *ScalarIV = Induction; 2419 if (IV != OldInduction) { 2420 ScalarIV = IV->getType()->isIntegerTy() 2421 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2422 : Builder.CreateCast(Instruction::SIToFP, Induction, 2423 IV->getType()); 2424 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2425 ScalarIV->setName("offset.idx"); 2426 } 2427 if (Trunc) { 2428 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2429 assert(Step->getType()->isIntegerTy() && 2430 "Truncation requires an integer step"); 2431 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2432 Step = Builder.CreateTrunc(Step, TruncType); 2433 } 2434 return ScalarIV; 2435 }; 2436 2437 // Create the vector values from the scalar IV, in the absence of creating a 2438 // vector IV. 2439 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2440 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2441 for (unsigned Part = 0; Part < UF; ++Part) { 2442 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2443 Value *EntryPart = 2444 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2445 ID.getInductionOpcode()); 2446 State.set(Def, EntryPart, Part); 2447 if (Trunc) 2448 addMetadata(EntryPart, Trunc); 2449 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, 2450 State, Part); 2451 } 2452 }; 2453 2454 // Fast-math-flags propagate from the original induction instruction. 2455 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 2456 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp())) 2457 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); 2458 2459 // Now do the actual transformations, and start with creating the step value. 2460 Value *Step = CreateStepValue(ID.getStep()); 2461 if (VF.isZero() || VF.isScalar()) { 2462 Value *ScalarIV = CreateScalarIV(Step); 2463 CreateSplatIV(ScalarIV, Step); 2464 return; 2465 } 2466 2467 // Determine if we want a scalar version of the induction variable. This is 2468 // true if the induction variable itself is not widened, or if it has at 2469 // least one user in the loop that is not widened. 2470 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2471 if (!NeedsScalarIV) { 2472 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2473 State); 2474 return; 2475 } 2476 2477 // Try to create a new independent vector induction variable. If we can't 2478 // create the phi node, we will splat the scalar induction variable in each 2479 // loop iteration. 2480 if (!shouldScalarizeInstruction(EntryVal)) { 2481 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, 2482 State); 2483 Value *ScalarIV = CreateScalarIV(Step); 2484 // Create scalar steps that can be used by instructions we will later 2485 // scalarize. Note that the addition of the scalar steps will not increase 2486 // the number of instructions in the loop in the common case prior to 2487 // InstCombine. We will be trading one vector extract for each scalar step. 2488 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2489 return; 2490 } 2491 2492 // All IV users are scalar instructions, so only emit a scalar IV, not a 2493 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2494 // predicate used by the masked loads/stores. 2495 Value *ScalarIV = CreateScalarIV(Step); 2496 if (!Cost->isScalarEpilogueAllowed()) 2497 CreateSplatIV(ScalarIV, Step); 2498 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); 2499 } 2500 2501 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2502 Instruction::BinaryOps BinOp) { 2503 // Create and check the types. 2504 auto *ValVTy = cast<VectorType>(Val->getType()); 2505 ElementCount VLen = ValVTy->getElementCount(); 2506 2507 Type *STy = Val->getType()->getScalarType(); 2508 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2509 "Induction Step must be an integer or FP"); 2510 assert(Step->getType() == STy && "Step has wrong type"); 2511 2512 SmallVector<Constant *, 8> Indices; 2513 2514 // Create a vector of consecutive numbers from zero to VF. 2515 VectorType *InitVecValVTy = ValVTy; 2516 Type *InitVecValSTy = STy; 2517 if (STy->isFloatingPointTy()) { 2518 InitVecValSTy = 2519 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits()); 2520 InitVecValVTy = VectorType::get(InitVecValSTy, VLen); 2521 } 2522 Value *InitVec = Builder.CreateStepVector(InitVecValVTy); 2523 2524 // Add on StartIdx 2525 Value *StartIdxSplat = Builder.CreateVectorSplat( 2526 VLen, ConstantInt::get(InitVecValSTy, StartIdx)); 2527 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat); 2528 2529 if (STy->isIntegerTy()) { 2530 Step = Builder.CreateVectorSplat(VLen, Step); 2531 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2532 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2533 // which can be found from the original scalar operations. 2534 Step = Builder.CreateMul(InitVec, Step); 2535 return Builder.CreateAdd(Val, Step, "induction"); 2536 } 2537 2538 // Floating point induction. 2539 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2540 "Binary Opcode should be specified for FP induction"); 2541 InitVec = Builder.CreateUIToFP(InitVec, ValVTy); 2542 Step = Builder.CreateVectorSplat(VLen, Step); 2543 Value *MulOp = Builder.CreateFMul(InitVec, Step); 2544 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2545 } 2546 2547 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2548 Instruction *EntryVal, 2549 const InductionDescriptor &ID, 2550 VPValue *Def, VPValue *CastDef, 2551 VPTransformState &State) { 2552 // We shouldn't have to build scalar steps if we aren't vectorizing. 2553 assert(VF.isVector() && "VF should be greater than one"); 2554 // Get the value type and ensure it and the step have the same integer type. 2555 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2556 assert(ScalarIVTy == Step->getType() && 2557 "Val and Step should have the same type"); 2558 2559 // We build scalar steps for both integer and floating-point induction 2560 // variables. Here, we determine the kind of arithmetic we will perform. 2561 Instruction::BinaryOps AddOp; 2562 Instruction::BinaryOps MulOp; 2563 if (ScalarIVTy->isIntegerTy()) { 2564 AddOp = Instruction::Add; 2565 MulOp = Instruction::Mul; 2566 } else { 2567 AddOp = ID.getInductionOpcode(); 2568 MulOp = Instruction::FMul; 2569 } 2570 2571 // Determine the number of scalars we need to generate for each unroll 2572 // iteration. If EntryVal is uniform, we only need to generate the first 2573 // lane. Otherwise, we generate all VF values. 2574 bool IsUniform = 2575 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); 2576 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); 2577 // Compute the scalar steps and save the results in State. 2578 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2579 ScalarIVTy->getScalarSizeInBits()); 2580 Type *VecIVTy = nullptr; 2581 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; 2582 if (!IsUniform && VF.isScalable()) { 2583 VecIVTy = VectorType::get(ScalarIVTy, VF); 2584 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); 2585 SplatStep = Builder.CreateVectorSplat(VF, Step); 2586 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); 2587 } 2588 2589 for (unsigned Part = 0; Part < UF; ++Part) { 2590 Value *StartIdx0 = 2591 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2592 2593 if (!IsUniform && VF.isScalable()) { 2594 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); 2595 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); 2596 if (ScalarIVTy->isFloatingPointTy()) 2597 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); 2598 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); 2599 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); 2600 State.set(Def, Add, Part); 2601 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2602 Part); 2603 // It's useful to record the lane values too for the known minimum number 2604 // of elements so we do those below. This improves the code quality when 2605 // trying to extract the first element, for example. 2606 } 2607 2608 if (ScalarIVTy->isFloatingPointTy()) 2609 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); 2610 2611 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2612 Value *StartIdx = Builder.CreateBinOp( 2613 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); 2614 // The step returned by `createStepForVF` is a runtime-evaluated value 2615 // when VF is scalable. Otherwise, it should be folded into a Constant. 2616 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2617 "Expected StartIdx to be folded to a constant when VF is not " 2618 "scalable"); 2619 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); 2620 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); 2621 State.set(Def, Add, VPIteration(Part, Lane)); 2622 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, 2623 Part, Lane); 2624 } 2625 } 2626 } 2627 2628 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, 2629 const VPIteration &Instance, 2630 VPTransformState &State) { 2631 Value *ScalarInst = State.get(Def, Instance); 2632 Value *VectorValue = State.get(Def, Instance.Part); 2633 VectorValue = Builder.CreateInsertElement( 2634 VectorValue, ScalarInst, 2635 Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); 2636 State.set(Def, VectorValue, Instance.Part); 2637 } 2638 2639 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2640 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2641 return Builder.CreateVectorReverse(Vec, "reverse"); 2642 } 2643 2644 // Return whether we allow using masked interleave-groups (for dealing with 2645 // strided loads/stores that reside in predicated blocks, or for dealing 2646 // with gaps). 2647 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2648 // If an override option has been passed in for interleaved accesses, use it. 2649 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2650 return EnableMaskedInterleavedMemAccesses; 2651 2652 return TTI.enableMaskedInterleavedAccessVectorization(); 2653 } 2654 2655 // Try to vectorize the interleave group that \p Instr belongs to. 2656 // 2657 // E.g. Translate following interleaved load group (factor = 3): 2658 // for (i = 0; i < N; i+=3) { 2659 // R = Pic[i]; // Member of index 0 2660 // G = Pic[i+1]; // Member of index 1 2661 // B = Pic[i+2]; // Member of index 2 2662 // ... // do something to R, G, B 2663 // } 2664 // To: 2665 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2666 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2667 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2668 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2669 // 2670 // Or translate following interleaved store group (factor = 3): 2671 // for (i = 0; i < N; i+=3) { 2672 // ... do something to R, G, B 2673 // Pic[i] = R; // Member of index 0 2674 // Pic[i+1] = G; // Member of index 1 2675 // Pic[i+2] = B; // Member of index 2 2676 // } 2677 // To: 2678 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2679 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2680 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2681 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2682 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2683 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2684 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2685 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2686 VPValue *BlockInMask) { 2687 Instruction *Instr = Group->getInsertPos(); 2688 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2689 2690 // Prepare for the vector type of the interleaved load/store. 2691 Type *ScalarTy = getMemInstValueType(Instr); 2692 unsigned InterleaveFactor = Group->getFactor(); 2693 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2694 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2695 2696 // Prepare for the new pointers. 2697 SmallVector<Value *, 2> AddrParts; 2698 unsigned Index = Group->getIndex(Instr); 2699 2700 // TODO: extend the masked interleaved-group support to reversed access. 2701 assert((!BlockInMask || !Group->isReverse()) && 2702 "Reversed masked interleave-group not supported."); 2703 2704 // If the group is reverse, adjust the index to refer to the last vector lane 2705 // instead of the first. We adjust the index from the first vector lane, 2706 // rather than directly getting the pointer for lane VF - 1, because the 2707 // pointer operand of the interleaved access is supposed to be uniform. For 2708 // uniform instructions, we're only required to generate a value for the 2709 // first vector lane in each unroll iteration. 2710 if (Group->isReverse()) 2711 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2712 2713 for (unsigned Part = 0; Part < UF; Part++) { 2714 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2715 setDebugLocFromInst(Builder, AddrPart); 2716 2717 // Notice current instruction could be any index. Need to adjust the address 2718 // to the member of index 0. 2719 // 2720 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2721 // b = A[i]; // Member of index 0 2722 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2723 // 2724 // E.g. A[i+1] = a; // Member of index 1 2725 // A[i] = b; // Member of index 0 2726 // A[i+2] = c; // Member of index 2 (Current instruction) 2727 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2728 2729 bool InBounds = false; 2730 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2731 InBounds = gep->isInBounds(); 2732 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2733 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2734 2735 // Cast to the vector pointer type. 2736 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2737 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2738 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2739 } 2740 2741 setDebugLocFromInst(Builder, Instr); 2742 Value *PoisonVec = PoisonValue::get(VecTy); 2743 2744 Value *MaskForGaps = nullptr; 2745 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2746 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2747 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2748 } 2749 2750 // Vectorize the interleaved load group. 2751 if (isa<LoadInst>(Instr)) { 2752 // For each unroll part, create a wide load for the group. 2753 SmallVector<Value *, 2> NewLoads; 2754 for (unsigned Part = 0; Part < UF; Part++) { 2755 Instruction *NewLoad; 2756 if (BlockInMask || MaskForGaps) { 2757 assert(useMaskedInterleavedAccesses(*TTI) && 2758 "masked interleaved groups are not allowed."); 2759 Value *GroupMask = MaskForGaps; 2760 if (BlockInMask) { 2761 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2762 Value *ShuffledMask = Builder.CreateShuffleVector( 2763 BlockInMaskPart, 2764 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2765 "interleaved.mask"); 2766 GroupMask = MaskForGaps 2767 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2768 MaskForGaps) 2769 : ShuffledMask; 2770 } 2771 NewLoad = 2772 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2773 GroupMask, PoisonVec, "wide.masked.vec"); 2774 } 2775 else 2776 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2777 Group->getAlign(), "wide.vec"); 2778 Group->addMetadata(NewLoad); 2779 NewLoads.push_back(NewLoad); 2780 } 2781 2782 // For each member in the group, shuffle out the appropriate data from the 2783 // wide loads. 2784 unsigned J = 0; 2785 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2786 Instruction *Member = Group->getMember(I); 2787 2788 // Skip the gaps in the group. 2789 if (!Member) 2790 continue; 2791 2792 auto StrideMask = 2793 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2794 for (unsigned Part = 0; Part < UF; Part++) { 2795 Value *StridedVec = Builder.CreateShuffleVector( 2796 NewLoads[Part], StrideMask, "strided.vec"); 2797 2798 // If this member has different type, cast the result type. 2799 if (Member->getType() != ScalarTy) { 2800 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2801 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2802 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2803 } 2804 2805 if (Group->isReverse()) 2806 StridedVec = reverseVector(StridedVec); 2807 2808 State.set(VPDefs[J], StridedVec, Part); 2809 } 2810 ++J; 2811 } 2812 return; 2813 } 2814 2815 // The sub vector type for current instruction. 2816 auto *SubVT = VectorType::get(ScalarTy, VF); 2817 2818 // Vectorize the interleaved store group. 2819 for (unsigned Part = 0; Part < UF; Part++) { 2820 // Collect the stored vector from each member. 2821 SmallVector<Value *, 4> StoredVecs; 2822 for (unsigned i = 0; i < InterleaveFactor; i++) { 2823 // Interleaved store group doesn't allow a gap, so each index has a member 2824 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2825 2826 Value *StoredVec = State.get(StoredValues[i], Part); 2827 2828 if (Group->isReverse()) 2829 StoredVec = reverseVector(StoredVec); 2830 2831 // If this member has different type, cast it to a unified type. 2832 2833 if (StoredVec->getType() != SubVT) 2834 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2835 2836 StoredVecs.push_back(StoredVec); 2837 } 2838 2839 // Concatenate all vectors into a wide vector. 2840 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2841 2842 // Interleave the elements in the wide vector. 2843 Value *IVec = Builder.CreateShuffleVector( 2844 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2845 "interleaved.vec"); 2846 2847 Instruction *NewStoreInstr; 2848 if (BlockInMask) { 2849 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2850 Value *ShuffledMask = Builder.CreateShuffleVector( 2851 BlockInMaskPart, 2852 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2853 "interleaved.mask"); 2854 NewStoreInstr = Builder.CreateMaskedStore( 2855 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2856 } 2857 else 2858 NewStoreInstr = 2859 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2860 2861 Group->addMetadata(NewStoreInstr); 2862 } 2863 } 2864 2865 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2866 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2867 VPValue *StoredValue, VPValue *BlockInMask) { 2868 // Attempt to issue a wide load. 2869 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2870 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2871 2872 assert((LI || SI) && "Invalid Load/Store instruction"); 2873 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2874 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2875 2876 LoopVectorizationCostModel::InstWidening Decision = 2877 Cost->getWideningDecision(Instr, VF); 2878 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2879 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2880 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2881 "CM decision is not to widen the memory instruction"); 2882 2883 Type *ScalarDataTy = getMemInstValueType(Instr); 2884 2885 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2886 const Align Alignment = getLoadStoreAlignment(Instr); 2887 2888 // Determine if the pointer operand of the access is either consecutive or 2889 // reverse consecutive. 2890 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2891 bool ConsecutiveStride = 2892 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2893 bool CreateGatherScatter = 2894 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2895 2896 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2897 // gather/scatter. Otherwise Decision should have been to Scalarize. 2898 assert((ConsecutiveStride || CreateGatherScatter) && 2899 "The instruction should be scalarized"); 2900 (void)ConsecutiveStride; 2901 2902 VectorParts BlockInMaskParts(UF); 2903 bool isMaskRequired = BlockInMask; 2904 if (isMaskRequired) 2905 for (unsigned Part = 0; Part < UF; ++Part) 2906 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2907 2908 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2909 // Calculate the pointer for the specific unroll-part. 2910 GetElementPtrInst *PartPtr = nullptr; 2911 2912 bool InBounds = false; 2913 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2914 InBounds = gep->isInBounds(); 2915 if (Reverse) { 2916 // If the address is consecutive but reversed, then the 2917 // wide store needs to start at the last vector element. 2918 // RunTimeVF = VScale * VF.getKnownMinValue() 2919 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() 2920 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2921 // NumElt = -Part * RunTimeVF 2922 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF); 2923 // LastLane = 1 - RunTimeVF 2924 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF); 2925 PartPtr = 2926 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt)); 2927 PartPtr->setIsInBounds(InBounds); 2928 PartPtr = cast<GetElementPtrInst>( 2929 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane)); 2930 PartPtr->setIsInBounds(InBounds); 2931 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2932 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2933 } else { 2934 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2935 PartPtr = cast<GetElementPtrInst>( 2936 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2937 PartPtr->setIsInBounds(InBounds); 2938 } 2939 2940 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2941 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2942 }; 2943 2944 // Handle Stores: 2945 if (SI) { 2946 setDebugLocFromInst(Builder, SI); 2947 2948 for (unsigned Part = 0; Part < UF; ++Part) { 2949 Instruction *NewSI = nullptr; 2950 Value *StoredVal = State.get(StoredValue, Part); 2951 if (CreateGatherScatter) { 2952 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2953 Value *VectorGep = State.get(Addr, Part); 2954 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2955 MaskPart); 2956 } else { 2957 if (Reverse) { 2958 // If we store to reverse consecutive memory locations, then we need 2959 // to reverse the order of elements in the stored value. 2960 StoredVal = reverseVector(StoredVal); 2961 // We don't want to update the value in the map as it might be used in 2962 // another expression. So don't call resetVectorValue(StoredVal). 2963 } 2964 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2965 if (isMaskRequired) 2966 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2967 BlockInMaskParts[Part]); 2968 else 2969 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2970 } 2971 addMetadata(NewSI, SI); 2972 } 2973 return; 2974 } 2975 2976 // Handle loads. 2977 assert(LI && "Must have a load instruction"); 2978 setDebugLocFromInst(Builder, LI); 2979 for (unsigned Part = 0; Part < UF; ++Part) { 2980 Value *NewLI; 2981 if (CreateGatherScatter) { 2982 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2983 Value *VectorGep = State.get(Addr, Part); 2984 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2985 nullptr, "wide.masked.gather"); 2986 addMetadata(NewLI, LI); 2987 } else { 2988 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); 2989 if (isMaskRequired) 2990 NewLI = Builder.CreateMaskedLoad( 2991 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2992 "wide.masked.load"); 2993 else 2994 NewLI = 2995 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2996 2997 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2998 addMetadata(NewLI, LI); 2999 if (Reverse) 3000 NewLI = reverseVector(NewLI); 3001 } 3002 3003 State.set(Def, NewLI, Part); 3004 } 3005 } 3006 3007 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, 3008 VPUser &User, 3009 const VPIteration &Instance, 3010 bool IfPredicateInstr, 3011 VPTransformState &State) { 3012 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 3013 3014 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 3015 // the first lane and part. 3016 if (isa<NoAliasScopeDeclInst>(Instr)) 3017 if (!Instance.isFirstIteration()) 3018 return; 3019 3020 setDebugLocFromInst(Builder, Instr); 3021 3022 // Does this instruction return a value ? 3023 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 3024 3025 Instruction *Cloned = Instr->clone(); 3026 if (!IsVoidRetTy) 3027 Cloned->setName(Instr->getName() + ".cloned"); 3028 3029 State.Builder.SetInsertPoint(Builder.GetInsertBlock(), 3030 Builder.GetInsertPoint()); 3031 // Replace the operands of the cloned instructions with their scalar 3032 // equivalents in the new loop. 3033 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 3034 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 3035 auto InputInstance = Instance; 3036 if (!Operand || !OrigLoop->contains(Operand) || 3037 (Cost->isUniformAfterVectorization(Operand, State.VF))) 3038 InputInstance.Lane = VPLane::getFirstLane(); 3039 auto *NewOp = State.get(User.getOperand(op), InputInstance); 3040 Cloned->setOperand(op, NewOp); 3041 } 3042 addNewMetadata(Cloned, Instr); 3043 3044 // Place the cloned scalar in the new loop. 3045 Builder.Insert(Cloned); 3046 3047 State.set(Def, Cloned, Instance); 3048 3049 // If we just cloned a new assumption, add it the assumption cache. 3050 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 3051 AC->registerAssumption(II); 3052 3053 // End if-block. 3054 if (IfPredicateInstr) 3055 PredicatedInstructions.push_back(Cloned); 3056 } 3057 3058 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 3059 Value *End, Value *Step, 3060 Instruction *DL) { 3061 BasicBlock *Header = L->getHeader(); 3062 BasicBlock *Latch = L->getLoopLatch(); 3063 // As we're just creating this loop, it's possible no latch exists 3064 // yet. If so, use the header as this will be a single block loop. 3065 if (!Latch) 3066 Latch = Header; 3067 3068 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 3069 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 3070 setDebugLocFromInst(Builder, OldInst); 3071 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 3072 3073 Builder.SetInsertPoint(Latch->getTerminator()); 3074 setDebugLocFromInst(Builder, OldInst); 3075 3076 // Create i+1 and fill the PHINode. 3077 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 3078 Induction->addIncoming(Start, L->getLoopPreheader()); 3079 Induction->addIncoming(Next, Latch); 3080 // Create the compare. 3081 Value *ICmp = Builder.CreateICmpEQ(Next, End); 3082 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 3083 3084 // Now we have two terminators. Remove the old one from the block. 3085 Latch->getTerminator()->eraseFromParent(); 3086 3087 return Induction; 3088 } 3089 3090 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 3091 if (TripCount) 3092 return TripCount; 3093 3094 assert(L && "Create Trip Count for null loop."); 3095 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3096 // Find the loop boundaries. 3097 ScalarEvolution *SE = PSE.getSE(); 3098 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 3099 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 3100 "Invalid loop count"); 3101 3102 Type *IdxTy = Legal->getWidestInductionType(); 3103 assert(IdxTy && "No type for induction"); 3104 3105 // The exit count might have the type of i64 while the phi is i32. This can 3106 // happen if we have an induction variable that is sign extended before the 3107 // compare. The only way that we get a backedge taken count is that the 3108 // induction variable was signed and as such will not overflow. In such a case 3109 // truncation is legal. 3110 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 3111 IdxTy->getPrimitiveSizeInBits()) 3112 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 3113 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 3114 3115 // Get the total trip count from the count by adding 1. 3116 const SCEV *ExitCount = SE->getAddExpr( 3117 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 3118 3119 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 3120 3121 // Expand the trip count and place the new instructions in the preheader. 3122 // Notice that the pre-header does not change, only the loop body. 3123 SCEVExpander Exp(*SE, DL, "induction"); 3124 3125 // Count holds the overall loop count (N). 3126 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 3127 L->getLoopPreheader()->getTerminator()); 3128 3129 if (TripCount->getType()->isPointerTy()) 3130 TripCount = 3131 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 3132 L->getLoopPreheader()->getTerminator()); 3133 3134 return TripCount; 3135 } 3136 3137 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3138 if (VectorTripCount) 3139 return VectorTripCount; 3140 3141 Value *TC = getOrCreateTripCount(L); 3142 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3143 3144 Type *Ty = TC->getType(); 3145 // This is where we can make the step a runtime constant. 3146 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3147 3148 // If the tail is to be folded by masking, round the number of iterations N 3149 // up to a multiple of Step instead of rounding down. This is done by first 3150 // adding Step-1 and then rounding down. Note that it's ok if this addition 3151 // overflows: the vector induction variable will eventually wrap to zero given 3152 // that it starts at zero and its Step is a power of two; the loop will then 3153 // exit, with the last early-exit vector comparison also producing all-true. 3154 if (Cost->foldTailByMasking()) { 3155 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3156 "VF*UF must be a power of 2 when folding tail by masking"); 3157 assert(!VF.isScalable() && 3158 "Tail folding not yet supported for scalable vectors"); 3159 TC = Builder.CreateAdd( 3160 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3161 } 3162 3163 // Now we need to generate the expression for the part of the loop that the 3164 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3165 // iterations are not required for correctness, or N - Step, otherwise. Step 3166 // is equal to the vectorization factor (number of SIMD elements) times the 3167 // unroll factor (number of SIMD instructions). 3168 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3169 3170 // There are two cases where we need to ensure (at least) the last iteration 3171 // runs in the scalar remainder loop. Thus, if the step evenly divides 3172 // the trip count, we set the remainder to be equal to the step. If the step 3173 // does not evenly divide the trip count, no adjustment is necessary since 3174 // there will already be scalar iterations. Note that the minimum iterations 3175 // check ensures that N >= Step. The cases are: 3176 // 1) If there is a non-reversed interleaved group that may speculatively 3177 // access memory out-of-bounds. 3178 // 2) If any instruction may follow a conditionally taken exit. That is, if 3179 // the loop contains multiple exiting blocks, or a single exiting block 3180 // which is not the latch. 3181 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3182 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3183 R = Builder.CreateSelect(IsZero, Step, R); 3184 } 3185 3186 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3187 3188 return VectorTripCount; 3189 } 3190 3191 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3192 const DataLayout &DL) { 3193 // Verify that V is a vector type with same number of elements as DstVTy. 3194 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3195 unsigned VF = DstFVTy->getNumElements(); 3196 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3197 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3198 Type *SrcElemTy = SrcVecTy->getElementType(); 3199 Type *DstElemTy = DstFVTy->getElementType(); 3200 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3201 "Vector elements must have same size"); 3202 3203 // Do a direct cast if element types are castable. 3204 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3205 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3206 } 3207 // V cannot be directly casted to desired vector type. 3208 // May happen when V is a floating point vector but DstVTy is a vector of 3209 // pointers or vice-versa. Handle this using a two-step bitcast using an 3210 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3211 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3212 "Only one type should be a pointer type"); 3213 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3214 "Only one type should be a floating point type"); 3215 Type *IntTy = 3216 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3217 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3218 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3219 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3220 } 3221 3222 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3223 BasicBlock *Bypass) { 3224 Value *Count = getOrCreateTripCount(L); 3225 // Reuse existing vector loop preheader for TC checks. 3226 // Note that new preheader block is generated for vector loop. 3227 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3228 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3229 3230 // Generate code to check if the loop's trip count is less than VF * UF, or 3231 // equal to it in case a scalar epilogue is required; this implies that the 3232 // vector trip count is zero. This check also covers the case where adding one 3233 // to the backedge-taken count overflowed leading to an incorrect trip count 3234 // of zero. In this case we will also jump to the scalar loop. 3235 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3236 : ICmpInst::ICMP_ULT; 3237 3238 // If tail is to be folded, vector loop takes care of all iterations. 3239 Value *CheckMinIters = Builder.getFalse(); 3240 if (!Cost->foldTailByMasking()) { 3241 Value *Step = 3242 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3243 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3244 } 3245 // Create new preheader for vector loop. 3246 LoopVectorPreHeader = 3247 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3248 "vector.ph"); 3249 3250 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3251 DT->getNode(Bypass)->getIDom()) && 3252 "TC check is expected to dominate Bypass"); 3253 3254 // Update dominator for Bypass & LoopExit. 3255 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3256 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3257 3258 ReplaceInstWithInst( 3259 TCCheckBlock->getTerminator(), 3260 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3261 LoopBypassBlocks.push_back(TCCheckBlock); 3262 } 3263 3264 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3265 3266 BasicBlock *const SCEVCheckBlock = 3267 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); 3268 if (!SCEVCheckBlock) 3269 return nullptr; 3270 3271 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3272 (OptForSizeBasedOnProfile && 3273 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3274 "Cannot SCEV check stride or overflow when optimizing for size"); 3275 3276 3277 // Update dominator only if this is first RT check. 3278 if (LoopBypassBlocks.empty()) { 3279 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3280 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3281 } 3282 3283 LoopBypassBlocks.push_back(SCEVCheckBlock); 3284 AddedSafetyChecks = true; 3285 return SCEVCheckBlock; 3286 } 3287 3288 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 3289 BasicBlock *Bypass) { 3290 // VPlan-native path does not do any analysis for runtime checks currently. 3291 if (EnableVPlanNativePath) 3292 return nullptr; 3293 3294 BasicBlock *const MemCheckBlock = 3295 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); 3296 3297 // Check if we generated code that checks in runtime if arrays overlap. We put 3298 // the checks into a separate block to make the more common case of few 3299 // elements faster. 3300 if (!MemCheckBlock) 3301 return nullptr; 3302 3303 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3304 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3305 "Cannot emit memory checks when optimizing for size, unless forced " 3306 "to vectorize."); 3307 ORE->emit([&]() { 3308 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3309 L->getStartLoc(), L->getHeader()) 3310 << "Code-size may be reduced by not forcing " 3311 "vectorization, or by source-code modifications " 3312 "eliminating the need for runtime checks " 3313 "(e.g., adding 'restrict')."; 3314 }); 3315 } 3316 3317 LoopBypassBlocks.push_back(MemCheckBlock); 3318 3319 AddedSafetyChecks = true; 3320 3321 // We currently don't use LoopVersioning for the actual loop cloning but we 3322 // still use it to add the noalias metadata. 3323 LVer = std::make_unique<LoopVersioning>( 3324 *Legal->getLAI(), 3325 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3326 DT, PSE.getSE()); 3327 LVer->prepareNoAliasMetadata(); 3328 return MemCheckBlock; 3329 } 3330 3331 Value *InnerLoopVectorizer::emitTransformedIndex( 3332 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3333 const InductionDescriptor &ID) const { 3334 3335 SCEVExpander Exp(*SE, DL, "induction"); 3336 auto Step = ID.getStep(); 3337 auto StartValue = ID.getStartValue(); 3338 assert(Index->getType() == Step->getType() && 3339 "Index type does not match StepValue type"); 3340 3341 // Note: the IR at this point is broken. We cannot use SE to create any new 3342 // SCEV and then expand it, hoping that SCEV's simplification will give us 3343 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3344 // lead to various SCEV crashes. So all we can do is to use builder and rely 3345 // on InstCombine for future simplifications. Here we handle some trivial 3346 // cases only. 3347 auto CreateAdd = [&B](Value *X, Value *Y) { 3348 assert(X->getType() == Y->getType() && "Types don't match!"); 3349 if (auto *CX = dyn_cast<ConstantInt>(X)) 3350 if (CX->isZero()) 3351 return Y; 3352 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3353 if (CY->isZero()) 3354 return X; 3355 return B.CreateAdd(X, Y); 3356 }; 3357 3358 auto CreateMul = [&B](Value *X, Value *Y) { 3359 assert(X->getType() == Y->getType() && "Types don't match!"); 3360 if (auto *CX = dyn_cast<ConstantInt>(X)) 3361 if (CX->isOne()) 3362 return Y; 3363 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3364 if (CY->isOne()) 3365 return X; 3366 return B.CreateMul(X, Y); 3367 }; 3368 3369 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3370 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3371 // the DomTree is not kept up-to-date for additional blocks generated in the 3372 // vector loop. By using the header as insertion point, we guarantee that the 3373 // expanded instructions dominate all their uses. 3374 auto GetInsertPoint = [this, &B]() { 3375 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3376 if (InsertBB != LoopVectorBody && 3377 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3378 return LoopVectorBody->getTerminator(); 3379 return &*B.GetInsertPoint(); 3380 }; 3381 3382 switch (ID.getKind()) { 3383 case InductionDescriptor::IK_IntInduction: { 3384 assert(Index->getType() == StartValue->getType() && 3385 "Index type does not match StartValue type"); 3386 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3387 return B.CreateSub(StartValue, Index); 3388 auto *Offset = CreateMul( 3389 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3390 return CreateAdd(StartValue, Offset); 3391 } 3392 case InductionDescriptor::IK_PtrInduction: { 3393 assert(isa<SCEVConstant>(Step) && 3394 "Expected constant step for pointer induction"); 3395 return B.CreateGEP( 3396 StartValue->getType()->getPointerElementType(), StartValue, 3397 CreateMul(Index, 3398 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3399 } 3400 case InductionDescriptor::IK_FpInduction: { 3401 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3402 auto InductionBinOp = ID.getInductionBinOp(); 3403 assert(InductionBinOp && 3404 (InductionBinOp->getOpcode() == Instruction::FAdd || 3405 InductionBinOp->getOpcode() == Instruction::FSub) && 3406 "Original bin op should be defined for FP induction"); 3407 3408 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3409 Value *MulExp = B.CreateFMul(StepValue, Index); 3410 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3411 "induction"); 3412 } 3413 case InductionDescriptor::IK_NoInduction: 3414 return nullptr; 3415 } 3416 llvm_unreachable("invalid enum"); 3417 } 3418 3419 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3420 LoopScalarBody = OrigLoop->getHeader(); 3421 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3422 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3423 assert(LoopExitBlock && "Must have an exit block"); 3424 assert(LoopVectorPreHeader && "Invalid loop structure"); 3425 3426 LoopMiddleBlock = 3427 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3428 LI, nullptr, Twine(Prefix) + "middle.block"); 3429 LoopScalarPreHeader = 3430 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3431 nullptr, Twine(Prefix) + "scalar.ph"); 3432 3433 // Set up branch from middle block to the exit and scalar preheader blocks. 3434 // completeLoopSkeleton will update the condition to use an iteration check, 3435 // if required to decide whether to execute the remainder. 3436 BranchInst *BrInst = 3437 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3438 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3439 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3440 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3441 3442 // We intentionally don't let SplitBlock to update LoopInfo since 3443 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3444 // LoopVectorBody is explicitly added to the correct place few lines later. 3445 LoopVectorBody = 3446 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3447 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3448 3449 // Update dominator for loop exit. 3450 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3451 3452 // Create and register the new vector loop. 3453 Loop *Lp = LI->AllocateLoop(); 3454 Loop *ParentLoop = OrigLoop->getParentLoop(); 3455 3456 // Insert the new loop into the loop nest and register the new basic blocks 3457 // before calling any utilities such as SCEV that require valid LoopInfo. 3458 if (ParentLoop) { 3459 ParentLoop->addChildLoop(Lp); 3460 } else { 3461 LI->addTopLevelLoop(Lp); 3462 } 3463 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3464 return Lp; 3465 } 3466 3467 void InnerLoopVectorizer::createInductionResumeValues( 3468 Loop *L, Value *VectorTripCount, 3469 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3470 assert(VectorTripCount && L && "Expected valid arguments"); 3471 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3472 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3473 "Inconsistent information about additional bypass."); 3474 // We are going to resume the execution of the scalar loop. 3475 // Go over all of the induction variables that we found and fix the 3476 // PHIs that are left in the scalar version of the loop. 3477 // The starting values of PHI nodes depend on the counter of the last 3478 // iteration in the vectorized loop. 3479 // If we come from a bypass edge then we need to start from the original 3480 // start value. 3481 for (auto &InductionEntry : Legal->getInductionVars()) { 3482 PHINode *OrigPhi = InductionEntry.first; 3483 InductionDescriptor II = InductionEntry.second; 3484 3485 // Create phi nodes to merge from the backedge-taken check block. 3486 PHINode *BCResumeVal = 3487 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3488 LoopScalarPreHeader->getTerminator()); 3489 // Copy original phi DL over to the new one. 3490 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3491 Value *&EndValue = IVEndValues[OrigPhi]; 3492 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3493 if (OrigPhi == OldInduction) { 3494 // We know what the end value is. 3495 EndValue = VectorTripCount; 3496 } else { 3497 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3498 3499 // Fast-math-flags propagate from the original induction instruction. 3500 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3501 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3502 3503 Type *StepType = II.getStep()->getType(); 3504 Instruction::CastOps CastOp = 3505 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3506 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3507 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3508 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3509 EndValue->setName("ind.end"); 3510 3511 // Compute the end value for the additional bypass (if applicable). 3512 if (AdditionalBypass.first) { 3513 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3514 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3515 StepType, true); 3516 CRD = 3517 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3518 EndValueFromAdditionalBypass = 3519 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3520 EndValueFromAdditionalBypass->setName("ind.end"); 3521 } 3522 } 3523 // The new PHI merges the original incoming value, in case of a bypass, 3524 // or the value at the end of the vectorized loop. 3525 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3526 3527 // Fix the scalar body counter (PHI node). 3528 // The old induction's phi node in the scalar body needs the truncated 3529 // value. 3530 for (BasicBlock *BB : LoopBypassBlocks) 3531 BCResumeVal->addIncoming(II.getStartValue(), BB); 3532 3533 if (AdditionalBypass.first) 3534 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3535 EndValueFromAdditionalBypass); 3536 3537 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3538 } 3539 } 3540 3541 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3542 MDNode *OrigLoopID) { 3543 assert(L && "Expected valid loop."); 3544 3545 // The trip counts should be cached by now. 3546 Value *Count = getOrCreateTripCount(L); 3547 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3548 3549 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3550 3551 // Add a check in the middle block to see if we have completed 3552 // all of the iterations in the first vector loop. 3553 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3554 // If tail is to be folded, we know we don't need to run the remainder. 3555 if (!Cost->foldTailByMasking()) { 3556 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3557 Count, VectorTripCount, "cmp.n", 3558 LoopMiddleBlock->getTerminator()); 3559 3560 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3561 // of the corresponding compare because they may have ended up with 3562 // different line numbers and we want to avoid awkward line stepping while 3563 // debugging. Eg. if the compare has got a line number inside the loop. 3564 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3565 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3566 } 3567 3568 // Get ready to start creating new instructions into the vectorized body. 3569 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3570 "Inconsistent vector loop preheader"); 3571 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3572 3573 Optional<MDNode *> VectorizedLoopID = 3574 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3575 LLVMLoopVectorizeFollowupVectorized}); 3576 if (VectorizedLoopID.hasValue()) { 3577 L->setLoopID(VectorizedLoopID.getValue()); 3578 3579 // Do not setAlreadyVectorized if loop attributes have been defined 3580 // explicitly. 3581 return LoopVectorPreHeader; 3582 } 3583 3584 // Keep all loop hints from the original loop on the vector loop (we'll 3585 // replace the vectorizer-specific hints below). 3586 if (MDNode *LID = OrigLoop->getLoopID()) 3587 L->setLoopID(LID); 3588 3589 LoopVectorizeHints Hints(L, true, *ORE); 3590 Hints.setAlreadyVectorized(); 3591 3592 #ifdef EXPENSIVE_CHECKS 3593 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3594 LI->verify(*DT); 3595 #endif 3596 3597 return LoopVectorPreHeader; 3598 } 3599 3600 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3601 /* 3602 In this function we generate a new loop. The new loop will contain 3603 the vectorized instructions while the old loop will continue to run the 3604 scalar remainder. 3605 3606 [ ] <-- loop iteration number check. 3607 / | 3608 / v 3609 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3610 | / | 3611 | / v 3612 || [ ] <-- vector pre header. 3613 |/ | 3614 | v 3615 | [ ] \ 3616 | [ ]_| <-- vector loop. 3617 | | 3618 | v 3619 | -[ ] <--- middle-block. 3620 | / | 3621 | / v 3622 -|- >[ ] <--- new preheader. 3623 | | 3624 | v 3625 | [ ] \ 3626 | [ ]_| <-- old scalar loop to handle remainder. 3627 \ | 3628 \ v 3629 >[ ] <-- exit block. 3630 ... 3631 */ 3632 3633 // Get the metadata of the original loop before it gets modified. 3634 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3635 3636 // Workaround! Compute the trip count of the original loop and cache it 3637 // before we start modifying the CFG. This code has a systemic problem 3638 // wherein it tries to run analysis over partially constructed IR; this is 3639 // wrong, and not simply for SCEV. The trip count of the original loop 3640 // simply happens to be prone to hitting this in practice. In theory, we 3641 // can hit the same issue for any SCEV, or ValueTracking query done during 3642 // mutation. See PR49900. 3643 getOrCreateTripCount(OrigLoop); 3644 3645 // Create an empty vector loop, and prepare basic blocks for the runtime 3646 // checks. 3647 Loop *Lp = createVectorLoopSkeleton(""); 3648 3649 // Now, compare the new count to zero. If it is zero skip the vector loop and 3650 // jump to the scalar loop. This check also covers the case where the 3651 // backedge-taken count is uint##_max: adding one to it will overflow leading 3652 // to an incorrect trip count of zero. In this (rare) case we will also jump 3653 // to the scalar loop. 3654 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3655 3656 // Generate the code to check any assumptions that we've made for SCEV 3657 // expressions. 3658 emitSCEVChecks(Lp, LoopScalarPreHeader); 3659 3660 // Generate the code that checks in runtime if arrays overlap. We put the 3661 // checks into a separate block to make the more common case of few elements 3662 // faster. 3663 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3664 3665 // Some loops have a single integer induction variable, while other loops 3666 // don't. One example is c++ iterators that often have multiple pointer 3667 // induction variables. In the code below we also support a case where we 3668 // don't have a single induction variable. 3669 // 3670 // We try to obtain an induction variable from the original loop as hard 3671 // as possible. However if we don't find one that: 3672 // - is an integer 3673 // - counts from zero, stepping by one 3674 // - is the size of the widest induction variable type 3675 // then we create a new one. 3676 OldInduction = Legal->getPrimaryInduction(); 3677 Type *IdxTy = Legal->getWidestInductionType(); 3678 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3679 // The loop step is equal to the vectorization factor (num of SIMD elements) 3680 // times the unroll factor (num of SIMD instructions). 3681 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3682 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3683 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3684 Induction = 3685 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3686 getDebugLocFromInstOrOperands(OldInduction)); 3687 3688 // Emit phis for the new starting index of the scalar loop. 3689 createInductionResumeValues(Lp, CountRoundDown); 3690 3691 return completeLoopSkeleton(Lp, OrigLoopID); 3692 } 3693 3694 // Fix up external users of the induction variable. At this point, we are 3695 // in LCSSA form, with all external PHIs that use the IV having one input value, 3696 // coming from the remainder loop. We need those PHIs to also have a correct 3697 // value for the IV when arriving directly from the middle block. 3698 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3699 const InductionDescriptor &II, 3700 Value *CountRoundDown, Value *EndValue, 3701 BasicBlock *MiddleBlock) { 3702 // There are two kinds of external IV usages - those that use the value 3703 // computed in the last iteration (the PHI) and those that use the penultimate 3704 // value (the value that feeds into the phi from the loop latch). 3705 // We allow both, but they, obviously, have different values. 3706 3707 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3708 3709 DenseMap<Value *, Value *> MissingVals; 3710 3711 // An external user of the last iteration's value should see the value that 3712 // the remainder loop uses to initialize its own IV. 3713 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3714 for (User *U : PostInc->users()) { 3715 Instruction *UI = cast<Instruction>(U); 3716 if (!OrigLoop->contains(UI)) { 3717 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3718 MissingVals[UI] = EndValue; 3719 } 3720 } 3721 3722 // An external user of the penultimate value need to see EndValue - Step. 3723 // The simplest way to get this is to recompute it from the constituent SCEVs, 3724 // that is Start + (Step * (CRD - 1)). 3725 for (User *U : OrigPhi->users()) { 3726 auto *UI = cast<Instruction>(U); 3727 if (!OrigLoop->contains(UI)) { 3728 const DataLayout &DL = 3729 OrigLoop->getHeader()->getModule()->getDataLayout(); 3730 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3731 3732 IRBuilder<> B(MiddleBlock->getTerminator()); 3733 3734 // Fast-math-flags propagate from the original induction instruction. 3735 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3736 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3737 3738 Value *CountMinusOne = B.CreateSub( 3739 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3740 Value *CMO = 3741 !II.getStep()->getType()->isIntegerTy() 3742 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3743 II.getStep()->getType()) 3744 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3745 CMO->setName("cast.cmo"); 3746 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3747 Escape->setName("ind.escape"); 3748 MissingVals[UI] = Escape; 3749 } 3750 } 3751 3752 for (auto &I : MissingVals) { 3753 PHINode *PHI = cast<PHINode>(I.first); 3754 // One corner case we have to handle is two IVs "chasing" each-other, 3755 // that is %IV2 = phi [...], [ %IV1, %latch ] 3756 // In this case, if IV1 has an external use, we need to avoid adding both 3757 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3758 // don't already have an incoming value for the middle block. 3759 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3760 PHI->addIncoming(I.second, MiddleBlock); 3761 } 3762 } 3763 3764 namespace { 3765 3766 struct CSEDenseMapInfo { 3767 static bool canHandle(const Instruction *I) { 3768 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3769 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3770 } 3771 3772 static inline Instruction *getEmptyKey() { 3773 return DenseMapInfo<Instruction *>::getEmptyKey(); 3774 } 3775 3776 static inline Instruction *getTombstoneKey() { 3777 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3778 } 3779 3780 static unsigned getHashValue(const Instruction *I) { 3781 assert(canHandle(I) && "Unknown instruction!"); 3782 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3783 I->value_op_end())); 3784 } 3785 3786 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3787 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3788 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3789 return LHS == RHS; 3790 return LHS->isIdenticalTo(RHS); 3791 } 3792 }; 3793 3794 } // end anonymous namespace 3795 3796 ///Perform cse of induction variable instructions. 3797 static void cse(BasicBlock *BB) { 3798 // Perform simple cse. 3799 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3800 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3801 Instruction *In = &*I++; 3802 3803 if (!CSEDenseMapInfo::canHandle(In)) 3804 continue; 3805 3806 // Check if we can replace this instruction with any of the 3807 // visited instructions. 3808 if (Instruction *V = CSEMap.lookup(In)) { 3809 In->replaceAllUsesWith(V); 3810 In->eraseFromParent(); 3811 continue; 3812 } 3813 3814 CSEMap[In] = In; 3815 } 3816 } 3817 3818 InstructionCost 3819 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3820 bool &NeedToScalarize) const { 3821 Function *F = CI->getCalledFunction(); 3822 Type *ScalarRetTy = CI->getType(); 3823 SmallVector<Type *, 4> Tys, ScalarTys; 3824 for (auto &ArgOp : CI->arg_operands()) 3825 ScalarTys.push_back(ArgOp->getType()); 3826 3827 // Estimate cost of scalarized vector call. The source operands are assumed 3828 // to be vectors, so we need to extract individual elements from there, 3829 // execute VF scalar calls, and then gather the result into the vector return 3830 // value. 3831 InstructionCost ScalarCallCost = 3832 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3833 if (VF.isScalar()) 3834 return ScalarCallCost; 3835 3836 // Compute corresponding vector type for return value and arguments. 3837 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3838 for (Type *ScalarTy : ScalarTys) 3839 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3840 3841 // Compute costs of unpacking argument values for the scalar calls and 3842 // packing the return values to a vector. 3843 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3844 3845 InstructionCost Cost = 3846 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3847 3848 // If we can't emit a vector call for this function, then the currently found 3849 // cost is the cost we need to return. 3850 NeedToScalarize = true; 3851 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3852 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3853 3854 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3855 return Cost; 3856 3857 // If the corresponding vector cost is cheaper, return its cost. 3858 InstructionCost VectorCallCost = 3859 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3860 if (VectorCallCost < Cost) { 3861 NeedToScalarize = false; 3862 Cost = VectorCallCost; 3863 } 3864 return Cost; 3865 } 3866 3867 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3868 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3869 return Elt; 3870 return VectorType::get(Elt, VF); 3871 } 3872 3873 InstructionCost 3874 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3875 ElementCount VF) const { 3876 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3877 assert(ID && "Expected intrinsic call!"); 3878 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3879 FastMathFlags FMF; 3880 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3881 FMF = FPMO->getFastMathFlags(); 3882 3883 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); 3884 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3885 SmallVector<Type *> ParamTys; 3886 std::transform(FTy->param_begin(), FTy->param_end(), 3887 std::back_inserter(ParamTys), 3888 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3889 3890 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3891 dyn_cast<IntrinsicInst>(CI)); 3892 return TTI.getIntrinsicInstrCost(CostAttrs, 3893 TargetTransformInfo::TCK_RecipThroughput); 3894 } 3895 3896 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3897 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3898 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3899 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3900 } 3901 3902 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3903 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3904 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3905 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3906 } 3907 3908 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) { 3909 // For every instruction `I` in MinBWs, truncate the operands, create a 3910 // truncated version of `I` and reextend its result. InstCombine runs 3911 // later and will remove any ext/trunc pairs. 3912 SmallPtrSet<Value *, 4> Erased; 3913 for (const auto &KV : Cost->getMinimalBitwidths()) { 3914 // If the value wasn't vectorized, we must maintain the original scalar 3915 // type. The absence of the value from State indicates that it 3916 // wasn't vectorized. 3917 VPValue *Def = State.Plan->getVPValue(KV.first); 3918 if (!State.hasAnyVectorValue(Def)) 3919 continue; 3920 for (unsigned Part = 0; Part < UF; ++Part) { 3921 Value *I = State.get(Def, Part); 3922 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3923 continue; 3924 Type *OriginalTy = I->getType(); 3925 Type *ScalarTruncatedTy = 3926 IntegerType::get(OriginalTy->getContext(), KV.second); 3927 auto *TruncatedTy = FixedVectorType::get( 3928 ScalarTruncatedTy, 3929 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3930 if (TruncatedTy == OriginalTy) 3931 continue; 3932 3933 IRBuilder<> B(cast<Instruction>(I)); 3934 auto ShrinkOperand = [&](Value *V) -> Value * { 3935 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3936 if (ZI->getSrcTy() == TruncatedTy) 3937 return ZI->getOperand(0); 3938 return B.CreateZExtOrTrunc(V, TruncatedTy); 3939 }; 3940 3941 // The actual instruction modification depends on the instruction type, 3942 // unfortunately. 3943 Value *NewI = nullptr; 3944 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3945 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3946 ShrinkOperand(BO->getOperand(1))); 3947 3948 // Any wrapping introduced by shrinking this operation shouldn't be 3949 // considered undefined behavior. So, we can't unconditionally copy 3950 // arithmetic wrapping flags to NewI. 3951 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3952 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3953 NewI = 3954 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3955 ShrinkOperand(CI->getOperand(1))); 3956 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3957 NewI = B.CreateSelect(SI->getCondition(), 3958 ShrinkOperand(SI->getTrueValue()), 3959 ShrinkOperand(SI->getFalseValue())); 3960 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3961 switch (CI->getOpcode()) { 3962 default: 3963 llvm_unreachable("Unhandled cast!"); 3964 case Instruction::Trunc: 3965 NewI = ShrinkOperand(CI->getOperand(0)); 3966 break; 3967 case Instruction::SExt: 3968 NewI = B.CreateSExtOrTrunc( 3969 CI->getOperand(0), 3970 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3971 break; 3972 case Instruction::ZExt: 3973 NewI = B.CreateZExtOrTrunc( 3974 CI->getOperand(0), 3975 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3976 break; 3977 } 3978 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3979 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3980 ->getNumElements(); 3981 auto *O0 = B.CreateZExtOrTrunc( 3982 SI->getOperand(0), 3983 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3984 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3985 ->getNumElements(); 3986 auto *O1 = B.CreateZExtOrTrunc( 3987 SI->getOperand(1), 3988 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3989 3990 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3991 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3992 // Don't do anything with the operands, just extend the result. 3993 continue; 3994 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3995 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3996 ->getNumElements(); 3997 auto *O0 = B.CreateZExtOrTrunc( 3998 IE->getOperand(0), 3999 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4000 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 4001 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 4002 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 4003 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 4004 ->getNumElements(); 4005 auto *O0 = B.CreateZExtOrTrunc( 4006 EE->getOperand(0), 4007 FixedVectorType::get(ScalarTruncatedTy, Elements)); 4008 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 4009 } else { 4010 // If we don't know what to do, be conservative and don't do anything. 4011 continue; 4012 } 4013 4014 // Lastly, extend the result. 4015 NewI->takeName(cast<Instruction>(I)); 4016 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 4017 I->replaceAllUsesWith(Res); 4018 cast<Instruction>(I)->eraseFromParent(); 4019 Erased.insert(I); 4020 State.reset(Def, Res, Part); 4021 } 4022 } 4023 4024 // We'll have created a bunch of ZExts that are now parentless. Clean up. 4025 for (const auto &KV : Cost->getMinimalBitwidths()) { 4026 // If the value wasn't vectorized, we must maintain the original scalar 4027 // type. The absence of the value from State indicates that it 4028 // wasn't vectorized. 4029 VPValue *Def = State.Plan->getVPValue(KV.first); 4030 if (!State.hasAnyVectorValue(Def)) 4031 continue; 4032 for (unsigned Part = 0; Part < UF; ++Part) { 4033 Value *I = State.get(Def, Part); 4034 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 4035 if (Inst && Inst->use_empty()) { 4036 Value *NewI = Inst->getOperand(0); 4037 Inst->eraseFromParent(); 4038 State.reset(Def, NewI, Part); 4039 } 4040 } 4041 } 4042 } 4043 4044 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { 4045 // Insert truncates and extends for any truncated instructions as hints to 4046 // InstCombine. 4047 if (VF.isVector()) 4048 truncateToMinimalBitwidths(State); 4049 4050 // Fix widened non-induction PHIs by setting up the PHI operands. 4051 if (OrigPHIsToFix.size()) { 4052 assert(EnableVPlanNativePath && 4053 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 4054 fixNonInductionPHIs(State); 4055 } 4056 4057 // At this point every instruction in the original loop is widened to a 4058 // vector form. Now we need to fix the recurrences in the loop. These PHI 4059 // nodes are currently empty because we did not want to introduce cycles. 4060 // This is the second stage of vectorizing recurrences. 4061 fixCrossIterationPHIs(State); 4062 4063 // Forget the original basic block. 4064 PSE.getSE()->forgetLoop(OrigLoop); 4065 4066 // Fix-up external users of the induction variables. 4067 for (auto &Entry : Legal->getInductionVars()) 4068 fixupIVUsers(Entry.first, Entry.second, 4069 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 4070 IVEndValues[Entry.first], LoopMiddleBlock); 4071 4072 fixLCSSAPHIs(State); 4073 for (Instruction *PI : PredicatedInstructions) 4074 sinkScalarOperands(&*PI); 4075 4076 // Remove redundant induction instructions. 4077 cse(LoopVectorBody); 4078 4079 // Set/update profile weights for the vector and remainder loops as original 4080 // loop iterations are now distributed among them. Note that original loop 4081 // represented by LoopScalarBody becomes remainder loop after vectorization. 4082 // 4083 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 4084 // end up getting slightly roughened result but that should be OK since 4085 // profile is not inherently precise anyway. Note also possible bypass of 4086 // vector code caused by legality checks is ignored, assigning all the weight 4087 // to the vector loop, optimistically. 4088 // 4089 // For scalable vectorization we can't know at compile time how many iterations 4090 // of the loop are handled in one vector iteration, so instead assume a pessimistic 4091 // vscale of '1'. 4092 setProfileInfoAfterUnrolling( 4093 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 4094 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 4095 } 4096 4097 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) { 4098 // In order to support recurrences we need to be able to vectorize Phi nodes. 4099 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4100 // stage #2: We now need to fix the recurrences by adding incoming edges to 4101 // the currently empty PHI nodes. At this point every instruction in the 4102 // original loop is widened to a vector form so we can use them to construct 4103 // the incoming edges. 4104 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock(); 4105 for (VPRecipeBase &R : Header->phis()) { 4106 auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R); 4107 if (!PhiR) 4108 continue; 4109 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4110 if (PhiR->getRecurrenceDescriptor()) { 4111 fixReduction(PhiR, State); 4112 } else if (Legal->isFirstOrderRecurrence(OrigPhi)) 4113 fixFirstOrderRecurrence(OrigPhi, State); 4114 } 4115 } 4116 4117 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi, 4118 VPTransformState &State) { 4119 // This is the second phase of vectorizing first-order recurrences. An 4120 // overview of the transformation is described below. Suppose we have the 4121 // following loop. 4122 // 4123 // for (int i = 0; i < n; ++i) 4124 // b[i] = a[i] - a[i - 1]; 4125 // 4126 // There is a first-order recurrence on "a". For this loop, the shorthand 4127 // scalar IR looks like: 4128 // 4129 // scalar.ph: 4130 // s_init = a[-1] 4131 // br scalar.body 4132 // 4133 // scalar.body: 4134 // i = phi [0, scalar.ph], [i+1, scalar.body] 4135 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4136 // s2 = a[i] 4137 // b[i] = s2 - s1 4138 // br cond, scalar.body, ... 4139 // 4140 // In this example, s1 is a recurrence because it's value depends on the 4141 // previous iteration. In the first phase of vectorization, we created a 4142 // temporary value for s1. We now complete the vectorization and produce the 4143 // shorthand vector IR shown below (for VF = 4, UF = 1). 4144 // 4145 // vector.ph: 4146 // v_init = vector(..., ..., ..., a[-1]) 4147 // br vector.body 4148 // 4149 // vector.body 4150 // i = phi [0, vector.ph], [i+4, vector.body] 4151 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4152 // v2 = a[i, i+1, i+2, i+3]; 4153 // v3 = vector(v1(3), v2(0, 1, 2)) 4154 // b[i, i+1, i+2, i+3] = v2 - v3 4155 // br cond, vector.body, middle.block 4156 // 4157 // middle.block: 4158 // x = v2(3) 4159 // br scalar.ph 4160 // 4161 // scalar.ph: 4162 // s_init = phi [x, middle.block], [a[-1], otherwise] 4163 // br scalar.body 4164 // 4165 // After execution completes the vector loop, we extract the next value of 4166 // the recurrence (x) to use as the initial value in the scalar loop. 4167 4168 // Get the original loop preheader and single loop latch. 4169 auto *Preheader = OrigLoop->getLoopPreheader(); 4170 auto *Latch = OrigLoop->getLoopLatch(); 4171 4172 // Get the initial and previous values of the scalar recurrence. 4173 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4174 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4175 4176 auto *IdxTy = Builder.getInt32Ty(); 4177 auto *One = ConstantInt::get(IdxTy, 1); 4178 4179 // Create a vector from the initial value. 4180 auto *VectorInit = ScalarInit; 4181 if (VF.isVector()) { 4182 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4183 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4184 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4185 VectorInit = Builder.CreateInsertElement( 4186 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), 4187 VectorInit, LastIdx, "vector.recur.init"); 4188 } 4189 4190 VPValue *PhiDef = State.Plan->getVPValue(Phi); 4191 VPValue *PreviousDef = State.Plan->getVPValue(Previous); 4192 // We constructed a temporary phi node in the first phase of vectorization. 4193 // This phi node will eventually be deleted. 4194 Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0))); 4195 4196 // Create a phi node for the new recurrence. The current value will either be 4197 // the initial value inserted into a vector or loop-varying vector value. 4198 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4199 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4200 4201 // Get the vectorized previous value of the last part UF - 1. It appears last 4202 // among all unrolled iterations, due to the order of their construction. 4203 Value *PreviousLastPart = State.get(PreviousDef, UF - 1); 4204 4205 // Find and set the insertion point after the previous value if it is an 4206 // instruction. 4207 BasicBlock::iterator InsertPt; 4208 // Note that the previous value may have been constant-folded so it is not 4209 // guaranteed to be an instruction in the vector loop. 4210 // FIXME: Loop invariant values do not form recurrences. We should deal with 4211 // them earlier. 4212 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4213 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4214 else { 4215 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4216 if (isa<PHINode>(PreviousLastPart)) 4217 // If the previous value is a phi node, we should insert after all the phi 4218 // nodes in the block containing the PHI to avoid breaking basic block 4219 // verification. Note that the basic block may be different to 4220 // LoopVectorBody, in case we predicate the loop. 4221 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4222 else 4223 InsertPt = ++PreviousInst->getIterator(); 4224 } 4225 Builder.SetInsertPoint(&*InsertPt); 4226 4227 // The vector from which to take the initial value for the current iteration 4228 // (actual or unrolled). Initially, this is the vector phi node. 4229 Value *Incoming = VecPhi; 4230 4231 // Shuffle the current and previous vector and update the vector parts. 4232 for (unsigned Part = 0; Part < UF; ++Part) { 4233 Value *PreviousPart = State.get(PreviousDef, Part); 4234 Value *PhiPart = State.get(PhiDef, Part); 4235 auto *Shuffle = VF.isVector() 4236 ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1) 4237 : Incoming; 4238 PhiPart->replaceAllUsesWith(Shuffle); 4239 cast<Instruction>(PhiPart)->eraseFromParent(); 4240 State.reset(PhiDef, Shuffle, Part); 4241 Incoming = PreviousPart; 4242 } 4243 4244 // Fix the latch value of the new recurrence in the vector loop. 4245 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4246 4247 // Extract the last vector element in the middle block. This will be the 4248 // initial value for the recurrence when jumping to the scalar loop. 4249 auto *ExtractForScalar = Incoming; 4250 if (VF.isVector()) { 4251 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4252 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4253 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 4254 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx, 4255 "vector.recur.extract"); 4256 } 4257 // Extract the second last element in the middle block if the 4258 // Phi is used outside the loop. We need to extract the phi itself 4259 // and not the last element (the phi update in the current iteration). This 4260 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4261 // when the scalar loop is not run at all. 4262 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4263 if (VF.isVector()) { 4264 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 4265 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 4266 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4267 Incoming, Idx, "vector.recur.extract.for.phi"); 4268 } else if (UF > 1) 4269 // When loop is unrolled without vectorizing, initialize 4270 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value 4271 // of `Incoming`. This is analogous to the vectorized case above: extracting 4272 // the second last element when VF > 1. 4273 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 4274 4275 // Fix the initial value of the original recurrence in the scalar loop. 4276 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4277 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4278 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4279 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4280 Start->addIncoming(Incoming, BB); 4281 } 4282 4283 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4284 Phi->setName("scalar.recur"); 4285 4286 // Finally, fix users of the recurrence outside the loop. The users will need 4287 // either the last value of the scalar recurrence or the last value of the 4288 // vector recurrence we extracted in the middle block. Since the loop is in 4289 // LCSSA form, we just need to find all the phi nodes for the original scalar 4290 // recurrence in the exit block, and then add an edge for the middle block. 4291 // Note that LCSSA does not imply single entry when the original scalar loop 4292 // had multiple exiting edges (as we always run the last iteration in the 4293 // scalar epilogue); in that case, the exiting path through middle will be 4294 // dynamically dead and the value picked for the phi doesn't matter. 4295 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4296 if (any_of(LCSSAPhi.incoming_values(), 4297 [Phi](Value *V) { return V == Phi; })) 4298 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4299 } 4300 4301 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { 4302 return EnableStrictReductions && RdxDesc.isOrdered(); 4303 } 4304 4305 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, 4306 VPTransformState &State) { 4307 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 4308 // Get it's reduction variable descriptor. 4309 assert(Legal->isReductionVariable(OrigPhi) && 4310 "Unable to find the reduction variable"); 4311 RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor(); 4312 4313 RecurKind RK = RdxDesc.getRecurrenceKind(); 4314 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4315 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4316 setDebugLocFromInst(Builder, ReductionStartValue); 4317 bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi); 4318 4319 VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst); 4320 // This is the vector-clone of the value that leaves the loop. 4321 Type *VecTy = State.get(LoopExitInstDef, 0)->getType(); 4322 4323 // Wrap flags are in general invalid after vectorization, clear them. 4324 clearReductionWrapFlags(RdxDesc, State); 4325 4326 // Fix the vector-loop phi. 4327 4328 // Reductions do not have to start at zero. They can start with 4329 // any loop invariant values. 4330 BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4331 4332 bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && 4333 useOrderedReductions(RdxDesc); 4334 4335 for (unsigned Part = 0; Part < UF; ++Part) { 4336 if (IsOrdered && Part > 0) 4337 break; 4338 Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part); 4339 Value *Val = State.get(PhiR->getBackedgeValue(), Part); 4340 if (IsOrdered) 4341 Val = State.get(PhiR->getBackedgeValue(), UF - 1); 4342 4343 cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch); 4344 } 4345 4346 // Before each round, move the insertion point right between 4347 // the PHIs and the values we are going to write. 4348 // This allows us to write both PHINodes and the extractelement 4349 // instructions. 4350 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4351 4352 setDebugLocFromInst(Builder, LoopExitInst); 4353 4354 Type *PhiTy = OrigPhi->getType(); 4355 // If tail is folded by masking, the vector value to leave the loop should be 4356 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4357 // instead of the former. For an inloop reduction the reduction will already 4358 // be predicated, and does not need to be handled here. 4359 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4360 for (unsigned Part = 0; Part < UF; ++Part) { 4361 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); 4362 Value *Sel = nullptr; 4363 for (User *U : VecLoopExitInst->users()) { 4364 if (isa<SelectInst>(U)) { 4365 assert(!Sel && "Reduction exit feeding two selects"); 4366 Sel = U; 4367 } else 4368 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4369 } 4370 assert(Sel && "Reduction exit feeds no select"); 4371 State.reset(LoopExitInstDef, Sel, Part); 4372 4373 // If the target can create a predicated operator for the reduction at no 4374 // extra cost in the loop (for example a predicated vadd), it can be 4375 // cheaper for the select to remain in the loop than be sunk out of it, 4376 // and so use the select value for the phi instead of the old 4377 // LoopExitValue. 4378 if (PreferPredicatedReductionSelect || 4379 TTI->preferPredicatedReductionSelect( 4380 RdxDesc.getOpcode(), PhiTy, 4381 TargetTransformInfo::ReductionFlags())) { 4382 auto *VecRdxPhi = 4383 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part)); 4384 VecRdxPhi->setIncomingValueForBlock( 4385 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4386 } 4387 } 4388 } 4389 4390 // If the vector reduction can be performed in a smaller type, we truncate 4391 // then extend the loop exit value to enable InstCombine to evaluate the 4392 // entire expression in the smaller type. 4393 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 4394 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4395 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4396 Builder.SetInsertPoint( 4397 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4398 VectorParts RdxParts(UF); 4399 for (unsigned Part = 0; Part < UF; ++Part) { 4400 RdxParts[Part] = State.get(LoopExitInstDef, Part); 4401 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4402 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4403 : Builder.CreateZExt(Trunc, VecTy); 4404 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4405 UI != RdxParts[Part]->user_end();) 4406 if (*UI != Trunc) { 4407 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4408 RdxParts[Part] = Extnd; 4409 } else { 4410 ++UI; 4411 } 4412 } 4413 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4414 for (unsigned Part = 0; Part < UF; ++Part) { 4415 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4416 State.reset(LoopExitInstDef, RdxParts[Part], Part); 4417 } 4418 } 4419 4420 // Reduce all of the unrolled parts into a single vector. 4421 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0); 4422 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4423 4424 // The middle block terminator has already been assigned a DebugLoc here (the 4425 // OrigLoop's single latch terminator). We want the whole middle block to 4426 // appear to execute on this line because: (a) it is all compiler generated, 4427 // (b) these instructions are always executed after evaluating the latch 4428 // conditional branch, and (c) other passes may add new predecessors which 4429 // terminate on this line. This is the easiest way to ensure we don't 4430 // accidentally cause an extra step back into the loop while debugging. 4431 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4432 if (IsOrdered) 4433 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); 4434 else { 4435 // Floating-point operations should have some FMF to enable the reduction. 4436 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4437 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4438 for (unsigned Part = 1; Part < UF; ++Part) { 4439 Value *RdxPart = State.get(LoopExitInstDef, Part); 4440 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4441 ReducedPartRdx = Builder.CreateBinOp( 4442 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4443 } else { 4444 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4445 } 4446 } 4447 } 4448 4449 // Create the reduction after the loop. Note that inloop reductions create the 4450 // target reduction in the loop using a Reduction recipe. 4451 if (VF.isVector() && !IsInLoopReductionPhi) { 4452 ReducedPartRdx = 4453 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4454 // If the reduction can be performed in a smaller type, we need to extend 4455 // the reduction to the wider type before we branch to the original loop. 4456 if (PhiTy != RdxDesc.getRecurrenceType()) 4457 ReducedPartRdx = RdxDesc.isSigned() 4458 ? Builder.CreateSExt(ReducedPartRdx, PhiTy) 4459 : Builder.CreateZExt(ReducedPartRdx, PhiTy); 4460 } 4461 4462 // Create a phi node that merges control-flow from the backedge-taken check 4463 // block and the middle block. 4464 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx", 4465 LoopScalarPreHeader->getTerminator()); 4466 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4467 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4468 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4469 4470 // Now, we need to fix the users of the reduction variable 4471 // inside and outside of the scalar remainder loop. 4472 4473 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4474 // in the exit blocks. See comment on analogous loop in 4475 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4476 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4477 if (any_of(LCSSAPhi.incoming_values(), 4478 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4479 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4480 4481 // Fix the scalar loop reduction variable with the incoming reduction sum 4482 // from the vector body and from the backedge value. 4483 int IncomingEdgeBlockIdx = 4484 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4485 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4486 // Pick the other block. 4487 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4488 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4489 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4490 } 4491 4492 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc, 4493 VPTransformState &State) { 4494 RecurKind RK = RdxDesc.getRecurrenceKind(); 4495 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4496 return; 4497 4498 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4499 assert(LoopExitInstr && "null loop exit instruction"); 4500 SmallVector<Instruction *, 8> Worklist; 4501 SmallPtrSet<Instruction *, 8> Visited; 4502 Worklist.push_back(LoopExitInstr); 4503 Visited.insert(LoopExitInstr); 4504 4505 while (!Worklist.empty()) { 4506 Instruction *Cur = Worklist.pop_back_val(); 4507 if (isa<OverflowingBinaryOperator>(Cur)) 4508 for (unsigned Part = 0; Part < UF; ++Part) { 4509 Value *V = State.get(State.Plan->getVPValue(Cur), Part); 4510 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4511 } 4512 4513 for (User *U : Cur->users()) { 4514 Instruction *UI = cast<Instruction>(U); 4515 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4516 Visited.insert(UI).second) 4517 Worklist.push_back(UI); 4518 } 4519 } 4520 } 4521 4522 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { 4523 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4524 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4525 // Some phis were already hand updated by the reduction and recurrence 4526 // code above, leave them alone. 4527 continue; 4528 4529 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4530 // Non-instruction incoming values will have only one value. 4531 4532 VPLane Lane = VPLane::getFirstLane(); 4533 if (isa<Instruction>(IncomingValue) && 4534 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue), 4535 VF)) 4536 Lane = VPLane::getLastLaneForVF(VF); 4537 4538 // Can be a loop invariant incoming value or the last scalar value to be 4539 // extracted from the vectorized loop. 4540 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4541 Value *lastIncomingValue = 4542 OrigLoop->isLoopInvariant(IncomingValue) 4543 ? IncomingValue 4544 : State.get(State.Plan->getVPValue(IncomingValue), 4545 VPIteration(UF - 1, Lane)); 4546 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4547 } 4548 } 4549 4550 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4551 // The basic block and loop containing the predicated instruction. 4552 auto *PredBB = PredInst->getParent(); 4553 auto *VectorLoop = LI->getLoopFor(PredBB); 4554 4555 // Initialize a worklist with the operands of the predicated instruction. 4556 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4557 4558 // Holds instructions that we need to analyze again. An instruction may be 4559 // reanalyzed if we don't yet know if we can sink it or not. 4560 SmallVector<Instruction *, 8> InstsToReanalyze; 4561 4562 // Returns true if a given use occurs in the predicated block. Phi nodes use 4563 // their operands in their corresponding predecessor blocks. 4564 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4565 auto *I = cast<Instruction>(U.getUser()); 4566 BasicBlock *BB = I->getParent(); 4567 if (auto *Phi = dyn_cast<PHINode>(I)) 4568 BB = Phi->getIncomingBlock( 4569 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4570 return BB == PredBB; 4571 }; 4572 4573 // Iteratively sink the scalarized operands of the predicated instruction 4574 // into the block we created for it. When an instruction is sunk, it's 4575 // operands are then added to the worklist. The algorithm ends after one pass 4576 // through the worklist doesn't sink a single instruction. 4577 bool Changed; 4578 do { 4579 // Add the instructions that need to be reanalyzed to the worklist, and 4580 // reset the changed indicator. 4581 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4582 InstsToReanalyze.clear(); 4583 Changed = false; 4584 4585 while (!Worklist.empty()) { 4586 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4587 4588 // We can't sink an instruction if it is a phi node, is already in the 4589 // predicated block, is not in the loop, or may have side effects. 4590 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4591 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4592 continue; 4593 4594 // It's legal to sink the instruction if all its uses occur in the 4595 // predicated block. Otherwise, there's nothing to do yet, and we may 4596 // need to reanalyze the instruction. 4597 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4598 InstsToReanalyze.push_back(I); 4599 continue; 4600 } 4601 4602 // Move the instruction to the beginning of the predicated block, and add 4603 // it's operands to the worklist. 4604 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4605 Worklist.insert(I->op_begin(), I->op_end()); 4606 4607 // The sinking may have enabled other instructions to be sunk, so we will 4608 // need to iterate. 4609 Changed = true; 4610 } 4611 } while (Changed); 4612 } 4613 4614 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { 4615 for (PHINode *OrigPhi : OrigPHIsToFix) { 4616 VPWidenPHIRecipe *VPPhi = 4617 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi)); 4618 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 4619 // Make sure the builder has a valid insert point. 4620 Builder.SetInsertPoint(NewPhi); 4621 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 4622 VPValue *Inc = VPPhi->getIncomingValue(i); 4623 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 4624 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 4625 } 4626 } 4627 } 4628 4629 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4630 VPUser &Operands, unsigned UF, 4631 ElementCount VF, bool IsPtrLoopInvariant, 4632 SmallBitVector &IsIndexLoopInvariant, 4633 VPTransformState &State) { 4634 // Construct a vector GEP by widening the operands of the scalar GEP as 4635 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4636 // results in a vector of pointers when at least one operand of the GEP 4637 // is vector-typed. Thus, to keep the representation compact, we only use 4638 // vector-typed operands for loop-varying values. 4639 4640 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4641 // If we are vectorizing, but the GEP has only loop-invariant operands, 4642 // the GEP we build (by only using vector-typed operands for 4643 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4644 // produce a vector of pointers, we need to either arbitrarily pick an 4645 // operand to broadcast, or broadcast a clone of the original GEP. 4646 // Here, we broadcast a clone of the original. 4647 // 4648 // TODO: If at some point we decide to scalarize instructions having 4649 // loop-invariant operands, this special case will no longer be 4650 // required. We would add the scalarization decision to 4651 // collectLoopScalars() and teach getVectorValue() to broadcast 4652 // the lane-zero scalar value. 4653 auto *Clone = Builder.Insert(GEP->clone()); 4654 for (unsigned Part = 0; Part < UF; ++Part) { 4655 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4656 State.set(VPDef, EntryPart, Part); 4657 addMetadata(EntryPart, GEP); 4658 } 4659 } else { 4660 // If the GEP has at least one loop-varying operand, we are sure to 4661 // produce a vector of pointers. But if we are only unrolling, we want 4662 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4663 // produce with the code below will be scalar (if VF == 1) or vector 4664 // (otherwise). Note that for the unroll-only case, we still maintain 4665 // values in the vector mapping with initVector, as we do for other 4666 // instructions. 4667 for (unsigned Part = 0; Part < UF; ++Part) { 4668 // The pointer operand of the new GEP. If it's loop-invariant, we 4669 // won't broadcast it. 4670 auto *Ptr = IsPtrLoopInvariant 4671 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 4672 : State.get(Operands.getOperand(0), Part); 4673 4674 // Collect all the indices for the new GEP. If any index is 4675 // loop-invariant, we won't broadcast it. 4676 SmallVector<Value *, 4> Indices; 4677 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4678 VPValue *Operand = Operands.getOperand(I); 4679 if (IsIndexLoopInvariant[I - 1]) 4680 Indices.push_back(State.get(Operand, VPIteration(0, 0))); 4681 else 4682 Indices.push_back(State.get(Operand, Part)); 4683 } 4684 4685 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4686 // but it should be a vector, otherwise. 4687 auto *NewGEP = 4688 GEP->isInBounds() 4689 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4690 Indices) 4691 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4692 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4693 "NewGEP is not a pointer vector"); 4694 State.set(VPDef, NewGEP, Part); 4695 addMetadata(NewGEP, GEP); 4696 } 4697 } 4698 } 4699 4700 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4701 RecurrenceDescriptor *RdxDesc, 4702 VPWidenPHIRecipe *PhiR, 4703 VPTransformState &State) { 4704 PHINode *P = cast<PHINode>(PN); 4705 if (EnableVPlanNativePath) { 4706 // Currently we enter here in the VPlan-native path for non-induction 4707 // PHIs where all control flow is uniform. We simply widen these PHIs. 4708 // Create a vector phi with no operands - the vector phi operands will be 4709 // set at the end of vector code generation. 4710 Type *VecTy = (State.VF.isScalar()) 4711 ? PN->getType() 4712 : VectorType::get(PN->getType(), State.VF); 4713 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4714 State.set(PhiR, VecPhi, 0); 4715 OrigPHIsToFix.push_back(P); 4716 4717 return; 4718 } 4719 4720 assert(PN->getParent() == OrigLoop->getHeader() && 4721 "Non-header phis should have been handled elsewhere"); 4722 4723 VPValue *StartVPV = PhiR->getStartValue(); 4724 Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr; 4725 // In order to support recurrences we need to be able to vectorize Phi nodes. 4726 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4727 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4728 // this value when we vectorize all of the instructions that use the PHI. 4729 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4730 Value *Iden = nullptr; 4731 bool ScalarPHI = 4732 (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4733 Type *VecTy = 4734 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); 4735 4736 if (RdxDesc) { 4737 assert(Legal->isReductionVariable(P) && StartV && 4738 "RdxDesc should only be set for reduction variables; in that case " 4739 "a StartV is also required"); 4740 RecurKind RK = RdxDesc->getRecurrenceKind(); 4741 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4742 // MinMax reduction have the start value as their identify. 4743 if (ScalarPHI) { 4744 Iden = StartV; 4745 } else { 4746 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4747 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4748 StartV = Iden = 4749 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident"); 4750 } 4751 } else { 4752 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4753 RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags()); 4754 Iden = IdenC; 4755 4756 if (!ScalarPHI) { 4757 Iden = ConstantVector::getSplat(State.VF, IdenC); 4758 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4759 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4760 Constant *Zero = Builder.getInt32(0); 4761 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4762 } 4763 } 4764 } 4765 4766 bool IsOrdered = State.VF.isVector() && 4767 Cost->isInLoopReduction(cast<PHINode>(PN)) && 4768 useOrderedReductions(*RdxDesc); 4769 4770 for (unsigned Part = 0; Part < State.UF; ++Part) { 4771 // This is phase one of vectorizing PHIs. 4772 if (Part > 0 && IsOrdered) 4773 return; 4774 Value *EntryPart = PHINode::Create( 4775 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4776 State.set(PhiR, EntryPart, Part); 4777 if (StartV) { 4778 // Make sure to add the reduction start value only to the 4779 // first unroll part. 4780 Value *StartVal = (Part == 0) ? StartV : Iden; 4781 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4782 } 4783 } 4784 return; 4785 } 4786 4787 assert(!Legal->isReductionVariable(P) && 4788 "reductions should be handled above"); 4789 4790 setDebugLocFromInst(Builder, P); 4791 4792 // This PHINode must be an induction variable. 4793 // Make sure that we know about it. 4794 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4795 4796 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4797 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4798 4799 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4800 // which can be found from the original scalar operations. 4801 switch (II.getKind()) { 4802 case InductionDescriptor::IK_NoInduction: 4803 llvm_unreachable("Unknown induction"); 4804 case InductionDescriptor::IK_IntInduction: 4805 case InductionDescriptor::IK_FpInduction: 4806 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4807 case InductionDescriptor::IK_PtrInduction: { 4808 // Handle the pointer induction variable case. 4809 assert(P->getType()->isPointerTy() && "Unexpected type."); 4810 4811 if (Cost->isScalarAfterVectorization(P, State.VF)) { 4812 // This is the normalized GEP that starts counting at zero. 4813 Value *PtrInd = 4814 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4815 // Determine the number of scalars we need to generate for each unroll 4816 // iteration. If the instruction is uniform, we only need to generate the 4817 // first lane. Otherwise, we generate all VF values. 4818 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF); 4819 assert((IsUniform || !VF.isScalable()) && 4820 "Currently unsupported for scalable vectors"); 4821 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 4822 4823 for (unsigned Part = 0; Part < UF; ++Part) { 4824 Value *PartStart = createStepForVF( 4825 Builder, ConstantInt::get(PtrInd->getType(), Part), VF); 4826 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4827 Value *Idx = Builder.CreateAdd( 4828 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 4829 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4830 Value *SclrGep = 4831 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4832 SclrGep->setName("next.gep"); 4833 State.set(PhiR, SclrGep, VPIteration(Part, Lane)); 4834 } 4835 } 4836 return; 4837 } 4838 assert(isa<SCEVConstant>(II.getStep()) && 4839 "Induction step not a SCEV constant!"); 4840 Type *PhiType = II.getStep()->getType(); 4841 4842 // Build a pointer phi 4843 Value *ScalarStartValue = II.getStartValue(); 4844 Type *ScStValueType = ScalarStartValue->getType(); 4845 PHINode *NewPointerPhi = 4846 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4847 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4848 4849 // A pointer induction, performed by using a gep 4850 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4851 Instruction *InductionLoc = LoopLatch->getTerminator(); 4852 const SCEV *ScalarStep = II.getStep(); 4853 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4854 Value *ScalarStepValue = 4855 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4856 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF); 4857 Value *NumUnrolledElems = 4858 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 4859 Value *InductionGEP = GetElementPtrInst::Create( 4860 ScStValueType->getPointerElementType(), NewPointerPhi, 4861 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 4862 InductionLoc); 4863 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4864 4865 // Create UF many actual address geps that use the pointer 4866 // phi as base and a vectorized version of the step value 4867 // (<step*0, ..., step*N>) as offset. 4868 for (unsigned Part = 0; Part < State.UF; ++Part) { 4869 Type *VecPhiType = VectorType::get(PhiType, State.VF); 4870 Value *StartOffsetScalar = 4871 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 4872 Value *StartOffset = 4873 Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 4874 // Create a vector of consecutive numbers from zero to VF. 4875 StartOffset = 4876 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType)); 4877 4878 Value *GEP = Builder.CreateGEP( 4879 ScStValueType->getPointerElementType(), NewPointerPhi, 4880 Builder.CreateMul( 4881 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue), 4882 "vector.gep")); 4883 State.set(PhiR, GEP, Part); 4884 } 4885 } 4886 } 4887 } 4888 4889 /// A helper function for checking whether an integer division-related 4890 /// instruction may divide by zero (in which case it must be predicated if 4891 /// executed conditionally in the scalar code). 4892 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4893 /// Non-zero divisors that are non compile-time constants will not be 4894 /// converted into multiplication, so we will still end up scalarizing 4895 /// the division, but can do so w/o predication. 4896 static bool mayDivideByZero(Instruction &I) { 4897 assert((I.getOpcode() == Instruction::UDiv || 4898 I.getOpcode() == Instruction::SDiv || 4899 I.getOpcode() == Instruction::URem || 4900 I.getOpcode() == Instruction::SRem) && 4901 "Unexpected instruction"); 4902 Value *Divisor = I.getOperand(1); 4903 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4904 return !CInt || CInt->isZero(); 4905 } 4906 4907 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4908 VPUser &User, 4909 VPTransformState &State) { 4910 switch (I.getOpcode()) { 4911 case Instruction::Call: 4912 case Instruction::Br: 4913 case Instruction::PHI: 4914 case Instruction::GetElementPtr: 4915 case Instruction::Select: 4916 llvm_unreachable("This instruction is handled by a different recipe."); 4917 case Instruction::UDiv: 4918 case Instruction::SDiv: 4919 case Instruction::SRem: 4920 case Instruction::URem: 4921 case Instruction::Add: 4922 case Instruction::FAdd: 4923 case Instruction::Sub: 4924 case Instruction::FSub: 4925 case Instruction::FNeg: 4926 case Instruction::Mul: 4927 case Instruction::FMul: 4928 case Instruction::FDiv: 4929 case Instruction::FRem: 4930 case Instruction::Shl: 4931 case Instruction::LShr: 4932 case Instruction::AShr: 4933 case Instruction::And: 4934 case Instruction::Or: 4935 case Instruction::Xor: { 4936 // Just widen unops and binops. 4937 setDebugLocFromInst(Builder, &I); 4938 4939 for (unsigned Part = 0; Part < UF; ++Part) { 4940 SmallVector<Value *, 2> Ops; 4941 for (VPValue *VPOp : User.operands()) 4942 Ops.push_back(State.get(VPOp, Part)); 4943 4944 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4945 4946 if (auto *VecOp = dyn_cast<Instruction>(V)) 4947 VecOp->copyIRFlags(&I); 4948 4949 // Use this vector value for all users of the original instruction. 4950 State.set(Def, V, Part); 4951 addMetadata(V, &I); 4952 } 4953 4954 break; 4955 } 4956 case Instruction::ICmp: 4957 case Instruction::FCmp: { 4958 // Widen compares. Generate vector compares. 4959 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4960 auto *Cmp = cast<CmpInst>(&I); 4961 setDebugLocFromInst(Builder, Cmp); 4962 for (unsigned Part = 0; Part < UF; ++Part) { 4963 Value *A = State.get(User.getOperand(0), Part); 4964 Value *B = State.get(User.getOperand(1), Part); 4965 Value *C = nullptr; 4966 if (FCmp) { 4967 // Propagate fast math flags. 4968 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4969 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4970 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4971 } else { 4972 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4973 } 4974 State.set(Def, C, Part); 4975 addMetadata(C, &I); 4976 } 4977 4978 break; 4979 } 4980 4981 case Instruction::ZExt: 4982 case Instruction::SExt: 4983 case Instruction::FPToUI: 4984 case Instruction::FPToSI: 4985 case Instruction::FPExt: 4986 case Instruction::PtrToInt: 4987 case Instruction::IntToPtr: 4988 case Instruction::SIToFP: 4989 case Instruction::UIToFP: 4990 case Instruction::Trunc: 4991 case Instruction::FPTrunc: 4992 case Instruction::BitCast: { 4993 auto *CI = cast<CastInst>(&I); 4994 setDebugLocFromInst(Builder, CI); 4995 4996 /// Vectorize casts. 4997 Type *DestTy = 4998 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4999 5000 for (unsigned Part = 0; Part < UF; ++Part) { 5001 Value *A = State.get(User.getOperand(0), Part); 5002 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 5003 State.set(Def, Cast, Part); 5004 addMetadata(Cast, &I); 5005 } 5006 break; 5007 } 5008 default: 5009 // This instruction is not vectorized by simple widening. 5010 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 5011 llvm_unreachable("Unhandled instruction!"); 5012 } // end of switch. 5013 } 5014 5015 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 5016 VPUser &ArgOperands, 5017 VPTransformState &State) { 5018 assert(!isa<DbgInfoIntrinsic>(I) && 5019 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 5020 setDebugLocFromInst(Builder, &I); 5021 5022 Module *M = I.getParent()->getParent()->getParent(); 5023 auto *CI = cast<CallInst>(&I); 5024 5025 SmallVector<Type *, 4> Tys; 5026 for (Value *ArgOperand : CI->arg_operands()) 5027 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 5028 5029 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 5030 5031 // The flag shows whether we use Intrinsic or a usual Call for vectorized 5032 // version of the instruction. 5033 // Is it beneficial to perform intrinsic call compared to lib call? 5034 bool NeedToScalarize = false; 5035 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 5036 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 5037 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 5038 assert((UseVectorIntrinsic || !NeedToScalarize) && 5039 "Instruction should be scalarized elsewhere."); 5040 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 5041 "Either the intrinsic cost or vector call cost must be valid"); 5042 5043 for (unsigned Part = 0; Part < UF; ++Part) { 5044 SmallVector<Value *, 4> Args; 5045 for (auto &I : enumerate(ArgOperands.operands())) { 5046 // Some intrinsics have a scalar argument - don't replace it with a 5047 // vector. 5048 Value *Arg; 5049 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 5050 Arg = State.get(I.value(), Part); 5051 else 5052 Arg = State.get(I.value(), VPIteration(0, 0)); 5053 Args.push_back(Arg); 5054 } 5055 5056 Function *VectorF; 5057 if (UseVectorIntrinsic) { 5058 // Use vector version of the intrinsic. 5059 Type *TysForDecl[] = {CI->getType()}; 5060 if (VF.isVector()) 5061 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 5062 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 5063 assert(VectorF && "Can't retrieve vector intrinsic."); 5064 } else { 5065 // Use vector version of the function call. 5066 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 5067 #ifndef NDEBUG 5068 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 5069 "Can't create vector function."); 5070 #endif 5071 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 5072 } 5073 SmallVector<OperandBundleDef, 1> OpBundles; 5074 CI->getOperandBundlesAsDefs(OpBundles); 5075 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 5076 5077 if (isa<FPMathOperator>(V)) 5078 V->copyFastMathFlags(CI); 5079 5080 State.set(Def, V, Part); 5081 addMetadata(V, &I); 5082 } 5083 } 5084 5085 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 5086 VPUser &Operands, 5087 bool InvariantCond, 5088 VPTransformState &State) { 5089 setDebugLocFromInst(Builder, &I); 5090 5091 // The condition can be loop invariant but still defined inside the 5092 // loop. This means that we can't just use the original 'cond' value. 5093 // We have to take the 'vectorized' value and pick the first lane. 5094 // Instcombine will make this a no-op. 5095 auto *InvarCond = InvariantCond 5096 ? State.get(Operands.getOperand(0), VPIteration(0, 0)) 5097 : nullptr; 5098 5099 for (unsigned Part = 0; Part < UF; ++Part) { 5100 Value *Cond = 5101 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 5102 Value *Op0 = State.get(Operands.getOperand(1), Part); 5103 Value *Op1 = State.get(Operands.getOperand(2), Part); 5104 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 5105 State.set(VPDef, Sel, Part); 5106 addMetadata(Sel, &I); 5107 } 5108 } 5109 5110 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 5111 // We should not collect Scalars more than once per VF. Right now, this 5112 // function is called from collectUniformsAndScalars(), which already does 5113 // this check. Collecting Scalars for VF=1 does not make any sense. 5114 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 5115 "This function should not be visited twice for the same VF"); 5116 5117 SmallSetVector<Instruction *, 8> Worklist; 5118 5119 // These sets are used to seed the analysis with pointers used by memory 5120 // accesses that will remain scalar. 5121 SmallSetVector<Instruction *, 8> ScalarPtrs; 5122 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 5123 auto *Latch = TheLoop->getLoopLatch(); 5124 5125 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 5126 // The pointer operands of loads and stores will be scalar as long as the 5127 // memory access is not a gather or scatter operation. The value operand of a 5128 // store will remain scalar if the store is scalarized. 5129 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 5130 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 5131 assert(WideningDecision != CM_Unknown && 5132 "Widening decision should be ready at this moment"); 5133 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5134 if (Ptr == Store->getValueOperand()) 5135 return WideningDecision == CM_Scalarize; 5136 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5137 "Ptr is neither a value or pointer operand"); 5138 return WideningDecision != CM_GatherScatter; 5139 }; 5140 5141 // A helper that returns true if the given value is a bitcast or 5142 // getelementptr instruction contained in the loop. 5143 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5144 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5145 isa<GetElementPtrInst>(V)) && 5146 !TheLoop->isLoopInvariant(V); 5147 }; 5148 5149 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5150 if (!isa<PHINode>(Ptr) || 5151 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5152 return false; 5153 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5154 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5155 return false; 5156 return isScalarUse(MemAccess, Ptr); 5157 }; 5158 5159 // A helper that evaluates a memory access's use of a pointer. If the 5160 // pointer is actually the pointer induction of a loop, it is being 5161 // inserted into Worklist. If the use will be a scalar use, and the 5162 // pointer is only used by memory accesses, we place the pointer in 5163 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5164 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5165 if (isScalarPtrInduction(MemAccess, Ptr)) { 5166 Worklist.insert(cast<Instruction>(Ptr)); 5167 Instruction *Update = cast<Instruction>( 5168 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5169 Worklist.insert(Update); 5170 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5171 << "\n"); 5172 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5173 << "\n"); 5174 return; 5175 } 5176 // We only care about bitcast and getelementptr instructions contained in 5177 // the loop. 5178 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5179 return; 5180 5181 // If the pointer has already been identified as scalar (e.g., if it was 5182 // also identified as uniform), there's nothing to do. 5183 auto *I = cast<Instruction>(Ptr); 5184 if (Worklist.count(I)) 5185 return; 5186 5187 // If the use of the pointer will be a scalar use, and all users of the 5188 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5189 // place the pointer in PossibleNonScalarPtrs. 5190 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5191 return isa<LoadInst>(U) || isa<StoreInst>(U); 5192 })) 5193 ScalarPtrs.insert(I); 5194 else 5195 PossibleNonScalarPtrs.insert(I); 5196 }; 5197 5198 // We seed the scalars analysis with three classes of instructions: (1) 5199 // instructions marked uniform-after-vectorization and (2) bitcast, 5200 // getelementptr and (pointer) phi instructions used by memory accesses 5201 // requiring a scalar use. 5202 // 5203 // (1) Add to the worklist all instructions that have been identified as 5204 // uniform-after-vectorization. 5205 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5206 5207 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5208 // memory accesses requiring a scalar use. The pointer operands of loads and 5209 // stores will be scalar as long as the memory accesses is not a gather or 5210 // scatter operation. The value operand of a store will remain scalar if the 5211 // store is scalarized. 5212 for (auto *BB : TheLoop->blocks()) 5213 for (auto &I : *BB) { 5214 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5215 evaluatePtrUse(Load, Load->getPointerOperand()); 5216 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5217 evaluatePtrUse(Store, Store->getPointerOperand()); 5218 evaluatePtrUse(Store, Store->getValueOperand()); 5219 } 5220 } 5221 for (auto *I : ScalarPtrs) 5222 if (!PossibleNonScalarPtrs.count(I)) { 5223 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5224 Worklist.insert(I); 5225 } 5226 5227 // Insert the forced scalars. 5228 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5229 // induction variable when the PHI user is scalarized. 5230 auto ForcedScalar = ForcedScalars.find(VF); 5231 if (ForcedScalar != ForcedScalars.end()) 5232 for (auto *I : ForcedScalar->second) 5233 Worklist.insert(I); 5234 5235 // Expand the worklist by looking through any bitcasts and getelementptr 5236 // instructions we've already identified as scalar. This is similar to the 5237 // expansion step in collectLoopUniforms(); however, here we're only 5238 // expanding to include additional bitcasts and getelementptr instructions. 5239 unsigned Idx = 0; 5240 while (Idx != Worklist.size()) { 5241 Instruction *Dst = Worklist[Idx++]; 5242 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5243 continue; 5244 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5245 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5246 auto *J = cast<Instruction>(U); 5247 return !TheLoop->contains(J) || Worklist.count(J) || 5248 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5249 isScalarUse(J, Src)); 5250 })) { 5251 Worklist.insert(Src); 5252 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5253 } 5254 } 5255 5256 // An induction variable will remain scalar if all users of the induction 5257 // variable and induction variable update remain scalar. 5258 for (auto &Induction : Legal->getInductionVars()) { 5259 auto *Ind = Induction.first; 5260 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5261 5262 // If tail-folding is applied, the primary induction variable will be used 5263 // to feed a vector compare. 5264 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5265 continue; 5266 5267 // Determine if all users of the induction variable are scalar after 5268 // vectorization. 5269 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5270 auto *I = cast<Instruction>(U); 5271 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5272 }); 5273 if (!ScalarInd) 5274 continue; 5275 5276 // Determine if all users of the induction variable update instruction are 5277 // scalar after vectorization. 5278 auto ScalarIndUpdate = 5279 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5280 auto *I = cast<Instruction>(U); 5281 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5282 }); 5283 if (!ScalarIndUpdate) 5284 continue; 5285 5286 // The induction variable and its update instruction will remain scalar. 5287 Worklist.insert(Ind); 5288 Worklist.insert(IndUpdate); 5289 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5290 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5291 << "\n"); 5292 } 5293 5294 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5295 } 5296 5297 bool LoopVectorizationCostModel::isScalarWithPredication( 5298 Instruction *I, ElementCount VF) const { 5299 if (!blockNeedsPredication(I->getParent())) 5300 return false; 5301 switch(I->getOpcode()) { 5302 default: 5303 break; 5304 case Instruction::Load: 5305 case Instruction::Store: { 5306 if (!Legal->isMaskRequired(I)) 5307 return false; 5308 auto *Ptr = getLoadStorePointerOperand(I); 5309 auto *Ty = getMemInstValueType(I); 5310 // We have already decided how to vectorize this instruction, get that 5311 // result. 5312 if (VF.isVector()) { 5313 InstWidening WideningDecision = getWideningDecision(I, VF); 5314 assert(WideningDecision != CM_Unknown && 5315 "Widening decision should be ready at this moment"); 5316 return WideningDecision == CM_Scalarize; 5317 } 5318 const Align Alignment = getLoadStoreAlignment(I); 5319 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5320 isLegalMaskedGather(Ty, Alignment)) 5321 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5322 isLegalMaskedScatter(Ty, Alignment)); 5323 } 5324 case Instruction::UDiv: 5325 case Instruction::SDiv: 5326 case Instruction::SRem: 5327 case Instruction::URem: 5328 return mayDivideByZero(*I); 5329 } 5330 return false; 5331 } 5332 5333 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5334 Instruction *I, ElementCount VF) { 5335 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5336 assert(getWideningDecision(I, VF) == CM_Unknown && 5337 "Decision should not be set yet."); 5338 auto *Group = getInterleavedAccessGroup(I); 5339 assert(Group && "Must have a group."); 5340 5341 // If the instruction's allocated size doesn't equal it's type size, it 5342 // requires padding and will be scalarized. 5343 auto &DL = I->getModule()->getDataLayout(); 5344 auto *ScalarTy = getMemInstValueType(I); 5345 if (hasIrregularType(ScalarTy, DL)) 5346 return false; 5347 5348 // Check if masking is required. 5349 // A Group may need masking for one of two reasons: it resides in a block that 5350 // needs predication, or it was decided to use masking to deal with gaps. 5351 bool PredicatedAccessRequiresMasking = 5352 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5353 bool AccessWithGapsRequiresMasking = 5354 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5355 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5356 return true; 5357 5358 // If masked interleaving is required, we expect that the user/target had 5359 // enabled it, because otherwise it either wouldn't have been created or 5360 // it should have been invalidated by the CostModel. 5361 assert(useMaskedInterleavedAccesses(TTI) && 5362 "Masked interleave-groups for predicated accesses are not enabled."); 5363 5364 auto *Ty = getMemInstValueType(I); 5365 const Align Alignment = getLoadStoreAlignment(I); 5366 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5367 : TTI.isLegalMaskedStore(Ty, Alignment); 5368 } 5369 5370 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5371 Instruction *I, ElementCount VF) { 5372 // Get and ensure we have a valid memory instruction. 5373 LoadInst *LI = dyn_cast<LoadInst>(I); 5374 StoreInst *SI = dyn_cast<StoreInst>(I); 5375 assert((LI || SI) && "Invalid memory instruction"); 5376 5377 auto *Ptr = getLoadStorePointerOperand(I); 5378 5379 // In order to be widened, the pointer should be consecutive, first of all. 5380 if (!Legal->isConsecutivePtr(Ptr)) 5381 return false; 5382 5383 // If the instruction is a store located in a predicated block, it will be 5384 // scalarized. 5385 if (isScalarWithPredication(I)) 5386 return false; 5387 5388 // If the instruction's allocated size doesn't equal it's type size, it 5389 // requires padding and will be scalarized. 5390 auto &DL = I->getModule()->getDataLayout(); 5391 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5392 if (hasIrregularType(ScalarTy, DL)) 5393 return false; 5394 5395 return true; 5396 } 5397 5398 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5399 // We should not collect Uniforms more than once per VF. Right now, 5400 // this function is called from collectUniformsAndScalars(), which 5401 // already does this check. Collecting Uniforms for VF=1 does not make any 5402 // sense. 5403 5404 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5405 "This function should not be visited twice for the same VF"); 5406 5407 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5408 // not analyze again. Uniforms.count(VF) will return 1. 5409 Uniforms[VF].clear(); 5410 5411 // We now know that the loop is vectorizable! 5412 // Collect instructions inside the loop that will remain uniform after 5413 // vectorization. 5414 5415 // Global values, params and instructions outside of current loop are out of 5416 // scope. 5417 auto isOutOfScope = [&](Value *V) -> bool { 5418 Instruction *I = dyn_cast<Instruction>(V); 5419 return (!I || !TheLoop->contains(I)); 5420 }; 5421 5422 SetVector<Instruction *> Worklist; 5423 BasicBlock *Latch = TheLoop->getLoopLatch(); 5424 5425 // Instructions that are scalar with predication must not be considered 5426 // uniform after vectorization, because that would create an erroneous 5427 // replicating region where only a single instance out of VF should be formed. 5428 // TODO: optimize such seldom cases if found important, see PR40816. 5429 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5430 if (isOutOfScope(I)) { 5431 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5432 << *I << "\n"); 5433 return; 5434 } 5435 if (isScalarWithPredication(I, VF)) { 5436 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5437 << *I << "\n"); 5438 return; 5439 } 5440 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5441 Worklist.insert(I); 5442 }; 5443 5444 // Start with the conditional branch. If the branch condition is an 5445 // instruction contained in the loop that is only used by the branch, it is 5446 // uniform. 5447 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5448 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5449 addToWorklistIfAllowed(Cmp); 5450 5451 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5452 InstWidening WideningDecision = getWideningDecision(I, VF); 5453 assert(WideningDecision != CM_Unknown && 5454 "Widening decision should be ready at this moment"); 5455 5456 // A uniform memory op is itself uniform. We exclude uniform stores 5457 // here as they demand the last lane, not the first one. 5458 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5459 assert(WideningDecision == CM_Scalarize); 5460 return true; 5461 } 5462 5463 return (WideningDecision == CM_Widen || 5464 WideningDecision == CM_Widen_Reverse || 5465 WideningDecision == CM_Interleave); 5466 }; 5467 5468 5469 // Returns true if Ptr is the pointer operand of a memory access instruction 5470 // I, and I is known to not require scalarization. 5471 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5472 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5473 }; 5474 5475 // Holds a list of values which are known to have at least one uniform use. 5476 // Note that there may be other uses which aren't uniform. A "uniform use" 5477 // here is something which only demands lane 0 of the unrolled iterations; 5478 // it does not imply that all lanes produce the same value (e.g. this is not 5479 // the usual meaning of uniform) 5480 SetVector<Value *> HasUniformUse; 5481 5482 // Scan the loop for instructions which are either a) known to have only 5483 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5484 for (auto *BB : TheLoop->blocks()) 5485 for (auto &I : *BB) { 5486 // If there's no pointer operand, there's nothing to do. 5487 auto *Ptr = getLoadStorePointerOperand(&I); 5488 if (!Ptr) 5489 continue; 5490 5491 // A uniform memory op is itself uniform. We exclude uniform stores 5492 // here as they demand the last lane, not the first one. 5493 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5494 addToWorklistIfAllowed(&I); 5495 5496 if (isUniformDecision(&I, VF)) { 5497 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5498 HasUniformUse.insert(Ptr); 5499 } 5500 } 5501 5502 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5503 // demanding) users. Since loops are assumed to be in LCSSA form, this 5504 // disallows uses outside the loop as well. 5505 for (auto *V : HasUniformUse) { 5506 if (isOutOfScope(V)) 5507 continue; 5508 auto *I = cast<Instruction>(V); 5509 auto UsersAreMemAccesses = 5510 llvm::all_of(I->users(), [&](User *U) -> bool { 5511 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5512 }); 5513 if (UsersAreMemAccesses) 5514 addToWorklistIfAllowed(I); 5515 } 5516 5517 // Expand Worklist in topological order: whenever a new instruction 5518 // is added , its users should be already inside Worklist. It ensures 5519 // a uniform instruction will only be used by uniform instructions. 5520 unsigned idx = 0; 5521 while (idx != Worklist.size()) { 5522 Instruction *I = Worklist[idx++]; 5523 5524 for (auto OV : I->operand_values()) { 5525 // isOutOfScope operands cannot be uniform instructions. 5526 if (isOutOfScope(OV)) 5527 continue; 5528 // First order recurrence Phi's should typically be considered 5529 // non-uniform. 5530 auto *OP = dyn_cast<PHINode>(OV); 5531 if (OP && Legal->isFirstOrderRecurrence(OP)) 5532 continue; 5533 // If all the users of the operand are uniform, then add the 5534 // operand into the uniform worklist. 5535 auto *OI = cast<Instruction>(OV); 5536 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5537 auto *J = cast<Instruction>(U); 5538 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5539 })) 5540 addToWorklistIfAllowed(OI); 5541 } 5542 } 5543 5544 // For an instruction to be added into Worklist above, all its users inside 5545 // the loop should also be in Worklist. However, this condition cannot be 5546 // true for phi nodes that form a cyclic dependence. We must process phi 5547 // nodes separately. An induction variable will remain uniform if all users 5548 // of the induction variable and induction variable update remain uniform. 5549 // The code below handles both pointer and non-pointer induction variables. 5550 for (auto &Induction : Legal->getInductionVars()) { 5551 auto *Ind = Induction.first; 5552 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5553 5554 // Determine if all users of the induction variable are uniform after 5555 // vectorization. 5556 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5557 auto *I = cast<Instruction>(U); 5558 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5559 isVectorizedMemAccessUse(I, Ind); 5560 }); 5561 if (!UniformInd) 5562 continue; 5563 5564 // Determine if all users of the induction variable update instruction are 5565 // uniform after vectorization. 5566 auto UniformIndUpdate = 5567 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5568 auto *I = cast<Instruction>(U); 5569 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5570 isVectorizedMemAccessUse(I, IndUpdate); 5571 }); 5572 if (!UniformIndUpdate) 5573 continue; 5574 5575 // The induction variable and its update instruction will remain uniform. 5576 addToWorklistIfAllowed(Ind); 5577 addToWorklistIfAllowed(IndUpdate); 5578 } 5579 5580 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5581 } 5582 5583 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5584 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5585 5586 if (Legal->getRuntimePointerChecking()->Need) { 5587 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5588 "runtime pointer checks needed. Enable vectorization of this " 5589 "loop with '#pragma clang loop vectorize(enable)' when " 5590 "compiling with -Os/-Oz", 5591 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5592 return true; 5593 } 5594 5595 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5596 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5597 "runtime SCEV checks needed. Enable vectorization of this " 5598 "loop with '#pragma clang loop vectorize(enable)' when " 5599 "compiling with -Os/-Oz", 5600 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5601 return true; 5602 } 5603 5604 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5605 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5606 reportVectorizationFailure("Runtime stride check for small trip count", 5607 "runtime stride == 1 checks needed. Enable vectorization of " 5608 "this loop without such check by compiling with -Os/-Oz", 5609 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5610 return true; 5611 } 5612 5613 return false; 5614 } 5615 5616 ElementCount 5617 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 5618 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 5619 reportVectorizationInfo( 5620 "Disabling scalable vectorization, because target does not " 5621 "support scalable vectors.", 5622 "ScalableVectorsUnsupported", ORE, TheLoop); 5623 return ElementCount::getScalable(0); 5624 } 5625 5626 auto MaxScalableVF = ElementCount::getScalable( 5627 std::numeric_limits<ElementCount::ScalarTy>::max()); 5628 5629 // Disable scalable vectorization if the loop contains unsupported reductions. 5630 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 5631 // FIXME: While for scalable vectors this is currently sufficient, this should 5632 // be replaced by a more detailed mechanism that filters out specific VFs, 5633 // instead of invalidating vectorization for a whole set of VFs based on the 5634 // MaxVF. 5635 if (!canVectorizeReductions(MaxScalableVF)) { 5636 reportVectorizationInfo( 5637 "Scalable vectorization not supported for the reduction " 5638 "operations found in this loop.", 5639 "ScalableVFUnfeasible", ORE, TheLoop); 5640 return ElementCount::getScalable(0); 5641 } 5642 5643 if (Legal->isSafeForAnyVectorWidth()) 5644 return MaxScalableVF; 5645 5646 // Limit MaxScalableVF by the maximum safe dependence distance. 5647 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5648 MaxScalableVF = ElementCount::getScalable( 5649 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5650 if (!MaxScalableVF) 5651 reportVectorizationInfo( 5652 "Max legal vector width too small, scalable vectorization " 5653 "unfeasible.", 5654 "ScalableVFUnfeasible", ORE, TheLoop); 5655 5656 return MaxScalableVF; 5657 } 5658 5659 ElementCount 5660 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5661 ElementCount UserVF) { 5662 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5663 unsigned SmallestType, WidestType; 5664 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5665 5666 // Get the maximum safe dependence distance in bits computed by LAA. 5667 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5668 // the memory accesses that is most restrictive (involved in the smallest 5669 // dependence distance). 5670 unsigned MaxSafeElements = 5671 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 5672 5673 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 5674 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 5675 5676 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 5677 << ".\n"); 5678 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 5679 << ".\n"); 5680 5681 // First analyze the UserVF, fall back if the UserVF should be ignored. 5682 if (UserVF) { 5683 auto MaxSafeUserVF = 5684 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 5685 5686 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) 5687 return UserVF; 5688 5689 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 5690 5691 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 5692 // is better to ignore the hint and let the compiler choose a suitable VF. 5693 if (!UserVF.isScalable()) { 5694 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5695 << " is unsafe, clamping to max safe VF=" 5696 << MaxSafeFixedVF << ".\n"); 5697 ORE->emit([&]() { 5698 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5699 TheLoop->getStartLoc(), 5700 TheLoop->getHeader()) 5701 << "User-specified vectorization factor " 5702 << ore::NV("UserVectorizationFactor", UserVF) 5703 << " is unsafe, clamping to maximum safe vectorization factor " 5704 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 5705 }); 5706 return MaxSafeFixedVF; 5707 } 5708 5709 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5710 << " is unsafe. Ignoring scalable UserVF.\n"); 5711 ORE->emit([&]() { 5712 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5713 TheLoop->getStartLoc(), 5714 TheLoop->getHeader()) 5715 << "User-specified vectorization factor " 5716 << ore::NV("UserVectorizationFactor", UserVF) 5717 << " is unsafe. Ignoring the hint to let the compiler pick a " 5718 "suitable VF."; 5719 }); 5720 } 5721 5722 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5723 << " / " << WidestType << " bits.\n"); 5724 5725 ElementCount MaxFixedVF = ElementCount::getFixed(1); 5726 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5727 WidestType, MaxSafeFixedVF)) 5728 MaxFixedVF = MaxVF; 5729 5730 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, 5731 WidestType, MaxSafeScalableVF)) 5732 // FIXME: Return scalable VF as well (to be added in future patch). 5733 if (MaxVF.isScalable()) 5734 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 5735 << "\n"); 5736 5737 return MaxFixedVF; 5738 } 5739 5740 Optional<ElementCount> 5741 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5742 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5743 // TODO: It may by useful to do since it's still likely to be dynamically 5744 // uniform if the target can skip. 5745 reportVectorizationFailure( 5746 "Not inserting runtime ptr check for divergent target", 5747 "runtime pointer checks needed. Not enabled for divergent target", 5748 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5749 return None; 5750 } 5751 5752 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5753 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5754 if (TC == 1) { 5755 reportVectorizationFailure("Single iteration (non) loop", 5756 "loop trip count is one, irrelevant for vectorization", 5757 "SingleIterationLoop", ORE, TheLoop); 5758 return None; 5759 } 5760 5761 switch (ScalarEpilogueStatus) { 5762 case CM_ScalarEpilogueAllowed: 5763 return computeFeasibleMaxVF(TC, UserVF); 5764 case CM_ScalarEpilogueNotAllowedUsePredicate: 5765 LLVM_FALLTHROUGH; 5766 case CM_ScalarEpilogueNotNeededUsePredicate: 5767 LLVM_DEBUG( 5768 dbgs() << "LV: vector predicate hint/switch found.\n" 5769 << "LV: Not allowing scalar epilogue, creating predicated " 5770 << "vector loop.\n"); 5771 break; 5772 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5773 // fallthrough as a special case of OptForSize 5774 case CM_ScalarEpilogueNotAllowedOptSize: 5775 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5776 LLVM_DEBUG( 5777 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5778 else 5779 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5780 << "count.\n"); 5781 5782 // Bail if runtime checks are required, which are not good when optimising 5783 // for size. 5784 if (runtimeChecksRequired()) 5785 return None; 5786 5787 break; 5788 } 5789 5790 // The only loops we can vectorize without a scalar epilogue, are loops with 5791 // a bottom-test and a single exiting block. We'd have to handle the fact 5792 // that not every instruction executes on the last iteration. This will 5793 // require a lane mask which varies through the vector loop body. (TODO) 5794 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5795 // If there was a tail-folding hint/switch, but we can't fold the tail by 5796 // masking, fallback to a vectorization with a scalar epilogue. 5797 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5798 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5799 "scalar epilogue instead.\n"); 5800 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5801 return computeFeasibleMaxVF(TC, UserVF); 5802 } 5803 return None; 5804 } 5805 5806 // Now try the tail folding 5807 5808 // Invalidate interleave groups that require an epilogue if we can't mask 5809 // the interleave-group. 5810 if (!useMaskedInterleavedAccesses(TTI)) { 5811 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5812 "No decisions should have been taken at this point"); 5813 // Note: There is no need to invalidate any cost modeling decisions here, as 5814 // non where taken so far. 5815 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5816 } 5817 5818 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5819 assert(!MaxVF.isScalable() && 5820 "Scalable vectors do not yet support tail folding"); 5821 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5822 "MaxVF must be a power of 2"); 5823 unsigned MaxVFtimesIC = 5824 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5825 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5826 // chose. 5827 ScalarEvolution *SE = PSE.getSE(); 5828 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5829 const SCEV *ExitCount = SE->getAddExpr( 5830 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5831 const SCEV *Rem = SE->getURemExpr( 5832 SE->applyLoopGuards(ExitCount, TheLoop), 5833 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5834 if (Rem->isZero()) { 5835 // Accept MaxVF if we do not have a tail. 5836 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5837 return MaxVF; 5838 } 5839 5840 // If we don't know the precise trip count, or if the trip count that we 5841 // found modulo the vectorization factor is not zero, try to fold the tail 5842 // by masking. 5843 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5844 if (Legal->prepareToFoldTailByMasking()) { 5845 FoldTailByMasking = true; 5846 return MaxVF; 5847 } 5848 5849 // If there was a tail-folding hint/switch, but we can't fold the tail by 5850 // masking, fallback to a vectorization with a scalar epilogue. 5851 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5852 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5853 "scalar epilogue instead.\n"); 5854 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5855 return MaxVF; 5856 } 5857 5858 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5859 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5860 return None; 5861 } 5862 5863 if (TC == 0) { 5864 reportVectorizationFailure( 5865 "Unable to calculate the loop count due to complex control flow", 5866 "unable to calculate the loop count due to complex control flow", 5867 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5868 return None; 5869 } 5870 5871 reportVectorizationFailure( 5872 "Cannot optimize for size and vectorize at the same time.", 5873 "cannot optimize for size and vectorize at the same time. " 5874 "Enable vectorization of this loop with '#pragma clang loop " 5875 "vectorize(enable)' when compiling with -Os/-Oz", 5876 "NoTailLoopWithOptForSize", ORE, TheLoop); 5877 return None; 5878 } 5879 5880 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 5881 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, 5882 const ElementCount &MaxSafeVF) { 5883 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 5884 TypeSize WidestRegister = TTI.getRegisterBitWidth( 5885 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 5886 : TargetTransformInfo::RGK_FixedWidthVector); 5887 5888 // Convenience function to return the minimum of two ElementCounts. 5889 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 5890 assert((LHS.isScalable() == RHS.isScalable()) && 5891 "Scalable flags must match"); 5892 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 5893 }; 5894 5895 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5896 // Note that both WidestRegister and WidestType may not be a powers of 2. 5897 auto MaxVectorElementCount = ElementCount::get( 5898 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType), 5899 ComputeScalableMaxVF); 5900 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 5901 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5902 << (MaxVectorElementCount * WidestType) << " bits.\n"); 5903 5904 if (!MaxVectorElementCount) { 5905 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5906 return ElementCount::getFixed(1); 5907 } 5908 5909 const auto TripCountEC = ElementCount::getFixed(ConstTripCount); 5910 if (ConstTripCount && 5911 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && 5912 isPowerOf2_32(ConstTripCount)) { 5913 // We need to clamp the VF to be the ConstTripCount. There is no point in 5914 // choosing a higher viable VF as done in the loop below. If 5915 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when 5916 // the TC is less than or equal to the known number of lanes. 5917 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5918 << ConstTripCount << "\n"); 5919 return TripCountEC; 5920 } 5921 5922 ElementCount MaxVF = MaxVectorElementCount; 5923 if (TTI.shouldMaximizeVectorBandwidth() || 5924 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5925 auto MaxVectorElementCountMaxBW = ElementCount::get( 5926 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), 5927 ComputeScalableMaxVF); 5928 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 5929 5930 // Collect all viable vectorization factors larger than the default MaxVF 5931 // (i.e. MaxVectorElementCount). 5932 SmallVector<ElementCount, 8> VFs; 5933 for (ElementCount VS = MaxVectorElementCount * 2; 5934 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 5935 VFs.push_back(VS); 5936 5937 // For each VF calculate its register usage. 5938 auto RUs = calculateRegisterUsage(VFs); 5939 5940 // Select the largest VF which doesn't require more registers than existing 5941 // ones. 5942 for (int i = RUs.size() - 1; i >= 0; --i) { 5943 bool Selected = true; 5944 for (auto &pair : RUs[i].MaxLocalUsers) { 5945 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5946 if (pair.second > TargetNumRegisters) 5947 Selected = false; 5948 } 5949 if (Selected) { 5950 MaxVF = VFs[i]; 5951 break; 5952 } 5953 } 5954 if (ElementCount MinVF = 5955 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 5956 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 5957 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5958 << ") with target's minimum: " << MinVF << '\n'); 5959 MaxVF = MinVF; 5960 } 5961 } 5962 } 5963 return MaxVF; 5964 } 5965 5966 bool LoopVectorizationCostModel::isMoreProfitable( 5967 const VectorizationFactor &A, const VectorizationFactor &B) const { 5968 InstructionCost::CostType CostA = *A.Cost.getValue(); 5969 InstructionCost::CostType CostB = *B.Cost.getValue(); 5970 5971 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 5972 5973 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && 5974 MaxTripCount) { 5975 // If we are folding the tail and the trip count is a known (possibly small) 5976 // constant, the trip count will be rounded up to an integer number of 5977 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), 5978 // which we compare directly. When not folding the tail, the total cost will 5979 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is 5980 // approximated with the per-lane cost below instead of using the tripcount 5981 // as here. 5982 int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); 5983 int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); 5984 return RTCostA < RTCostB; 5985 } 5986 5987 // To avoid the need for FP division: 5988 // (CostA / A.Width) < (CostB / B.Width) 5989 // <=> (CostA * B.Width) < (CostB * A.Width) 5990 return (CostA * B.Width.getKnownMinValue()) < 5991 (CostB * A.Width.getKnownMinValue()); 5992 } 5993 5994 VectorizationFactor 5995 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5996 // FIXME: This can be fixed for scalable vectors later, because at this stage 5997 // the LoopVectorizer will only consider vectorizing a loop with scalable 5998 // vectors when the loop has a hint to enable vectorization for a given VF. 5999 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 6000 6001 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 6002 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 6003 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 6004 6005 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); 6006 VectorizationFactor ChosenFactor = ScalarCost; 6007 6008 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 6009 if (ForceVectorization && MaxVF.isVector()) { 6010 // Ignore scalar width, because the user explicitly wants vectorization. 6011 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 6012 // evaluation. 6013 ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max(); 6014 } 6015 6016 for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); 6017 i *= 2) { 6018 // Notice that the vector loop needs to be executed less times, so 6019 // we need to divide the cost of the vector loops by the width of 6020 // the vector elements. 6021 VectorizationCostTy C = expectedCost(i); 6022 6023 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 6024 VectorizationFactor Candidate(i, C.first); 6025 LLVM_DEBUG( 6026 dbgs() << "LV: Vector loop of width " << i << " costs: " 6027 << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) 6028 << ".\n"); 6029 6030 if (!C.second && !ForceVectorization) { 6031 LLVM_DEBUG( 6032 dbgs() << "LV: Not considering vector loop of width " << i 6033 << " because it will not generate any vector instructions.\n"); 6034 continue; 6035 } 6036 6037 // If profitable add it to ProfitableVF list. 6038 if (isMoreProfitable(Candidate, ScalarCost)) 6039 ProfitableVFs.push_back(Candidate); 6040 6041 if (isMoreProfitable(Candidate, ChosenFactor)) 6042 ChosenFactor = Candidate; 6043 } 6044 6045 if (!EnableCondStoresVectorization && NumPredStores) { 6046 reportVectorizationFailure("There are conditional stores.", 6047 "store that is conditionally executed prevents vectorization", 6048 "ConditionalStore", ORE, TheLoop); 6049 ChosenFactor = ScalarCost; 6050 } 6051 6052 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 6053 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) 6054 dbgs() 6055 << "LV: Vectorization seems to be not beneficial, " 6056 << "but was forced by a user.\n"); 6057 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 6058 return ChosenFactor; 6059 } 6060 6061 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 6062 const Loop &L, ElementCount VF) const { 6063 // Cross iteration phis such as reductions need special handling and are 6064 // currently unsupported. 6065 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 6066 return Legal->isFirstOrderRecurrence(&Phi) || 6067 Legal->isReductionVariable(&Phi); 6068 })) 6069 return false; 6070 6071 // Phis with uses outside of the loop require special handling and are 6072 // currently unsupported. 6073 for (auto &Entry : Legal->getInductionVars()) { 6074 // Look for uses of the value of the induction at the last iteration. 6075 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 6076 for (User *U : PostInc->users()) 6077 if (!L.contains(cast<Instruction>(U))) 6078 return false; 6079 // Look for uses of penultimate value of the induction. 6080 for (User *U : Entry.first->users()) 6081 if (!L.contains(cast<Instruction>(U))) 6082 return false; 6083 } 6084 6085 // Induction variables that are widened require special handling that is 6086 // currently not supported. 6087 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 6088 return !(this->isScalarAfterVectorization(Entry.first, VF) || 6089 this->isProfitableToScalarize(Entry.first, VF)); 6090 })) 6091 return false; 6092 6093 return true; 6094 } 6095 6096 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 6097 const ElementCount VF) const { 6098 // FIXME: We need a much better cost-model to take different parameters such 6099 // as register pressure, code size increase and cost of extra branches into 6100 // account. For now we apply a very crude heuristic and only consider loops 6101 // with vectorization factors larger than a certain value. 6102 // We also consider epilogue vectorization unprofitable for targets that don't 6103 // consider interleaving beneficial (eg. MVE). 6104 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 6105 return false; 6106 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 6107 return true; 6108 return false; 6109 } 6110 6111 VectorizationFactor 6112 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 6113 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 6114 VectorizationFactor Result = VectorizationFactor::Disabled(); 6115 if (!EnableEpilogueVectorization) { 6116 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 6117 return Result; 6118 } 6119 6120 if (!isScalarEpilogueAllowed()) { 6121 LLVM_DEBUG( 6122 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 6123 "allowed.\n";); 6124 return Result; 6125 } 6126 6127 // FIXME: This can be fixed for scalable vectors later, because at this stage 6128 // the LoopVectorizer will only consider vectorizing a loop with scalable 6129 // vectors when the loop has a hint to enable vectorization for a given VF. 6130 if (MainLoopVF.isScalable()) { 6131 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 6132 "yet supported.\n"); 6133 return Result; 6134 } 6135 6136 // Not really a cost consideration, but check for unsupported cases here to 6137 // simplify the logic. 6138 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 6139 LLVM_DEBUG( 6140 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 6141 "not a supported candidate.\n";); 6142 return Result; 6143 } 6144 6145 if (EpilogueVectorizationForceVF > 1) { 6146 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 6147 if (LVP.hasPlanWithVFs( 6148 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 6149 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 6150 else { 6151 LLVM_DEBUG( 6152 dbgs() 6153 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 6154 return Result; 6155 } 6156 } 6157 6158 if (TheLoop->getHeader()->getParent()->hasOptSize() || 6159 TheLoop->getHeader()->getParent()->hasMinSize()) { 6160 LLVM_DEBUG( 6161 dbgs() 6162 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 6163 return Result; 6164 } 6165 6166 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 6167 return Result; 6168 6169 for (auto &NextVF : ProfitableVFs) 6170 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 6171 (Result.Width.getFixedValue() == 1 || 6172 isMoreProfitable(NextVF, Result)) && 6173 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 6174 Result = NextVF; 6175 6176 if (Result != VectorizationFactor::Disabled()) 6177 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 6178 << Result.Width.getFixedValue() << "\n";); 6179 return Result; 6180 } 6181 6182 std::pair<unsigned, unsigned> 6183 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 6184 unsigned MinWidth = -1U; 6185 unsigned MaxWidth = 8; 6186 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 6187 6188 // For each block. 6189 for (BasicBlock *BB : TheLoop->blocks()) { 6190 // For each instruction in the loop. 6191 for (Instruction &I : BB->instructionsWithoutDebug()) { 6192 Type *T = I.getType(); 6193 6194 // Skip ignored values. 6195 if (ValuesToIgnore.count(&I)) 6196 continue; 6197 6198 // Only examine Loads, Stores and PHINodes. 6199 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 6200 continue; 6201 6202 // Examine PHI nodes that are reduction variables. Update the type to 6203 // account for the recurrence type. 6204 if (auto *PN = dyn_cast<PHINode>(&I)) { 6205 if (!Legal->isReductionVariable(PN)) 6206 continue; 6207 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 6208 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 6209 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 6210 RdxDesc.getRecurrenceType(), 6211 TargetTransformInfo::ReductionFlags())) 6212 continue; 6213 T = RdxDesc.getRecurrenceType(); 6214 } 6215 6216 // Examine the stored values. 6217 if (auto *ST = dyn_cast<StoreInst>(&I)) 6218 T = ST->getValueOperand()->getType(); 6219 6220 // Ignore loaded pointer types and stored pointer types that are not 6221 // vectorizable. 6222 // 6223 // FIXME: The check here attempts to predict whether a load or store will 6224 // be vectorized. We only know this for certain after a VF has 6225 // been selected. Here, we assume that if an access can be 6226 // vectorized, it will be. We should also look at extending this 6227 // optimization to non-pointer types. 6228 // 6229 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6230 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6231 continue; 6232 6233 MinWidth = std::min(MinWidth, 6234 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6235 MaxWidth = std::max(MaxWidth, 6236 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6237 } 6238 } 6239 6240 return {MinWidth, MaxWidth}; 6241 } 6242 6243 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6244 unsigned LoopCost) { 6245 // -- The interleave heuristics -- 6246 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6247 // There are many micro-architectural considerations that we can't predict 6248 // at this level. For example, frontend pressure (on decode or fetch) due to 6249 // code size, or the number and capabilities of the execution ports. 6250 // 6251 // We use the following heuristics to select the interleave count: 6252 // 1. If the code has reductions, then we interleave to break the cross 6253 // iteration dependency. 6254 // 2. If the loop is really small, then we interleave to reduce the loop 6255 // overhead. 6256 // 3. We don't interleave if we think that we will spill registers to memory 6257 // due to the increased register pressure. 6258 6259 if (!isScalarEpilogueAllowed()) 6260 return 1; 6261 6262 // We used the distance for the interleave count. 6263 if (Legal->getMaxSafeDepDistBytes() != -1U) 6264 return 1; 6265 6266 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6267 const bool HasReductions = !Legal->getReductionVars().empty(); 6268 // Do not interleave loops with a relatively small known or estimated trip 6269 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6270 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6271 // because with the above conditions interleaving can expose ILP and break 6272 // cross iteration dependences for reductions. 6273 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6274 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6275 return 1; 6276 6277 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6278 // We divide by these constants so assume that we have at least one 6279 // instruction that uses at least one register. 6280 for (auto& pair : R.MaxLocalUsers) { 6281 pair.second = std::max(pair.second, 1U); 6282 } 6283 6284 // We calculate the interleave count using the following formula. 6285 // Subtract the number of loop invariants from the number of available 6286 // registers. These registers are used by all of the interleaved instances. 6287 // Next, divide the remaining registers by the number of registers that is 6288 // required by the loop, in order to estimate how many parallel instances 6289 // fit without causing spills. All of this is rounded down if necessary to be 6290 // a power of two. We want power of two interleave count to simplify any 6291 // addressing operations or alignment considerations. 6292 // We also want power of two interleave counts to ensure that the induction 6293 // variable of the vector loop wraps to zero, when tail is folded by masking; 6294 // this currently happens when OptForSize, in which case IC is set to 1 above. 6295 unsigned IC = UINT_MAX; 6296 6297 for (auto& pair : R.MaxLocalUsers) { 6298 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6299 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6300 << " registers of " 6301 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6302 if (VF.isScalar()) { 6303 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6304 TargetNumRegisters = ForceTargetNumScalarRegs; 6305 } else { 6306 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6307 TargetNumRegisters = ForceTargetNumVectorRegs; 6308 } 6309 unsigned MaxLocalUsers = pair.second; 6310 unsigned LoopInvariantRegs = 0; 6311 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6312 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6313 6314 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6315 // Don't count the induction variable as interleaved. 6316 if (EnableIndVarRegisterHeur) { 6317 TmpIC = 6318 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6319 std::max(1U, (MaxLocalUsers - 1))); 6320 } 6321 6322 IC = std::min(IC, TmpIC); 6323 } 6324 6325 // Clamp the interleave ranges to reasonable counts. 6326 unsigned MaxInterleaveCount = 6327 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6328 6329 // Check if the user has overridden the max. 6330 if (VF.isScalar()) { 6331 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6332 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6333 } else { 6334 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6335 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6336 } 6337 6338 // If trip count is known or estimated compile time constant, limit the 6339 // interleave count to be less than the trip count divided by VF, provided it 6340 // is at least 1. 6341 // 6342 // For scalable vectors we can't know if interleaving is beneficial. It may 6343 // not be beneficial for small loops if none of the lanes in the second vector 6344 // iterations is enabled. However, for larger loops, there is likely to be a 6345 // similar benefit as for fixed-width vectors. For now, we choose to leave 6346 // the InterleaveCount as if vscale is '1', although if some information about 6347 // the vector is known (e.g. min vector size), we can make a better decision. 6348 if (BestKnownTC) { 6349 MaxInterleaveCount = 6350 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6351 // Make sure MaxInterleaveCount is greater than 0. 6352 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6353 } 6354 6355 assert(MaxInterleaveCount > 0 && 6356 "Maximum interleave count must be greater than 0"); 6357 6358 // Clamp the calculated IC to be between the 1 and the max interleave count 6359 // that the target and trip count allows. 6360 if (IC > MaxInterleaveCount) 6361 IC = MaxInterleaveCount; 6362 else 6363 // Make sure IC is greater than 0. 6364 IC = std::max(1u, IC); 6365 6366 assert(IC > 0 && "Interleave count must be greater than 0."); 6367 6368 // If we did not calculate the cost for VF (because the user selected the VF) 6369 // then we calculate the cost of VF here. 6370 if (LoopCost == 0) { 6371 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6372 LoopCost = *expectedCost(VF).first.getValue(); 6373 } 6374 6375 assert(LoopCost && "Non-zero loop cost expected"); 6376 6377 // Interleave if we vectorized this loop and there is a reduction that could 6378 // benefit from interleaving. 6379 if (VF.isVector() && HasReductions) { 6380 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6381 return IC; 6382 } 6383 6384 // Note that if we've already vectorized the loop we will have done the 6385 // runtime check and so interleaving won't require further checks. 6386 bool InterleavingRequiresRuntimePointerCheck = 6387 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6388 6389 // We want to interleave small loops in order to reduce the loop overhead and 6390 // potentially expose ILP opportunities. 6391 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6392 << "LV: IC is " << IC << '\n' 6393 << "LV: VF is " << VF << '\n'); 6394 const bool AggressivelyInterleaveReductions = 6395 TTI.enableAggressiveInterleaving(HasReductions); 6396 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6397 // We assume that the cost overhead is 1 and we use the cost model 6398 // to estimate the cost of the loop and interleave until the cost of the 6399 // loop overhead is about 5% of the cost of the loop. 6400 unsigned SmallIC = 6401 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6402 6403 // Interleave until store/load ports (estimated by max interleave count) are 6404 // saturated. 6405 unsigned NumStores = Legal->getNumStores(); 6406 unsigned NumLoads = Legal->getNumLoads(); 6407 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6408 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6409 6410 // If we have a scalar reduction (vector reductions are already dealt with 6411 // by this point), we can increase the critical path length if the loop 6412 // we're interleaving is inside another loop. Limit, by default to 2, so the 6413 // critical path only gets increased by one reduction operation. 6414 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6415 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6416 SmallIC = std::min(SmallIC, F); 6417 StoresIC = std::min(StoresIC, F); 6418 LoadsIC = std::min(LoadsIC, F); 6419 } 6420 6421 if (EnableLoadStoreRuntimeInterleave && 6422 std::max(StoresIC, LoadsIC) > SmallIC) { 6423 LLVM_DEBUG( 6424 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6425 return std::max(StoresIC, LoadsIC); 6426 } 6427 6428 // If there are scalar reductions and TTI has enabled aggressive 6429 // interleaving for reductions, we will interleave to expose ILP. 6430 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6431 AggressivelyInterleaveReductions) { 6432 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6433 // Interleave no less than SmallIC but not as aggressive as the normal IC 6434 // to satisfy the rare situation when resources are too limited. 6435 return std::max(IC / 2, SmallIC); 6436 } else { 6437 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6438 return SmallIC; 6439 } 6440 } 6441 6442 // Interleave if this is a large loop (small loops are already dealt with by 6443 // this point) that could benefit from interleaving. 6444 if (AggressivelyInterleaveReductions) { 6445 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6446 return IC; 6447 } 6448 6449 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6450 return 1; 6451 } 6452 6453 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6454 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6455 // This function calculates the register usage by measuring the highest number 6456 // of values that are alive at a single location. Obviously, this is a very 6457 // rough estimation. We scan the loop in a topological order in order and 6458 // assign a number to each instruction. We use RPO to ensure that defs are 6459 // met before their users. We assume that each instruction that has in-loop 6460 // users starts an interval. We record every time that an in-loop value is 6461 // used, so we have a list of the first and last occurrences of each 6462 // instruction. Next, we transpose this data structure into a multi map that 6463 // holds the list of intervals that *end* at a specific location. This multi 6464 // map allows us to perform a linear search. We scan the instructions linearly 6465 // and record each time that a new interval starts, by placing it in a set. 6466 // If we find this value in the multi-map then we remove it from the set. 6467 // The max register usage is the maximum size of the set. 6468 // We also search for instructions that are defined outside the loop, but are 6469 // used inside the loop. We need this number separately from the max-interval 6470 // usage number because when we unroll, loop-invariant values do not take 6471 // more register. 6472 LoopBlocksDFS DFS(TheLoop); 6473 DFS.perform(LI); 6474 6475 RegisterUsage RU; 6476 6477 // Each 'key' in the map opens a new interval. The values 6478 // of the map are the index of the 'last seen' usage of the 6479 // instruction that is the key. 6480 using IntervalMap = DenseMap<Instruction *, unsigned>; 6481 6482 // Maps instruction to its index. 6483 SmallVector<Instruction *, 64> IdxToInstr; 6484 // Marks the end of each interval. 6485 IntervalMap EndPoint; 6486 // Saves the list of instruction indices that are used in the loop. 6487 SmallPtrSet<Instruction *, 8> Ends; 6488 // Saves the list of values that are used in the loop but are 6489 // defined outside the loop, such as arguments and constants. 6490 SmallPtrSet<Value *, 8> LoopInvariants; 6491 6492 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6493 for (Instruction &I : BB->instructionsWithoutDebug()) { 6494 IdxToInstr.push_back(&I); 6495 6496 // Save the end location of each USE. 6497 for (Value *U : I.operands()) { 6498 auto *Instr = dyn_cast<Instruction>(U); 6499 6500 // Ignore non-instruction values such as arguments, constants, etc. 6501 if (!Instr) 6502 continue; 6503 6504 // If this instruction is outside the loop then record it and continue. 6505 if (!TheLoop->contains(Instr)) { 6506 LoopInvariants.insert(Instr); 6507 continue; 6508 } 6509 6510 // Overwrite previous end points. 6511 EndPoint[Instr] = IdxToInstr.size(); 6512 Ends.insert(Instr); 6513 } 6514 } 6515 } 6516 6517 // Saves the list of intervals that end with the index in 'key'. 6518 using InstrList = SmallVector<Instruction *, 2>; 6519 DenseMap<unsigned, InstrList> TransposeEnds; 6520 6521 // Transpose the EndPoints to a list of values that end at each index. 6522 for (auto &Interval : EndPoint) 6523 TransposeEnds[Interval.second].push_back(Interval.first); 6524 6525 SmallPtrSet<Instruction *, 8> OpenIntervals; 6526 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6527 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6528 6529 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6530 6531 // A lambda that gets the register usage for the given type and VF. 6532 const auto &TTICapture = TTI; 6533 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6534 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6535 return 0U; 6536 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6537 }; 6538 6539 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6540 Instruction *I = IdxToInstr[i]; 6541 6542 // Remove all of the instructions that end at this location. 6543 InstrList &List = TransposeEnds[i]; 6544 for (Instruction *ToRemove : List) 6545 OpenIntervals.erase(ToRemove); 6546 6547 // Ignore instructions that are never used within the loop. 6548 if (!Ends.count(I)) 6549 continue; 6550 6551 // Skip ignored values. 6552 if (ValuesToIgnore.count(I)) 6553 continue; 6554 6555 // For each VF find the maximum usage of registers. 6556 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6557 // Count the number of live intervals. 6558 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6559 6560 if (VFs[j].isScalar()) { 6561 for (auto Inst : OpenIntervals) { 6562 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6563 if (RegUsage.find(ClassID) == RegUsage.end()) 6564 RegUsage[ClassID] = 1; 6565 else 6566 RegUsage[ClassID] += 1; 6567 } 6568 } else { 6569 collectUniformsAndScalars(VFs[j]); 6570 for (auto Inst : OpenIntervals) { 6571 // Skip ignored values for VF > 1. 6572 if (VecValuesToIgnore.count(Inst)) 6573 continue; 6574 if (isScalarAfterVectorization(Inst, VFs[j])) { 6575 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6576 if (RegUsage.find(ClassID) == RegUsage.end()) 6577 RegUsage[ClassID] = 1; 6578 else 6579 RegUsage[ClassID] += 1; 6580 } else { 6581 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6582 if (RegUsage.find(ClassID) == RegUsage.end()) 6583 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6584 else 6585 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6586 } 6587 } 6588 } 6589 6590 for (auto& pair : RegUsage) { 6591 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6592 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6593 else 6594 MaxUsages[j][pair.first] = pair.second; 6595 } 6596 } 6597 6598 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6599 << OpenIntervals.size() << '\n'); 6600 6601 // Add the current instruction to the list of open intervals. 6602 OpenIntervals.insert(I); 6603 } 6604 6605 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6606 SmallMapVector<unsigned, unsigned, 4> Invariant; 6607 6608 for (auto Inst : LoopInvariants) { 6609 unsigned Usage = 6610 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6611 unsigned ClassID = 6612 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6613 if (Invariant.find(ClassID) == Invariant.end()) 6614 Invariant[ClassID] = Usage; 6615 else 6616 Invariant[ClassID] += Usage; 6617 } 6618 6619 LLVM_DEBUG({ 6620 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6621 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6622 << " item\n"; 6623 for (const auto &pair : MaxUsages[i]) { 6624 dbgs() << "LV(REG): RegisterClass: " 6625 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6626 << " registers\n"; 6627 } 6628 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6629 << " item\n"; 6630 for (const auto &pair : Invariant) { 6631 dbgs() << "LV(REG): RegisterClass: " 6632 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6633 << " registers\n"; 6634 } 6635 }); 6636 6637 RU.LoopInvariantRegs = Invariant; 6638 RU.MaxLocalUsers = MaxUsages[i]; 6639 RUs[i] = RU; 6640 } 6641 6642 return RUs; 6643 } 6644 6645 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6646 // TODO: Cost model for emulated masked load/store is completely 6647 // broken. This hack guides the cost model to use an artificially 6648 // high enough value to practically disable vectorization with such 6649 // operations, except where previously deployed legality hack allowed 6650 // using very low cost values. This is to avoid regressions coming simply 6651 // from moving "masked load/store" check from legality to cost model. 6652 // Masked Load/Gather emulation was previously never allowed. 6653 // Limited number of Masked Store/Scatter emulation was allowed. 6654 assert(isPredicatedInst(I, ElementCount::getFixed(1)) && 6655 "Expecting a scalar emulated instruction"); 6656 return isa<LoadInst>(I) || 6657 (isa<StoreInst>(I) && 6658 NumPredStores > NumberOfStoresToPredicate); 6659 } 6660 6661 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6662 // If we aren't vectorizing the loop, or if we've already collected the 6663 // instructions to scalarize, there's nothing to do. Collection may already 6664 // have occurred if we have a user-selected VF and are now computing the 6665 // expected cost for interleaving. 6666 if (VF.isScalar() || VF.isZero() || 6667 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6668 return; 6669 6670 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6671 // not profitable to scalarize any instructions, the presence of VF in the 6672 // map will indicate that we've analyzed it already. 6673 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6674 6675 // Find all the instructions that are scalar with predication in the loop and 6676 // determine if it would be better to not if-convert the blocks they are in. 6677 // If so, we also record the instructions to scalarize. 6678 for (BasicBlock *BB : TheLoop->blocks()) { 6679 if (!blockNeedsPredication(BB)) 6680 continue; 6681 for (Instruction &I : *BB) 6682 if (isScalarWithPredication(&I)) { 6683 ScalarCostsTy ScalarCosts; 6684 // Do not apply discount logic if hacked cost is needed 6685 // for emulated masked memrefs. 6686 if (!useEmulatedMaskMemRefHack(&I) && 6687 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6688 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6689 // Remember that BB will remain after vectorization. 6690 PredicatedBBsAfterVectorization.insert(BB); 6691 } 6692 } 6693 } 6694 6695 int LoopVectorizationCostModel::computePredInstDiscount( 6696 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6697 assert(!isUniformAfterVectorization(PredInst, VF) && 6698 "Instruction marked uniform-after-vectorization will be predicated"); 6699 6700 // Initialize the discount to zero, meaning that the scalar version and the 6701 // vector version cost the same. 6702 InstructionCost Discount = 0; 6703 6704 // Holds instructions to analyze. The instructions we visit are mapped in 6705 // ScalarCosts. Those instructions are the ones that would be scalarized if 6706 // we find that the scalar version costs less. 6707 SmallVector<Instruction *, 8> Worklist; 6708 6709 // Returns true if the given instruction can be scalarized. 6710 auto canBeScalarized = [&](Instruction *I) -> bool { 6711 // We only attempt to scalarize instructions forming a single-use chain 6712 // from the original predicated block that would otherwise be vectorized. 6713 // Although not strictly necessary, we give up on instructions we know will 6714 // already be scalar to avoid traversing chains that are unlikely to be 6715 // beneficial. 6716 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6717 isScalarAfterVectorization(I, VF)) 6718 return false; 6719 6720 // If the instruction is scalar with predication, it will be analyzed 6721 // separately. We ignore it within the context of PredInst. 6722 if (isScalarWithPredication(I)) 6723 return false; 6724 6725 // If any of the instruction's operands are uniform after vectorization, 6726 // the instruction cannot be scalarized. This prevents, for example, a 6727 // masked load from being scalarized. 6728 // 6729 // We assume we will only emit a value for lane zero of an instruction 6730 // marked uniform after vectorization, rather than VF identical values. 6731 // Thus, if we scalarize an instruction that uses a uniform, we would 6732 // create uses of values corresponding to the lanes we aren't emitting code 6733 // for. This behavior can be changed by allowing getScalarValue to clone 6734 // the lane zero values for uniforms rather than asserting. 6735 for (Use &U : I->operands()) 6736 if (auto *J = dyn_cast<Instruction>(U.get())) 6737 if (isUniformAfterVectorization(J, VF)) 6738 return false; 6739 6740 // Otherwise, we can scalarize the instruction. 6741 return true; 6742 }; 6743 6744 // Compute the expected cost discount from scalarizing the entire expression 6745 // feeding the predicated instruction. We currently only consider expressions 6746 // that are single-use instruction chains. 6747 Worklist.push_back(PredInst); 6748 while (!Worklist.empty()) { 6749 Instruction *I = Worklist.pop_back_val(); 6750 6751 // If we've already analyzed the instruction, there's nothing to do. 6752 if (ScalarCosts.find(I) != ScalarCosts.end()) 6753 continue; 6754 6755 // Compute the cost of the vector instruction. Note that this cost already 6756 // includes the scalarization overhead of the predicated instruction. 6757 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6758 6759 // Compute the cost of the scalarized instruction. This cost is the cost of 6760 // the instruction as if it wasn't if-converted and instead remained in the 6761 // predicated block. We will scale this cost by block probability after 6762 // computing the scalarization overhead. 6763 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6764 InstructionCost ScalarCost = 6765 VF.getKnownMinValue() * 6766 getInstructionCost(I, ElementCount::getFixed(1)).first; 6767 6768 // Compute the scalarization overhead of needed insertelement instructions 6769 // and phi nodes. 6770 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6771 ScalarCost += TTI.getScalarizationOverhead( 6772 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6773 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6774 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6775 ScalarCost += 6776 VF.getKnownMinValue() * 6777 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6778 } 6779 6780 // Compute the scalarization overhead of needed extractelement 6781 // instructions. For each of the instruction's operands, if the operand can 6782 // be scalarized, add it to the worklist; otherwise, account for the 6783 // overhead. 6784 for (Use &U : I->operands()) 6785 if (auto *J = dyn_cast<Instruction>(U.get())) { 6786 assert(VectorType::isValidElementType(J->getType()) && 6787 "Instruction has non-scalar type"); 6788 if (canBeScalarized(J)) 6789 Worklist.push_back(J); 6790 else if (needsExtract(J, VF)) { 6791 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6792 ScalarCost += TTI.getScalarizationOverhead( 6793 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6794 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6795 } 6796 } 6797 6798 // Scale the total scalar cost by block probability. 6799 ScalarCost /= getReciprocalPredBlockProb(); 6800 6801 // Compute the discount. A non-negative discount means the vector version 6802 // of the instruction costs more, and scalarizing would be beneficial. 6803 Discount += VectorCost - ScalarCost; 6804 ScalarCosts[I] = ScalarCost; 6805 } 6806 6807 return *Discount.getValue(); 6808 } 6809 6810 LoopVectorizationCostModel::VectorizationCostTy 6811 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6812 VectorizationCostTy Cost; 6813 6814 // For each block. 6815 for (BasicBlock *BB : TheLoop->blocks()) { 6816 VectorizationCostTy BlockCost; 6817 6818 // For each instruction in the old loop. 6819 for (Instruction &I : BB->instructionsWithoutDebug()) { 6820 // Skip ignored values. 6821 if (ValuesToIgnore.count(&I) || 6822 (VF.isVector() && VecValuesToIgnore.count(&I))) 6823 continue; 6824 6825 VectorizationCostTy C = getInstructionCost(&I, VF); 6826 6827 // Check if we should override the cost. 6828 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6829 C.first = InstructionCost(ForceTargetInstructionCost); 6830 6831 BlockCost.first += C.first; 6832 BlockCost.second |= C.second; 6833 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6834 << " for VF " << VF << " For instruction: " << I 6835 << '\n'); 6836 } 6837 6838 // If we are vectorizing a predicated block, it will have been 6839 // if-converted. This means that the block's instructions (aside from 6840 // stores and instructions that may divide by zero) will now be 6841 // unconditionally executed. For the scalar case, we may not always execute 6842 // the predicated block, if it is an if-else block. Thus, scale the block's 6843 // cost by the probability of executing it. blockNeedsPredication from 6844 // Legal is used so as to not include all blocks in tail folded loops. 6845 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6846 BlockCost.first /= getReciprocalPredBlockProb(); 6847 6848 Cost.first += BlockCost.first; 6849 Cost.second |= BlockCost.second; 6850 } 6851 6852 return Cost; 6853 } 6854 6855 /// Gets Address Access SCEV after verifying that the access pattern 6856 /// is loop invariant except the induction variable dependence. 6857 /// 6858 /// This SCEV can be sent to the Target in order to estimate the address 6859 /// calculation cost. 6860 static const SCEV *getAddressAccessSCEV( 6861 Value *Ptr, 6862 LoopVectorizationLegality *Legal, 6863 PredicatedScalarEvolution &PSE, 6864 const Loop *TheLoop) { 6865 6866 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6867 if (!Gep) 6868 return nullptr; 6869 6870 // We are looking for a gep with all loop invariant indices except for one 6871 // which should be an induction variable. 6872 auto SE = PSE.getSE(); 6873 unsigned NumOperands = Gep->getNumOperands(); 6874 for (unsigned i = 1; i < NumOperands; ++i) { 6875 Value *Opd = Gep->getOperand(i); 6876 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6877 !Legal->isInductionVariable(Opd)) 6878 return nullptr; 6879 } 6880 6881 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6882 return PSE.getSCEV(Ptr); 6883 } 6884 6885 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6886 return Legal->hasStride(I->getOperand(0)) || 6887 Legal->hasStride(I->getOperand(1)); 6888 } 6889 6890 InstructionCost 6891 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6892 ElementCount VF) { 6893 assert(VF.isVector() && 6894 "Scalarization cost of instruction implies vectorization."); 6895 if (VF.isScalable()) 6896 return InstructionCost::getInvalid(); 6897 6898 Type *ValTy = getMemInstValueType(I); 6899 auto SE = PSE.getSE(); 6900 6901 unsigned AS = getLoadStoreAddressSpace(I); 6902 Value *Ptr = getLoadStorePointerOperand(I); 6903 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6904 6905 // Figure out whether the access is strided and get the stride value 6906 // if it's known in compile time 6907 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6908 6909 // Get the cost of the scalar memory instruction and address computation. 6910 InstructionCost Cost = 6911 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6912 6913 // Don't pass *I here, since it is scalar but will actually be part of a 6914 // vectorized loop where the user of it is a vectorized instruction. 6915 const Align Alignment = getLoadStoreAlignment(I); 6916 Cost += VF.getKnownMinValue() * 6917 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6918 AS, TTI::TCK_RecipThroughput); 6919 6920 // Get the overhead of the extractelement and insertelement instructions 6921 // we might create due to scalarization. 6922 Cost += getScalarizationOverhead(I, VF); 6923 6924 // If we have a predicated load/store, it will need extra i1 extracts and 6925 // conditional branches, but may not be executed for each vector lane. Scale 6926 // the cost by the probability of executing the predicated block. 6927 if (isPredicatedInst(I, ElementCount::getFixed(1))) { 6928 Cost /= getReciprocalPredBlockProb(); 6929 6930 // Add the cost of an i1 extract and a branch 6931 auto *Vec_i1Ty = 6932 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6933 Cost += TTI.getScalarizationOverhead( 6934 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 6935 /*Insert=*/false, /*Extract=*/true); 6936 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); 6937 6938 if (useEmulatedMaskMemRefHack(I)) 6939 // Artificially setting to a high enough value to practically disable 6940 // vectorization with such operations. 6941 Cost = 3000000; 6942 } 6943 6944 return Cost; 6945 } 6946 6947 InstructionCost 6948 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6949 ElementCount VF) { 6950 Type *ValTy = getMemInstValueType(I); 6951 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6952 Value *Ptr = getLoadStorePointerOperand(I); 6953 unsigned AS = getLoadStoreAddressSpace(I); 6954 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6955 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6956 6957 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6958 "Stride should be 1 or -1 for consecutive memory access"); 6959 const Align Alignment = getLoadStoreAlignment(I); 6960 InstructionCost Cost = 0; 6961 if (Legal->isMaskRequired(I)) 6962 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6963 CostKind); 6964 else 6965 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6966 CostKind, I); 6967 6968 bool Reverse = ConsecutiveStride < 0; 6969 if (Reverse) 6970 Cost += 6971 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 6972 return Cost; 6973 } 6974 6975 InstructionCost 6976 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6977 ElementCount VF) { 6978 assert(Legal->isUniformMemOp(*I)); 6979 6980 Type *ValTy = getMemInstValueType(I); 6981 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6982 const Align Alignment = getLoadStoreAlignment(I); 6983 unsigned AS = getLoadStoreAddressSpace(I); 6984 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6985 if (isa<LoadInst>(I)) { 6986 return TTI.getAddressComputationCost(ValTy) + 6987 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6988 CostKind) + 6989 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6990 } 6991 StoreInst *SI = cast<StoreInst>(I); 6992 6993 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6994 return TTI.getAddressComputationCost(ValTy) + 6995 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6996 CostKind) + 6997 (isLoopInvariantStoreValue 6998 ? 0 6999 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 7000 VF.getKnownMinValue() - 1)); 7001 } 7002 7003 InstructionCost 7004 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 7005 ElementCount VF) { 7006 Type *ValTy = getMemInstValueType(I); 7007 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7008 const Align Alignment = getLoadStoreAlignment(I); 7009 const Value *Ptr = getLoadStorePointerOperand(I); 7010 7011 return TTI.getAddressComputationCost(VectorTy) + 7012 TTI.getGatherScatterOpCost( 7013 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 7014 TargetTransformInfo::TCK_RecipThroughput, I); 7015 } 7016 7017 InstructionCost 7018 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 7019 ElementCount VF) { 7020 // TODO: Once we have support for interleaving with scalable vectors 7021 // we can calculate the cost properly here. 7022 if (VF.isScalable()) 7023 return InstructionCost::getInvalid(); 7024 7025 Type *ValTy = getMemInstValueType(I); 7026 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 7027 unsigned AS = getLoadStoreAddressSpace(I); 7028 7029 auto Group = getInterleavedAccessGroup(I); 7030 assert(Group && "Fail to get an interleaved access group."); 7031 7032 unsigned InterleaveFactor = Group->getFactor(); 7033 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 7034 7035 // Holds the indices of existing members in an interleaved load group. 7036 // An interleaved store group doesn't need this as it doesn't allow gaps. 7037 SmallVector<unsigned, 4> Indices; 7038 if (isa<LoadInst>(I)) { 7039 for (unsigned i = 0; i < InterleaveFactor; i++) 7040 if (Group->getMember(i)) 7041 Indices.push_back(i); 7042 } 7043 7044 // Calculate the cost of the whole interleaved group. 7045 bool UseMaskForGaps = 7046 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 7047 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 7048 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 7049 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 7050 7051 if (Group->isReverse()) { 7052 // TODO: Add support for reversed masked interleaved access. 7053 assert(!Legal->isMaskRequired(I) && 7054 "Reverse masked interleaved access not supported."); 7055 Cost += 7056 Group->getNumMembers() * 7057 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); 7058 } 7059 return Cost; 7060 } 7061 7062 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 7063 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 7064 // Early exit for no inloop reductions 7065 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 7066 return InstructionCost::getInvalid(); 7067 auto *VectorTy = cast<VectorType>(Ty); 7068 7069 // We are looking for a pattern of, and finding the minimal acceptable cost: 7070 // reduce(mul(ext(A), ext(B))) or 7071 // reduce(mul(A, B)) or 7072 // reduce(ext(A)) or 7073 // reduce(A). 7074 // The basic idea is that we walk down the tree to do that, finding the root 7075 // reduction instruction in InLoopReductionImmediateChains. From there we find 7076 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 7077 // of the components. If the reduction cost is lower then we return it for the 7078 // reduction instruction and 0 for the other instructions in the pattern. If 7079 // it is not we return an invalid cost specifying the orignal cost method 7080 // should be used. 7081 Instruction *RetI = I; 7082 if ((RetI->getOpcode() == Instruction::SExt || 7083 RetI->getOpcode() == Instruction::ZExt)) { 7084 if (!RetI->hasOneUser()) 7085 return InstructionCost::getInvalid(); 7086 RetI = RetI->user_back(); 7087 } 7088 if (RetI->getOpcode() == Instruction::Mul && 7089 RetI->user_back()->getOpcode() == Instruction::Add) { 7090 if (!RetI->hasOneUser()) 7091 return InstructionCost::getInvalid(); 7092 RetI = RetI->user_back(); 7093 } 7094 7095 // Test if the found instruction is a reduction, and if not return an invalid 7096 // cost specifying the parent to use the original cost modelling. 7097 if (!InLoopReductionImmediateChains.count(RetI)) 7098 return InstructionCost::getInvalid(); 7099 7100 // Find the reduction this chain is a part of and calculate the basic cost of 7101 // the reduction on its own. 7102 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 7103 Instruction *ReductionPhi = LastChain; 7104 while (!isa<PHINode>(ReductionPhi)) 7105 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 7106 7107 RecurrenceDescriptor RdxDesc = 7108 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 7109 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 7110 RdxDesc.getOpcode(), VectorTy, false, CostKind); 7111 7112 // Get the operand that was not the reduction chain and match it to one of the 7113 // patterns, returning the better cost if it is found. 7114 Instruction *RedOp = RetI->getOperand(1) == LastChain 7115 ? dyn_cast<Instruction>(RetI->getOperand(0)) 7116 : dyn_cast<Instruction>(RetI->getOperand(1)); 7117 7118 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 7119 7120 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 7121 !TheLoop->isLoopInvariant(RedOp)) { 7122 bool IsUnsigned = isa<ZExtInst>(RedOp); 7123 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 7124 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7125 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7126 CostKind); 7127 7128 InstructionCost ExtCost = 7129 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 7130 TTI::CastContextHint::None, CostKind, RedOp); 7131 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 7132 return I == RetI ? *RedCost.getValue() : 0; 7133 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 7134 Instruction *Mul = RedOp; 7135 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 7136 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 7137 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 7138 Op0->getOpcode() == Op1->getOpcode() && 7139 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 7140 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 7141 bool IsUnsigned = isa<ZExtInst>(Op0); 7142 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 7143 // reduce(mul(ext, ext)) 7144 InstructionCost ExtCost = 7145 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 7146 TTI::CastContextHint::None, CostKind, Op0); 7147 InstructionCost MulCost = 7148 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7149 7150 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7151 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 7152 CostKind); 7153 7154 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 7155 return I == RetI ? *RedCost.getValue() : 0; 7156 } else { 7157 InstructionCost MulCost = 7158 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 7159 7160 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 7161 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 7162 CostKind); 7163 7164 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 7165 return I == RetI ? *RedCost.getValue() : 0; 7166 } 7167 } 7168 7169 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 7170 } 7171 7172 InstructionCost 7173 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 7174 ElementCount VF) { 7175 // Calculate scalar cost only. Vectorization cost should be ready at this 7176 // moment. 7177 if (VF.isScalar()) { 7178 Type *ValTy = getMemInstValueType(I); 7179 const Align Alignment = getLoadStoreAlignment(I); 7180 unsigned AS = getLoadStoreAddressSpace(I); 7181 7182 return TTI.getAddressComputationCost(ValTy) + 7183 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 7184 TTI::TCK_RecipThroughput, I); 7185 } 7186 return getWideningCost(I, VF); 7187 } 7188 7189 LoopVectorizationCostModel::VectorizationCostTy 7190 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 7191 ElementCount VF) { 7192 // If we know that this instruction will remain uniform, check the cost of 7193 // the scalar version. 7194 if (isUniformAfterVectorization(I, VF)) 7195 VF = ElementCount::getFixed(1); 7196 7197 if (VF.isVector() && isProfitableToScalarize(I, VF)) 7198 return VectorizationCostTy(InstsToScalarize[VF][I], false); 7199 7200 // Forced scalars do not have any scalarization overhead. 7201 auto ForcedScalar = ForcedScalars.find(VF); 7202 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 7203 auto InstSet = ForcedScalar->second; 7204 if (InstSet.count(I)) 7205 return VectorizationCostTy( 7206 (getInstructionCost(I, ElementCount::getFixed(1)).first * 7207 VF.getKnownMinValue()), 7208 false); 7209 } 7210 7211 Type *VectorTy; 7212 InstructionCost C = getInstructionCost(I, VF, VectorTy); 7213 7214 bool TypeNotScalarized = 7215 VF.isVector() && VectorTy->isVectorTy() && 7216 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 7217 return VectorizationCostTy(C, TypeNotScalarized); 7218 } 7219 7220 InstructionCost 7221 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 7222 ElementCount VF) const { 7223 7224 if (VF.isScalable()) 7225 return InstructionCost::getInvalid(); 7226 7227 if (VF.isScalar()) 7228 return 0; 7229 7230 InstructionCost Cost = 0; 7231 Type *RetTy = ToVectorTy(I->getType(), VF); 7232 if (!RetTy->isVoidTy() && 7233 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7234 Cost += TTI.getScalarizationOverhead( 7235 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7236 true, false); 7237 7238 // Some targets keep addresses scalar. 7239 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7240 return Cost; 7241 7242 // Some targets support efficient element stores. 7243 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7244 return Cost; 7245 7246 // Collect operands to consider. 7247 CallInst *CI = dyn_cast<CallInst>(I); 7248 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7249 7250 // Skip operands that do not require extraction/scalarization and do not incur 7251 // any overhead. 7252 SmallVector<Type *> Tys; 7253 for (auto *V : filterExtractingOperands(Ops, VF)) 7254 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 7255 return Cost + TTI.getOperandsScalarizationOverhead( 7256 filterExtractingOperands(Ops, VF), Tys); 7257 } 7258 7259 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7260 if (VF.isScalar()) 7261 return; 7262 NumPredStores = 0; 7263 for (BasicBlock *BB : TheLoop->blocks()) { 7264 // For each instruction in the old loop. 7265 for (Instruction &I : *BB) { 7266 Value *Ptr = getLoadStorePointerOperand(&I); 7267 if (!Ptr) 7268 continue; 7269 7270 // TODO: We should generate better code and update the cost model for 7271 // predicated uniform stores. Today they are treated as any other 7272 // predicated store (see added test cases in 7273 // invariant-store-vectorization.ll). 7274 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7275 NumPredStores++; 7276 7277 if (Legal->isUniformMemOp(I)) { 7278 // TODO: Avoid replicating loads and stores instead of 7279 // relying on instcombine to remove them. 7280 // Load: Scalar load + broadcast 7281 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7282 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7283 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7284 continue; 7285 } 7286 7287 // We assume that widening is the best solution when possible. 7288 if (memoryInstructionCanBeWidened(&I, VF)) { 7289 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7290 int ConsecutiveStride = 7291 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7292 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7293 "Expected consecutive stride."); 7294 InstWidening Decision = 7295 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7296 setWideningDecision(&I, VF, Decision, Cost); 7297 continue; 7298 } 7299 7300 // Choose between Interleaving, Gather/Scatter or Scalarization. 7301 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7302 unsigned NumAccesses = 1; 7303 if (isAccessInterleaved(&I)) { 7304 auto Group = getInterleavedAccessGroup(&I); 7305 assert(Group && "Fail to get an interleaved access group."); 7306 7307 // Make one decision for the whole group. 7308 if (getWideningDecision(&I, VF) != CM_Unknown) 7309 continue; 7310 7311 NumAccesses = Group->getNumMembers(); 7312 if (interleavedAccessCanBeWidened(&I, VF)) 7313 InterleaveCost = getInterleaveGroupCost(&I, VF); 7314 } 7315 7316 InstructionCost GatherScatterCost = 7317 isLegalGatherOrScatter(&I) 7318 ? getGatherScatterCost(&I, VF) * NumAccesses 7319 : InstructionCost::getInvalid(); 7320 7321 InstructionCost ScalarizationCost = 7322 getMemInstScalarizationCost(&I, VF) * NumAccesses; 7323 7324 // Choose better solution for the current VF, 7325 // write down this decision and use it during vectorization. 7326 InstructionCost Cost; 7327 InstWidening Decision; 7328 if (InterleaveCost <= GatherScatterCost && 7329 InterleaveCost < ScalarizationCost) { 7330 Decision = CM_Interleave; 7331 Cost = InterleaveCost; 7332 } else if (GatherScatterCost < ScalarizationCost) { 7333 Decision = CM_GatherScatter; 7334 Cost = GatherScatterCost; 7335 } else { 7336 assert(!VF.isScalable() && 7337 "We cannot yet scalarise for scalable vectors"); 7338 Decision = CM_Scalarize; 7339 Cost = ScalarizationCost; 7340 } 7341 // If the instructions belongs to an interleave group, the whole group 7342 // receives the same decision. The whole group receives the cost, but 7343 // the cost will actually be assigned to one instruction. 7344 if (auto Group = getInterleavedAccessGroup(&I)) 7345 setWideningDecision(Group, VF, Decision, Cost); 7346 else 7347 setWideningDecision(&I, VF, Decision, Cost); 7348 } 7349 } 7350 7351 // Make sure that any load of address and any other address computation 7352 // remains scalar unless there is gather/scatter support. This avoids 7353 // inevitable extracts into address registers, and also has the benefit of 7354 // activating LSR more, since that pass can't optimize vectorized 7355 // addresses. 7356 if (TTI.prefersVectorizedAddressing()) 7357 return; 7358 7359 // Start with all scalar pointer uses. 7360 SmallPtrSet<Instruction *, 8> AddrDefs; 7361 for (BasicBlock *BB : TheLoop->blocks()) 7362 for (Instruction &I : *BB) { 7363 Instruction *PtrDef = 7364 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7365 if (PtrDef && TheLoop->contains(PtrDef) && 7366 getWideningDecision(&I, VF) != CM_GatherScatter) 7367 AddrDefs.insert(PtrDef); 7368 } 7369 7370 // Add all instructions used to generate the addresses. 7371 SmallVector<Instruction *, 4> Worklist; 7372 append_range(Worklist, AddrDefs); 7373 while (!Worklist.empty()) { 7374 Instruction *I = Worklist.pop_back_val(); 7375 for (auto &Op : I->operands()) 7376 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7377 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7378 AddrDefs.insert(InstOp).second) 7379 Worklist.push_back(InstOp); 7380 } 7381 7382 for (auto *I : AddrDefs) { 7383 if (isa<LoadInst>(I)) { 7384 // Setting the desired widening decision should ideally be handled in 7385 // by cost functions, but since this involves the task of finding out 7386 // if the loaded register is involved in an address computation, it is 7387 // instead changed here when we know this is the case. 7388 InstWidening Decision = getWideningDecision(I, VF); 7389 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7390 // Scalarize a widened load of address. 7391 setWideningDecision( 7392 I, VF, CM_Scalarize, 7393 (VF.getKnownMinValue() * 7394 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7395 else if (auto Group = getInterleavedAccessGroup(I)) { 7396 // Scalarize an interleave group of address loads. 7397 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7398 if (Instruction *Member = Group->getMember(I)) 7399 setWideningDecision( 7400 Member, VF, CM_Scalarize, 7401 (VF.getKnownMinValue() * 7402 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7403 } 7404 } 7405 } else 7406 // Make sure I gets scalarized and a cost estimate without 7407 // scalarization overhead. 7408 ForcedScalars[VF].insert(I); 7409 } 7410 } 7411 7412 InstructionCost 7413 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7414 Type *&VectorTy) { 7415 Type *RetTy = I->getType(); 7416 if (canTruncateToMinimalBitwidth(I, VF)) 7417 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7418 auto SE = PSE.getSE(); 7419 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7420 7421 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 7422 ElementCount VF) -> bool { 7423 if (VF.isScalar()) 7424 return true; 7425 7426 auto Scalarized = InstsToScalarize.find(VF); 7427 assert(Scalarized != InstsToScalarize.end() && 7428 "VF not yet analyzed for scalarization profitability"); 7429 return !Scalarized->second.count(I) && 7430 llvm::all_of(I->users(), [&](User *U) { 7431 auto *UI = cast<Instruction>(U); 7432 return !Scalarized->second.count(UI); 7433 }); 7434 }; 7435 (void) hasSingleCopyAfterVectorization; 7436 7437 if (isScalarAfterVectorization(I, VF)) { 7438 // With the exception of GEPs and PHIs, after scalarization there should 7439 // only be one copy of the instruction generated in the loop. This is 7440 // because the VF is either 1, or any instructions that need scalarizing 7441 // have already been dealt with by the the time we get here. As a result, 7442 // it means we don't have to multiply the instruction cost by VF. 7443 assert(I->getOpcode() == Instruction::GetElementPtr || 7444 I->getOpcode() == Instruction::PHI || 7445 (I->getOpcode() == Instruction::BitCast && 7446 I->getType()->isPointerTy()) || 7447 hasSingleCopyAfterVectorization(I, VF)); 7448 VectorTy = RetTy; 7449 } else 7450 VectorTy = ToVectorTy(RetTy, VF); 7451 7452 // TODO: We need to estimate the cost of intrinsic calls. 7453 switch (I->getOpcode()) { 7454 case Instruction::GetElementPtr: 7455 // We mark this instruction as zero-cost because the cost of GEPs in 7456 // vectorized code depends on whether the corresponding memory instruction 7457 // is scalarized or not. Therefore, we handle GEPs with the memory 7458 // instruction cost. 7459 return 0; 7460 case Instruction::Br: { 7461 // In cases of scalarized and predicated instructions, there will be VF 7462 // predicated blocks in the vectorized loop. Each branch around these 7463 // blocks requires also an extract of its vector compare i1 element. 7464 bool ScalarPredicatedBB = false; 7465 BranchInst *BI = cast<BranchInst>(I); 7466 if (VF.isVector() && BI->isConditional() && 7467 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7468 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7469 ScalarPredicatedBB = true; 7470 7471 if (ScalarPredicatedBB) { 7472 // Return cost for branches around scalarized and predicated blocks. 7473 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7474 auto *Vec_i1Ty = 7475 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7476 return (TTI.getScalarizationOverhead( 7477 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7478 false, true) + 7479 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7480 VF.getKnownMinValue())); 7481 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7482 // The back-edge branch will remain, as will all scalar branches. 7483 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7484 else 7485 // This branch will be eliminated by if-conversion. 7486 return 0; 7487 // Note: We currently assume zero cost for an unconditional branch inside 7488 // a predicated block since it will become a fall-through, although we 7489 // may decide in the future to call TTI for all branches. 7490 } 7491 case Instruction::PHI: { 7492 auto *Phi = cast<PHINode>(I); 7493 7494 // First-order recurrences are replaced by vector shuffles inside the loop. 7495 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7496 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7497 return TTI.getShuffleCost( 7498 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7499 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7500 7501 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7502 // converted into select instructions. We require N - 1 selects per phi 7503 // node, where N is the number of incoming values. 7504 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7505 return (Phi->getNumIncomingValues() - 1) * 7506 TTI.getCmpSelInstrCost( 7507 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7508 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7509 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7510 7511 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7512 } 7513 case Instruction::UDiv: 7514 case Instruction::SDiv: 7515 case Instruction::URem: 7516 case Instruction::SRem: 7517 // If we have a predicated instruction, it may not be executed for each 7518 // vector lane. Get the scalarization cost and scale this amount by the 7519 // probability of executing the predicated block. If the instruction is not 7520 // predicated, we fall through to the next case. 7521 if (VF.isVector() && isScalarWithPredication(I)) { 7522 InstructionCost Cost = 0; 7523 7524 // These instructions have a non-void type, so account for the phi nodes 7525 // that we will create. This cost is likely to be zero. The phi node 7526 // cost, if any, should be scaled by the block probability because it 7527 // models a copy at the end of each predicated block. 7528 Cost += VF.getKnownMinValue() * 7529 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7530 7531 // The cost of the non-predicated instruction. 7532 Cost += VF.getKnownMinValue() * 7533 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7534 7535 // The cost of insertelement and extractelement instructions needed for 7536 // scalarization. 7537 Cost += getScalarizationOverhead(I, VF); 7538 7539 // Scale the cost by the probability of executing the predicated blocks. 7540 // This assumes the predicated block for each vector lane is equally 7541 // likely. 7542 return Cost / getReciprocalPredBlockProb(); 7543 } 7544 LLVM_FALLTHROUGH; 7545 case Instruction::Add: 7546 case Instruction::FAdd: 7547 case Instruction::Sub: 7548 case Instruction::FSub: 7549 case Instruction::Mul: 7550 case Instruction::FMul: 7551 case Instruction::FDiv: 7552 case Instruction::FRem: 7553 case Instruction::Shl: 7554 case Instruction::LShr: 7555 case Instruction::AShr: 7556 case Instruction::And: 7557 case Instruction::Or: 7558 case Instruction::Xor: { 7559 // Since we will replace the stride by 1 the multiplication should go away. 7560 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7561 return 0; 7562 7563 // Detect reduction patterns 7564 InstructionCost RedCost; 7565 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7566 .isValid()) 7567 return RedCost; 7568 7569 // Certain instructions can be cheaper to vectorize if they have a constant 7570 // second vector operand. One example of this are shifts on x86. 7571 Value *Op2 = I->getOperand(1); 7572 TargetTransformInfo::OperandValueProperties Op2VP; 7573 TargetTransformInfo::OperandValueKind Op2VK = 7574 TTI.getOperandInfo(Op2, Op2VP); 7575 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7576 Op2VK = TargetTransformInfo::OK_UniformValue; 7577 7578 SmallVector<const Value *, 4> Operands(I->operand_values()); 7579 return TTI.getArithmeticInstrCost( 7580 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7581 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7582 } 7583 case Instruction::FNeg: { 7584 return TTI.getArithmeticInstrCost( 7585 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, 7586 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, 7587 TargetTransformInfo::OP_None, I->getOperand(0), I); 7588 } 7589 case Instruction::Select: { 7590 SelectInst *SI = cast<SelectInst>(I); 7591 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7592 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7593 7594 const Value *Op0, *Op1; 7595 using namespace llvm::PatternMatch; 7596 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7597 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7598 // select x, y, false --> x & y 7599 // select x, true, y --> x | y 7600 TTI::OperandValueProperties Op1VP = TTI::OP_None; 7601 TTI::OperandValueProperties Op2VP = TTI::OP_None; 7602 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP); 7603 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP); 7604 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7605 Op1->getType()->getScalarSizeInBits() == 1); 7606 7607 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7608 return TTI.getArithmeticInstrCost( 7609 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7610 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I); 7611 } 7612 7613 Type *CondTy = SI->getCondition()->getType(); 7614 if (!ScalarCond) 7615 CondTy = VectorType::get(CondTy, VF); 7616 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7617 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7618 } 7619 case Instruction::ICmp: 7620 case Instruction::FCmp: { 7621 Type *ValTy = I->getOperand(0)->getType(); 7622 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7623 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7624 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7625 VectorTy = ToVectorTy(ValTy, VF); 7626 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7627 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7628 } 7629 case Instruction::Store: 7630 case Instruction::Load: { 7631 ElementCount Width = VF; 7632 if (Width.isVector()) { 7633 InstWidening Decision = getWideningDecision(I, Width); 7634 assert(Decision != CM_Unknown && 7635 "CM decision should be taken at this point"); 7636 if (Decision == CM_Scalarize) 7637 Width = ElementCount::getFixed(1); 7638 } 7639 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7640 return getMemoryInstructionCost(I, VF); 7641 } 7642 case Instruction::BitCast: 7643 if (I->getType()->isPointerTy()) 7644 return 0; 7645 LLVM_FALLTHROUGH; 7646 case Instruction::ZExt: 7647 case Instruction::SExt: 7648 case Instruction::FPToUI: 7649 case Instruction::FPToSI: 7650 case Instruction::FPExt: 7651 case Instruction::PtrToInt: 7652 case Instruction::IntToPtr: 7653 case Instruction::SIToFP: 7654 case Instruction::UIToFP: 7655 case Instruction::Trunc: 7656 case Instruction::FPTrunc: { 7657 // Computes the CastContextHint from a Load/Store instruction. 7658 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7659 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7660 "Expected a load or a store!"); 7661 7662 if (VF.isScalar() || !TheLoop->contains(I)) 7663 return TTI::CastContextHint::Normal; 7664 7665 switch (getWideningDecision(I, VF)) { 7666 case LoopVectorizationCostModel::CM_GatherScatter: 7667 return TTI::CastContextHint::GatherScatter; 7668 case LoopVectorizationCostModel::CM_Interleave: 7669 return TTI::CastContextHint::Interleave; 7670 case LoopVectorizationCostModel::CM_Scalarize: 7671 case LoopVectorizationCostModel::CM_Widen: 7672 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7673 : TTI::CastContextHint::Normal; 7674 case LoopVectorizationCostModel::CM_Widen_Reverse: 7675 return TTI::CastContextHint::Reversed; 7676 case LoopVectorizationCostModel::CM_Unknown: 7677 llvm_unreachable("Instr did not go through cost modelling?"); 7678 } 7679 7680 llvm_unreachable("Unhandled case!"); 7681 }; 7682 7683 unsigned Opcode = I->getOpcode(); 7684 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7685 // For Trunc, the context is the only user, which must be a StoreInst. 7686 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7687 if (I->hasOneUse()) 7688 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7689 CCH = ComputeCCH(Store); 7690 } 7691 // For Z/Sext, the context is the operand, which must be a LoadInst. 7692 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7693 Opcode == Instruction::FPExt) { 7694 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7695 CCH = ComputeCCH(Load); 7696 } 7697 7698 // We optimize the truncation of induction variables having constant 7699 // integer steps. The cost of these truncations is the same as the scalar 7700 // operation. 7701 if (isOptimizableIVTruncate(I, VF)) { 7702 auto *Trunc = cast<TruncInst>(I); 7703 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7704 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7705 } 7706 7707 // Detect reduction patterns 7708 InstructionCost RedCost; 7709 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7710 .isValid()) 7711 return RedCost; 7712 7713 Type *SrcScalarTy = I->getOperand(0)->getType(); 7714 Type *SrcVecTy = 7715 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7716 if (canTruncateToMinimalBitwidth(I, VF)) { 7717 // This cast is going to be shrunk. This may remove the cast or it might 7718 // turn it into slightly different cast. For example, if MinBW == 16, 7719 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7720 // 7721 // Calculate the modified src and dest types. 7722 Type *MinVecTy = VectorTy; 7723 if (Opcode == Instruction::Trunc) { 7724 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7725 VectorTy = 7726 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7727 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7728 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7729 VectorTy = 7730 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7731 } 7732 } 7733 7734 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7735 } 7736 case Instruction::Call: { 7737 bool NeedToScalarize; 7738 CallInst *CI = cast<CallInst>(I); 7739 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7740 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7741 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7742 return std::min(CallCost, IntrinsicCost); 7743 } 7744 return CallCost; 7745 } 7746 case Instruction::ExtractValue: 7747 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7748 default: 7749 // This opcode is unknown. Assume that it is the same as 'mul'. 7750 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7751 } // end of switch. 7752 } 7753 7754 char LoopVectorize::ID = 0; 7755 7756 static const char lv_name[] = "Loop Vectorization"; 7757 7758 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7759 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7760 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7761 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7762 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7763 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7764 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7765 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7766 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7767 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7768 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7769 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7770 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7771 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7772 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7773 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7774 7775 namespace llvm { 7776 7777 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7778 7779 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7780 bool VectorizeOnlyWhenForced) { 7781 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7782 } 7783 7784 } // end namespace llvm 7785 7786 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7787 // Check if the pointer operand of a load or store instruction is 7788 // consecutive. 7789 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7790 return Legal->isConsecutivePtr(Ptr); 7791 return false; 7792 } 7793 7794 void LoopVectorizationCostModel::collectValuesToIgnore() { 7795 // Ignore ephemeral values. 7796 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7797 7798 // Ignore type-promoting instructions we identified during reduction 7799 // detection. 7800 for (auto &Reduction : Legal->getReductionVars()) { 7801 RecurrenceDescriptor &RedDes = Reduction.second; 7802 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7803 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7804 } 7805 // Ignore type-casting instructions we identified during induction 7806 // detection. 7807 for (auto &Induction : Legal->getInductionVars()) { 7808 InductionDescriptor &IndDes = Induction.second; 7809 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7810 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7811 } 7812 } 7813 7814 void LoopVectorizationCostModel::collectInLoopReductions() { 7815 for (auto &Reduction : Legal->getReductionVars()) { 7816 PHINode *Phi = Reduction.first; 7817 RecurrenceDescriptor &RdxDesc = Reduction.second; 7818 7819 // We don't collect reductions that are type promoted (yet). 7820 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7821 continue; 7822 7823 // If the target would prefer this reduction to happen "in-loop", then we 7824 // want to record it as such. 7825 unsigned Opcode = RdxDesc.getOpcode(); 7826 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7827 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7828 TargetTransformInfo::ReductionFlags())) 7829 continue; 7830 7831 // Check that we can correctly put the reductions into the loop, by 7832 // finding the chain of operations that leads from the phi to the loop 7833 // exit value. 7834 SmallVector<Instruction *, 4> ReductionOperations = 7835 RdxDesc.getReductionOpChain(Phi, TheLoop); 7836 bool InLoop = !ReductionOperations.empty(); 7837 if (InLoop) { 7838 InLoopReductionChains[Phi] = ReductionOperations; 7839 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7840 Instruction *LastChain = Phi; 7841 for (auto *I : ReductionOperations) { 7842 InLoopReductionImmediateChains[I] = LastChain; 7843 LastChain = I; 7844 } 7845 } 7846 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7847 << " reduction for phi: " << *Phi << "\n"); 7848 } 7849 } 7850 7851 // TODO: we could return a pair of values that specify the max VF and 7852 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7853 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7854 // doesn't have a cost model that can choose which plan to execute if 7855 // more than one is generated. 7856 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7857 LoopVectorizationCostModel &CM) { 7858 unsigned WidestType; 7859 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7860 return WidestVectorRegBits / WidestType; 7861 } 7862 7863 VectorizationFactor 7864 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7865 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7866 ElementCount VF = UserVF; 7867 // Outer loop handling: They may require CFG and instruction level 7868 // transformations before even evaluating whether vectorization is profitable. 7869 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7870 // the vectorization pipeline. 7871 if (!OrigLoop->isInnermost()) { 7872 // If the user doesn't provide a vectorization factor, determine a 7873 // reasonable one. 7874 if (UserVF.isZero()) { 7875 VF = ElementCount::getFixed(determineVPlanVF( 7876 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) 7877 .getFixedSize(), 7878 CM)); 7879 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7880 7881 // Make sure we have a VF > 1 for stress testing. 7882 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7883 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7884 << "overriding computed VF.\n"); 7885 VF = ElementCount::getFixed(4); 7886 } 7887 } 7888 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7889 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7890 "VF needs to be a power of two"); 7891 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7892 << "VF " << VF << " to build VPlans.\n"); 7893 buildVPlans(VF, VF); 7894 7895 // For VPlan build stress testing, we bail out after VPlan construction. 7896 if (VPlanBuildStressTest) 7897 return VectorizationFactor::Disabled(); 7898 7899 return {VF, 0 /*Cost*/}; 7900 } 7901 7902 LLVM_DEBUG( 7903 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7904 "VPlan-native path.\n"); 7905 return VectorizationFactor::Disabled(); 7906 } 7907 7908 Optional<VectorizationFactor> 7909 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7910 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7911 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7912 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7913 return None; 7914 7915 // Invalidate interleave groups if all blocks of loop will be predicated. 7916 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7917 !useMaskedInterleavedAccesses(*TTI)) { 7918 LLVM_DEBUG( 7919 dbgs() 7920 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7921 "which requires masked-interleaved support.\n"); 7922 if (CM.InterleaveInfo.invalidateGroups()) 7923 // Invalidating interleave groups also requires invalidating all decisions 7924 // based on them, which includes widening decisions and uniform and scalar 7925 // values. 7926 CM.invalidateCostModelingDecisions(); 7927 } 7928 7929 ElementCount MaxVF = MaybeMaxVF.getValue(); 7930 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7931 7932 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7933 if (!UserVF.isZero() && 7934 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7935 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7936 // VFs here, this should be reverted to only use legal UserVFs once the 7937 // loop below supports scalable VFs. 7938 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7939 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7940 << " VF " << VF << ".\n"); 7941 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7942 "VF needs to be a power of two"); 7943 // Collect the instructions (and their associated costs) that will be more 7944 // profitable to scalarize. 7945 CM.selectUserVectorizationFactor(VF); 7946 CM.collectInLoopReductions(); 7947 buildVPlansWithVPRecipes(VF, VF); 7948 LLVM_DEBUG(printPlans(dbgs())); 7949 return {{VF, 0}}; 7950 } 7951 7952 assert(!MaxVF.isScalable() && 7953 "Scalable vectors not yet supported beyond this point"); 7954 7955 for (ElementCount VF = ElementCount::getFixed(1); 7956 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7957 // Collect Uniform and Scalar instructions after vectorization with VF. 7958 CM.collectUniformsAndScalars(VF); 7959 7960 // Collect the instructions (and their associated costs) that will be more 7961 // profitable to scalarize. 7962 if (VF.isVector()) 7963 CM.collectInstsToScalarize(VF); 7964 } 7965 7966 CM.collectInLoopReductions(); 7967 7968 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7969 LLVM_DEBUG(printPlans(dbgs())); 7970 if (MaxVF.isScalar()) 7971 return VectorizationFactor::Disabled(); 7972 7973 // Select the optimal vectorization factor. 7974 auto SelectedVF = CM.selectVectorizationFactor(MaxVF); 7975 7976 // Check if it is profitable to vectorize with runtime checks. 7977 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); 7978 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) { 7979 bool PragmaThresholdReached = 7980 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 7981 bool ThresholdReached = 7982 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 7983 if ((ThresholdReached && !Hints.allowReordering()) || 7984 PragmaThresholdReached) { 7985 ORE->emit([&]() { 7986 return OptimizationRemarkAnalysisAliasing( 7987 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(), 7988 OrigLoop->getHeader()) 7989 << "loop not vectorized: cannot prove it is safe to reorder " 7990 "memory operations"; 7991 }); 7992 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 7993 Hints.emitRemarkWithHints(); 7994 return VectorizationFactor::Disabled(); 7995 } 7996 } 7997 return SelectedVF; 7998 } 7999 8000 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 8001 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 8002 << '\n'); 8003 BestVF = VF; 8004 BestUF = UF; 8005 8006 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 8007 return !Plan->hasVF(VF); 8008 }); 8009 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 8010 } 8011 8012 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 8013 DominatorTree *DT) { 8014 // Perform the actual loop transformation. 8015 8016 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 8017 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 8018 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 8019 8020 VPTransformState State{ 8021 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; 8022 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 8023 State.TripCount = ILV.getOrCreateTripCount(nullptr); 8024 State.CanonicalIV = ILV.Induction; 8025 8026 ILV.printDebugTracesAtStart(); 8027 8028 //===------------------------------------------------===// 8029 // 8030 // Notice: any optimization or new instruction that go 8031 // into the code below should also be implemented in 8032 // the cost-model. 8033 // 8034 //===------------------------------------------------===// 8035 8036 // 2. Copy and widen instructions from the old loop into the new loop. 8037 VPlans.front()->execute(&State); 8038 8039 // 3. Fix the vectorized code: take care of header phi's, live-outs, 8040 // predication, updating analyses. 8041 ILV.fixVectorizedLoop(State); 8042 8043 ILV.printDebugTracesAtEnd(); 8044 } 8045 8046 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 8047 void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 8048 for (const auto &Plan : VPlans) 8049 if (PrintVPlansInDotFormat) 8050 Plan->printDOT(O); 8051 else 8052 Plan->print(O); 8053 } 8054 #endif 8055 8056 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 8057 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 8058 8059 // We create new control-flow for the vectorized loop, so the original exit 8060 // conditions will be dead after vectorization if it's only used by the 8061 // terminator 8062 SmallVector<BasicBlock*> ExitingBlocks; 8063 OrigLoop->getExitingBlocks(ExitingBlocks); 8064 for (auto *BB : ExitingBlocks) { 8065 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 8066 if (!Cmp || !Cmp->hasOneUse()) 8067 continue; 8068 8069 // TODO: we should introduce a getUniqueExitingBlocks on Loop 8070 if (!DeadInstructions.insert(Cmp).second) 8071 continue; 8072 8073 // The operands of the icmp is often a dead trunc, used by IndUpdate. 8074 // TODO: can recurse through operands in general 8075 for (Value *Op : Cmp->operands()) { 8076 if (isa<TruncInst>(Op) && Op->hasOneUse()) 8077 DeadInstructions.insert(cast<Instruction>(Op)); 8078 } 8079 } 8080 8081 // We create new "steps" for induction variable updates to which the original 8082 // induction variables map. An original update instruction will be dead if 8083 // all its users except the induction variable are dead. 8084 auto *Latch = OrigLoop->getLoopLatch(); 8085 for (auto &Induction : Legal->getInductionVars()) { 8086 PHINode *Ind = Induction.first; 8087 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 8088 8089 // If the tail is to be folded by masking, the primary induction variable, 8090 // if exists, isn't dead: it will be used for masking. Don't kill it. 8091 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 8092 continue; 8093 8094 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 8095 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 8096 })) 8097 DeadInstructions.insert(IndUpdate); 8098 8099 // We record as "Dead" also the type-casting instructions we had identified 8100 // during induction analysis. We don't need any handling for them in the 8101 // vectorized loop because we have proven that, under a proper runtime 8102 // test guarding the vectorized loop, the value of the phi, and the casted 8103 // value of the phi, are the same. The last instruction in this casting chain 8104 // will get its scalar/vector/widened def from the scalar/vector/widened def 8105 // of the respective phi node. Any other casts in the induction def-use chain 8106 // have no other uses outside the phi update chain, and will be ignored. 8107 InductionDescriptor &IndDes = Induction.second; 8108 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 8109 DeadInstructions.insert(Casts.begin(), Casts.end()); 8110 } 8111 } 8112 8113 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 8114 8115 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 8116 8117 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 8118 Instruction::BinaryOps BinOp) { 8119 // When unrolling and the VF is 1, we only need to add a simple scalar. 8120 Type *Ty = Val->getType(); 8121 assert(!Ty->isVectorTy() && "Val must be a scalar"); 8122 8123 if (Ty->isFloatingPointTy()) { 8124 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 8125 8126 // Floating-point operations inherit FMF via the builder's flags. 8127 Value *MulOp = Builder.CreateFMul(C, Step); 8128 return Builder.CreateBinOp(BinOp, Val, MulOp); 8129 } 8130 Constant *C = ConstantInt::get(Ty, StartIdx); 8131 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 8132 } 8133 8134 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 8135 SmallVector<Metadata *, 4> MDs; 8136 // Reserve first location for self reference to the LoopID metadata node. 8137 MDs.push_back(nullptr); 8138 bool IsUnrollMetadata = false; 8139 MDNode *LoopID = L->getLoopID(); 8140 if (LoopID) { 8141 // First find existing loop unrolling disable metadata. 8142 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 8143 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 8144 if (MD) { 8145 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 8146 IsUnrollMetadata = 8147 S && S->getString().startswith("llvm.loop.unroll.disable"); 8148 } 8149 MDs.push_back(LoopID->getOperand(i)); 8150 } 8151 } 8152 8153 if (!IsUnrollMetadata) { 8154 // Add runtime unroll disable metadata. 8155 LLVMContext &Context = L->getHeader()->getContext(); 8156 SmallVector<Metadata *, 1> DisableOperands; 8157 DisableOperands.push_back( 8158 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 8159 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 8160 MDs.push_back(DisableNode); 8161 MDNode *NewLoopID = MDNode::get(Context, MDs); 8162 // Set operand 0 to refer to the loop id itself. 8163 NewLoopID->replaceOperandWith(0, NewLoopID); 8164 L->setLoopID(NewLoopID); 8165 } 8166 } 8167 8168 //===--------------------------------------------------------------------===// 8169 // EpilogueVectorizerMainLoop 8170 //===--------------------------------------------------------------------===// 8171 8172 /// This function is partially responsible for generating the control flow 8173 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8174 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 8175 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8176 Loop *Lp = createVectorLoopSkeleton(""); 8177 8178 // Generate the code to check the minimum iteration count of the vector 8179 // epilogue (see below). 8180 EPI.EpilogueIterationCountCheck = 8181 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 8182 EPI.EpilogueIterationCountCheck->setName("iter.check"); 8183 8184 // Generate the code to check any assumptions that we've made for SCEV 8185 // expressions. 8186 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); 8187 8188 // Generate the code that checks at runtime if arrays overlap. We put the 8189 // checks into a separate block to make the more common case of few elements 8190 // faster. 8191 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 8192 8193 // Generate the iteration count check for the main loop, *after* the check 8194 // for the epilogue loop, so that the path-length is shorter for the case 8195 // that goes directly through the vector epilogue. The longer-path length for 8196 // the main loop is compensated for, by the gain from vectorizing the larger 8197 // trip count. Note: the branch will get updated later on when we vectorize 8198 // the epilogue. 8199 EPI.MainLoopIterationCountCheck = 8200 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 8201 8202 // Generate the induction variable. 8203 OldInduction = Legal->getPrimaryInduction(); 8204 Type *IdxTy = Legal->getWidestInductionType(); 8205 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8206 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8207 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8208 EPI.VectorTripCount = CountRoundDown; 8209 Induction = 8210 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8211 getDebugLocFromInstOrOperands(OldInduction)); 8212 8213 // Skip induction resume value creation here because they will be created in 8214 // the second pass. If we created them here, they wouldn't be used anyway, 8215 // because the vplan in the second pass still contains the inductions from the 8216 // original loop. 8217 8218 return completeLoopSkeleton(Lp, OrigLoopID); 8219 } 8220 8221 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 8222 LLVM_DEBUG({ 8223 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 8224 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8225 << ", Main Loop UF:" << EPI.MainLoopUF 8226 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8227 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8228 }); 8229 } 8230 8231 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 8232 DEBUG_WITH_TYPE(VerboseDebug, { 8233 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 8234 }); 8235 } 8236 8237 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 8238 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 8239 assert(L && "Expected valid Loop."); 8240 assert(Bypass && "Expected valid bypass basic block."); 8241 unsigned VFactor = 8242 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 8243 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 8244 Value *Count = getOrCreateTripCount(L); 8245 // Reuse existing vector loop preheader for TC checks. 8246 // Note that new preheader block is generated for vector loop. 8247 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 8248 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 8249 8250 // Generate code to check if the loop's trip count is less than VF * UF of the 8251 // main vector loop. 8252 auto P = 8253 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8254 8255 Value *CheckMinIters = Builder.CreateICmp( 8256 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 8257 "min.iters.check"); 8258 8259 if (!ForEpilogue) 8260 TCCheckBlock->setName("vector.main.loop.iter.check"); 8261 8262 // Create new preheader for vector loop. 8263 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 8264 DT, LI, nullptr, "vector.ph"); 8265 8266 if (ForEpilogue) { 8267 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 8268 DT->getNode(Bypass)->getIDom()) && 8269 "TC check is expected to dominate Bypass"); 8270 8271 // Update dominator for Bypass & LoopExit. 8272 DT->changeImmediateDominator(Bypass, TCCheckBlock); 8273 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 8274 8275 LoopBypassBlocks.push_back(TCCheckBlock); 8276 8277 // Save the trip count so we don't have to regenerate it in the 8278 // vec.epilog.iter.check. This is safe to do because the trip count 8279 // generated here dominates the vector epilog iter check. 8280 EPI.TripCount = Count; 8281 } 8282 8283 ReplaceInstWithInst( 8284 TCCheckBlock->getTerminator(), 8285 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8286 8287 return TCCheckBlock; 8288 } 8289 8290 //===--------------------------------------------------------------------===// 8291 // EpilogueVectorizerEpilogueLoop 8292 //===--------------------------------------------------------------------===// 8293 8294 /// This function is partially responsible for generating the control flow 8295 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8296 BasicBlock * 8297 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8298 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8299 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8300 8301 // Now, compare the remaining count and if there aren't enough iterations to 8302 // execute the vectorized epilogue skip to the scalar part. 8303 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8304 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8305 LoopVectorPreHeader = 8306 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8307 LI, nullptr, "vec.epilog.ph"); 8308 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8309 VecEpilogueIterationCountCheck); 8310 8311 // Adjust the control flow taking the state info from the main loop 8312 // vectorization into account. 8313 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8314 "expected this to be saved from the previous pass."); 8315 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8316 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8317 8318 DT->changeImmediateDominator(LoopVectorPreHeader, 8319 EPI.MainLoopIterationCountCheck); 8320 8321 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8322 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8323 8324 if (EPI.SCEVSafetyCheck) 8325 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8326 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8327 if (EPI.MemSafetyCheck) 8328 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8329 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8330 8331 DT->changeImmediateDominator( 8332 VecEpilogueIterationCountCheck, 8333 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8334 8335 DT->changeImmediateDominator(LoopScalarPreHeader, 8336 EPI.EpilogueIterationCountCheck); 8337 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8338 8339 // Keep track of bypass blocks, as they feed start values to the induction 8340 // phis in the scalar loop preheader. 8341 if (EPI.SCEVSafetyCheck) 8342 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8343 if (EPI.MemSafetyCheck) 8344 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8345 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8346 8347 // Generate a resume induction for the vector epilogue and put it in the 8348 // vector epilogue preheader 8349 Type *IdxTy = Legal->getWidestInductionType(); 8350 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8351 LoopVectorPreHeader->getFirstNonPHI()); 8352 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8353 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8354 EPI.MainLoopIterationCountCheck); 8355 8356 // Generate the induction variable. 8357 OldInduction = Legal->getPrimaryInduction(); 8358 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8359 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8360 Value *StartIdx = EPResumeVal; 8361 Induction = 8362 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8363 getDebugLocFromInstOrOperands(OldInduction)); 8364 8365 // Generate induction resume values. These variables save the new starting 8366 // indexes for the scalar loop. They are used to test if there are any tail 8367 // iterations left once the vector loop has completed. 8368 // Note that when the vectorized epilogue is skipped due to iteration count 8369 // check, then the resume value for the induction variable comes from 8370 // the trip count of the main vector loop, hence passing the AdditionalBypass 8371 // argument. 8372 createInductionResumeValues(Lp, CountRoundDown, 8373 {VecEpilogueIterationCountCheck, 8374 EPI.VectorTripCount} /* AdditionalBypass */); 8375 8376 AddRuntimeUnrollDisableMetaData(Lp); 8377 return completeLoopSkeleton(Lp, OrigLoopID); 8378 } 8379 8380 BasicBlock * 8381 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8382 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8383 8384 assert(EPI.TripCount && 8385 "Expected trip count to have been safed in the first pass."); 8386 assert( 8387 (!isa<Instruction>(EPI.TripCount) || 8388 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8389 "saved trip count does not dominate insertion point."); 8390 Value *TC = EPI.TripCount; 8391 IRBuilder<> Builder(Insert->getTerminator()); 8392 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8393 8394 // Generate code to check if the loop's trip count is less than VF * UF of the 8395 // vector epilogue loop. 8396 auto P = 8397 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8398 8399 Value *CheckMinIters = Builder.CreateICmp( 8400 P, Count, 8401 ConstantInt::get(Count->getType(), 8402 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8403 "min.epilog.iters.check"); 8404 8405 ReplaceInstWithInst( 8406 Insert->getTerminator(), 8407 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8408 8409 LoopBypassBlocks.push_back(Insert); 8410 return Insert; 8411 } 8412 8413 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8414 LLVM_DEBUG({ 8415 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8416 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8417 << ", Main Loop UF:" << EPI.MainLoopUF 8418 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8419 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8420 }); 8421 } 8422 8423 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8424 DEBUG_WITH_TYPE(VerboseDebug, { 8425 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8426 }); 8427 } 8428 8429 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8430 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8431 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8432 bool PredicateAtRangeStart = Predicate(Range.Start); 8433 8434 for (ElementCount TmpVF = Range.Start * 2; 8435 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8436 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8437 Range.End = TmpVF; 8438 break; 8439 } 8440 8441 return PredicateAtRangeStart; 8442 } 8443 8444 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8445 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8446 /// of VF's starting at a given VF and extending it as much as possible. Each 8447 /// vectorization decision can potentially shorten this sub-range during 8448 /// buildVPlan(). 8449 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8450 ElementCount MaxVF) { 8451 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8452 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8453 VFRange SubRange = {VF, MaxVFPlusOne}; 8454 VPlans.push_back(buildVPlan(SubRange)); 8455 VF = SubRange.End; 8456 } 8457 } 8458 8459 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8460 VPlanPtr &Plan) { 8461 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8462 8463 // Look for cached value. 8464 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8465 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8466 if (ECEntryIt != EdgeMaskCache.end()) 8467 return ECEntryIt->second; 8468 8469 VPValue *SrcMask = createBlockInMask(Src, Plan); 8470 8471 // The terminator has to be a branch inst! 8472 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8473 assert(BI && "Unexpected terminator found"); 8474 8475 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8476 return EdgeMaskCache[Edge] = SrcMask; 8477 8478 // If source is an exiting block, we know the exit edge is dynamically dead 8479 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8480 // adding uses of an otherwise potentially dead instruction. 8481 if (OrigLoop->isLoopExiting(Src)) 8482 return EdgeMaskCache[Edge] = SrcMask; 8483 8484 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8485 assert(EdgeMask && "No Edge Mask found for condition"); 8486 8487 if (BI->getSuccessor(0) != Dst) 8488 EdgeMask = Builder.createNot(EdgeMask); 8489 8490 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8491 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8492 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8493 // The select version does not introduce new UB if SrcMask is false and 8494 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8495 VPValue *False = Plan->getOrAddVPValue( 8496 ConstantInt::getFalse(BI->getCondition()->getType())); 8497 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); 8498 } 8499 8500 return EdgeMaskCache[Edge] = EdgeMask; 8501 } 8502 8503 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8504 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8505 8506 // Look for cached value. 8507 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8508 if (BCEntryIt != BlockMaskCache.end()) 8509 return BCEntryIt->second; 8510 8511 // All-one mask is modelled as no-mask following the convention for masked 8512 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8513 VPValue *BlockMask = nullptr; 8514 8515 if (OrigLoop->getHeader() == BB) { 8516 if (!CM.blockNeedsPredication(BB)) 8517 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8518 8519 // Create the block in mask as the first non-phi instruction in the block. 8520 VPBuilder::InsertPointGuard Guard(Builder); 8521 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8522 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8523 8524 // Introduce the early-exit compare IV <= BTC to form header block mask. 8525 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8526 // Start by constructing the desired canonical IV. 8527 VPValue *IV = nullptr; 8528 if (Legal->getPrimaryInduction()) 8529 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8530 else { 8531 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8532 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8533 IV = IVRecipe->getVPSingleValue(); 8534 } 8535 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8536 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8537 8538 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8539 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8540 // as a second argument, we only pass the IV here and extract the 8541 // tripcount from the transform state where codegen of the VP instructions 8542 // happen. 8543 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8544 } else { 8545 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8546 } 8547 return BlockMaskCache[BB] = BlockMask; 8548 } 8549 8550 // This is the block mask. We OR all incoming edges. 8551 for (auto *Predecessor : predecessors(BB)) { 8552 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8553 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8554 return BlockMaskCache[BB] = EdgeMask; 8555 8556 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8557 BlockMask = EdgeMask; 8558 continue; 8559 } 8560 8561 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8562 } 8563 8564 return BlockMaskCache[BB] = BlockMask; 8565 } 8566 8567 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8568 ArrayRef<VPValue *> Operands, 8569 VFRange &Range, 8570 VPlanPtr &Plan) { 8571 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8572 "Must be called with either a load or store"); 8573 8574 auto willWiden = [&](ElementCount VF) -> bool { 8575 if (VF.isScalar()) 8576 return false; 8577 LoopVectorizationCostModel::InstWidening Decision = 8578 CM.getWideningDecision(I, VF); 8579 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8580 "CM decision should be taken at this point."); 8581 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8582 return true; 8583 if (CM.isScalarAfterVectorization(I, VF) || 8584 CM.isProfitableToScalarize(I, VF)) 8585 return false; 8586 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8587 }; 8588 8589 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8590 return nullptr; 8591 8592 VPValue *Mask = nullptr; 8593 if (Legal->isMaskRequired(I)) 8594 Mask = createBlockInMask(I->getParent(), Plan); 8595 8596 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8597 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask); 8598 8599 StoreInst *Store = cast<StoreInst>(I); 8600 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], 8601 Mask); 8602 } 8603 8604 VPWidenIntOrFpInductionRecipe * 8605 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, 8606 ArrayRef<VPValue *> Operands) const { 8607 // Check if this is an integer or fp induction. If so, build the recipe that 8608 // produces its scalar and vector values. 8609 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8610 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8611 II.getKind() == InductionDescriptor::IK_FpInduction) { 8612 assert(II.getStartValue() == 8613 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8614 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); 8615 return new VPWidenIntOrFpInductionRecipe( 8616 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); 8617 } 8618 8619 return nullptr; 8620 } 8621 8622 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8623 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, 8624 VPlan &Plan) const { 8625 // Optimize the special case where the source is a constant integer 8626 // induction variable. Notice that we can only optimize the 'trunc' case 8627 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8628 // (c) other casts depend on pointer size. 8629 8630 // Determine whether \p K is a truncation based on an induction variable that 8631 // can be optimized. 8632 auto isOptimizableIVTruncate = 8633 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8634 return [=](ElementCount VF) -> bool { 8635 return CM.isOptimizableIVTruncate(K, VF); 8636 }; 8637 }; 8638 8639 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8640 isOptimizableIVTruncate(I), Range)) { 8641 8642 InductionDescriptor II = 8643 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8644 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8645 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8646 Start, nullptr, I); 8647 } 8648 return nullptr; 8649 } 8650 8651 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8652 ArrayRef<VPValue *> Operands, 8653 VPlanPtr &Plan) { 8654 // If all incoming values are equal, the incoming VPValue can be used directly 8655 // instead of creating a new VPBlendRecipe. 8656 VPValue *FirstIncoming = Operands[0]; 8657 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) { 8658 return FirstIncoming == Inc; 8659 })) { 8660 return Operands[0]; 8661 } 8662 8663 // We know that all PHIs in non-header blocks are converted into selects, so 8664 // we don't have to worry about the insertion order and we can just use the 8665 // builder. At this point we generate the predication tree. There may be 8666 // duplications since this is a simple recursive scan, but future 8667 // optimizations will clean it up. 8668 SmallVector<VPValue *, 2> OperandsWithMask; 8669 unsigned NumIncoming = Phi->getNumIncomingValues(); 8670 8671 for (unsigned In = 0; In < NumIncoming; In++) { 8672 VPValue *EdgeMask = 8673 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8674 assert((EdgeMask || NumIncoming == 1) && 8675 "Multiple predecessors with one having a full mask"); 8676 OperandsWithMask.push_back(Operands[In]); 8677 if (EdgeMask) 8678 OperandsWithMask.push_back(EdgeMask); 8679 } 8680 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8681 } 8682 8683 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8684 ArrayRef<VPValue *> Operands, 8685 VFRange &Range) const { 8686 8687 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8688 [this, CI](ElementCount VF) { 8689 return CM.isScalarWithPredication(CI, VF); 8690 }, 8691 Range); 8692 8693 if (IsPredicated) 8694 return nullptr; 8695 8696 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8697 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8698 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8699 ID == Intrinsic::pseudoprobe || 8700 ID == Intrinsic::experimental_noalias_scope_decl)) 8701 return nullptr; 8702 8703 auto willWiden = [&](ElementCount VF) -> bool { 8704 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8705 // The following case may be scalarized depending on the VF. 8706 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8707 // version of the instruction. 8708 // Is it beneficial to perform intrinsic call compared to lib call? 8709 bool NeedToScalarize = false; 8710 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8711 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8712 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8713 assert((IntrinsicCost.isValid() || CallCost.isValid()) && 8714 "Either the intrinsic cost or vector call cost must be valid"); 8715 return UseVectorIntrinsic || !NeedToScalarize; 8716 }; 8717 8718 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8719 return nullptr; 8720 8721 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands()); 8722 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); 8723 } 8724 8725 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8726 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8727 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8728 // Instruction should be widened, unless it is scalar after vectorization, 8729 // scalarization is profitable or it is predicated. 8730 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8731 return CM.isScalarAfterVectorization(I, VF) || 8732 CM.isProfitableToScalarize(I, VF) || 8733 CM.isScalarWithPredication(I, VF); 8734 }; 8735 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8736 Range); 8737 } 8738 8739 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, 8740 ArrayRef<VPValue *> Operands) const { 8741 auto IsVectorizableOpcode = [](unsigned Opcode) { 8742 switch (Opcode) { 8743 case Instruction::Add: 8744 case Instruction::And: 8745 case Instruction::AShr: 8746 case Instruction::BitCast: 8747 case Instruction::FAdd: 8748 case Instruction::FCmp: 8749 case Instruction::FDiv: 8750 case Instruction::FMul: 8751 case Instruction::FNeg: 8752 case Instruction::FPExt: 8753 case Instruction::FPToSI: 8754 case Instruction::FPToUI: 8755 case Instruction::FPTrunc: 8756 case Instruction::FRem: 8757 case Instruction::FSub: 8758 case Instruction::ICmp: 8759 case Instruction::IntToPtr: 8760 case Instruction::LShr: 8761 case Instruction::Mul: 8762 case Instruction::Or: 8763 case Instruction::PtrToInt: 8764 case Instruction::SDiv: 8765 case Instruction::Select: 8766 case Instruction::SExt: 8767 case Instruction::Shl: 8768 case Instruction::SIToFP: 8769 case Instruction::SRem: 8770 case Instruction::Sub: 8771 case Instruction::Trunc: 8772 case Instruction::UDiv: 8773 case Instruction::UIToFP: 8774 case Instruction::URem: 8775 case Instruction::Xor: 8776 case Instruction::ZExt: 8777 return true; 8778 } 8779 return false; 8780 }; 8781 8782 if (!IsVectorizableOpcode(I->getOpcode())) 8783 return nullptr; 8784 8785 // Success: widen this instruction. 8786 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8787 } 8788 8789 void VPRecipeBuilder::fixHeaderPhis() { 8790 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8791 for (VPWidenPHIRecipe *R : PhisToFix) { 8792 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8793 VPRecipeBase *IncR = 8794 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8795 R->addOperand(IncR->getVPSingleValue()); 8796 } 8797 } 8798 8799 VPBasicBlock *VPRecipeBuilder::handleReplication( 8800 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8801 VPlanPtr &Plan) { 8802 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8803 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8804 Range); 8805 8806 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8807 [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range); 8808 8809 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8810 IsUniform, IsPredicated); 8811 setRecipe(I, Recipe); 8812 Plan->addVPValue(I, Recipe); 8813 8814 // Find if I uses a predicated instruction. If so, it will use its scalar 8815 // value. Avoid hoisting the insert-element which packs the scalar value into 8816 // a vector value, as that happens iff all users use the vector value. 8817 for (VPValue *Op : Recipe->operands()) { 8818 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef()); 8819 if (!PredR) 8820 continue; 8821 auto *RepR = 8822 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef()); 8823 assert(RepR->isPredicated() && 8824 "expected Replicate recipe to be predicated"); 8825 RepR->setAlsoPack(false); 8826 } 8827 8828 // Finalize the recipe for Instr, first if it is not predicated. 8829 if (!IsPredicated) { 8830 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8831 VPBB->appendRecipe(Recipe); 8832 return VPBB; 8833 } 8834 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8835 assert(VPBB->getSuccessors().empty() && 8836 "VPBB has successors when handling predicated replication."); 8837 // Record predicated instructions for above packing optimizations. 8838 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8839 VPBlockUtils::insertBlockAfter(Region, VPBB); 8840 auto *RegSucc = new VPBasicBlock(); 8841 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8842 return RegSucc; 8843 } 8844 8845 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8846 VPRecipeBase *PredRecipe, 8847 VPlanPtr &Plan) { 8848 // Instructions marked for predication are replicated and placed under an 8849 // if-then construct to prevent side-effects. 8850 8851 // Generate recipes to compute the block mask for this region. 8852 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8853 8854 // Build the triangular if-then region. 8855 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8856 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8857 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8858 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8859 auto *PHIRecipe = Instr->getType()->isVoidTy() 8860 ? nullptr 8861 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8862 if (PHIRecipe) { 8863 Plan->removeVPValueFor(Instr); 8864 Plan->addVPValue(Instr, PHIRecipe); 8865 } 8866 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8867 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8868 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8869 8870 // Note: first set Entry as region entry and then connect successors starting 8871 // from it in order, to propagate the "parent" of each VPBasicBlock. 8872 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8873 VPBlockUtils::connectBlocks(Pred, Exit); 8874 8875 return Region; 8876 } 8877 8878 VPRecipeOrVPValueTy 8879 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8880 ArrayRef<VPValue *> Operands, 8881 VFRange &Range, VPlanPtr &Plan) { 8882 // First, check for specific widening recipes that deal with calls, memory 8883 // operations, inductions and Phi nodes. 8884 if (auto *CI = dyn_cast<CallInst>(Instr)) 8885 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); 8886 8887 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8888 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8889 8890 VPRecipeBase *Recipe; 8891 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8892 if (Phi->getParent() != OrigLoop->getHeader()) 8893 return tryToBlend(Phi, Operands, Plan); 8894 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) 8895 return toVPRecipeResult(Recipe); 8896 8897 if (Legal->isReductionVariable(Phi)) { 8898 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8899 assert(RdxDesc.getRecurrenceStartValue() == 8900 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8901 VPValue *StartV = Operands[0]; 8902 8903 auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8904 PhisToFix.push_back(PhiRecipe); 8905 // Record the incoming value from the backedge, so we can add the incoming 8906 // value from the backedge after all recipes have been created. 8907 recordRecipeOf(cast<Instruction>( 8908 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()))); 8909 return toVPRecipeResult(PhiRecipe); 8910 } 8911 8912 return toVPRecipeResult(new VPWidenPHIRecipe(Phi)); 8913 } 8914 8915 if (isa<TruncInst>(Instr) && 8916 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8917 Range, *Plan))) 8918 return toVPRecipeResult(Recipe); 8919 8920 if (!shouldWiden(Instr, Range)) 8921 return nullptr; 8922 8923 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8924 return toVPRecipeResult(new VPWidenGEPRecipe( 8925 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop)); 8926 8927 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8928 bool InvariantCond = 8929 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8930 return toVPRecipeResult(new VPWidenSelectRecipe( 8931 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); 8932 } 8933 8934 return toVPRecipeResult(tryToWiden(Instr, Operands)); 8935 } 8936 8937 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8938 ElementCount MaxVF) { 8939 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8940 8941 // Collect instructions from the original loop that will become trivially dead 8942 // in the vectorized loop. We don't need to vectorize these instructions. For 8943 // example, original induction update instructions can become dead because we 8944 // separately emit induction "steps" when generating code for the new loop. 8945 // Similarly, we create a new latch condition when setting up the structure 8946 // of the new loop, so the old one can become dead. 8947 SmallPtrSet<Instruction *, 4> DeadInstructions; 8948 collectTriviallyDeadInstructions(DeadInstructions); 8949 8950 // Add assume instructions we need to drop to DeadInstructions, to prevent 8951 // them from being added to the VPlan. 8952 // TODO: We only need to drop assumes in blocks that get flattend. If the 8953 // control flow is preserved, we should keep them. 8954 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8955 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8956 8957 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8958 // Dead instructions do not need sinking. Remove them from SinkAfter. 8959 for (Instruction *I : DeadInstructions) 8960 SinkAfter.erase(I); 8961 8962 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8963 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8964 VFRange SubRange = {VF, MaxVFPlusOne}; 8965 VPlans.push_back( 8966 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8967 VF = SubRange.End; 8968 } 8969 } 8970 8971 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8972 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8973 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8974 8975 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8976 8977 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8978 8979 // --------------------------------------------------------------------------- 8980 // Pre-construction: record ingredients whose recipes we'll need to further 8981 // process after constructing the initial VPlan. 8982 // --------------------------------------------------------------------------- 8983 8984 // Mark instructions we'll need to sink later and their targets as 8985 // ingredients whose recipe we'll need to record. 8986 for (auto &Entry : SinkAfter) { 8987 RecipeBuilder.recordRecipeOf(Entry.first); 8988 RecipeBuilder.recordRecipeOf(Entry.second); 8989 } 8990 for (auto &Reduction : CM.getInLoopReductionChains()) { 8991 PHINode *Phi = Reduction.first; 8992 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8993 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8994 8995 RecipeBuilder.recordRecipeOf(Phi); 8996 for (auto &R : ReductionOperations) { 8997 RecipeBuilder.recordRecipeOf(R); 8998 // For min/max reducitons, where we have a pair of icmp/select, we also 8999 // need to record the ICmp recipe, so it can be removed later. 9000 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 9001 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 9002 } 9003 } 9004 9005 // For each interleave group which is relevant for this (possibly trimmed) 9006 // Range, add it to the set of groups to be later applied to the VPlan and add 9007 // placeholders for its members' Recipes which we'll be replacing with a 9008 // single VPInterleaveRecipe. 9009 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 9010 auto applyIG = [IG, this](ElementCount VF) -> bool { 9011 return (VF.isVector() && // Query is illegal for VF == 1 9012 CM.getWideningDecision(IG->getInsertPos(), VF) == 9013 LoopVectorizationCostModel::CM_Interleave); 9014 }; 9015 if (!getDecisionAndClampRange(applyIG, Range)) 9016 continue; 9017 InterleaveGroups.insert(IG); 9018 for (unsigned i = 0; i < IG->getFactor(); i++) 9019 if (Instruction *Member = IG->getMember(i)) 9020 RecipeBuilder.recordRecipeOf(Member); 9021 }; 9022 9023 // --------------------------------------------------------------------------- 9024 // Build initial VPlan: Scan the body of the loop in a topological order to 9025 // visit each basic block after having visited its predecessor basic blocks. 9026 // --------------------------------------------------------------------------- 9027 9028 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 9029 auto Plan = std::make_unique<VPlan>(); 9030 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 9031 Plan->setEntry(VPBB); 9032 9033 // Scan the body of the loop in a topological order to visit each basic block 9034 // after having visited its predecessor basic blocks. 9035 LoopBlocksDFS DFS(OrigLoop); 9036 DFS.perform(LI); 9037 9038 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 9039 // Relevant instructions from basic block BB will be grouped into VPRecipe 9040 // ingredients and fill a new VPBasicBlock. 9041 unsigned VPBBsForBB = 0; 9042 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 9043 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 9044 VPBB = FirstVPBBForBB; 9045 Builder.setInsertPoint(VPBB); 9046 9047 // Introduce each ingredient into VPlan. 9048 // TODO: Model and preserve debug instrinsics in VPlan. 9049 for (Instruction &I : BB->instructionsWithoutDebug()) { 9050 Instruction *Instr = &I; 9051 9052 // First filter out irrelevant instructions, to ensure no recipes are 9053 // built for them. 9054 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 9055 continue; 9056 9057 SmallVector<VPValue *, 4> Operands; 9058 auto *Phi = dyn_cast<PHINode>(Instr); 9059 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 9060 Operands.push_back(Plan->getOrAddVPValue( 9061 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 9062 } else { 9063 auto OpRange = Plan->mapToVPValues(Instr->operands()); 9064 Operands = {OpRange.begin(), OpRange.end()}; 9065 } 9066 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 9067 Instr, Operands, Range, Plan)) { 9068 // If Instr can be simplified to an existing VPValue, use it. 9069 if (RecipeOrValue.is<VPValue *>()) { 9070 Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>()); 9071 continue; 9072 } 9073 // Otherwise, add the new recipe. 9074 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>(); 9075 for (auto *Def : Recipe->definedValues()) { 9076 auto *UV = Def->getUnderlyingValue(); 9077 Plan->addVPValue(UV, Def); 9078 } 9079 9080 RecipeBuilder.setRecipe(Instr, Recipe); 9081 VPBB->appendRecipe(Recipe); 9082 continue; 9083 } 9084 9085 // Otherwise, if all widening options failed, Instruction is to be 9086 // replicated. This may create a successor for VPBB. 9087 VPBasicBlock *NextVPBB = 9088 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); 9089 if (NextVPBB != VPBB) { 9090 VPBB = NextVPBB; 9091 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 9092 : ""); 9093 } 9094 } 9095 } 9096 9097 RecipeBuilder.fixHeaderPhis(); 9098 9099 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 9100 // may also be empty, such as the last one VPBB, reflecting original 9101 // basic-blocks with no recipes. 9102 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 9103 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 9104 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 9105 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 9106 delete PreEntry; 9107 9108 // --------------------------------------------------------------------------- 9109 // Transform initial VPlan: Apply previously taken decisions, in order, to 9110 // bring the VPlan to its final state. 9111 // --------------------------------------------------------------------------- 9112 9113 // Apply Sink-After legal constraints. 9114 for (auto &Entry : SinkAfter) { 9115 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 9116 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 9117 9118 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { 9119 auto *Region = 9120 dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent()); 9121 if (Region && Region->isReplicator()) 9122 return Region; 9123 return nullptr; 9124 }; 9125 9126 // If the target is in a replication region, make sure to move Sink to the 9127 // block after it, not into the replication region itself. 9128 if (auto *TargetRegion = GetReplicateRegion(Target)) { 9129 assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!"); 9130 assert(!GetReplicateRegion(Sink) && 9131 "cannot sink a region into another region yet"); 9132 VPBasicBlock *NextBlock = 9133 cast<VPBasicBlock>(TargetRegion->getSuccessors().front()); 9134 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 9135 continue; 9136 } 9137 9138 auto *SinkRegion = GetReplicateRegion(Sink); 9139 // Unless the sink source is in a replicate region, sink the recipe 9140 // directly. 9141 if (!SinkRegion) { 9142 Sink->moveAfter(Target); 9143 continue; 9144 } 9145 9146 // If the sink source is in a replicate region, we need to move the whole 9147 // replicate region, which should only contain a single recipe in the main 9148 // block. 9149 assert(Sink->getParent()->size() == 1 && 9150 "parent must be a replicator with a single recipe"); 9151 auto *SplitBlock = 9152 Target->getParent()->splitAt(std::next(Target->getIterator())); 9153 9154 auto *Pred = SinkRegion->getSinglePredecessor(); 9155 auto *Succ = SinkRegion->getSingleSuccessor(); 9156 VPBlockUtils::disconnectBlocks(Pred, SinkRegion); 9157 VPBlockUtils::disconnectBlocks(SinkRegion, Succ); 9158 VPBlockUtils::connectBlocks(Pred, Succ); 9159 9160 auto *SplitPred = SplitBlock->getSinglePredecessor(); 9161 9162 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); 9163 VPBlockUtils::connectBlocks(SplitPred, SinkRegion); 9164 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); 9165 if (VPBB == SplitPred) 9166 VPBB = SplitBlock; 9167 } 9168 9169 // Interleave memory: for each Interleave Group we marked earlier as relevant 9170 // for this VPlan, replace the Recipes widening its memory instructions with a 9171 // single VPInterleaveRecipe at its insertion point. 9172 for (auto IG : InterleaveGroups) { 9173 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 9174 RecipeBuilder.getRecipe(IG->getInsertPos())); 9175 SmallVector<VPValue *, 4> StoredValues; 9176 for (unsigned i = 0; i < IG->getFactor(); ++i) 9177 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 9178 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 9179 9180 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 9181 Recipe->getMask()); 9182 VPIG->insertBefore(Recipe); 9183 unsigned J = 0; 9184 for (unsigned i = 0; i < IG->getFactor(); ++i) 9185 if (Instruction *Member = IG->getMember(i)) { 9186 if (!Member->getType()->isVoidTy()) { 9187 VPValue *OriginalV = Plan->getVPValue(Member); 9188 Plan->removeVPValueFor(Member); 9189 Plan->addVPValue(Member, VPIG->getVPValue(J)); 9190 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 9191 J++; 9192 } 9193 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 9194 } 9195 } 9196 9197 // Adjust the recipes for any inloop reductions. 9198 if (Range.Start.isVector()) 9199 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 9200 9201 // Finally, if tail is folded by masking, introduce selects between the phi 9202 // and the live-out instruction of each reduction, at the end of the latch. 9203 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 9204 Builder.setInsertPoint(VPBB); 9205 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 9206 for (auto &Reduction : Legal->getReductionVars()) { 9207 if (CM.isInLoopReduction(Reduction.first)) 9208 continue; 9209 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 9210 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 9211 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 9212 } 9213 } 9214 9215 std::string PlanName; 9216 raw_string_ostream RSO(PlanName); 9217 ElementCount VF = Range.Start; 9218 Plan->addVF(VF); 9219 RSO << "Initial VPlan for VF={" << VF; 9220 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 9221 Plan->addVF(VF); 9222 RSO << "," << VF; 9223 } 9224 RSO << "},UF>=1"; 9225 RSO.flush(); 9226 Plan->setName(PlanName); 9227 9228 return Plan; 9229 } 9230 9231 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 9232 // Outer loop handling: They may require CFG and instruction level 9233 // transformations before even evaluating whether vectorization is profitable. 9234 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 9235 // the vectorization pipeline. 9236 assert(!OrigLoop->isInnermost()); 9237 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 9238 9239 // Create new empty VPlan 9240 auto Plan = std::make_unique<VPlan>(); 9241 9242 // Build hierarchical CFG 9243 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 9244 HCFGBuilder.buildHierarchicalCFG(); 9245 9246 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 9247 VF *= 2) 9248 Plan->addVF(VF); 9249 9250 if (EnableVPlanPredication) { 9251 VPlanPredicator VPP(*Plan); 9252 VPP.predicate(); 9253 9254 // Avoid running transformation to recipes until masked code generation in 9255 // VPlan-native path is in place. 9256 return Plan; 9257 } 9258 9259 SmallPtrSet<Instruction *, 1> DeadInstructions; 9260 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, 9261 Legal->getInductionVars(), 9262 DeadInstructions, *PSE.getSE()); 9263 return Plan; 9264 } 9265 9266 // Adjust the recipes for any inloop reductions. The chain of instructions 9267 // leading from the loop exit instr to the phi need to be converted to 9268 // reductions, with one operand being vector and the other being the scalar 9269 // reduction chain. 9270 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 9271 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 9272 for (auto &Reduction : CM.getInLoopReductionChains()) { 9273 PHINode *Phi = Reduction.first; 9274 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 9275 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 9276 9277 // ReductionOperations are orders top-down from the phi's use to the 9278 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 9279 // which of the two operands will remain scalar and which will be reduced. 9280 // For minmax the chain will be the select instructions. 9281 Instruction *Chain = Phi; 9282 for (Instruction *R : ReductionOperations) { 9283 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 9284 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9285 9286 VPValue *ChainOp = Plan->getVPValue(Chain); 9287 unsigned FirstOpId; 9288 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9289 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 9290 "Expected to replace a VPWidenSelectSC"); 9291 FirstOpId = 1; 9292 } else { 9293 assert(isa<VPWidenRecipe>(WidenRecipe) && 9294 "Expected to replace a VPWidenSC"); 9295 FirstOpId = 0; 9296 } 9297 unsigned VecOpId = 9298 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 9299 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 9300 9301 auto *CondOp = CM.foldTailByMasking() 9302 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 9303 : nullptr; 9304 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9305 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 9306 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9307 Plan->removeVPValueFor(R); 9308 Plan->addVPValue(R, RedRecipe); 9309 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 9310 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe); 9311 WidenRecipe->eraseFromParent(); 9312 9313 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9314 VPRecipeBase *CompareRecipe = 9315 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 9316 assert(isa<VPWidenRecipe>(CompareRecipe) && 9317 "Expected to replace a VPWidenSC"); 9318 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 9319 "Expected no remaining users"); 9320 CompareRecipe->eraseFromParent(); 9321 } 9322 Chain = R; 9323 } 9324 } 9325 } 9326 9327 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9328 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9329 VPSlotTracker &SlotTracker) const { 9330 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9331 IG->getInsertPos()->printAsOperand(O, false); 9332 O << ", "; 9333 getAddr()->printAsOperand(O, SlotTracker); 9334 VPValue *Mask = getMask(); 9335 if (Mask) { 9336 O << ", "; 9337 Mask->printAsOperand(O, SlotTracker); 9338 } 9339 for (unsigned i = 0; i < IG->getFactor(); ++i) 9340 if (Instruction *I = IG->getMember(i)) 9341 O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; 9342 } 9343 #endif 9344 9345 void VPWidenCallRecipe::execute(VPTransformState &State) { 9346 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 9347 *this, State); 9348 } 9349 9350 void VPWidenSelectRecipe::execute(VPTransformState &State) { 9351 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 9352 this, *this, InvariantCond, State); 9353 } 9354 9355 void VPWidenRecipe::execute(VPTransformState &State) { 9356 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 9357 } 9358 9359 void VPWidenGEPRecipe::execute(VPTransformState &State) { 9360 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 9361 *this, State.UF, State.VF, IsPtrLoopInvariant, 9362 IsIndexLoopInvariant, State); 9363 } 9364 9365 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 9366 assert(!State.Instance && "Int or FP induction being replicated."); 9367 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 9368 getTruncInst(), getVPValue(0), 9369 getCastValue(), State); 9370 } 9371 9372 void VPWidenPHIRecipe::execute(VPTransformState &State) { 9373 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc, 9374 this, State); 9375 } 9376 9377 void VPBlendRecipe::execute(VPTransformState &State) { 9378 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9379 // We know that all PHIs in non-header blocks are converted into 9380 // selects, so we don't have to worry about the insertion order and we 9381 // can just use the builder. 9382 // At this point we generate the predication tree. There may be 9383 // duplications since this is a simple recursive scan, but future 9384 // optimizations will clean it up. 9385 9386 unsigned NumIncoming = getNumIncomingValues(); 9387 9388 // Generate a sequence of selects of the form: 9389 // SELECT(Mask3, In3, 9390 // SELECT(Mask2, In2, 9391 // SELECT(Mask1, In1, 9392 // In0))) 9393 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9394 // are essentially undef are taken from In0. 9395 InnerLoopVectorizer::VectorParts Entry(State.UF); 9396 for (unsigned In = 0; In < NumIncoming; ++In) { 9397 for (unsigned Part = 0; Part < State.UF; ++Part) { 9398 // We might have single edge PHIs (blocks) - use an identity 9399 // 'select' for the first PHI operand. 9400 Value *In0 = State.get(getIncomingValue(In), Part); 9401 if (In == 0) 9402 Entry[Part] = In0; // Initialize with the first incoming value. 9403 else { 9404 // Select between the current value and the previous incoming edge 9405 // based on the incoming mask. 9406 Value *Cond = State.get(getMask(In), Part); 9407 Entry[Part] = 9408 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9409 } 9410 } 9411 } 9412 for (unsigned Part = 0; Part < State.UF; ++Part) 9413 State.set(this, Entry[Part], Part); 9414 } 9415 9416 void VPInterleaveRecipe::execute(VPTransformState &State) { 9417 assert(!State.Instance && "Interleave group being replicated."); 9418 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9419 getStoredValues(), getMask()); 9420 } 9421 9422 void VPReductionRecipe::execute(VPTransformState &State) { 9423 assert(!State.Instance && "Reduction being replicated."); 9424 Value *PrevInChain = State.get(getChainOp(), 0); 9425 for (unsigned Part = 0; Part < State.UF; ++Part) { 9426 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9427 bool IsOrdered = useOrderedReductions(*RdxDesc); 9428 Value *NewVecOp = State.get(getVecOp(), Part); 9429 if (VPValue *Cond = getCondOp()) { 9430 Value *NewCond = State.get(Cond, Part); 9431 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9432 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9433 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags()); 9434 Constant *IdenVec = 9435 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9436 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9437 NewVecOp = Select; 9438 } 9439 Value *NewRed; 9440 Value *NextInChain; 9441 if (IsOrdered) { 9442 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, 9443 PrevInChain); 9444 PrevInChain = NewRed; 9445 } else { 9446 PrevInChain = State.get(getChainOp(), Part); 9447 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9448 } 9449 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9450 NextInChain = 9451 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9452 NewRed, PrevInChain); 9453 } else if (IsOrdered) 9454 NextInChain = NewRed; 9455 else { 9456 NextInChain = State.Builder.CreateBinOp( 9457 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9458 PrevInChain); 9459 } 9460 State.set(this, NextInChain, Part); 9461 } 9462 } 9463 9464 void VPReplicateRecipe::execute(VPTransformState &State) { 9465 if (State.Instance) { // Generate a single instance. 9466 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9467 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9468 *State.Instance, IsPredicated, State); 9469 // Insert scalar instance packing it into a vector. 9470 if (AlsoPack && State.VF.isVector()) { 9471 // If we're constructing lane 0, initialize to start from poison. 9472 if (State.Instance->Lane.isFirstLane()) { 9473 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9474 Value *Poison = PoisonValue::get( 9475 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9476 State.set(this, Poison, State.Instance->Part); 9477 } 9478 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State); 9479 } 9480 return; 9481 } 9482 9483 // Generate scalar instances for all VF lanes of all UF parts, unless the 9484 // instruction is uniform inwhich case generate only the first lane for each 9485 // of the UF parts. 9486 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9487 assert((!State.VF.isScalable() || IsUniform) && 9488 "Can't scalarize a scalable vector"); 9489 for (unsigned Part = 0; Part < State.UF; ++Part) 9490 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9491 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, 9492 VPIteration(Part, Lane), IsPredicated, 9493 State); 9494 } 9495 9496 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9497 assert(State.Instance && "Branch on Mask works only on single instance."); 9498 9499 unsigned Part = State.Instance->Part; 9500 unsigned Lane = State.Instance->Lane.getKnownLane(); 9501 9502 Value *ConditionBit = nullptr; 9503 VPValue *BlockInMask = getMask(); 9504 if (BlockInMask) { 9505 ConditionBit = State.get(BlockInMask, Part); 9506 if (ConditionBit->getType()->isVectorTy()) 9507 ConditionBit = State.Builder.CreateExtractElement( 9508 ConditionBit, State.Builder.getInt32(Lane)); 9509 } else // Block in mask is all-one. 9510 ConditionBit = State.Builder.getTrue(); 9511 9512 // Replace the temporary unreachable terminator with a new conditional branch, 9513 // whose two destinations will be set later when they are created. 9514 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9515 assert(isa<UnreachableInst>(CurrentTerminator) && 9516 "Expected to replace unreachable terminator with conditional branch."); 9517 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9518 CondBr->setSuccessor(0, nullptr); 9519 ReplaceInstWithInst(CurrentTerminator, CondBr); 9520 } 9521 9522 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9523 assert(State.Instance && "Predicated instruction PHI works per instance."); 9524 Instruction *ScalarPredInst = 9525 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9526 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9527 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9528 assert(PredicatingBB && "Predicated block has no single predecessor."); 9529 assert(isa<VPReplicateRecipe>(getOperand(0)) && 9530 "operand must be VPReplicateRecipe"); 9531 9532 // By current pack/unpack logic we need to generate only a single phi node: if 9533 // a vector value for the predicated instruction exists at this point it means 9534 // the instruction has vector users only, and a phi for the vector value is 9535 // needed. In this case the recipe of the predicated instruction is marked to 9536 // also do that packing, thereby "hoisting" the insert-element sequence. 9537 // Otherwise, a phi node for the scalar value is needed. 9538 unsigned Part = State.Instance->Part; 9539 if (State.hasVectorValue(getOperand(0), Part)) { 9540 Value *VectorValue = State.get(getOperand(0), Part); 9541 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9542 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9543 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9544 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9545 if (State.hasVectorValue(this, Part)) 9546 State.reset(this, VPhi, Part); 9547 else 9548 State.set(this, VPhi, Part); 9549 // NOTE: Currently we need to update the value of the operand, so the next 9550 // predicated iteration inserts its generated value in the correct vector. 9551 State.reset(getOperand(0), VPhi, Part); 9552 } else { 9553 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType(); 9554 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9555 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), 9556 PredicatingBB); 9557 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9558 if (State.hasScalarValue(this, *State.Instance)) 9559 State.reset(this, Phi, *State.Instance); 9560 else 9561 State.set(this, Phi, *State.Instance); 9562 // NOTE: Currently we need to update the value of the operand, so the next 9563 // predicated iteration inserts its generated value in the correct vector. 9564 State.reset(getOperand(0), Phi, *State.Instance); 9565 } 9566 } 9567 9568 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9569 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9570 State.ILV->vectorizeMemoryInstruction( 9571 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), 9572 StoredValue, getMask()); 9573 } 9574 9575 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9576 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9577 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9578 // for predication. 9579 static ScalarEpilogueLowering getScalarEpilogueLowering( 9580 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9581 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9582 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9583 LoopVectorizationLegality &LVL) { 9584 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9585 // don't look at hints or options, and don't request a scalar epilogue. 9586 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9587 // LoopAccessInfo (due to code dependency and not being able to reliably get 9588 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9589 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9590 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9591 // back to the old way and vectorize with versioning when forced. See D81345.) 9592 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9593 PGSOQueryType::IRPass) && 9594 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9595 return CM_ScalarEpilogueNotAllowedOptSize; 9596 9597 // 2) If set, obey the directives 9598 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9599 switch (PreferPredicateOverEpilogue) { 9600 case PreferPredicateTy::ScalarEpilogue: 9601 return CM_ScalarEpilogueAllowed; 9602 case PreferPredicateTy::PredicateElseScalarEpilogue: 9603 return CM_ScalarEpilogueNotNeededUsePredicate; 9604 case PreferPredicateTy::PredicateOrDontVectorize: 9605 return CM_ScalarEpilogueNotAllowedUsePredicate; 9606 }; 9607 } 9608 9609 // 3) If set, obey the hints 9610 switch (Hints.getPredicate()) { 9611 case LoopVectorizeHints::FK_Enabled: 9612 return CM_ScalarEpilogueNotNeededUsePredicate; 9613 case LoopVectorizeHints::FK_Disabled: 9614 return CM_ScalarEpilogueAllowed; 9615 }; 9616 9617 // 4) if the TTI hook indicates this is profitable, request predication. 9618 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9619 LVL.getLAI())) 9620 return CM_ScalarEpilogueNotNeededUsePredicate; 9621 9622 return CM_ScalarEpilogueAllowed; 9623 } 9624 9625 Value *VPTransformState::get(VPValue *Def, unsigned Part) { 9626 // If Values have been set for this Def return the one relevant for \p Part. 9627 if (hasVectorValue(Def, Part)) 9628 return Data.PerPartOutput[Def][Part]; 9629 9630 if (!hasScalarValue(Def, {Part, 0})) { 9631 Value *IRV = Def->getLiveInIRValue(); 9632 Value *B = ILV->getBroadcastInstrs(IRV); 9633 set(Def, B, Part); 9634 return B; 9635 } 9636 9637 Value *ScalarValue = get(Def, {Part, 0}); 9638 // If we aren't vectorizing, we can just copy the scalar map values over 9639 // to the vector map. 9640 if (VF.isScalar()) { 9641 set(Def, ScalarValue, Part); 9642 return ScalarValue; 9643 } 9644 9645 auto *RepR = dyn_cast<VPReplicateRecipe>(Def); 9646 bool IsUniform = RepR && RepR->isUniform(); 9647 9648 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1; 9649 // Check if there is a scalar value for the selected lane. 9650 if (!hasScalarValue(Def, {Part, LastLane})) { 9651 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform. 9652 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) && 9653 "unexpected recipe found to be invariant"); 9654 IsUniform = true; 9655 LastLane = 0; 9656 } 9657 9658 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane})); 9659 9660 // Set the insert point after the last scalarized instruction. This 9661 // ensures the insertelement sequence will directly follow the scalar 9662 // definitions. 9663 auto OldIP = Builder.saveIP(); 9664 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 9665 Builder.SetInsertPoint(&*NewIP); 9666 9667 // However, if we are vectorizing, we need to construct the vector values. 9668 // If the value is known to be uniform after vectorization, we can just 9669 // broadcast the scalar value corresponding to lane zero for each unroll 9670 // iteration. Otherwise, we construct the vector values using 9671 // insertelement instructions. Since the resulting vectors are stored in 9672 // State, we will only generate the insertelements once. 9673 Value *VectorValue = nullptr; 9674 if (IsUniform) { 9675 VectorValue = ILV->getBroadcastInstrs(ScalarValue); 9676 set(Def, VectorValue, Part); 9677 } else { 9678 // Initialize packing with insertelements to start from undef. 9679 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 9680 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); 9681 set(Def, Undef, Part); 9682 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 9683 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this); 9684 VectorValue = get(Def, Part); 9685 } 9686 Builder.restoreIP(OldIP); 9687 return VectorValue; 9688 } 9689 9690 // Process the loop in the VPlan-native vectorization path. This path builds 9691 // VPlan upfront in the vectorization pipeline, which allows to apply 9692 // VPlan-to-VPlan transformations from the very beginning without modifying the 9693 // input LLVM IR. 9694 static bool processLoopInVPlanNativePath( 9695 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9696 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9697 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9698 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9699 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9700 LoopVectorizationRequirements &Requirements) { 9701 9702 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9703 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9704 return false; 9705 } 9706 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9707 Function *F = L->getHeader()->getParent(); 9708 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9709 9710 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9711 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9712 9713 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9714 &Hints, IAI); 9715 // Use the planner for outer loop vectorization. 9716 // TODO: CM is not used at this point inside the planner. Turn CM into an 9717 // optional argument if we don't need it in the future. 9718 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, 9719 Requirements, ORE); 9720 9721 // Get user vectorization factor. 9722 ElementCount UserVF = Hints.getWidth(); 9723 9724 // Plan how to best vectorize, return the best VF and its cost. 9725 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9726 9727 // If we are stress testing VPlan builds, do not attempt to generate vector 9728 // code. Masked vector code generation support will follow soon. 9729 // Also, do not attempt to vectorize if no vector code will be produced. 9730 if (VPlanBuildStressTest || EnableVPlanPredication || 9731 VectorizationFactor::Disabled() == VF) 9732 return false; 9733 9734 LVP.setBestPlan(VF.Width, 1); 9735 9736 { 9737 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 9738 F->getParent()->getDataLayout()); 9739 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9740 &CM, BFI, PSI, Checks); 9741 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9742 << L->getHeader()->getParent()->getName() << "\"\n"); 9743 LVP.executePlan(LB, DT); 9744 } 9745 9746 // Mark the loop as already vectorized to avoid vectorizing again. 9747 Hints.setAlreadyVectorized(); 9748 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9749 return true; 9750 } 9751 9752 // Emit a remark if there are stores to floats that required a floating point 9753 // extension. If the vectorized loop was generated with floating point there 9754 // will be a performance penalty from the conversion overhead and the change in 9755 // the vector width. 9756 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9757 SmallVector<Instruction *, 4> Worklist; 9758 for (BasicBlock *BB : L->getBlocks()) { 9759 for (Instruction &Inst : *BB) { 9760 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9761 if (S->getValueOperand()->getType()->isFloatTy()) 9762 Worklist.push_back(S); 9763 } 9764 } 9765 } 9766 9767 // Traverse the floating point stores upwards searching, for floating point 9768 // conversions. 9769 SmallPtrSet<const Instruction *, 4> Visited; 9770 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9771 while (!Worklist.empty()) { 9772 auto *I = Worklist.pop_back_val(); 9773 if (!L->contains(I)) 9774 continue; 9775 if (!Visited.insert(I).second) 9776 continue; 9777 9778 // Emit a remark if the floating point store required a floating 9779 // point conversion. 9780 // TODO: More work could be done to identify the root cause such as a 9781 // constant or a function return type and point the user to it. 9782 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9783 ORE->emit([&]() { 9784 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9785 I->getDebugLoc(), L->getHeader()) 9786 << "floating point conversion changes vector width. " 9787 << "Mixed floating point precision requires an up/down " 9788 << "cast that will negatively impact performance."; 9789 }); 9790 9791 for (Use &Op : I->operands()) 9792 if (auto *OpI = dyn_cast<Instruction>(Op)) 9793 Worklist.push_back(OpI); 9794 } 9795 } 9796 9797 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9798 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9799 !EnableLoopInterleaving), 9800 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9801 !EnableLoopVectorization) {} 9802 9803 bool LoopVectorizePass::processLoop(Loop *L) { 9804 assert((EnableVPlanNativePath || L->isInnermost()) && 9805 "VPlan-native path is not enabled. Only process inner loops."); 9806 9807 #ifndef NDEBUG 9808 const std::string DebugLocStr = getDebugLocString(L); 9809 #endif /* NDEBUG */ 9810 9811 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9812 << L->getHeader()->getParent()->getName() << "\" from " 9813 << DebugLocStr << "\n"); 9814 9815 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9816 9817 LLVM_DEBUG( 9818 dbgs() << "LV: Loop hints:" 9819 << " force=" 9820 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9821 ? "disabled" 9822 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9823 ? "enabled" 9824 : "?")) 9825 << " width=" << Hints.getWidth() 9826 << " interleave=" << Hints.getInterleave() << "\n"); 9827 9828 // Function containing loop 9829 Function *F = L->getHeader()->getParent(); 9830 9831 // Looking at the diagnostic output is the only way to determine if a loop 9832 // was vectorized (other than looking at the IR or machine code), so it 9833 // is important to generate an optimization remark for each loop. Most of 9834 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9835 // generated as OptimizationRemark and OptimizationRemarkMissed are 9836 // less verbose reporting vectorized loops and unvectorized loops that may 9837 // benefit from vectorization, respectively. 9838 9839 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9840 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9841 return false; 9842 } 9843 9844 PredicatedScalarEvolution PSE(*SE, *L); 9845 9846 // Check if it is legal to vectorize the loop. 9847 LoopVectorizationRequirements Requirements; 9848 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9849 &Requirements, &Hints, DB, AC, BFI, PSI); 9850 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9851 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9852 Hints.emitRemarkWithHints(); 9853 return false; 9854 } 9855 9856 // Check the function attributes and profiles to find out if this function 9857 // should be optimized for size. 9858 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9859 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9860 9861 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9862 // here. They may require CFG and instruction level transformations before 9863 // even evaluating whether vectorization is profitable. Since we cannot modify 9864 // the incoming IR, we need to build VPlan upfront in the vectorization 9865 // pipeline. 9866 if (!L->isInnermost()) 9867 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9868 ORE, BFI, PSI, Hints, Requirements); 9869 9870 assert(L->isInnermost() && "Inner loop expected."); 9871 9872 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9873 // count by optimizing for size, to minimize overheads. 9874 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9875 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9876 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9877 << "This loop is worth vectorizing only if no scalar " 9878 << "iteration overheads are incurred."); 9879 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9880 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9881 else { 9882 LLVM_DEBUG(dbgs() << "\n"); 9883 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9884 } 9885 } 9886 9887 // Check the function attributes to see if implicit floats are allowed. 9888 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9889 // an integer loop and the vector instructions selected are purely integer 9890 // vector instructions? 9891 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9892 reportVectorizationFailure( 9893 "Can't vectorize when the NoImplicitFloat attribute is used", 9894 "loop not vectorized due to NoImplicitFloat attribute", 9895 "NoImplicitFloat", ORE, L); 9896 Hints.emitRemarkWithHints(); 9897 return false; 9898 } 9899 9900 // Check if the target supports potentially unsafe FP vectorization. 9901 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9902 // for the target we're vectorizing for, to make sure none of the 9903 // additional fp-math flags can help. 9904 if (Hints.isPotentiallyUnsafe() && 9905 TTI->isFPVectorizationPotentiallyUnsafe()) { 9906 reportVectorizationFailure( 9907 "Potentially unsafe FP op prevents vectorization", 9908 "loop not vectorized due to unsafe FP support.", 9909 "UnsafeFP", ORE, L); 9910 Hints.emitRemarkWithHints(); 9911 return false; 9912 } 9913 9914 if (!Requirements.canVectorizeFPMath(Hints)) { 9915 ORE->emit([&]() { 9916 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9917 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9918 ExactFPMathInst->getDebugLoc(), 9919 ExactFPMathInst->getParent()) 9920 << "loop not vectorized: cannot prove it is safe to reorder " 9921 "floating-point operations"; 9922 }); 9923 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9924 "reorder floating-point operations\n"); 9925 Hints.emitRemarkWithHints(); 9926 return false; 9927 } 9928 9929 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9930 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9931 9932 // If an override option has been passed in for interleaved accesses, use it. 9933 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9934 UseInterleaved = EnableInterleavedMemAccesses; 9935 9936 // Analyze interleaved memory accesses. 9937 if (UseInterleaved) { 9938 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9939 } 9940 9941 // Use the cost model. 9942 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9943 F, &Hints, IAI); 9944 CM.collectValuesToIgnore(); 9945 9946 // Use the planner for vectorization. 9947 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, 9948 Requirements, ORE); 9949 9950 // Get user vectorization factor and interleave count. 9951 ElementCount UserVF = Hints.getWidth(); 9952 unsigned UserIC = Hints.getInterleave(); 9953 9954 // Plan how to best vectorize, return the best VF and its cost. 9955 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9956 9957 VectorizationFactor VF = VectorizationFactor::Disabled(); 9958 unsigned IC = 1; 9959 9960 if (MaybeVF) { 9961 VF = *MaybeVF; 9962 // Select the interleave count. 9963 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue()); 9964 } 9965 9966 // Identify the diagnostic messages that should be produced. 9967 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9968 bool VectorizeLoop = true, InterleaveLoop = true; 9969 if (VF.Width.isScalar()) { 9970 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9971 VecDiagMsg = std::make_pair( 9972 "VectorizationNotBeneficial", 9973 "the cost-model indicates that vectorization is not beneficial"); 9974 VectorizeLoop = false; 9975 } 9976 9977 if (!MaybeVF && UserIC > 1) { 9978 // Tell the user interleaving was avoided up-front, despite being explicitly 9979 // requested. 9980 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9981 "interleaving should be avoided up front\n"); 9982 IntDiagMsg = std::make_pair( 9983 "InterleavingAvoided", 9984 "Ignoring UserIC, because interleaving was avoided up front"); 9985 InterleaveLoop = false; 9986 } else if (IC == 1 && UserIC <= 1) { 9987 // Tell the user interleaving is not beneficial. 9988 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9989 IntDiagMsg = std::make_pair( 9990 "InterleavingNotBeneficial", 9991 "the cost-model indicates that interleaving is not beneficial"); 9992 InterleaveLoop = false; 9993 if (UserIC == 1) { 9994 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9995 IntDiagMsg.second += 9996 " and is explicitly disabled or interleave count is set to 1"; 9997 } 9998 } else if (IC > 1 && UserIC == 1) { 9999 // Tell the user interleaving is beneficial, but it explicitly disabled. 10000 LLVM_DEBUG( 10001 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10002 IntDiagMsg = std::make_pair( 10003 "InterleavingBeneficialButDisabled", 10004 "the cost-model indicates that interleaving is beneficial " 10005 "but is explicitly disabled or interleave count is set to 1"); 10006 InterleaveLoop = false; 10007 } 10008 10009 // Override IC if user provided an interleave count. 10010 IC = UserIC > 0 ? UserIC : IC; 10011 10012 // Emit diagnostic messages, if any. 10013 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10014 if (!VectorizeLoop && !InterleaveLoop) { 10015 // Do not vectorize or interleaving the loop. 10016 ORE->emit([&]() { 10017 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10018 L->getStartLoc(), L->getHeader()) 10019 << VecDiagMsg.second; 10020 }); 10021 ORE->emit([&]() { 10022 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10023 L->getStartLoc(), L->getHeader()) 10024 << IntDiagMsg.second; 10025 }); 10026 return false; 10027 } else if (!VectorizeLoop && InterleaveLoop) { 10028 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10029 ORE->emit([&]() { 10030 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10031 L->getStartLoc(), L->getHeader()) 10032 << VecDiagMsg.second; 10033 }); 10034 } else if (VectorizeLoop && !InterleaveLoop) { 10035 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10036 << ") in " << DebugLocStr << '\n'); 10037 ORE->emit([&]() { 10038 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10039 L->getStartLoc(), L->getHeader()) 10040 << IntDiagMsg.second; 10041 }); 10042 } else if (VectorizeLoop && InterleaveLoop) { 10043 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10044 << ") in " << DebugLocStr << '\n'); 10045 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10046 } 10047 10048 bool DisableRuntimeUnroll = false; 10049 MDNode *OrigLoopID = L->getLoopID(); 10050 { 10051 // Optimistically generate runtime checks. Drop them if they turn out to not 10052 // be profitable. Limit the scope of Checks, so the cleanup happens 10053 // immediately after vector codegeneration is done. 10054 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, 10055 F->getParent()->getDataLayout()); 10056 if (!VF.Width.isScalar() || IC > 1) 10057 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); 10058 LVP.setBestPlan(VF.Width, IC); 10059 10060 using namespace ore; 10061 if (!VectorizeLoop) { 10062 assert(IC > 1 && "interleave count should not be 1 or 0"); 10063 // If we decided that it is not legal to vectorize the loop, then 10064 // interleave it. 10065 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10066 &CM, BFI, PSI, Checks); 10067 LVP.executePlan(Unroller, DT); 10068 10069 ORE->emit([&]() { 10070 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10071 L->getHeader()) 10072 << "interleaved loop (interleaved count: " 10073 << NV("InterleaveCount", IC) << ")"; 10074 }); 10075 } else { 10076 // If we decided that it is *legal* to vectorize the loop, then do it. 10077 10078 // Consider vectorizing the epilogue too if it's profitable. 10079 VectorizationFactor EpilogueVF = 10080 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 10081 if (EpilogueVF.Width.isVector()) { 10082 10083 // The first pass vectorizes the main loop and creates a scalar epilogue 10084 // to be vectorized by executing the plan (potentially with a different 10085 // factor) again shortly afterwards. 10086 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 10087 EpilogueVF.Width.getKnownMinValue(), 10088 1); 10089 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10090 EPI, &LVL, &CM, BFI, PSI, Checks); 10091 10092 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 10093 LVP.executePlan(MainILV, DT); 10094 ++LoopsVectorized; 10095 10096 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10097 formLCSSARecursively(*L, *DT, LI, SE); 10098 10099 // Second pass vectorizes the epilogue and adjusts the control flow 10100 // edges from the first pass. 10101 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 10102 EPI.MainLoopVF = EPI.EpilogueVF; 10103 EPI.MainLoopUF = EPI.EpilogueUF; 10104 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10105 ORE, EPI, &LVL, &CM, BFI, PSI, 10106 Checks); 10107 LVP.executePlan(EpilogILV, DT); 10108 ++LoopsEpilogueVectorized; 10109 10110 if (!MainILV.areSafetyChecksAdded()) 10111 DisableRuntimeUnroll = true; 10112 } else { 10113 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 10114 &LVL, &CM, BFI, PSI, Checks); 10115 LVP.executePlan(LB, DT); 10116 ++LoopsVectorized; 10117 10118 // Add metadata to disable runtime unrolling a scalar loop when there 10119 // are no runtime checks about strides and memory. A scalar loop that is 10120 // rarely used is not worth unrolling. 10121 if (!LB.areSafetyChecksAdded()) 10122 DisableRuntimeUnroll = true; 10123 } 10124 // Report the vectorization decision. 10125 ORE->emit([&]() { 10126 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 10127 L->getHeader()) 10128 << "vectorized loop (vectorization width: " 10129 << NV("VectorizationFactor", VF.Width) 10130 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 10131 }); 10132 } 10133 10134 if (ORE->allowExtraAnalysis(LV_NAME)) 10135 checkMixedPrecision(L, ORE); 10136 } 10137 10138 Optional<MDNode *> RemainderLoopID = 10139 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10140 LLVMLoopVectorizeFollowupEpilogue}); 10141 if (RemainderLoopID.hasValue()) { 10142 L->setLoopID(RemainderLoopID.getValue()); 10143 } else { 10144 if (DisableRuntimeUnroll) 10145 AddRuntimeUnrollDisableMetaData(L); 10146 10147 // Mark the loop as already vectorized to avoid vectorizing again. 10148 Hints.setAlreadyVectorized(); 10149 } 10150 10151 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10152 return true; 10153 } 10154 10155 LoopVectorizeResult LoopVectorizePass::runImpl( 10156 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10157 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 10158 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 10159 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 10160 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10161 SE = &SE_; 10162 LI = &LI_; 10163 TTI = &TTI_; 10164 DT = &DT_; 10165 BFI = &BFI_; 10166 TLI = TLI_; 10167 AA = &AA_; 10168 AC = &AC_; 10169 GetLAA = &GetLAA_; 10170 DB = &DB_; 10171 ORE = &ORE_; 10172 PSI = PSI_; 10173 10174 // Don't attempt if 10175 // 1. the target claims to have no vector registers, and 10176 // 2. interleaving won't help ILP. 10177 // 10178 // The second condition is necessary because, even if the target has no 10179 // vector registers, loop vectorization may still enable scalar 10180 // interleaving. 10181 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10182 TTI->getMaxInterleaveFactor(1) < 2) 10183 return LoopVectorizeResult(false, false); 10184 10185 bool Changed = false, CFGChanged = false; 10186 10187 // The vectorizer requires loops to be in simplified form. 10188 // Since simplification may add new inner loops, it has to run before the 10189 // legality and profitability checks. This means running the loop vectorizer 10190 // will simplify all loops, regardless of whether anything end up being 10191 // vectorized. 10192 for (auto &L : *LI) 10193 Changed |= CFGChanged |= 10194 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10195 10196 // Build up a worklist of inner-loops to vectorize. This is necessary as 10197 // the act of vectorizing or partially unrolling a loop creates new loops 10198 // and can invalidate iterators across the loops. 10199 SmallVector<Loop *, 8> Worklist; 10200 10201 for (Loop *L : *LI) 10202 collectSupportedLoops(*L, LI, ORE, Worklist); 10203 10204 LoopsAnalyzed += Worklist.size(); 10205 10206 // Now walk the identified inner loops. 10207 while (!Worklist.empty()) { 10208 Loop *L = Worklist.pop_back_val(); 10209 10210 // For the inner loops we actually process, form LCSSA to simplify the 10211 // transform. 10212 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10213 10214 Changed |= CFGChanged |= processLoop(L); 10215 } 10216 10217 // Process each loop nest in the function. 10218 return LoopVectorizeResult(Changed, CFGChanged); 10219 } 10220 10221 PreservedAnalyses LoopVectorizePass::run(Function &F, 10222 FunctionAnalysisManager &AM) { 10223 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10224 auto &LI = AM.getResult<LoopAnalysis>(F); 10225 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10226 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10227 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 10228 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10229 auto &AA = AM.getResult<AAManager>(F); 10230 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10231 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10232 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10233 MemorySSA *MSSA = EnableMSSALoopDependency 10234 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 10235 : nullptr; 10236 10237 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 10238 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 10239 [&](Loop &L) -> const LoopAccessInfo & { 10240 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 10241 TLI, TTI, nullptr, MSSA}; 10242 return LAM.getResult<LoopAccessAnalysis>(L, AR); 10243 }; 10244 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10245 ProfileSummaryInfo *PSI = 10246 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10247 LoopVectorizeResult Result = 10248 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 10249 if (!Result.MadeAnyChange) 10250 return PreservedAnalyses::all(); 10251 PreservedAnalyses PA; 10252 10253 // We currently do not preserve loopinfo/dominator analyses with outer loop 10254 // vectorization. Until this is addressed, mark these analyses as preserved 10255 // only for non-VPlan-native path. 10256 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10257 if (!EnableVPlanNativePath) { 10258 PA.preserve<LoopAnalysis>(); 10259 PA.preserve<DominatorTreeAnalysis>(); 10260 } 10261 PA.preserve<BasicAA>(); 10262 PA.preserve<GlobalsAA>(); 10263 if (!Result.MadeCFGChange) 10264 PA.preserveSet<CFGAnalyses>(); 10265 return PA; 10266 } 10267