1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 95 #include "llvm/Analysis/TargetLibraryInfo.h" 96 #include "llvm/Analysis/TargetTransformInfo.h" 97 #include "llvm/Analysis/VectorUtils.h" 98 #include "llvm/IR/Attributes.h" 99 #include "llvm/IR/BasicBlock.h" 100 #include "llvm/IR/CFG.h" 101 #include "llvm/IR/Constant.h" 102 #include "llvm/IR/Constants.h" 103 #include "llvm/IR/DataLayout.h" 104 #include "llvm/IR/DebugInfoMetadata.h" 105 #include "llvm/IR/DebugLoc.h" 106 #include "llvm/IR/DerivedTypes.h" 107 #include "llvm/IR/DiagnosticInfo.h" 108 #include "llvm/IR/Dominators.h" 109 #include "llvm/IR/Function.h" 110 #include "llvm/IR/IRBuilder.h" 111 #include "llvm/IR/InstrTypes.h" 112 #include "llvm/IR/Instruction.h" 113 #include "llvm/IR/Instructions.h" 114 #include "llvm/IR/IntrinsicInst.h" 115 #include "llvm/IR/Intrinsics.h" 116 #include "llvm/IR/LLVMContext.h" 117 #include "llvm/IR/Metadata.h" 118 #include "llvm/IR/Module.h" 119 #include "llvm/IR/Operator.h" 120 #include "llvm/IR/Type.h" 121 #include "llvm/IR/Use.h" 122 #include "llvm/IR/User.h" 123 #include "llvm/IR/Value.h" 124 #include "llvm/IR/ValueHandle.h" 125 #include "llvm/IR/Verifier.h" 126 #include "llvm/InitializePasses.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/InstructionCost.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 142 #include "llvm/Transforms/Utils/SizeOpts.h" 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 144 #include <algorithm> 145 #include <cassert> 146 #include <cstdint> 147 #include <cstdlib> 148 #include <functional> 149 #include <iterator> 150 #include <limits> 151 #include <memory> 152 #include <string> 153 #include <tuple> 154 #include <utility> 155 156 using namespace llvm; 157 158 #define LV_NAME "loop-vectorize" 159 #define DEBUG_TYPE LV_NAME 160 161 #ifndef NDEBUG 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163 #endif 164 165 /// @{ 166 /// Metadata attribute names 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168 const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170 const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172 /// @} 173 174 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178 static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182 static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188 static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193 /// Loops with a known constant trip count below this number are vectorized only 194 /// if no scalar iteration overheads are incurred. 195 static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 202 // that predication is preferred, and this lists all options. I.e., the 203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body 204 // and predicate the instructions accordingly. If tail-folding fails, there are 205 // different fallback strategies depending on these values: 206 namespace PreferPredicateTy { 207 enum Option { 208 ScalarEpilogue = 0, 209 PredicateElseScalarEpilogue, 210 PredicateOrDontVectorize 211 }; 212 } // namespace PreferPredicateTy 213 214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 215 "prefer-predicate-over-epilogue", 216 cl::init(PreferPredicateTy::ScalarEpilogue), 217 cl::Hidden, 218 cl::desc("Tail-folding and predication preferences over creating a scalar " 219 "epilogue loop."), 220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 221 "scalar-epilogue", 222 "Don't tail-predicate loops, create scalar epilogue"), 223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 224 "predicate-else-scalar-epilogue", 225 "prefer tail-folding, create scalar epilogue if tail " 226 "folding fails."), 227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 228 "predicate-dont-vectorize", 229 "prefers tail-folding, don't attempt vectorization if " 230 "tail-folding fails."))); 231 232 static cl::opt<bool> MaximizeBandwidth( 233 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 234 cl::desc("Maximize bandwidth when selecting vectorization factor which " 235 "will be determined by the smallest type in loop.")); 236 237 static cl::opt<bool> EnableInterleavedMemAccesses( 238 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 239 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 240 241 /// An interleave-group may need masking if it resides in a block that needs 242 /// predication, or in order to mask away gaps. 243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 244 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 245 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 246 247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 248 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 249 cl::desc("We don't interleave loops with a estimated constant trip count " 250 "below this number")); 251 252 static cl::opt<unsigned> ForceTargetNumScalarRegs( 253 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 254 cl::desc("A flag that overrides the target's number of scalar registers.")); 255 256 static cl::opt<unsigned> ForceTargetNumVectorRegs( 257 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 258 cl::desc("A flag that overrides the target's number of vector registers.")); 259 260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 261 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 262 cl::desc("A flag that overrides the target's max interleave factor for " 263 "scalar loops.")); 264 265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 266 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 267 cl::desc("A flag that overrides the target's max interleave factor for " 268 "vectorized loops.")); 269 270 static cl::opt<unsigned> ForceTargetInstructionCost( 271 "force-target-instruction-cost", cl::init(0), cl::Hidden, 272 cl::desc("A flag that overrides the target's expected cost for " 273 "an instruction to a single constant value. Mostly " 274 "useful for getting consistent testing.")); 275 276 static cl::opt<bool> ForceTargetSupportsScalableVectors( 277 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 278 cl::desc( 279 "Pretend that scalable vectors are supported, even if the target does " 280 "not support them. This flag should only be used for testing.")); 281 282 static cl::opt<unsigned> SmallLoopCost( 283 "small-loop-cost", cl::init(20), cl::Hidden, 284 cl::desc( 285 "The cost of a loop that is considered 'small' by the interleaver.")); 286 287 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 288 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 289 cl::desc("Enable the use of the block frequency analysis to access PGO " 290 "heuristics minimizing code growth in cold regions and being more " 291 "aggressive in hot regions.")); 292 293 // Runtime interleave loops for load/store throughput. 294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 295 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 296 cl::desc( 297 "Enable runtime interleaving until load/store ports are saturated")); 298 299 /// Interleave small loops with scalar reductions. 300 static cl::opt<bool> InterleaveSmallLoopScalarReduction( 301 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 302 cl::desc("Enable interleaving for loops with small iteration counts that " 303 "contain scalar reductions to expose ILP.")); 304 305 /// The number of stores in a loop that are allowed to need predication. 306 static cl::opt<unsigned> NumberOfStoresToPredicate( 307 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 308 cl::desc("Max number of stores to be predicated behind an if.")); 309 310 static cl::opt<bool> EnableIndVarRegisterHeur( 311 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 312 cl::desc("Count the induction variable only once when interleaving")); 313 314 static cl::opt<bool> EnableCondStoresVectorization( 315 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 316 cl::desc("Enable if predication of stores during vectorization.")); 317 318 static cl::opt<unsigned> MaxNestedScalarReductionIC( 319 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 320 cl::desc("The maximum interleave count to use when interleaving a scalar " 321 "reduction in a nested loop.")); 322 323 static cl::opt<bool> 324 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 325 cl::Hidden, 326 cl::desc("Prefer in-loop vector reductions, " 327 "overriding the targets preference.")); 328 329 static cl::opt<bool> PreferPredicatedReductionSelect( 330 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 331 cl::desc( 332 "Prefer predicating a reduction operation over an after loop select.")); 333 334 cl::opt<bool> EnableVPlanNativePath( 335 "enable-vplan-native-path", cl::init(false), cl::Hidden, 336 cl::desc("Enable VPlan-native vectorization path with " 337 "support for outer loop vectorization.")); 338 339 // FIXME: Remove this switch once we have divergence analysis. Currently we 340 // assume divergent non-backedge branches when this switch is true. 341 cl::opt<bool> EnableVPlanPredication( 342 "enable-vplan-predication", cl::init(false), cl::Hidden, 343 cl::desc("Enable VPlan-native vectorization path predicator with " 344 "support for outer loop vectorization.")); 345 346 // This flag enables the stress testing of the VPlan H-CFG construction in the 347 // VPlan-native vectorization path. It must be used in conjuction with 348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 349 // verification of the H-CFGs built. 350 static cl::opt<bool> VPlanBuildStressTest( 351 "vplan-build-stress-test", cl::init(false), cl::Hidden, 352 cl::desc( 353 "Build VPlan for every supported loop nest in the function and bail " 354 "out right after the build (stress test the VPlan H-CFG construction " 355 "in the VPlan-native vectorization path).")); 356 357 cl::opt<bool> llvm::EnableLoopInterleaving( 358 "interleave-loops", cl::init(true), cl::Hidden, 359 cl::desc("Enable loop interleaving in Loop vectorization passes")); 360 cl::opt<bool> llvm::EnableLoopVectorization( 361 "vectorize-loops", cl::init(true), cl::Hidden, 362 cl::desc("Run the Loop vectorization passes")); 363 364 /// A helper function that returns the type of loaded or stored value. 365 static Type *getMemInstValueType(Value *I) { 366 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 367 "Expected Load or Store instruction"); 368 if (auto *LI = dyn_cast<LoadInst>(I)) 369 return LI->getType(); 370 return cast<StoreInst>(I)->getValueOperand()->getType(); 371 } 372 373 /// A helper function that returns true if the given type is irregular. The 374 /// type is irregular if its allocated size doesn't equal the store size of an 375 /// element of the corresponding vector type at the given vectorization factor. 376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { 377 // Determine if an array of VF elements of type Ty is "bitcast compatible" 378 // with a <VF x Ty> vector. 379 if (VF.isVector()) { 380 auto *VectorTy = VectorType::get(Ty, VF); 381 return TypeSize::get(VF.getKnownMinValue() * 382 DL.getTypeAllocSize(Ty).getFixedValue(), 383 VF.isScalable()) != DL.getTypeStoreSize(VectorTy); 384 } 385 386 // If the vectorization factor is one, we just check if an array of type Ty 387 // requires padding between elements. 388 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 389 } 390 391 /// A helper function that returns the reciprocal of the block probability of 392 /// predicated blocks. If we return X, we are assuming the predicated block 393 /// will execute once for every X iterations of the loop header. 394 /// 395 /// TODO: We should use actual block probability here, if available. Currently, 396 /// we always assume predicated blocks have a 50% chance of executing. 397 static unsigned getReciprocalPredBlockProb() { return 2; } 398 399 /// A helper function that adds a 'fast' flag to floating-point operations. 400 static Value *addFastMathFlag(Value *V) { 401 if (isa<FPMathOperator>(V)) 402 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 403 return V; 404 } 405 406 /// A helper function that returns an integer or floating-point constant with 407 /// value C. 408 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 409 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 410 : ConstantFP::get(Ty, C); 411 } 412 413 /// Returns "best known" trip count for the specified loop \p L as defined by 414 /// the following procedure: 415 /// 1) Returns exact trip count if it is known. 416 /// 2) Returns expected trip count according to profile data if any. 417 /// 3) Returns upper bound estimate if it is known. 418 /// 4) Returns None if all of the above failed. 419 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 420 // Check if exact trip count is known. 421 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 422 return ExpectedTC; 423 424 // Check if there is an expected trip count available from profile data. 425 if (LoopVectorizeWithBlockFrequency) 426 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 427 return EstimatedTC; 428 429 // Check if upper bound estimate is known. 430 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 431 return ExpectedTC; 432 433 return None; 434 } 435 436 namespace llvm { 437 438 /// InnerLoopVectorizer vectorizes loops which contain only one basic 439 /// block to a specified vectorization factor (VF). 440 /// This class performs the widening of scalars into vectors, or multiple 441 /// scalars. This class also implements the following features: 442 /// * It inserts an epilogue loop for handling loops that don't have iteration 443 /// counts that are known to be a multiple of the vectorization factor. 444 /// * It handles the code generation for reduction variables. 445 /// * Scalarization (implementation using scalars) of un-vectorizable 446 /// instructions. 447 /// InnerLoopVectorizer does not perform any vectorization-legality 448 /// checks, and relies on the caller to check for the different legality 449 /// aspects. The InnerLoopVectorizer relies on the 450 /// LoopVectorizationLegality class to provide information about the induction 451 /// and reduction variables that were found to a given vectorization factor. 452 class InnerLoopVectorizer { 453 public: 454 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 455 LoopInfo *LI, DominatorTree *DT, 456 const TargetLibraryInfo *TLI, 457 const TargetTransformInfo *TTI, AssumptionCache *AC, 458 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 459 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 460 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 461 ProfileSummaryInfo *PSI) 462 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 463 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 464 Builder(PSE.getSE()->getContext()), 465 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), 466 BFI(BFI), PSI(PSI) { 467 // Query this against the original loop and save it here because the profile 468 // of the original loop header may change as the transformation happens. 469 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 470 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 471 } 472 473 virtual ~InnerLoopVectorizer() = default; 474 475 /// Create a new empty loop that will contain vectorized instructions later 476 /// on, while the old loop will be used as the scalar remainder. Control flow 477 /// is generated around the vectorized (and scalar epilogue) loops consisting 478 /// of various checks and bypasses. Return the pre-header block of the new 479 /// loop. 480 /// In the case of epilogue vectorization, this function is overriden to 481 /// handle the more complex control flow around the loops. 482 virtual BasicBlock *createVectorizedLoopSkeleton(); 483 484 /// Widen a single instruction within the innermost loop. 485 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, 486 VPTransformState &State); 487 488 /// Widen a single call instruction within the innermost loop. 489 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, 490 VPTransformState &State); 491 492 /// Widen a single select instruction within the innermost loop. 493 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands, 494 bool InvariantCond, VPTransformState &State); 495 496 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 497 void fixVectorizedLoop(); 498 499 // Return true if any runtime check is added. 500 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 501 502 /// A type for vectorized values in the new loop. Each value from the 503 /// original loop, when vectorized, is represented by UF vector values in the 504 /// new unrolled loop, where UF is the unroll factor. 505 using VectorParts = SmallVector<Value *, 2>; 506 507 /// Vectorize a single GetElementPtrInst based on information gathered and 508 /// decisions taken during planning. 509 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices, 510 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, 511 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State); 512 513 /// Vectorize a single PHINode in a block. This method handles the induction 514 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 515 /// arbitrary length vectors. 516 void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc, 517 Value *StartV, unsigned UF, ElementCount VF); 518 519 /// A helper function to scalarize a single Instruction in the innermost loop. 520 /// Generates a sequence of scalar instances for each lane between \p MinLane 521 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 522 /// inclusive. Uses the VPValue operands from \p Operands instead of \p 523 /// Instr's operands. 524 void scalarizeInstruction(Instruction *Instr, VPUser &Operands, 525 const VPIteration &Instance, bool IfPredicateInstr, 526 VPTransformState &State); 527 528 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 529 /// is provided, the integer induction variable will first be truncated to 530 /// the corresponding type. 531 void widenIntOrFpInduction(PHINode *IV, Value *Start, 532 TruncInst *Trunc = nullptr); 533 534 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 535 /// vector or scalar value on-demand if one is not yet available. When 536 /// vectorizing a loop, we visit the definition of an instruction before its 537 /// uses. When visiting the definition, we either vectorize or scalarize the 538 /// instruction, creating an entry for it in the corresponding map. (In some 539 /// cases, such as induction variables, we will create both vector and scalar 540 /// entries.) Then, as we encounter uses of the definition, we derive values 541 /// for each scalar or vector use unless such a value is already available. 542 /// For example, if we scalarize a definition and one of its uses is vector, 543 /// we build the required vector on-demand with an insertelement sequence 544 /// when visiting the use. Otherwise, if the use is scalar, we can use the 545 /// existing scalar definition. 546 /// 547 /// Return a value in the new loop corresponding to \p V from the original 548 /// loop at unroll index \p Part. If the value has already been vectorized, 549 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 550 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 551 /// a new vector value on-demand by inserting the scalar values into a vector 552 /// with an insertelement sequence. If the value has been neither vectorized 553 /// nor scalarized, it must be loop invariant, so we simply broadcast the 554 /// value into a vector. 555 Value *getOrCreateVectorValue(Value *V, unsigned Part); 556 557 void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) { 558 VectorLoopValueMap.setVectorValue(Scalar, Part, Vector); 559 } 560 561 /// Return a value in the new loop corresponding to \p V from the original 562 /// loop at unroll and vector indices \p Instance. If the value has been 563 /// vectorized but not scalarized, the necessary extractelement instruction 564 /// will be generated. 565 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 566 567 /// Construct the vector value of a scalarized value \p V one lane at a time. 568 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 569 570 /// Try to vectorize interleaved access group \p Group with the base address 571 /// given in \p Addr, optionally masking the vector operations if \p 572 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 573 /// values in the vectorized loop. 574 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 575 ArrayRef<VPValue *> VPDefs, 576 VPTransformState &State, VPValue *Addr, 577 ArrayRef<VPValue *> StoredValues, 578 VPValue *BlockInMask = nullptr); 579 580 /// Vectorize Load and Store instructions with the base address given in \p 581 /// Addr, optionally masking the vector operations if \p BlockInMask is 582 /// non-null. Use \p State to translate given VPValues to IR values in the 583 /// vectorized loop. 584 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 585 VPValue *Def, VPValue *Addr, 586 VPValue *StoredValue, VPValue *BlockInMask); 587 588 /// Set the debug location in the builder using the debug location in 589 /// the instruction. 590 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 591 592 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 593 void fixNonInductionPHIs(void); 594 595 protected: 596 friend class LoopVectorizationPlanner; 597 598 /// A small list of PHINodes. 599 using PhiVector = SmallVector<PHINode *, 4>; 600 601 /// A type for scalarized values in the new loop. Each value from the 602 /// original loop, when scalarized, is represented by UF x VF scalar values 603 /// in the new unrolled loop, where UF is the unroll factor and VF is the 604 /// vectorization factor. 605 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 606 607 /// Set up the values of the IVs correctly when exiting the vector loop. 608 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 609 Value *CountRoundDown, Value *EndValue, 610 BasicBlock *MiddleBlock); 611 612 /// Create a new induction variable inside L. 613 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 614 Value *Step, Instruction *DL); 615 616 /// Handle all cross-iteration phis in the header. 617 void fixCrossIterationPHIs(); 618 619 /// Fix a first-order recurrence. This is the second phase of vectorizing 620 /// this phi node. 621 void fixFirstOrderRecurrence(PHINode *Phi); 622 623 /// Fix a reduction cross-iteration phi. This is the second phase of 624 /// vectorizing this phi node. 625 void fixReduction(PHINode *Phi); 626 627 /// Clear NSW/NUW flags from reduction instructions if necessary. 628 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 629 630 /// Fixup the LCSSA phi nodes in the unique exit block. This simply 631 /// means we need to add the appropriate incoming value from the middle 632 /// block as exiting edges from the scalar epilogue loop (if present) are 633 /// already in place, and we exit the vector loop exclusively to the middle 634 /// block. 635 void fixLCSSAPHIs(); 636 637 /// Iteratively sink the scalarized operands of a predicated instruction into 638 /// the block that was created for it. 639 void sinkScalarOperands(Instruction *PredInst); 640 641 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 642 /// represented as. 643 void truncateToMinimalBitwidths(); 644 645 /// Create a broadcast instruction. This method generates a broadcast 646 /// instruction (shuffle) for loop invariant values and for the induction 647 /// value. If this is the induction variable then we extend it to N, N+1, ... 648 /// this is needed because each iteration in the loop corresponds to a SIMD 649 /// element. 650 virtual Value *getBroadcastInstrs(Value *V); 651 652 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 653 /// to each vector element of Val. The sequence starts at StartIndex. 654 /// \p Opcode is relevant for FP induction variable. 655 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 656 Instruction::BinaryOps Opcode = 657 Instruction::BinaryOpsEnd); 658 659 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 660 /// variable on which to base the steps, \p Step is the size of the step, and 661 /// \p EntryVal is the value from the original loop that maps to the steps. 662 /// Note that \p EntryVal doesn't have to be an induction variable - it 663 /// can also be a truncate instruction. 664 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 665 const InductionDescriptor &ID); 666 667 /// Create a vector induction phi node based on an existing scalar one. \p 668 /// EntryVal is the value from the original loop that maps to the vector phi 669 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 670 /// truncate instruction, instead of widening the original IV, we widen a 671 /// version of the IV truncated to \p EntryVal's type. 672 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 673 Value *Step, Value *Start, 674 Instruction *EntryVal); 675 676 /// Returns true if an instruction \p I should be scalarized instead of 677 /// vectorized for the chosen vectorization factor. 678 bool shouldScalarizeInstruction(Instruction *I) const; 679 680 /// Returns true if we should generate a scalar version of \p IV. 681 bool needsScalarInduction(Instruction *IV) const; 682 683 /// If there is a cast involved in the induction variable \p ID, which should 684 /// be ignored in the vectorized loop body, this function records the 685 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 686 /// cast. We had already proved that the casted Phi is equal to the uncasted 687 /// Phi in the vectorized loop (under a runtime guard), and therefore 688 /// there is no need to vectorize the cast - the same value can be used in the 689 /// vector loop for both the Phi and the cast. 690 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 691 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 692 /// 693 /// \p EntryVal is the value from the original loop that maps to the vector 694 /// phi node and is used to distinguish what is the IV currently being 695 /// processed - original one (if \p EntryVal is a phi corresponding to the 696 /// original IV) or the "newly-created" one based on the proof mentioned above 697 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 698 /// latter case \p EntryVal is a TruncInst and we must not record anything for 699 /// that IV, but it's error-prone to expect callers of this routine to care 700 /// about that, hence this explicit parameter. 701 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 702 const Instruction *EntryVal, 703 Value *VectorLoopValue, 704 unsigned Part, 705 unsigned Lane = UINT_MAX); 706 707 /// Generate a shuffle sequence that will reverse the vector Vec. 708 virtual Value *reverseVector(Value *Vec); 709 710 /// Returns (and creates if needed) the original loop trip count. 711 Value *getOrCreateTripCount(Loop *NewLoop); 712 713 /// Returns (and creates if needed) the trip count of the widened loop. 714 Value *getOrCreateVectorTripCount(Loop *NewLoop); 715 716 /// Returns a bitcasted value to the requested vector type. 717 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 718 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 719 const DataLayout &DL); 720 721 /// Emit a bypass check to see if the vector trip count is zero, including if 722 /// it overflows. 723 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 724 725 /// Emit a bypass check to see if all of the SCEV assumptions we've 726 /// had to make are correct. 727 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 728 729 /// Emit bypass checks to check any memory assumptions we may have made. 730 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 731 732 /// Compute the transformed value of Index at offset StartValue using step 733 /// StepValue. 734 /// For integer induction, returns StartValue + Index * StepValue. 735 /// For pointer induction, returns StartValue[Index * StepValue]. 736 /// FIXME: The newly created binary instructions should contain nsw/nuw 737 /// flags, which can be found from the original scalar operations. 738 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 739 const DataLayout &DL, 740 const InductionDescriptor &ID) const; 741 742 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 743 /// vector loop preheader, middle block and scalar preheader. Also 744 /// allocate a loop object for the new vector loop and return it. 745 Loop *createVectorLoopSkeleton(StringRef Prefix); 746 747 /// Create new phi nodes for the induction variables to resume iteration count 748 /// in the scalar epilogue, from where the vectorized loop left off (given by 749 /// \p VectorTripCount). 750 /// In cases where the loop skeleton is more complicated (eg. epilogue 751 /// vectorization) and the resume values can come from an additional bypass 752 /// block, the \p AdditionalBypass pair provides information about the bypass 753 /// block and the end value on the edge from bypass to this loop. 754 void createInductionResumeValues( 755 Loop *L, Value *VectorTripCount, 756 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 757 758 /// Complete the loop skeleton by adding debug MDs, creating appropriate 759 /// conditional branches in the middle block, preparing the builder and 760 /// running the verifier. Take in the vector loop \p L as argument, and return 761 /// the preheader of the completed vector loop. 762 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); 763 764 /// Add additional metadata to \p To that was not present on \p Orig. 765 /// 766 /// Currently this is used to add the noalias annotations based on the 767 /// inserted memchecks. Use this for instructions that are *cloned* into the 768 /// vector loop. 769 void addNewMetadata(Instruction *To, const Instruction *Orig); 770 771 /// Add metadata from one instruction to another. 772 /// 773 /// This includes both the original MDs from \p From and additional ones (\see 774 /// addNewMetadata). Use this for *newly created* instructions in the vector 775 /// loop. 776 void addMetadata(Instruction *To, Instruction *From); 777 778 /// Similar to the previous function but it adds the metadata to a 779 /// vector of instructions. 780 void addMetadata(ArrayRef<Value *> To, Instruction *From); 781 782 /// Allow subclasses to override and print debug traces before/after vplan 783 /// execution, when trace information is requested. 784 virtual void printDebugTracesAtStart(){}; 785 virtual void printDebugTracesAtEnd(){}; 786 787 /// The original loop. 788 Loop *OrigLoop; 789 790 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 791 /// dynamic knowledge to simplify SCEV expressions and converts them to a 792 /// more usable form. 793 PredicatedScalarEvolution &PSE; 794 795 /// Loop Info. 796 LoopInfo *LI; 797 798 /// Dominator Tree. 799 DominatorTree *DT; 800 801 /// Alias Analysis. 802 AAResults *AA; 803 804 /// Target Library Info. 805 const TargetLibraryInfo *TLI; 806 807 /// Target Transform Info. 808 const TargetTransformInfo *TTI; 809 810 /// Assumption Cache. 811 AssumptionCache *AC; 812 813 /// Interface to emit optimization remarks. 814 OptimizationRemarkEmitter *ORE; 815 816 /// LoopVersioning. It's only set up (non-null) if memchecks were 817 /// used. 818 /// 819 /// This is currently only used to add no-alias metadata based on the 820 /// memchecks. The actually versioning is performed manually. 821 std::unique_ptr<LoopVersioning> LVer; 822 823 /// The vectorization SIMD factor to use. Each vector will have this many 824 /// vector elements. 825 ElementCount VF; 826 827 /// The vectorization unroll factor to use. Each scalar is vectorized to this 828 /// many different vector instructions. 829 unsigned UF; 830 831 /// The builder that we use 832 IRBuilder<> Builder; 833 834 // --- Vectorization state --- 835 836 /// The vector-loop preheader. 837 BasicBlock *LoopVectorPreHeader; 838 839 /// The scalar-loop preheader. 840 BasicBlock *LoopScalarPreHeader; 841 842 /// Middle Block between the vector and the scalar. 843 BasicBlock *LoopMiddleBlock; 844 845 /// The (unique) ExitBlock of the scalar loop. Note that 846 /// there can be multiple exiting edges reaching this block. 847 BasicBlock *LoopExitBlock; 848 849 /// The vector loop body. 850 BasicBlock *LoopVectorBody; 851 852 /// The scalar loop body. 853 BasicBlock *LoopScalarBody; 854 855 /// A list of all bypass blocks. The first block is the entry of the loop. 856 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 857 858 /// The new Induction variable which was added to the new block. 859 PHINode *Induction = nullptr; 860 861 /// The induction variable of the old basic block. 862 PHINode *OldInduction = nullptr; 863 864 /// Maps values from the original loop to their corresponding values in the 865 /// vectorized loop. A key value can map to either vector values, scalar 866 /// values or both kinds of values, depending on whether the key was 867 /// vectorized and scalarized. 868 VectorizerValueMap VectorLoopValueMap; 869 870 /// Store instructions that were predicated. 871 SmallVector<Instruction *, 4> PredicatedInstructions; 872 873 /// Trip count of the original loop. 874 Value *TripCount = nullptr; 875 876 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 877 Value *VectorTripCount = nullptr; 878 879 /// The legality analysis. 880 LoopVectorizationLegality *Legal; 881 882 /// The profitablity analysis. 883 LoopVectorizationCostModel *Cost; 884 885 // Record whether runtime checks are added. 886 bool AddedSafetyChecks = false; 887 888 // Holds the end values for each induction variable. We save the end values 889 // so we can later fix-up the external users of the induction variables. 890 DenseMap<PHINode *, Value *> IVEndValues; 891 892 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 893 // fixed up at the end of vector code generation. 894 SmallVector<PHINode *, 8> OrigPHIsToFix; 895 896 /// BFI and PSI are used to check for profile guided size optimizations. 897 BlockFrequencyInfo *BFI; 898 ProfileSummaryInfo *PSI; 899 900 // Whether this loop should be optimized for size based on profile guided size 901 // optimizatios. 902 bool OptForSizeBasedOnProfile; 903 }; 904 905 class InnerLoopUnroller : public InnerLoopVectorizer { 906 public: 907 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 908 LoopInfo *LI, DominatorTree *DT, 909 const TargetLibraryInfo *TLI, 910 const TargetTransformInfo *TTI, AssumptionCache *AC, 911 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 912 LoopVectorizationLegality *LVL, 913 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 914 ProfileSummaryInfo *PSI) 915 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 916 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 917 BFI, PSI) {} 918 919 private: 920 Value *getBroadcastInstrs(Value *V) override; 921 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 922 Instruction::BinaryOps Opcode = 923 Instruction::BinaryOpsEnd) override; 924 Value *reverseVector(Value *Vec) override; 925 }; 926 927 /// Encapsulate information regarding vectorization of a loop and its epilogue. 928 /// This information is meant to be updated and used across two stages of 929 /// epilogue vectorization. 930 struct EpilogueLoopVectorizationInfo { 931 ElementCount MainLoopVF = ElementCount::getFixed(0); 932 unsigned MainLoopUF = 0; 933 ElementCount EpilogueVF = ElementCount::getFixed(0); 934 unsigned EpilogueUF = 0; 935 BasicBlock *MainLoopIterationCountCheck = nullptr; 936 BasicBlock *EpilogueIterationCountCheck = nullptr; 937 BasicBlock *SCEVSafetyCheck = nullptr; 938 BasicBlock *MemSafetyCheck = nullptr; 939 Value *TripCount = nullptr; 940 Value *VectorTripCount = nullptr; 941 942 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF, 943 unsigned EUF) 944 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF), 945 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) { 946 assert(EUF == 1 && 947 "A high UF for the epilogue loop is likely not beneficial."); 948 } 949 }; 950 951 /// An extension of the inner loop vectorizer that creates a skeleton for a 952 /// vectorized loop that has its epilogue (residual) also vectorized. 953 /// The idea is to run the vplan on a given loop twice, firstly to setup the 954 /// skeleton and vectorize the main loop, and secondly to complete the skeleton 955 /// from the first step and vectorize the epilogue. This is achieved by 956 /// deriving two concrete strategy classes from this base class and invoking 957 /// them in succession from the loop vectorizer planner. 958 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 959 public: 960 InnerLoopAndEpilogueVectorizer( 961 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 962 DominatorTree *DT, const TargetLibraryInfo *TLI, 963 const TargetTransformInfo *TTI, AssumptionCache *AC, 964 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 965 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 966 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 967 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 968 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), 969 EPI(EPI) {} 970 971 // Override this function to handle the more complex control flow around the 972 // three loops. 973 BasicBlock *createVectorizedLoopSkeleton() final override { 974 return createEpilogueVectorizedLoopSkeleton(); 975 } 976 977 /// The interface for creating a vectorized skeleton using one of two 978 /// different strategies, each corresponding to one execution of the vplan 979 /// as described above. 980 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; 981 982 /// Holds and updates state information required to vectorize the main loop 983 /// and its epilogue in two separate passes. This setup helps us avoid 984 /// regenerating and recomputing runtime safety checks. It also helps us to 985 /// shorten the iteration-count-check path length for the cases where the 986 /// iteration count of the loop is so small that the main vector loop is 987 /// completely skipped. 988 EpilogueLoopVectorizationInfo &EPI; 989 }; 990 991 /// A specialized derived class of inner loop vectorizer that performs 992 /// vectorization of *main* loops in the process of vectorizing loops and their 993 /// epilogues. 994 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 995 public: 996 EpilogueVectorizerMainLoop( 997 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 998 DominatorTree *DT, const TargetLibraryInfo *TLI, 999 const TargetTransformInfo *TTI, AssumptionCache *AC, 1000 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 1001 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 1002 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1003 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1004 EPI, LVL, CM, BFI, PSI) {} 1005 /// Implements the interface for creating a vectorized skeleton using the 1006 /// *main loop* strategy (ie the first pass of vplan execution). 1007 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1008 1009 protected: 1010 /// Emits an iteration count bypass check once for the main loop (when \p 1011 /// ForEpilogue is false) and once for the epilogue loop (when \p 1012 /// ForEpilogue is true). 1013 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass, 1014 bool ForEpilogue); 1015 void printDebugTracesAtStart() override; 1016 void printDebugTracesAtEnd() override; 1017 }; 1018 1019 // A specialized derived class of inner loop vectorizer that performs 1020 // vectorization of *epilogue* loops in the process of vectorizing loops and 1021 // their epilogues. 1022 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 1023 public: 1024 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 1025 LoopInfo *LI, DominatorTree *DT, 1026 const TargetLibraryInfo *TLI, 1027 const TargetTransformInfo *TTI, AssumptionCache *AC, 1028 OptimizationRemarkEmitter *ORE, 1029 EpilogueLoopVectorizationInfo &EPI, 1030 LoopVectorizationLegality *LVL, 1031 llvm::LoopVectorizationCostModel *CM, 1032 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) 1033 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1034 EPI, LVL, CM, BFI, PSI) {} 1035 /// Implements the interface for creating a vectorized skeleton using the 1036 /// *epilogue loop* strategy (ie the second pass of vplan execution). 1037 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; 1038 1039 protected: 1040 /// Emits an iteration count bypass check after the main vector loop has 1041 /// finished to see if there are any iterations left to execute by either 1042 /// the vector epilogue or the scalar epilogue. 1043 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L, 1044 BasicBlock *Bypass, 1045 BasicBlock *Insert); 1046 void printDebugTracesAtStart() override; 1047 void printDebugTracesAtEnd() override; 1048 }; 1049 } // end namespace llvm 1050 1051 /// Look for a meaningful debug location on the instruction or it's 1052 /// operands. 1053 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 1054 if (!I) 1055 return I; 1056 1057 DebugLoc Empty; 1058 if (I->getDebugLoc() != Empty) 1059 return I; 1060 1061 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 1062 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 1063 if (OpInst->getDebugLoc() != Empty) 1064 return OpInst; 1065 } 1066 1067 return I; 1068 } 1069 1070 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 1071 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 1072 const DILocation *DIL = Inst->getDebugLoc(); 1073 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 1074 !isa<DbgInfoIntrinsic>(Inst)) { 1075 assert(!VF.isScalable() && "scalable vectors not yet supported."); 1076 auto NewDIL = 1077 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); 1078 if (NewDIL) 1079 B.SetCurrentDebugLocation(NewDIL.getValue()); 1080 else 1081 LLVM_DEBUG(dbgs() 1082 << "Failed to create new discriminator: " 1083 << DIL->getFilename() << " Line: " << DIL->getLine()); 1084 } 1085 else 1086 B.SetCurrentDebugLocation(DIL); 1087 } else 1088 B.SetCurrentDebugLocation(DebugLoc()); 1089 } 1090 1091 /// Write a record \p DebugMsg about vectorization failure to the debug 1092 /// output stream. If \p I is passed, it is an instruction that prevents 1093 /// vectorization. 1094 #ifndef NDEBUG 1095 static void debugVectorizationFailure(const StringRef DebugMsg, 1096 Instruction *I) { 1097 dbgs() << "LV: Not vectorizing: " << DebugMsg; 1098 if (I != nullptr) 1099 dbgs() << " " << *I; 1100 else 1101 dbgs() << '.'; 1102 dbgs() << '\n'; 1103 } 1104 #endif 1105 1106 /// Create an analysis remark that explains why vectorization failed 1107 /// 1108 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 1109 /// RemarkName is the identifier for the remark. If \p I is passed it is an 1110 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 1111 /// the location of the remark. \return the remark object that can be 1112 /// streamed to. 1113 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 1114 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 1115 Value *CodeRegion = TheLoop->getHeader(); 1116 DebugLoc DL = TheLoop->getStartLoc(); 1117 1118 if (I) { 1119 CodeRegion = I->getParent(); 1120 // If there is no debug location attached to the instruction, revert back to 1121 // using the loop's. 1122 if (I->getDebugLoc()) 1123 DL = I->getDebugLoc(); 1124 } 1125 1126 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 1127 R << "loop not vectorized: "; 1128 return R; 1129 } 1130 1131 /// Return a value for Step multiplied by VF. 1132 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { 1133 assert(isa<ConstantInt>(Step) && "Expected an integer step"); 1134 Constant *StepVal = ConstantInt::get( 1135 Step->getType(), 1136 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue()); 1137 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal; 1138 } 1139 1140 namespace llvm { 1141 1142 void reportVectorizationFailure(const StringRef DebugMsg, 1143 const StringRef OREMsg, const StringRef ORETag, 1144 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 1145 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 1146 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1147 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 1148 ORETag, TheLoop, I) << OREMsg); 1149 } 1150 1151 } // end namespace llvm 1152 1153 #ifndef NDEBUG 1154 /// \return string containing a file name and a line # for the given loop. 1155 static std::string getDebugLocString(const Loop *L) { 1156 std::string Result; 1157 if (L) { 1158 raw_string_ostream OS(Result); 1159 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1160 LoopDbgLoc.print(OS); 1161 else 1162 // Just print the module name. 1163 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1164 OS.flush(); 1165 } 1166 return Result; 1167 } 1168 #endif 1169 1170 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 1171 const Instruction *Orig) { 1172 // If the loop was versioned with memchecks, add the corresponding no-alias 1173 // metadata. 1174 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 1175 LVer->annotateInstWithNoAlias(To, Orig); 1176 } 1177 1178 void InnerLoopVectorizer::addMetadata(Instruction *To, 1179 Instruction *From) { 1180 propagateMetadata(To, From); 1181 addNewMetadata(To, From); 1182 } 1183 1184 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 1185 Instruction *From) { 1186 for (Value *V : To) { 1187 if (Instruction *I = dyn_cast<Instruction>(V)) 1188 addMetadata(I, From); 1189 } 1190 } 1191 1192 namespace llvm { 1193 1194 // Loop vectorization cost-model hints how the scalar epilogue loop should be 1195 // lowered. 1196 enum ScalarEpilogueLowering { 1197 1198 // The default: allowing scalar epilogues. 1199 CM_ScalarEpilogueAllowed, 1200 1201 // Vectorization with OptForSize: don't allow epilogues. 1202 CM_ScalarEpilogueNotAllowedOptSize, 1203 1204 // A special case of vectorisation with OptForSize: loops with a very small 1205 // trip count are considered for vectorization under OptForSize, thereby 1206 // making sure the cost of their loop body is dominant, free of runtime 1207 // guards and scalar iteration overheads. 1208 CM_ScalarEpilogueNotAllowedLowTripLoop, 1209 1210 // Loop hint predicate indicating an epilogue is undesired. 1211 CM_ScalarEpilogueNotNeededUsePredicate, 1212 1213 // Directive indicating we must either tail fold or not vectorize 1214 CM_ScalarEpilogueNotAllowedUsePredicate 1215 }; 1216 1217 /// LoopVectorizationCostModel - estimates the expected speedups due to 1218 /// vectorization. 1219 /// In many cases vectorization is not profitable. This can happen because of 1220 /// a number of reasons. In this class we mainly attempt to predict the 1221 /// expected speedup/slowdowns due to the supported instruction set. We use the 1222 /// TargetTransformInfo to query the different backends for the cost of 1223 /// different operations. 1224 class LoopVectorizationCostModel { 1225 public: 1226 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1227 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1228 LoopVectorizationLegality *Legal, 1229 const TargetTransformInfo &TTI, 1230 const TargetLibraryInfo *TLI, DemandedBits *DB, 1231 AssumptionCache *AC, 1232 OptimizationRemarkEmitter *ORE, const Function *F, 1233 const LoopVectorizeHints *Hints, 1234 InterleavedAccessInfo &IAI) 1235 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1236 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1237 Hints(Hints), InterleaveInfo(IAI) {} 1238 1239 /// \return An upper bound for the vectorization factor, or None if 1240 /// vectorization and interleaving should be avoided up front. 1241 Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC); 1242 1243 /// \return True if runtime checks are required for vectorization, and false 1244 /// otherwise. 1245 bool runtimeChecksRequired(); 1246 1247 /// \return The most profitable vectorization factor and the cost of that VF. 1248 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 1249 /// then this vectorization factor will be selected if vectorization is 1250 /// possible. 1251 VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); 1252 VectorizationFactor 1253 selectEpilogueVectorizationFactor(const ElementCount MaxVF, 1254 const LoopVectorizationPlanner &LVP); 1255 1256 /// Setup cost-based decisions for user vectorization factor. 1257 void selectUserVectorizationFactor(ElementCount UserVF) { 1258 collectUniformsAndScalars(UserVF); 1259 collectInstsToScalarize(UserVF); 1260 } 1261 1262 /// \return The size (in bits) of the smallest and widest types in the code 1263 /// that needs to be vectorized. We ignore values that remain scalar such as 1264 /// 64 bit loop indices. 1265 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1266 1267 /// \return The desired interleave count. 1268 /// If interleave count has been specified by metadata it will be returned. 1269 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1270 /// are the selected vectorization factor and the cost of the selected VF. 1271 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost); 1272 1273 /// Memory access instruction may be vectorized in more than one way. 1274 /// Form of instruction after vectorization depends on cost. 1275 /// This function takes cost-based decisions for Load/Store instructions 1276 /// and collects them in a map. This decisions map is used for building 1277 /// the lists of loop-uniform and loop-scalar instructions. 1278 /// The calculated cost is saved with widening decision in order to 1279 /// avoid redundant calculations. 1280 void setCostBasedWideningDecision(ElementCount VF); 1281 1282 /// A struct that represents some properties of the register usage 1283 /// of a loop. 1284 struct RegisterUsage { 1285 /// Holds the number of loop invariant values that are used in the loop. 1286 /// The key is ClassID of target-provided register class. 1287 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1288 /// Holds the maximum number of concurrent live intervals in the loop. 1289 /// The key is ClassID of target-provided register class. 1290 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1291 }; 1292 1293 /// \return Returns information about the register usages of the loop for the 1294 /// given vectorization factors. 1295 SmallVector<RegisterUsage, 8> 1296 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1297 1298 /// Collect values we want to ignore in the cost model. 1299 void collectValuesToIgnore(); 1300 1301 /// Split reductions into those that happen in the loop, and those that happen 1302 /// outside. In loop reductions are collected into InLoopReductionChains. 1303 void collectInLoopReductions(); 1304 1305 /// \returns The smallest bitwidth each instruction can be represented with. 1306 /// The vector equivalents of these instructions should be truncated to this 1307 /// type. 1308 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1309 return MinBWs; 1310 } 1311 1312 /// \returns True if it is more profitable to scalarize instruction \p I for 1313 /// vectorization factor \p VF. 1314 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1315 assert(VF.isVector() && 1316 "Profitable to scalarize relevant only for VF > 1."); 1317 1318 // Cost model is not run in the VPlan-native path - return conservative 1319 // result until this changes. 1320 if (EnableVPlanNativePath) 1321 return false; 1322 1323 auto Scalars = InstsToScalarize.find(VF); 1324 assert(Scalars != InstsToScalarize.end() && 1325 "VF not yet analyzed for scalarization profitability"); 1326 return Scalars->second.find(I) != Scalars->second.end(); 1327 } 1328 1329 /// Returns true if \p I is known to be uniform after vectorization. 1330 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1331 if (VF.isScalar()) 1332 return true; 1333 1334 // Cost model is not run in the VPlan-native path - return conservative 1335 // result until this changes. 1336 if (EnableVPlanNativePath) 1337 return false; 1338 1339 auto UniformsPerVF = Uniforms.find(VF); 1340 assert(UniformsPerVF != Uniforms.end() && 1341 "VF not yet analyzed for uniformity"); 1342 return UniformsPerVF->second.count(I); 1343 } 1344 1345 /// Returns true if \p I is known to be scalar after vectorization. 1346 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1347 if (VF.isScalar()) 1348 return true; 1349 1350 // Cost model is not run in the VPlan-native path - return conservative 1351 // result until this changes. 1352 if (EnableVPlanNativePath) 1353 return false; 1354 1355 auto ScalarsPerVF = Scalars.find(VF); 1356 assert(ScalarsPerVF != Scalars.end() && 1357 "Scalar values are not calculated for VF"); 1358 return ScalarsPerVF->second.count(I); 1359 } 1360 1361 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1362 /// for vectorization factor \p VF. 1363 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1364 return VF.isVector() && MinBWs.find(I) != MinBWs.end() && 1365 !isProfitableToScalarize(I, VF) && 1366 !isScalarAfterVectorization(I, VF); 1367 } 1368 1369 /// Decision that was taken during cost calculation for memory instruction. 1370 enum InstWidening { 1371 CM_Unknown, 1372 CM_Widen, // For consecutive accesses with stride +1. 1373 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1374 CM_Interleave, 1375 CM_GatherScatter, 1376 CM_Scalarize 1377 }; 1378 1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1380 /// instruction \p I and vector width \p VF. 1381 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1382 InstructionCost Cost) { 1383 assert(VF.isVector() && "Expected VF >=2"); 1384 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1385 } 1386 1387 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1388 /// interleaving group \p Grp and vector width \p VF. 1389 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1390 ElementCount VF, InstWidening W, 1391 InstructionCost Cost) { 1392 assert(VF.isVector() && "Expected VF >=2"); 1393 /// Broadcast this decicion to all instructions inside the group. 1394 /// But the cost will be assigned to one instruction only. 1395 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1396 if (auto *I = Grp->getMember(i)) { 1397 if (Grp->getInsertPos() == I) 1398 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1399 else 1400 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1401 } 1402 } 1403 } 1404 1405 /// Return the cost model decision for the given instruction \p I and vector 1406 /// width \p VF. Return CM_Unknown if this instruction did not pass 1407 /// through the cost modeling. 1408 InstWidening getWideningDecision(Instruction *I, ElementCount VF) { 1409 assert(VF.isVector() && "Expected VF to be a vector VF"); 1410 // Cost model is not run in the VPlan-native path - return conservative 1411 // result until this changes. 1412 if (EnableVPlanNativePath) 1413 return CM_GatherScatter; 1414 1415 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1416 auto Itr = WideningDecisions.find(InstOnVF); 1417 if (Itr == WideningDecisions.end()) 1418 return CM_Unknown; 1419 return Itr->second.first; 1420 } 1421 1422 /// Return the vectorization cost for the given instruction \p I and vector 1423 /// width \p VF. 1424 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1425 assert(VF.isVector() && "Expected VF >=2"); 1426 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1427 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1428 "The cost is not calculated"); 1429 return WideningDecisions[InstOnVF].second; 1430 } 1431 1432 /// Return True if instruction \p I is an optimizable truncate whose operand 1433 /// is an induction variable. Such a truncate will be removed by adding a new 1434 /// induction variable with the destination type. 1435 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1436 // If the instruction is not a truncate, return false. 1437 auto *Trunc = dyn_cast<TruncInst>(I); 1438 if (!Trunc) 1439 return false; 1440 1441 // Get the source and destination types of the truncate. 1442 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1443 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1444 1445 // If the truncate is free for the given types, return false. Replacing a 1446 // free truncate with an induction variable would add an induction variable 1447 // update instruction to each iteration of the loop. We exclude from this 1448 // check the primary induction variable since it will need an update 1449 // instruction regardless. 1450 Value *Op = Trunc->getOperand(0); 1451 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1452 return false; 1453 1454 // If the truncated value is not an induction variable, return false. 1455 return Legal->isInductionPhi(Op); 1456 } 1457 1458 /// Collects the instructions to scalarize for each predicated instruction in 1459 /// the loop. 1460 void collectInstsToScalarize(ElementCount VF); 1461 1462 /// Collect Uniform and Scalar values for the given \p VF. 1463 /// The sets depend on CM decision for Load/Store instructions 1464 /// that may be vectorized as interleave, gather-scatter or scalarized. 1465 void collectUniformsAndScalars(ElementCount VF) { 1466 // Do the analysis once. 1467 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) 1468 return; 1469 setCostBasedWideningDecision(VF); 1470 collectLoopUniforms(VF); 1471 collectLoopScalars(VF); 1472 } 1473 1474 /// Returns true if the target machine supports masked store operation 1475 /// for the given \p DataType and kind of access to \p Ptr. 1476 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) { 1477 return Legal->isConsecutivePtr(Ptr) && 1478 TTI.isLegalMaskedStore(DataType, Alignment); 1479 } 1480 1481 /// Returns true if the target machine supports masked load operation 1482 /// for the given \p DataType and kind of access to \p Ptr. 1483 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) { 1484 return Legal->isConsecutivePtr(Ptr) && 1485 TTI.isLegalMaskedLoad(DataType, Alignment); 1486 } 1487 1488 /// Returns true if the target machine supports masked scatter operation 1489 /// for the given \p DataType. 1490 bool isLegalMaskedScatter(Type *DataType, Align Alignment) { 1491 return TTI.isLegalMaskedScatter(DataType, Alignment); 1492 } 1493 1494 /// Returns true if the target machine supports masked gather operation 1495 /// for the given \p DataType. 1496 bool isLegalMaskedGather(Type *DataType, Align Alignment) { 1497 return TTI.isLegalMaskedGather(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine can represent \p V as a masked gather 1501 /// or scatter operation. 1502 bool isLegalGatherOrScatter(Value *V) { 1503 bool LI = isa<LoadInst>(V); 1504 bool SI = isa<StoreInst>(V); 1505 if (!LI && !SI) 1506 return false; 1507 auto *Ty = getMemInstValueType(V); 1508 Align Align = getLoadStoreAlignment(V); 1509 return (LI && isLegalMaskedGather(Ty, Align)) || 1510 (SI && isLegalMaskedScatter(Ty, Align)); 1511 } 1512 1513 /// Returns true if \p I is an instruction that will be scalarized with 1514 /// predication. Such instructions include conditional stores and 1515 /// instructions that may divide by zero. 1516 /// If a non-zero VF has been calculated, we check if I will be scalarized 1517 /// predication for that VF. 1518 bool isScalarWithPredication(Instruction *I, 1519 ElementCount VF = ElementCount::getFixed(1)); 1520 1521 // Returns true if \p I is an instruction that will be predicated either 1522 // through scalar predication or masked load/store or masked gather/scatter. 1523 // Superset of instructions that return true for isScalarWithPredication. 1524 bool isPredicatedInst(Instruction *I) { 1525 if (!blockNeedsPredication(I->getParent())) 1526 return false; 1527 // Loads and stores that need some form of masked operation are predicated 1528 // instructions. 1529 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1530 return Legal->isMaskRequired(I); 1531 return isScalarWithPredication(I); 1532 } 1533 1534 /// Returns true if \p I is a memory instruction with consecutive memory 1535 /// access that can be widened. 1536 bool 1537 memoryInstructionCanBeWidened(Instruction *I, 1538 ElementCount VF = ElementCount::getFixed(1)); 1539 1540 /// Returns true if \p I is a memory instruction in an interleaved-group 1541 /// of memory accesses that can be vectorized with wide vector loads/stores 1542 /// and shuffles. 1543 bool 1544 interleavedAccessCanBeWidened(Instruction *I, 1545 ElementCount VF = ElementCount::getFixed(1)); 1546 1547 /// Check if \p Instr belongs to any interleaved access group. 1548 bool isAccessInterleaved(Instruction *Instr) { 1549 return InterleaveInfo.isInterleaved(Instr); 1550 } 1551 1552 /// Get the interleaved access group that \p Instr belongs to. 1553 const InterleaveGroup<Instruction> * 1554 getInterleavedAccessGroup(Instruction *Instr) { 1555 return InterleaveInfo.getInterleaveGroup(Instr); 1556 } 1557 1558 /// Returns true if we're required to use a scalar epilogue for at least 1559 /// the final iteration of the original loop. 1560 bool requiresScalarEpilogue() const { 1561 if (!isScalarEpilogueAllowed()) 1562 return false; 1563 // If we might exit from anywhere but the latch, must run the exiting 1564 // iteration in scalar form. 1565 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1566 return true; 1567 return InterleaveInfo.requiresScalarEpilogue(); 1568 } 1569 1570 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1571 /// loop hint annotation. 1572 bool isScalarEpilogueAllowed() const { 1573 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1574 } 1575 1576 /// Returns true if all loop blocks should be masked to fold tail loop. 1577 bool foldTailByMasking() const { return FoldTailByMasking; } 1578 1579 bool blockNeedsPredication(BasicBlock *BB) { 1580 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1581 } 1582 1583 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi 1584 /// nodes to the chain of instructions representing the reductions. Uses a 1585 /// MapVector to ensure deterministic iteration order. 1586 using ReductionChainMap = 1587 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>; 1588 1589 /// Return the chain of instructions representing an inloop reduction. 1590 const ReductionChainMap &getInLoopReductionChains() const { 1591 return InLoopReductionChains; 1592 } 1593 1594 /// Returns true if the Phi is part of an inloop reduction. 1595 bool isInLoopReduction(PHINode *Phi) const { 1596 return InLoopReductionChains.count(Phi); 1597 } 1598 1599 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1600 /// with factor VF. Return the cost of the instruction, including 1601 /// scalarization overhead if it's needed. 1602 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF); 1603 1604 /// Estimate cost of a call instruction CI if it were vectorized with factor 1605 /// VF. Return the cost of the instruction, including scalarization overhead 1606 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1607 /// scalarized - 1608 /// i.e. either vector version isn't available, or is too expensive. 1609 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, 1610 bool &NeedToScalarize); 1611 1612 /// Invalidates decisions already taken by the cost model. 1613 void invalidateCostModelingDecisions() { 1614 WideningDecisions.clear(); 1615 Uniforms.clear(); 1616 Scalars.clear(); 1617 } 1618 1619 private: 1620 unsigned NumPredStores = 0; 1621 1622 /// \return An upper bound for the vectorization factor, a power-of-2 larger 1623 /// than zero. One is returned if vectorization should best be avoided due 1624 /// to cost. 1625 ElementCount computeFeasibleMaxVF(unsigned ConstTripCount, 1626 ElementCount UserVF); 1627 1628 /// The vectorization cost is a combination of the cost itself and a boolean 1629 /// indicating whether any of the contributing operations will actually 1630 /// operate on 1631 /// vector values after type legalization in the backend. If this latter value 1632 /// is 1633 /// false, then all operations will be scalarized (i.e. no vectorization has 1634 /// actually taken place). 1635 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1636 1637 /// Returns the expected execution cost. The unit of the cost does 1638 /// not matter because we use the 'cost' units to compare different 1639 /// vector widths. The cost that is returned is *not* normalized by 1640 /// the factor width. 1641 VectorizationCostTy expectedCost(ElementCount VF); 1642 1643 /// Returns the execution time cost of an instruction for a given vector 1644 /// width. Vector width of one means scalar. 1645 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1646 1647 /// The cost-computation logic from getInstructionCost which provides 1648 /// the vector type as an output parameter. 1649 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1650 Type *&VectorTy); 1651 1652 /// Return the cost of instructions in an inloop reduction pattern, if I is 1653 /// part of that pattern. 1654 InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF, 1655 Type *VectorTy, 1656 TTI::TargetCostKind CostKind); 1657 1658 /// Calculate vectorization cost of memory instruction \p I. 1659 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1660 1661 /// The cost computation for scalarized memory instruction. 1662 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1663 1664 /// The cost computation for interleaving group of memory instructions. 1665 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1666 1667 /// The cost computation for Gather/Scatter instruction. 1668 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1669 1670 /// The cost computation for widening instruction \p I with consecutive 1671 /// memory access. 1672 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1673 1674 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1675 /// Load: scalar load + broadcast. 1676 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1677 /// element) 1678 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1679 1680 /// Estimate the overhead of scalarizing an instruction. This is a 1681 /// convenience wrapper for the type-based getScalarizationOverhead API. 1682 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF); 1683 1684 /// Returns whether the instruction is a load or store and will be a emitted 1685 /// as a vector operation. 1686 bool isConsecutiveLoadOrStore(Instruction *I); 1687 1688 /// Returns true if an artificially high cost for emulated masked memrefs 1689 /// should be used. 1690 bool useEmulatedMaskMemRefHack(Instruction *I); 1691 1692 /// Map of scalar integer values to the smallest bitwidth they can be legally 1693 /// represented as. The vector equivalents of these values should be truncated 1694 /// to this type. 1695 MapVector<Instruction *, uint64_t> MinBWs; 1696 1697 /// A type representing the costs for instructions if they were to be 1698 /// scalarized rather than vectorized. The entries are Instruction-Cost 1699 /// pairs. 1700 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1701 1702 /// A set containing all BasicBlocks that are known to present after 1703 /// vectorization as a predicated block. 1704 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1705 1706 /// Records whether it is allowed to have the original scalar loop execute at 1707 /// least once. This may be needed as a fallback loop in case runtime 1708 /// aliasing/dependence checks fail, or to handle the tail/remainder 1709 /// iterations when the trip count is unknown or doesn't divide by the VF, 1710 /// or as a peel-loop to handle gaps in interleave-groups. 1711 /// Under optsize and when the trip count is very small we don't allow any 1712 /// iterations to execute in the scalar loop. 1713 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1714 1715 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1716 bool FoldTailByMasking = false; 1717 1718 /// A map holding scalar costs for different vectorization factors. The 1719 /// presence of a cost for an instruction in the mapping indicates that the 1720 /// instruction will be scalarized when vectorizing with the associated 1721 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1722 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1723 1724 /// Holds the instructions known to be uniform after vectorization. 1725 /// The data is collected per VF. 1726 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1727 1728 /// Holds the instructions known to be scalar after vectorization. 1729 /// The data is collected per VF. 1730 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1731 1732 /// Holds the instructions (address computations) that are forced to be 1733 /// scalarized. 1734 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1735 1736 /// PHINodes of the reductions that should be expanded in-loop along with 1737 /// their associated chains of reduction operations, in program order from top 1738 /// (PHI) to bottom 1739 ReductionChainMap InLoopReductionChains; 1740 1741 /// A Map of inloop reduction operations and their immediate chain operand. 1742 /// FIXME: This can be removed once reductions can be costed correctly in 1743 /// vplan. This was added to allow quick lookup to the inloop operations, 1744 /// without having to loop through InLoopReductionChains. 1745 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1746 1747 /// Returns the expected difference in cost from scalarizing the expression 1748 /// feeding a predicated instruction \p PredInst. The instructions to 1749 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1750 /// non-negative return value implies the expression will be scalarized. 1751 /// Currently, only single-use chains are considered for scalarization. 1752 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1753 ElementCount VF); 1754 1755 /// Collect the instructions that are uniform after vectorization. An 1756 /// instruction is uniform if we represent it with a single scalar value in 1757 /// the vectorized loop corresponding to each vector iteration. Examples of 1758 /// uniform instructions include pointer operands of consecutive or 1759 /// interleaved memory accesses. Note that although uniformity implies an 1760 /// instruction will be scalar, the reverse is not true. In general, a 1761 /// scalarized instruction will be represented by VF scalar values in the 1762 /// vectorized loop, each corresponding to an iteration of the original 1763 /// scalar loop. 1764 void collectLoopUniforms(ElementCount VF); 1765 1766 /// Collect the instructions that are scalar after vectorization. An 1767 /// instruction is scalar if it is known to be uniform or will be scalarized 1768 /// during vectorization. Non-uniform scalarized instructions will be 1769 /// represented by VF values in the vectorized loop, each corresponding to an 1770 /// iteration of the original scalar loop. 1771 void collectLoopScalars(ElementCount VF); 1772 1773 /// Keeps cost model vectorization decision and cost for instructions. 1774 /// Right now it is used for memory instructions only. 1775 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1776 std::pair<InstWidening, InstructionCost>>; 1777 1778 DecisionList WideningDecisions; 1779 1780 /// Returns true if \p V is expected to be vectorized and it needs to be 1781 /// extracted. 1782 bool needsExtract(Value *V, ElementCount VF) const { 1783 Instruction *I = dyn_cast<Instruction>(V); 1784 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1785 TheLoop->isLoopInvariant(I)) 1786 return false; 1787 1788 // Assume we can vectorize V (and hence we need extraction) if the 1789 // scalars are not computed yet. This can happen, because it is called 1790 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1791 // the scalars are collected. That should be a safe assumption in most 1792 // cases, because we check if the operands have vectorizable types 1793 // beforehand in LoopVectorizationLegality. 1794 return Scalars.find(VF) == Scalars.end() || 1795 !isScalarAfterVectorization(I, VF); 1796 }; 1797 1798 /// Returns a range containing only operands needing to be extracted. 1799 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1800 ElementCount VF) { 1801 return SmallVector<Value *, 4>(make_filter_range( 1802 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1803 } 1804 1805 /// Determines if we have the infrastructure to vectorize loop \p L and its 1806 /// epilogue, assuming the main loop is vectorized by \p VF. 1807 bool isCandidateForEpilogueVectorization(const Loop &L, 1808 const ElementCount VF) const; 1809 1810 /// Returns true if epilogue vectorization is considered profitable, and 1811 /// false otherwise. 1812 /// \p VF is the vectorization factor chosen for the original loop. 1813 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1814 1815 public: 1816 /// The loop that we evaluate. 1817 Loop *TheLoop; 1818 1819 /// Predicated scalar evolution analysis. 1820 PredicatedScalarEvolution &PSE; 1821 1822 /// Loop Info analysis. 1823 LoopInfo *LI; 1824 1825 /// Vectorization legality. 1826 LoopVectorizationLegality *Legal; 1827 1828 /// Vector target information. 1829 const TargetTransformInfo &TTI; 1830 1831 /// Target Library Info. 1832 const TargetLibraryInfo *TLI; 1833 1834 /// Demanded bits analysis. 1835 DemandedBits *DB; 1836 1837 /// Assumption cache. 1838 AssumptionCache *AC; 1839 1840 /// Interface to emit optimization remarks. 1841 OptimizationRemarkEmitter *ORE; 1842 1843 const Function *TheFunction; 1844 1845 /// Loop Vectorize Hint. 1846 const LoopVectorizeHints *Hints; 1847 1848 /// The interleave access information contains groups of interleaved accesses 1849 /// with the same stride and close to each other. 1850 InterleavedAccessInfo &InterleaveInfo; 1851 1852 /// Values to ignore in the cost model. 1853 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1854 1855 /// Values to ignore in the cost model when VF > 1. 1856 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1857 1858 /// Profitable vector factors. 1859 SmallVector<VectorizationFactor, 8> ProfitableVFs; 1860 }; 1861 1862 } // end namespace llvm 1863 1864 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1865 // vectorization. The loop needs to be annotated with #pragma omp simd 1866 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1867 // vector length information is not provided, vectorization is not considered 1868 // explicit. Interleave hints are not allowed either. These limitations will be 1869 // relaxed in the future. 1870 // Please, note that we are currently forced to abuse the pragma 'clang 1871 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1872 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1873 // provides *explicit vectorization hints* (LV can bypass legal checks and 1874 // assume that vectorization is legal). However, both hints are implemented 1875 // using the same metadata (llvm.loop.vectorize, processed by 1876 // LoopVectorizeHints). This will be fixed in the future when the native IR 1877 // representation for pragma 'omp simd' is introduced. 1878 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1879 OptimizationRemarkEmitter *ORE) { 1880 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 1881 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1882 1883 // Only outer loops with an explicit vectorization hint are supported. 1884 // Unannotated outer loops are ignored. 1885 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1886 return false; 1887 1888 Function *Fn = OuterLp->getHeader()->getParent(); 1889 if (!Hints.allowVectorization(Fn, OuterLp, 1890 true /*VectorizeOnlyWhenForced*/)) { 1891 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1892 return false; 1893 } 1894 1895 if (Hints.getInterleave() > 1) { 1896 // TODO: Interleave support is future work. 1897 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1898 "outer loops.\n"); 1899 Hints.emitRemarkWithHints(); 1900 return false; 1901 } 1902 1903 return true; 1904 } 1905 1906 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1907 OptimizationRemarkEmitter *ORE, 1908 SmallVectorImpl<Loop *> &V) { 1909 // Collect inner loops and outer loops without irreducible control flow. For 1910 // now, only collect outer loops that have explicit vectorization hints. If we 1911 // are stress testing the VPlan H-CFG construction, we collect the outermost 1912 // loop of every loop nest. 1913 if (L.isInnermost() || VPlanBuildStressTest || 1914 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1915 LoopBlocksRPO RPOT(&L); 1916 RPOT.perform(LI); 1917 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1918 V.push_back(&L); 1919 // TODO: Collect inner loops inside marked outer loops in case 1920 // vectorization fails for the outer loop. Do not invoke 1921 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1922 // already known to be reducible. We can use an inherited attribute for 1923 // that. 1924 return; 1925 } 1926 } 1927 for (Loop *InnerL : L) 1928 collectSupportedLoops(*InnerL, LI, ORE, V); 1929 } 1930 1931 namespace { 1932 1933 /// The LoopVectorize Pass. 1934 struct LoopVectorize : public FunctionPass { 1935 /// Pass identification, replacement for typeid 1936 static char ID; 1937 1938 LoopVectorizePass Impl; 1939 1940 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1941 bool VectorizeOnlyWhenForced = false) 1942 : FunctionPass(ID), 1943 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) { 1944 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1945 } 1946 1947 bool runOnFunction(Function &F) override { 1948 if (skipFunction(F)) 1949 return false; 1950 1951 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1952 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1953 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1954 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1955 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1956 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1957 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1958 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1959 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1960 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1961 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1962 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1963 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1964 1965 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1966 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1967 1968 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1969 GetLAA, *ORE, PSI).MadeAnyChange; 1970 } 1971 1972 void getAnalysisUsage(AnalysisUsage &AU) const override { 1973 AU.addRequired<AssumptionCacheTracker>(); 1974 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1975 AU.addRequired<DominatorTreeWrapperPass>(); 1976 AU.addRequired<LoopInfoWrapperPass>(); 1977 AU.addRequired<ScalarEvolutionWrapperPass>(); 1978 AU.addRequired<TargetTransformInfoWrapperPass>(); 1979 AU.addRequired<AAResultsWrapperPass>(); 1980 AU.addRequired<LoopAccessLegacyAnalysis>(); 1981 AU.addRequired<DemandedBitsWrapperPass>(); 1982 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1983 AU.addRequired<InjectTLIMappingsLegacy>(); 1984 1985 // We currently do not preserve loopinfo/dominator analyses with outer loop 1986 // vectorization. Until this is addressed, mark these analyses as preserved 1987 // only for non-VPlan-native path. 1988 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1989 if (!EnableVPlanNativePath) { 1990 AU.addPreserved<LoopInfoWrapperPass>(); 1991 AU.addPreserved<DominatorTreeWrapperPass>(); 1992 } 1993 1994 AU.addPreserved<BasicAAWrapperPass>(); 1995 AU.addPreserved<GlobalsAAWrapperPass>(); 1996 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1997 } 1998 }; 1999 2000 } // end anonymous namespace 2001 2002 //===----------------------------------------------------------------------===// 2003 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2004 // LoopVectorizationCostModel and LoopVectorizationPlanner. 2005 //===----------------------------------------------------------------------===// 2006 2007 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 2008 // We need to place the broadcast of invariant variables outside the loop, 2009 // but only if it's proven safe to do so. Else, broadcast will be inside 2010 // vector loop body. 2011 Instruction *Instr = dyn_cast<Instruction>(V); 2012 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 2013 (!Instr || 2014 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 2015 // Place the code for broadcasting invariant variables in the new preheader. 2016 IRBuilder<>::InsertPointGuard Guard(Builder); 2017 if (SafeToHoist) 2018 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2019 2020 // Broadcast the scalar into all locations in the vector. 2021 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 2022 2023 return Shuf; 2024 } 2025 2026 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 2027 const InductionDescriptor &II, Value *Step, Value *Start, 2028 Instruction *EntryVal) { 2029 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2030 "Expected either an induction phi-node or a truncate of it!"); 2031 2032 // Construct the initial value of the vector IV in the vector loop preheader 2033 auto CurrIP = Builder.saveIP(); 2034 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 2035 if (isa<TruncInst>(EntryVal)) { 2036 assert(Start->getType()->isIntegerTy() && 2037 "Truncation requires an integer type"); 2038 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 2039 Step = Builder.CreateTrunc(Step, TruncType); 2040 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 2041 } 2042 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 2043 Value *SteppedStart = 2044 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 2045 2046 // We create vector phi nodes for both integer and floating-point induction 2047 // variables. Here, we determine the kind of arithmetic we will perform. 2048 Instruction::BinaryOps AddOp; 2049 Instruction::BinaryOps MulOp; 2050 if (Step->getType()->isIntegerTy()) { 2051 AddOp = Instruction::Add; 2052 MulOp = Instruction::Mul; 2053 } else { 2054 AddOp = II.getInductionOpcode(); 2055 MulOp = Instruction::FMul; 2056 } 2057 2058 // Multiply the vectorization factor by the step using integer or 2059 // floating-point arithmetic as appropriate. 2060 Value *ConstVF = 2061 getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); 2062 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 2063 2064 // Create a vector splat to use in the induction update. 2065 // 2066 // FIXME: If the step is non-constant, we create the vector splat with 2067 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 2068 // handle a constant vector splat. 2069 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2070 Value *SplatVF = isa<Constant>(Mul) 2071 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 2072 : Builder.CreateVectorSplat(VF, Mul); 2073 Builder.restoreIP(CurrIP); 2074 2075 // We may need to add the step a number of times, depending on the unroll 2076 // factor. The last of those goes into the PHI. 2077 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 2078 &*LoopVectorBody->getFirstInsertionPt()); 2079 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 2080 Instruction *LastInduction = VecInd; 2081 for (unsigned Part = 0; Part < UF; ++Part) { 2082 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 2083 2084 if (isa<TruncInst>(EntryVal)) 2085 addMetadata(LastInduction, EntryVal); 2086 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 2087 2088 LastInduction = cast<Instruction>(addFastMathFlag( 2089 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 2090 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 2091 } 2092 2093 // Move the last step to the end of the latch block. This ensures consistent 2094 // placement of all induction updates. 2095 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 2096 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 2097 auto *ICmp = cast<Instruction>(Br->getCondition()); 2098 LastInduction->moveBefore(ICmp); 2099 LastInduction->setName("vec.ind.next"); 2100 2101 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 2102 VecInd->addIncoming(LastInduction, LoopVectorLatch); 2103 } 2104 2105 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 2106 return Cost->isScalarAfterVectorization(I, VF) || 2107 Cost->isProfitableToScalarize(I, VF); 2108 } 2109 2110 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 2111 if (shouldScalarizeInstruction(IV)) 2112 return true; 2113 auto isScalarInst = [&](User *U) -> bool { 2114 auto *I = cast<Instruction>(U); 2115 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 2116 }; 2117 return llvm::any_of(IV->users(), isScalarInst); 2118 } 2119 2120 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 2121 const InductionDescriptor &ID, const Instruction *EntryVal, 2122 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 2123 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 2124 "Expected either an induction phi-node or a truncate of it!"); 2125 2126 // This induction variable is not the phi from the original loop but the 2127 // newly-created IV based on the proof that casted Phi is equal to the 2128 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 2129 // re-uses the same InductionDescriptor that original IV uses but we don't 2130 // have to do any recording in this case - that is done when original IV is 2131 // processed. 2132 if (isa<TruncInst>(EntryVal)) 2133 return; 2134 2135 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 2136 if (Casts.empty()) 2137 return; 2138 // Only the first Cast instruction in the Casts vector is of interest. 2139 // The rest of the Casts (if exist) have no uses outside the 2140 // induction update chain itself. 2141 Instruction *CastInst = *Casts.begin(); 2142 if (Lane < UINT_MAX) 2143 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 2144 else 2145 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 2146 } 2147 2148 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, 2149 TruncInst *Trunc) { 2150 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 2151 "Primary induction variable must have an integer type"); 2152 2153 auto II = Legal->getInductionVars().find(IV); 2154 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 2155 2156 auto ID = II->second; 2157 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 2158 2159 // The value from the original loop to which we are mapping the new induction 2160 // variable. 2161 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 2162 2163 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2164 2165 // Generate code for the induction step. Note that induction steps are 2166 // required to be loop-invariant 2167 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 2168 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 2169 "Induction step should be loop invariant"); 2170 if (PSE.getSE()->isSCEVable(IV->getType())) { 2171 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 2172 return Exp.expandCodeFor(Step, Step->getType(), 2173 LoopVectorPreHeader->getTerminator()); 2174 } 2175 return cast<SCEVUnknown>(Step)->getValue(); 2176 }; 2177 2178 // The scalar value to broadcast. This is derived from the canonical 2179 // induction variable. If a truncation type is given, truncate the canonical 2180 // induction variable and step. Otherwise, derive these values from the 2181 // induction descriptor. 2182 auto CreateScalarIV = [&](Value *&Step) -> Value * { 2183 Value *ScalarIV = Induction; 2184 if (IV != OldInduction) { 2185 ScalarIV = IV->getType()->isIntegerTy() 2186 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 2187 : Builder.CreateCast(Instruction::SIToFP, Induction, 2188 IV->getType()); 2189 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 2190 ScalarIV->setName("offset.idx"); 2191 } 2192 if (Trunc) { 2193 auto *TruncType = cast<IntegerType>(Trunc->getType()); 2194 assert(Step->getType()->isIntegerTy() && 2195 "Truncation requires an integer step"); 2196 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 2197 Step = Builder.CreateTrunc(Step, TruncType); 2198 } 2199 return ScalarIV; 2200 }; 2201 2202 // Create the vector values from the scalar IV, in the absence of creating a 2203 // vector IV. 2204 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 2205 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 2206 for (unsigned Part = 0; Part < UF; ++Part) { 2207 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2208 Value *EntryPart = 2209 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, 2210 ID.getInductionOpcode()); 2211 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 2212 if (Trunc) 2213 addMetadata(EntryPart, Trunc); 2214 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 2215 } 2216 }; 2217 2218 // Now do the actual transformations, and start with creating the step value. 2219 Value *Step = CreateStepValue(ID.getStep()); 2220 if (VF.isZero() || VF.isScalar()) { 2221 Value *ScalarIV = CreateScalarIV(Step); 2222 CreateSplatIV(ScalarIV, Step); 2223 return; 2224 } 2225 2226 // Determine if we want a scalar version of the induction variable. This is 2227 // true if the induction variable itself is not widened, or if it has at 2228 // least one user in the loop that is not widened. 2229 auto NeedsScalarIV = needsScalarInduction(EntryVal); 2230 if (!NeedsScalarIV) { 2231 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2232 return; 2233 } 2234 2235 // Try to create a new independent vector induction variable. If we can't 2236 // create the phi node, we will splat the scalar induction variable in each 2237 // loop iteration. 2238 if (!shouldScalarizeInstruction(EntryVal)) { 2239 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal); 2240 Value *ScalarIV = CreateScalarIV(Step); 2241 // Create scalar steps that can be used by instructions we will later 2242 // scalarize. Note that the addition of the scalar steps will not increase 2243 // the number of instructions in the loop in the common case prior to 2244 // InstCombine. We will be trading one vector extract for each scalar step. 2245 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2246 return; 2247 } 2248 2249 // All IV users are scalar instructions, so only emit a scalar IV, not a 2250 // vectorised IV. Except when we tail-fold, then the splat IV feeds the 2251 // predicate used by the masked loads/stores. 2252 Value *ScalarIV = CreateScalarIV(Step); 2253 if (!Cost->isScalarEpilogueAllowed()) 2254 CreateSplatIV(ScalarIV, Step); 2255 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 2256 } 2257 2258 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 2259 Instruction::BinaryOps BinOp) { 2260 // Create and check the types. 2261 auto *ValVTy = cast<FixedVectorType>(Val->getType()); 2262 int VLen = ValVTy->getNumElements(); 2263 2264 Type *STy = Val->getType()->getScalarType(); 2265 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 2266 "Induction Step must be an integer or FP"); 2267 assert(Step->getType() == STy && "Step has wrong type"); 2268 2269 SmallVector<Constant *, 8> Indices; 2270 2271 if (STy->isIntegerTy()) { 2272 // Create a vector of consecutive numbers from zero to VF. 2273 for (int i = 0; i < VLen; ++i) 2274 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 2275 2276 // Add the consecutive indices to the vector value. 2277 Constant *Cv = ConstantVector::get(Indices); 2278 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 2279 Step = Builder.CreateVectorSplat(VLen, Step); 2280 assert(Step->getType() == Val->getType() && "Invalid step vec"); 2281 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 2282 // which can be found from the original scalar operations. 2283 Step = Builder.CreateMul(Cv, Step); 2284 return Builder.CreateAdd(Val, Step, "induction"); 2285 } 2286 2287 // Floating point induction. 2288 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 2289 "Binary Opcode should be specified for FP induction"); 2290 // Create a vector of consecutive numbers from zero to VF. 2291 for (int i = 0; i < VLen; ++i) 2292 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 2293 2294 // Add the consecutive indices to the vector value. 2295 Constant *Cv = ConstantVector::get(Indices); 2296 2297 Step = Builder.CreateVectorSplat(VLen, Step); 2298 2299 // Floating point operations had to be 'fast' to enable the induction. 2300 FastMathFlags Flags; 2301 Flags.setFast(); 2302 2303 Value *MulOp = Builder.CreateFMul(Cv, Step); 2304 if (isa<Instruction>(MulOp)) 2305 // Have to check, MulOp may be a constant 2306 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 2307 2308 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 2309 if (isa<Instruction>(BOp)) 2310 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2311 return BOp; 2312 } 2313 2314 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 2315 Instruction *EntryVal, 2316 const InductionDescriptor &ID) { 2317 // We shouldn't have to build scalar steps if we aren't vectorizing. 2318 assert(VF.isVector() && "VF should be greater than one"); 2319 // Get the value type and ensure it and the step have the same integer type. 2320 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 2321 assert(ScalarIVTy == Step->getType() && 2322 "Val and Step should have the same type"); 2323 2324 // We build scalar steps for both integer and floating-point induction 2325 // variables. Here, we determine the kind of arithmetic we will perform. 2326 Instruction::BinaryOps AddOp; 2327 Instruction::BinaryOps MulOp; 2328 if (ScalarIVTy->isIntegerTy()) { 2329 AddOp = Instruction::Add; 2330 MulOp = Instruction::Mul; 2331 } else { 2332 AddOp = ID.getInductionOpcode(); 2333 MulOp = Instruction::FMul; 2334 } 2335 2336 // Determine the number of scalars we need to generate for each unroll 2337 // iteration. If EntryVal is uniform, we only need to generate the first 2338 // lane. Otherwise, we generate all VF values. 2339 unsigned Lanes = 2340 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) 2341 ? 1 2342 : VF.getKnownMinValue(); 2343 assert((!VF.isScalable() || Lanes == 1) && 2344 "Should never scalarize a scalable vector"); 2345 // Compute the scalar steps and save the results in VectorLoopValueMap. 2346 for (unsigned Part = 0; Part < UF; ++Part) { 2347 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 2348 auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), 2349 ScalarIVTy->getScalarSizeInBits()); 2350 Value *StartIdx = 2351 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); 2352 if (ScalarIVTy->isFloatingPointTy()) 2353 StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); 2354 StartIdx = addFastMathFlag(Builder.CreateBinOp( 2355 AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane))); 2356 // The step returned by `createStepForVF` is a runtime-evaluated value 2357 // when VF is scalable. Otherwise, it should be folded into a Constant. 2358 assert((VF.isScalable() || isa<Constant>(StartIdx)) && 2359 "Expected StartIdx to be folded to a constant when VF is not " 2360 "scalable"); 2361 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 2362 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 2363 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 2364 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 2365 } 2366 } 2367 } 2368 2369 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2370 assert(V != Induction && "The new induction variable should not be used."); 2371 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2372 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2373 2374 // If we have a stride that is replaced by one, do it here. Defer this for 2375 // the VPlan-native path until we start running Legal checks in that path. 2376 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2377 V = ConstantInt::get(V->getType(), 1); 2378 2379 // If we have a vector mapped to this value, return it. 2380 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2381 return VectorLoopValueMap.getVectorValue(V, Part); 2382 2383 // If the value has not been vectorized, check if it has been scalarized 2384 // instead. If it has been scalarized, and we actually need the value in 2385 // vector form, we will construct the vector values on demand. 2386 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2387 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2388 2389 // If we've scalarized a value, that value should be an instruction. 2390 auto *I = cast<Instruction>(V); 2391 2392 // If we aren't vectorizing, we can just copy the scalar map values over to 2393 // the vector map. 2394 if (VF.isScalar()) { 2395 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2396 return ScalarValue; 2397 } 2398 2399 // Get the last scalar instruction we generated for V and Part. If the value 2400 // is known to be uniform after vectorization, this corresponds to lane zero 2401 // of the Part unroll iteration. Otherwise, the last instruction is the one 2402 // we created for the last vector lane of the Part unroll iteration. 2403 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) 2404 ? 0 2405 : VF.getKnownMinValue() - 1; 2406 assert((!VF.isScalable() || LastLane == 0) && 2407 "Scalable vectorization can't lead to any scalarized values."); 2408 auto *LastInst = cast<Instruction>( 2409 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2410 2411 // Set the insert point after the last scalarized instruction. This ensures 2412 // the insertelement sequence will directly follow the scalar definitions. 2413 auto OldIP = Builder.saveIP(); 2414 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2415 Builder.SetInsertPoint(&*NewIP); 2416 2417 // However, if we are vectorizing, we need to construct the vector values. 2418 // If the value is known to be uniform after vectorization, we can just 2419 // broadcast the scalar value corresponding to lane zero for each unroll 2420 // iteration. Otherwise, we construct the vector values using insertelement 2421 // instructions. Since the resulting vectors are stored in 2422 // VectorLoopValueMap, we will only generate the insertelements once. 2423 Value *VectorValue = nullptr; 2424 if (Cost->isUniformAfterVectorization(I, VF)) { 2425 VectorValue = getBroadcastInstrs(ScalarValue); 2426 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2427 } else { 2428 // Initialize packing with insertelements to start from poison. 2429 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2430 Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF)); 2431 VectorLoopValueMap.setVectorValue(V, Part, Poison); 2432 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) 2433 packScalarIntoVectorValue(V, {Part, Lane}); 2434 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2435 } 2436 Builder.restoreIP(OldIP); 2437 return VectorValue; 2438 } 2439 2440 // If this scalar is unknown, assume that it is a constant or that it is 2441 // loop invariant. Broadcast V and save the value for future uses. 2442 Value *B = getBroadcastInstrs(V); 2443 VectorLoopValueMap.setVectorValue(V, Part, B); 2444 return B; 2445 } 2446 2447 Value * 2448 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2449 const VPIteration &Instance) { 2450 // If the value is not an instruction contained in the loop, it should 2451 // already be scalar. 2452 if (OrigLoop->isLoopInvariant(V)) 2453 return V; 2454 2455 assert(Instance.Lane > 0 2456 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2457 : true && "Uniform values only have lane zero"); 2458 2459 // If the value from the original loop has not been vectorized, it is 2460 // represented by UF x VF scalar values in the new loop. Return the requested 2461 // scalar value. 2462 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2463 return VectorLoopValueMap.getScalarValue(V, Instance); 2464 2465 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2466 // for the given unroll part. If this entry is not a vector type (i.e., the 2467 // vectorization factor is one), there is no need to generate an 2468 // extractelement instruction. 2469 auto *U = getOrCreateVectorValue(V, Instance.Part); 2470 if (!U->getType()->isVectorTy()) { 2471 assert(VF.isScalar() && "Value not scalarized has non-vector type"); 2472 return U; 2473 } 2474 2475 // Otherwise, the value from the original loop has been vectorized and is 2476 // represented by UF vector values. Extract and return the requested scalar 2477 // value from the appropriate vector lane. 2478 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2479 } 2480 2481 void InnerLoopVectorizer::packScalarIntoVectorValue( 2482 Value *V, const VPIteration &Instance) { 2483 assert(V != Induction && "The new induction variable should not be used."); 2484 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2485 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2486 2487 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2488 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2489 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2490 Builder.getInt32(Instance.Lane)); 2491 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2492 } 2493 2494 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2495 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2496 assert(!VF.isScalable() && "Cannot reverse scalable vectors"); 2497 SmallVector<int, 8> ShuffleMask; 2498 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 2499 ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); 2500 2501 return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse"); 2502 } 2503 2504 // Return whether we allow using masked interleave-groups (for dealing with 2505 // strided loads/stores that reside in predicated blocks, or for dealing 2506 // with gaps). 2507 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2508 // If an override option has been passed in for interleaved accesses, use it. 2509 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2510 return EnableMaskedInterleavedMemAccesses; 2511 2512 return TTI.enableMaskedInterleavedAccessVectorization(); 2513 } 2514 2515 // Try to vectorize the interleave group that \p Instr belongs to. 2516 // 2517 // E.g. Translate following interleaved load group (factor = 3): 2518 // for (i = 0; i < N; i+=3) { 2519 // R = Pic[i]; // Member of index 0 2520 // G = Pic[i+1]; // Member of index 1 2521 // B = Pic[i+2]; // Member of index 2 2522 // ... // do something to R, G, B 2523 // } 2524 // To: 2525 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2526 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2527 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2528 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2529 // 2530 // Or translate following interleaved store group (factor = 3): 2531 // for (i = 0; i < N; i+=3) { 2532 // ... do something to R, G, B 2533 // Pic[i] = R; // Member of index 0 2534 // Pic[i+1] = G; // Member of index 1 2535 // Pic[i+2] = B; // Member of index 2 2536 // } 2537 // To: 2538 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2539 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2540 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2541 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2542 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2543 void InnerLoopVectorizer::vectorizeInterleaveGroup( 2544 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2545 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2546 VPValue *BlockInMask) { 2547 Instruction *Instr = Group->getInsertPos(); 2548 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2549 2550 // Prepare for the vector type of the interleaved load/store. 2551 Type *ScalarTy = getMemInstValueType(Instr); 2552 unsigned InterleaveFactor = Group->getFactor(); 2553 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2554 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2555 2556 // Prepare for the new pointers. 2557 SmallVector<Value *, 2> AddrParts; 2558 unsigned Index = Group->getIndex(Instr); 2559 2560 // TODO: extend the masked interleaved-group support to reversed access. 2561 assert((!BlockInMask || !Group->isReverse()) && 2562 "Reversed masked interleave-group not supported."); 2563 2564 // If the group is reverse, adjust the index to refer to the last vector lane 2565 // instead of the first. We adjust the index from the first vector lane, 2566 // rather than directly getting the pointer for lane VF - 1, because the 2567 // pointer operand of the interleaved access is supposed to be uniform. For 2568 // uniform instructions, we're only required to generate a value for the 2569 // first vector lane in each unroll iteration. 2570 assert(!VF.isScalable() && 2571 "scalable vector reverse operation is not implemented"); 2572 if (Group->isReverse()) 2573 Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); 2574 2575 for (unsigned Part = 0; Part < UF; Part++) { 2576 Value *AddrPart = State.get(Addr, {Part, 0}); 2577 setDebugLocFromInst(Builder, AddrPart); 2578 2579 // Notice current instruction could be any index. Need to adjust the address 2580 // to the member of index 0. 2581 // 2582 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2583 // b = A[i]; // Member of index 0 2584 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2585 // 2586 // E.g. A[i+1] = a; // Member of index 1 2587 // A[i] = b; // Member of index 0 2588 // A[i+2] = c; // Member of index 2 (Current instruction) 2589 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2590 2591 bool InBounds = false; 2592 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2593 InBounds = gep->isInBounds(); 2594 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2595 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2596 2597 // Cast to the vector pointer type. 2598 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2599 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2600 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2601 } 2602 2603 setDebugLocFromInst(Builder, Instr); 2604 Value *PoisonVec = PoisonValue::get(VecTy); 2605 2606 Value *MaskForGaps = nullptr; 2607 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2608 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2609 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2610 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2611 } 2612 2613 // Vectorize the interleaved load group. 2614 if (isa<LoadInst>(Instr)) { 2615 // For each unroll part, create a wide load for the group. 2616 SmallVector<Value *, 2> NewLoads; 2617 for (unsigned Part = 0; Part < UF; Part++) { 2618 Instruction *NewLoad; 2619 if (BlockInMask || MaskForGaps) { 2620 assert(useMaskedInterleavedAccesses(*TTI) && 2621 "masked interleaved groups are not allowed."); 2622 Value *GroupMask = MaskForGaps; 2623 if (BlockInMask) { 2624 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2625 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2626 Value *ShuffledMask = Builder.CreateShuffleVector( 2627 BlockInMaskPart, 2628 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2629 "interleaved.mask"); 2630 GroupMask = MaskForGaps 2631 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2632 MaskForGaps) 2633 : ShuffledMask; 2634 } 2635 NewLoad = 2636 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2637 GroupMask, PoisonVec, "wide.masked.vec"); 2638 } 2639 else 2640 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2641 Group->getAlign(), "wide.vec"); 2642 Group->addMetadata(NewLoad); 2643 NewLoads.push_back(NewLoad); 2644 } 2645 2646 // For each member in the group, shuffle out the appropriate data from the 2647 // wide loads. 2648 unsigned J = 0; 2649 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2650 Instruction *Member = Group->getMember(I); 2651 2652 // Skip the gaps in the group. 2653 if (!Member) 2654 continue; 2655 2656 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2657 auto StrideMask = 2658 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2659 for (unsigned Part = 0; Part < UF; Part++) { 2660 Value *StridedVec = Builder.CreateShuffleVector( 2661 NewLoads[Part], StrideMask, "strided.vec"); 2662 2663 // If this member has different type, cast the result type. 2664 if (Member->getType() != ScalarTy) { 2665 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2666 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2667 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2668 } 2669 2670 if (Group->isReverse()) 2671 StridedVec = reverseVector(StridedVec); 2672 2673 State.set(VPDefs[J], Member, StridedVec, Part); 2674 } 2675 ++J; 2676 } 2677 return; 2678 } 2679 2680 // The sub vector type for current instruction. 2681 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2682 auto *SubVT = VectorType::get(ScalarTy, VF); 2683 2684 // Vectorize the interleaved store group. 2685 for (unsigned Part = 0; Part < UF; Part++) { 2686 // Collect the stored vector from each member. 2687 SmallVector<Value *, 4> StoredVecs; 2688 for (unsigned i = 0; i < InterleaveFactor; i++) { 2689 // Interleaved store group doesn't allow a gap, so each index has a member 2690 assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); 2691 2692 Value *StoredVec = State.get(StoredValues[i], Part); 2693 2694 if (Group->isReverse()) 2695 StoredVec = reverseVector(StoredVec); 2696 2697 // If this member has different type, cast it to a unified type. 2698 2699 if (StoredVec->getType() != SubVT) 2700 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2701 2702 StoredVecs.push_back(StoredVec); 2703 } 2704 2705 // Concatenate all vectors into a wide vector. 2706 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2707 2708 // Interleave the elements in the wide vector. 2709 assert(!VF.isScalable() && "scalable vectors not yet supported."); 2710 Value *IVec = Builder.CreateShuffleVector( 2711 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), 2712 "interleaved.vec"); 2713 2714 Instruction *NewStoreInstr; 2715 if (BlockInMask) { 2716 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2717 Value *ShuffledMask = Builder.CreateShuffleVector( 2718 BlockInMaskPart, 2719 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2720 "interleaved.mask"); 2721 NewStoreInstr = Builder.CreateMaskedStore( 2722 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2723 } 2724 else 2725 NewStoreInstr = 2726 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2727 2728 Group->addMetadata(NewStoreInstr); 2729 } 2730 } 2731 2732 void InnerLoopVectorizer::vectorizeMemoryInstruction( 2733 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, 2734 VPValue *StoredValue, VPValue *BlockInMask) { 2735 // Attempt to issue a wide load. 2736 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2737 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2738 2739 assert((LI || SI) && "Invalid Load/Store instruction"); 2740 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2741 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2742 2743 LoopVectorizationCostModel::InstWidening Decision = 2744 Cost->getWideningDecision(Instr, VF); 2745 assert((Decision == LoopVectorizationCostModel::CM_Widen || 2746 Decision == LoopVectorizationCostModel::CM_Widen_Reverse || 2747 Decision == LoopVectorizationCostModel::CM_GatherScatter) && 2748 "CM decision is not to widen the memory instruction"); 2749 2750 Type *ScalarDataTy = getMemInstValueType(Instr); 2751 2752 auto *DataTy = VectorType::get(ScalarDataTy, VF); 2753 const Align Alignment = getLoadStoreAlignment(Instr); 2754 2755 // Determine if the pointer operand of the access is either consecutive or 2756 // reverse consecutive. 2757 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2758 bool ConsecutiveStride = 2759 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2760 bool CreateGatherScatter = 2761 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2762 2763 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2764 // gather/scatter. Otherwise Decision should have been to Scalarize. 2765 assert((ConsecutiveStride || CreateGatherScatter) && 2766 "The instruction should be scalarized"); 2767 (void)ConsecutiveStride; 2768 2769 VectorParts BlockInMaskParts(UF); 2770 bool isMaskRequired = BlockInMask; 2771 if (isMaskRequired) 2772 for (unsigned Part = 0; Part < UF; ++Part) 2773 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2774 2775 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2776 // Calculate the pointer for the specific unroll-part. 2777 GetElementPtrInst *PartPtr = nullptr; 2778 2779 bool InBounds = false; 2780 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2781 InBounds = gep->isInBounds(); 2782 2783 if (Reverse) { 2784 assert(!VF.isScalable() && 2785 "Reversing vectors is not yet supported for scalable vectors."); 2786 2787 // If the address is consecutive but reversed, then the 2788 // wide store needs to start at the last vector element. 2789 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2790 ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); 2791 PartPtr->setIsInBounds(InBounds); 2792 PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP( 2793 ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); 2794 PartPtr->setIsInBounds(InBounds); 2795 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2796 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2797 } else { 2798 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF); 2799 PartPtr = cast<GetElementPtrInst>( 2800 Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); 2801 PartPtr->setIsInBounds(InBounds); 2802 } 2803 2804 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2805 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2806 }; 2807 2808 // Handle Stores: 2809 if (SI) { 2810 setDebugLocFromInst(Builder, SI); 2811 2812 for (unsigned Part = 0; Part < UF; ++Part) { 2813 Instruction *NewSI = nullptr; 2814 Value *StoredVal = State.get(StoredValue, Part); 2815 if (CreateGatherScatter) { 2816 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2817 Value *VectorGep = State.get(Addr, Part); 2818 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2819 MaskPart); 2820 } else { 2821 if (Reverse) { 2822 // If we store to reverse consecutive memory locations, then we need 2823 // to reverse the order of elements in the stored value. 2824 StoredVal = reverseVector(StoredVal); 2825 // We don't want to update the value in the map as it might be used in 2826 // another expression. So don't call resetVectorValue(StoredVal). 2827 } 2828 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2829 if (isMaskRequired) 2830 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2831 BlockInMaskParts[Part]); 2832 else 2833 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2834 } 2835 addMetadata(NewSI, SI); 2836 } 2837 return; 2838 } 2839 2840 // Handle loads. 2841 assert(LI && "Must have a load instruction"); 2842 setDebugLocFromInst(Builder, LI); 2843 for (unsigned Part = 0; Part < UF; ++Part) { 2844 Value *NewLI; 2845 if (CreateGatherScatter) { 2846 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2847 Value *VectorGep = State.get(Addr, Part); 2848 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2849 nullptr, "wide.masked.gather"); 2850 addMetadata(NewLI, LI); 2851 } else { 2852 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2853 if (isMaskRequired) 2854 NewLI = Builder.CreateMaskedLoad( 2855 VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), 2856 "wide.masked.load"); 2857 else 2858 NewLI = 2859 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2860 2861 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2862 addMetadata(NewLI, LI); 2863 if (Reverse) 2864 NewLI = reverseVector(NewLI); 2865 } 2866 2867 State.set(Def, Instr, NewLI, Part); 2868 } 2869 } 2870 2871 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User, 2872 const VPIteration &Instance, 2873 bool IfPredicateInstr, 2874 VPTransformState &State) { 2875 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2876 2877 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2878 // the first lane and part. 2879 if (isa<NoAliasScopeDeclInst>(Instr)) 2880 if (Instance.Lane != 0 || Instance.Part != 0) 2881 return; 2882 2883 setDebugLocFromInst(Builder, Instr); 2884 2885 // Does this instruction return a value ? 2886 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2887 2888 Instruction *Cloned = Instr->clone(); 2889 if (!IsVoidRetTy) 2890 Cloned->setName(Instr->getName() + ".cloned"); 2891 2892 // Replace the operands of the cloned instructions with their scalar 2893 // equivalents in the new loop. 2894 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { 2895 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); 2896 auto InputInstance = Instance; 2897 if (!Operand || !OrigLoop->contains(Operand) || 2898 (Cost->isUniformAfterVectorization(Operand, State.VF))) 2899 InputInstance.Lane = 0; 2900 auto *NewOp = State.get(User.getOperand(op), InputInstance); 2901 Cloned->setOperand(op, NewOp); 2902 } 2903 addNewMetadata(Cloned, Instr); 2904 2905 // Place the cloned scalar in the new loop. 2906 Builder.Insert(Cloned); 2907 2908 // TODO: Set result for VPValue of VPReciplicateRecipe. This requires 2909 // representing scalar values in VPTransformState. Add the cloned scalar to 2910 // the scalar map entry. 2911 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2912 2913 // If we just cloned a new assumption, add it the assumption cache. 2914 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2915 if (II->getIntrinsicID() == Intrinsic::assume) 2916 AC->registerAssumption(II); 2917 2918 // End if-block. 2919 if (IfPredicateInstr) 2920 PredicatedInstructions.push_back(Cloned); 2921 } 2922 2923 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2924 Value *End, Value *Step, 2925 Instruction *DL) { 2926 BasicBlock *Header = L->getHeader(); 2927 BasicBlock *Latch = L->getLoopLatch(); 2928 // As we're just creating this loop, it's possible no latch exists 2929 // yet. If so, use the header as this will be a single block loop. 2930 if (!Latch) 2931 Latch = Header; 2932 2933 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2934 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2935 setDebugLocFromInst(Builder, OldInst); 2936 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2937 2938 Builder.SetInsertPoint(Latch->getTerminator()); 2939 setDebugLocFromInst(Builder, OldInst); 2940 2941 // Create i+1 and fill the PHINode. 2942 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2943 Induction->addIncoming(Start, L->getLoopPreheader()); 2944 Induction->addIncoming(Next, Latch); 2945 // Create the compare. 2946 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2947 Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header); 2948 2949 // Now we have two terminators. Remove the old one from the block. 2950 Latch->getTerminator()->eraseFromParent(); 2951 2952 return Induction; 2953 } 2954 2955 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2956 if (TripCount) 2957 return TripCount; 2958 2959 assert(L && "Create Trip Count for null loop."); 2960 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2961 // Find the loop boundaries. 2962 ScalarEvolution *SE = PSE.getSE(); 2963 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2964 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && 2965 "Invalid loop count"); 2966 2967 Type *IdxTy = Legal->getWidestInductionType(); 2968 assert(IdxTy && "No type for induction"); 2969 2970 // The exit count might have the type of i64 while the phi is i32. This can 2971 // happen if we have an induction variable that is sign extended before the 2972 // compare. The only way that we get a backedge taken count is that the 2973 // induction variable was signed and as such will not overflow. In such a case 2974 // truncation is legal. 2975 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) > 2976 IdxTy->getPrimitiveSizeInBits()) 2977 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2978 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2979 2980 // Get the total trip count from the count by adding 1. 2981 const SCEV *ExitCount = SE->getAddExpr( 2982 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2983 2984 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2985 2986 // Expand the trip count and place the new instructions in the preheader. 2987 // Notice that the pre-header does not change, only the loop body. 2988 SCEVExpander Exp(*SE, DL, "induction"); 2989 2990 // Count holds the overall loop count (N). 2991 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2992 L->getLoopPreheader()->getTerminator()); 2993 2994 if (TripCount->getType()->isPointerTy()) 2995 TripCount = 2996 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2997 L->getLoopPreheader()->getTerminator()); 2998 2999 return TripCount; 3000 } 3001 3002 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 3003 if (VectorTripCount) 3004 return VectorTripCount; 3005 3006 Value *TC = getOrCreateTripCount(L); 3007 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 3008 3009 Type *Ty = TC->getType(); 3010 // This is where we can make the step a runtime constant. 3011 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF); 3012 3013 // If the tail is to be folded by masking, round the number of iterations N 3014 // up to a multiple of Step instead of rounding down. This is done by first 3015 // adding Step-1 and then rounding down. Note that it's ok if this addition 3016 // overflows: the vector induction variable will eventually wrap to zero given 3017 // that it starts at zero and its Step is a power of two; the loop will then 3018 // exit, with the last early-exit vector comparison also producing all-true. 3019 if (Cost->foldTailByMasking()) { 3020 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 3021 "VF*UF must be a power of 2 when folding tail by masking"); 3022 assert(!VF.isScalable() && 3023 "Tail folding not yet supported for scalable vectors"); 3024 TC = Builder.CreateAdd( 3025 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); 3026 } 3027 3028 // Now we need to generate the expression for the part of the loop that the 3029 // vectorized body will execute. This is equal to N - (N % Step) if scalar 3030 // iterations are not required for correctness, or N - Step, otherwise. Step 3031 // is equal to the vectorization factor (number of SIMD elements) times the 3032 // unroll factor (number of SIMD instructions). 3033 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 3034 3035 // There are two cases where we need to ensure (at least) the last iteration 3036 // runs in the scalar remainder loop. Thus, if the step evenly divides 3037 // the trip count, we set the remainder to be equal to the step. If the step 3038 // does not evenly divide the trip count, no adjustment is necessary since 3039 // there will already be scalar iterations. Note that the minimum iterations 3040 // check ensures that N >= Step. The cases are: 3041 // 1) If there is a non-reversed interleaved group that may speculatively 3042 // access memory out-of-bounds. 3043 // 2) If any instruction may follow a conditionally taken exit. That is, if 3044 // the loop contains multiple exiting blocks, or a single exiting block 3045 // which is not the latch. 3046 if (VF.isVector() && Cost->requiresScalarEpilogue()) { 3047 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 3048 R = Builder.CreateSelect(IsZero, Step, R); 3049 } 3050 3051 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 3052 3053 return VectorTripCount; 3054 } 3055 3056 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 3057 const DataLayout &DL) { 3058 // Verify that V is a vector type with same number of elements as DstVTy. 3059 auto *DstFVTy = cast<FixedVectorType>(DstVTy); 3060 unsigned VF = DstFVTy->getNumElements(); 3061 auto *SrcVecTy = cast<FixedVectorType>(V->getType()); 3062 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 3063 Type *SrcElemTy = SrcVecTy->getElementType(); 3064 Type *DstElemTy = DstFVTy->getElementType(); 3065 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 3066 "Vector elements must have same size"); 3067 3068 // Do a direct cast if element types are castable. 3069 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 3070 return Builder.CreateBitOrPointerCast(V, DstFVTy); 3071 } 3072 // V cannot be directly casted to desired vector type. 3073 // May happen when V is a floating point vector but DstVTy is a vector of 3074 // pointers or vice-versa. Handle this using a two-step bitcast using an 3075 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 3076 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 3077 "Only one type should be a pointer type"); 3078 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 3079 "Only one type should be a floating point type"); 3080 Type *IntTy = 3081 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 3082 auto *VecIntTy = FixedVectorType::get(IntTy, VF); 3083 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 3084 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 3085 } 3086 3087 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 3088 BasicBlock *Bypass) { 3089 Value *Count = getOrCreateTripCount(L); 3090 // Reuse existing vector loop preheader for TC checks. 3091 // Note that new preheader block is generated for vector loop. 3092 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 3093 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 3094 3095 // Generate code to check if the loop's trip count is less than VF * UF, or 3096 // equal to it in case a scalar epilogue is required; this implies that the 3097 // vector trip count is zero. This check also covers the case where adding one 3098 // to the backedge-taken count overflowed leading to an incorrect trip count 3099 // of zero. In this case we will also jump to the scalar loop. 3100 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 3101 : ICmpInst::ICMP_ULT; 3102 3103 // If tail is to be folded, vector loop takes care of all iterations. 3104 Value *CheckMinIters = Builder.getFalse(); 3105 if (!Cost->foldTailByMasking()) { 3106 Value *Step = 3107 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF); 3108 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); 3109 } 3110 // Create new preheader for vector loop. 3111 LoopVectorPreHeader = 3112 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 3113 "vector.ph"); 3114 3115 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 3116 DT->getNode(Bypass)->getIDom()) && 3117 "TC check is expected to dominate Bypass"); 3118 3119 // Update dominator for Bypass & LoopExit. 3120 DT->changeImmediateDominator(Bypass, TCCheckBlock); 3121 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 3122 3123 ReplaceInstWithInst( 3124 TCCheckBlock->getTerminator(), 3125 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 3126 LoopBypassBlocks.push_back(TCCheckBlock); 3127 } 3128 3129 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 3130 // Reuse existing vector loop preheader for SCEV checks. 3131 // Note that new preheader block is generated for vector loop. 3132 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 3133 3134 // Generate the code to check that the SCEV assumptions that we made. 3135 // We want the new basic block to start at the first instruction in a 3136 // sequence of instructions that form a check. 3137 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 3138 "scev.check"); 3139 Value *SCEVCheck = Exp.expandCodeForPredicate( 3140 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 3141 3142 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 3143 if (C->isZero()) 3144 return; 3145 3146 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 3147 (OptForSizeBasedOnProfile && 3148 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 3149 "Cannot SCEV check stride or overflow when optimizing for size"); 3150 3151 SCEVCheckBlock->setName("vector.scevcheck"); 3152 // Create new preheader for vector loop. 3153 LoopVectorPreHeader = 3154 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 3155 nullptr, "vector.ph"); 3156 3157 // Update dominator only if this is first RT check. 3158 if (LoopBypassBlocks.empty()) { 3159 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 3160 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 3161 } 3162 3163 ReplaceInstWithInst( 3164 SCEVCheckBlock->getTerminator(), 3165 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 3166 LoopBypassBlocks.push_back(SCEVCheckBlock); 3167 AddedSafetyChecks = true; 3168 } 3169 3170 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 3171 // VPlan-native path does not do any analysis for runtime checks currently. 3172 if (EnableVPlanNativePath) 3173 return; 3174 3175 // Reuse existing vector loop preheader for runtime memory checks. 3176 // Note that new preheader block is generated for vector loop. 3177 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 3178 3179 // Generate the code that checks in runtime if arrays overlap. We put the 3180 // checks into a separate block to make the more common case of few elements 3181 // faster. 3182 auto *LAI = Legal->getLAI(); 3183 const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); 3184 if (!RtPtrChecking.Need) 3185 return; 3186 3187 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3188 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3189 "Cannot emit memory checks when optimizing for size, unless forced " 3190 "to vectorize."); 3191 ORE->emit([&]() { 3192 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3193 L->getStartLoc(), L->getHeader()) 3194 << "Code-size may be reduced by not forcing " 3195 "vectorization, or by source-code modifications " 3196 "eliminating the need for runtime checks " 3197 "(e.g., adding 'restrict')."; 3198 }); 3199 } 3200 3201 MemCheckBlock->setName("vector.memcheck"); 3202 // Create new preheader for vector loop. 3203 LoopVectorPreHeader = 3204 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 3205 "vector.ph"); 3206 3207 auto *CondBranch = cast<BranchInst>( 3208 Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); 3209 ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); 3210 LoopBypassBlocks.push_back(MemCheckBlock); 3211 AddedSafetyChecks = true; 3212 3213 // Update dominator only if this is first RT check. 3214 if (LoopBypassBlocks.empty()) { 3215 DT->changeImmediateDominator(Bypass, MemCheckBlock); 3216 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 3217 } 3218 3219 Instruction *FirstCheckInst; 3220 Instruction *MemRuntimeCheck; 3221 SCEVExpander Exp(*PSE.getSE(), MemCheckBlock->getModule()->getDataLayout(), 3222 "induction"); 3223 std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( 3224 MemCheckBlock->getTerminator(), OrigLoop, RtPtrChecking.getChecks(), Exp); 3225 assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " 3226 "claimed checks are required"); 3227 CondBranch->setCondition(MemRuntimeCheck); 3228 3229 // We currently don't use LoopVersioning for the actual loop cloning but we 3230 // still use it to add the noalias metadata. 3231 LVer = std::make_unique<LoopVersioning>( 3232 *Legal->getLAI(), 3233 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, 3234 DT, PSE.getSE()); 3235 LVer->prepareNoAliasMetadata(); 3236 } 3237 3238 Value *InnerLoopVectorizer::emitTransformedIndex( 3239 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 3240 const InductionDescriptor &ID) const { 3241 3242 SCEVExpander Exp(*SE, DL, "induction"); 3243 auto Step = ID.getStep(); 3244 auto StartValue = ID.getStartValue(); 3245 assert(Index->getType() == Step->getType() && 3246 "Index type does not match StepValue type"); 3247 3248 // Note: the IR at this point is broken. We cannot use SE to create any new 3249 // SCEV and then expand it, hoping that SCEV's simplification will give us 3250 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 3251 // lead to various SCEV crashes. So all we can do is to use builder and rely 3252 // on InstCombine for future simplifications. Here we handle some trivial 3253 // cases only. 3254 auto CreateAdd = [&B](Value *X, Value *Y) { 3255 assert(X->getType() == Y->getType() && "Types don't match!"); 3256 if (auto *CX = dyn_cast<ConstantInt>(X)) 3257 if (CX->isZero()) 3258 return Y; 3259 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3260 if (CY->isZero()) 3261 return X; 3262 return B.CreateAdd(X, Y); 3263 }; 3264 3265 auto CreateMul = [&B](Value *X, Value *Y) { 3266 assert(X->getType() == Y->getType() && "Types don't match!"); 3267 if (auto *CX = dyn_cast<ConstantInt>(X)) 3268 if (CX->isOne()) 3269 return Y; 3270 if (auto *CY = dyn_cast<ConstantInt>(Y)) 3271 if (CY->isOne()) 3272 return X; 3273 return B.CreateMul(X, Y); 3274 }; 3275 3276 // Get a suitable insert point for SCEV expansion. For blocks in the vector 3277 // loop, choose the end of the vector loop header (=LoopVectorBody), because 3278 // the DomTree is not kept up-to-date for additional blocks generated in the 3279 // vector loop. By using the header as insertion point, we guarantee that the 3280 // expanded instructions dominate all their uses. 3281 auto GetInsertPoint = [this, &B]() { 3282 BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); 3283 if (InsertBB != LoopVectorBody && 3284 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) 3285 return LoopVectorBody->getTerminator(); 3286 return &*B.GetInsertPoint(); 3287 }; 3288 switch (ID.getKind()) { 3289 case InductionDescriptor::IK_IntInduction: { 3290 assert(Index->getType() == StartValue->getType() && 3291 "Index type does not match StartValue type"); 3292 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 3293 return B.CreateSub(StartValue, Index); 3294 auto *Offset = CreateMul( 3295 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); 3296 return CreateAdd(StartValue, Offset); 3297 } 3298 case InductionDescriptor::IK_PtrInduction: { 3299 assert(isa<SCEVConstant>(Step) && 3300 "Expected constant step for pointer induction"); 3301 return B.CreateGEP( 3302 StartValue->getType()->getPointerElementType(), StartValue, 3303 CreateMul(Index, 3304 Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()))); 3305 } 3306 case InductionDescriptor::IK_FpInduction: { 3307 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 3308 auto InductionBinOp = ID.getInductionBinOp(); 3309 assert(InductionBinOp && 3310 (InductionBinOp->getOpcode() == Instruction::FAdd || 3311 InductionBinOp->getOpcode() == Instruction::FSub) && 3312 "Original bin op should be defined for FP induction"); 3313 3314 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 3315 3316 // Floating point operations had to be 'fast' to enable the induction. 3317 FastMathFlags Flags; 3318 Flags.setFast(); 3319 3320 Value *MulExp = B.CreateFMul(StepValue, Index); 3321 if (isa<Instruction>(MulExp)) 3322 // We have to check, the MulExp may be a constant. 3323 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 3324 3325 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 3326 "induction"); 3327 if (isa<Instruction>(BOp)) 3328 cast<Instruction>(BOp)->setFastMathFlags(Flags); 3329 3330 return BOp; 3331 } 3332 case InductionDescriptor::IK_NoInduction: 3333 return nullptr; 3334 } 3335 llvm_unreachable("invalid enum"); 3336 } 3337 3338 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3339 LoopScalarBody = OrigLoop->getHeader(); 3340 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3341 LoopExitBlock = OrigLoop->getUniqueExitBlock(); 3342 assert(LoopExitBlock && "Must have an exit block"); 3343 assert(LoopVectorPreHeader && "Invalid loop structure"); 3344 3345 LoopMiddleBlock = 3346 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3347 LI, nullptr, Twine(Prefix) + "middle.block"); 3348 LoopScalarPreHeader = 3349 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3350 nullptr, Twine(Prefix) + "scalar.ph"); 3351 3352 // Set up branch from middle block to the exit and scalar preheader blocks. 3353 // completeLoopSkeleton will update the condition to use an iteration check, 3354 // if required to decide whether to execute the remainder. 3355 BranchInst *BrInst = 3356 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue()); 3357 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3358 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3359 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3360 3361 // We intentionally don't let SplitBlock to update LoopInfo since 3362 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 3363 // LoopVectorBody is explicitly added to the correct place few lines later. 3364 LoopVectorBody = 3365 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3366 nullptr, nullptr, Twine(Prefix) + "vector.body"); 3367 3368 // Update dominator for loop exit. 3369 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3370 3371 // Create and register the new vector loop. 3372 Loop *Lp = LI->AllocateLoop(); 3373 Loop *ParentLoop = OrigLoop->getParentLoop(); 3374 3375 // Insert the new loop into the loop nest and register the new basic blocks 3376 // before calling any utilities such as SCEV that require valid LoopInfo. 3377 if (ParentLoop) { 3378 ParentLoop->addChildLoop(Lp); 3379 } else { 3380 LI->addTopLevelLoop(Lp); 3381 } 3382 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3383 return Lp; 3384 } 3385 3386 void InnerLoopVectorizer::createInductionResumeValues( 3387 Loop *L, Value *VectorTripCount, 3388 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3389 assert(VectorTripCount && L && "Expected valid arguments"); 3390 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3391 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3392 "Inconsistent information about additional bypass."); 3393 // We are going to resume the execution of the scalar loop. 3394 // Go over all of the induction variables that we found and fix the 3395 // PHIs that are left in the scalar version of the loop. 3396 // The starting values of PHI nodes depend on the counter of the last 3397 // iteration in the vectorized loop. 3398 // If we come from a bypass edge then we need to start from the original 3399 // start value. 3400 for (auto &InductionEntry : Legal->getInductionVars()) { 3401 PHINode *OrigPhi = InductionEntry.first; 3402 InductionDescriptor II = InductionEntry.second; 3403 3404 // Create phi nodes to merge from the backedge-taken check block. 3405 PHINode *BCResumeVal = 3406 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3407 LoopScalarPreHeader->getTerminator()); 3408 // Copy original phi DL over to the new one. 3409 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3410 Value *&EndValue = IVEndValues[OrigPhi]; 3411 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3412 if (OrigPhi == OldInduction) { 3413 // We know what the end value is. 3414 EndValue = VectorTripCount; 3415 } else { 3416 IRBuilder<> B(L->getLoopPreheader()->getTerminator()); 3417 Type *StepType = II.getStep()->getType(); 3418 Instruction::CastOps CastOp = 3419 CastInst::getCastOpcode(VectorTripCount, true, StepType, true); 3420 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); 3421 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3422 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3423 EndValue->setName("ind.end"); 3424 3425 // Compute the end value for the additional bypass (if applicable). 3426 if (AdditionalBypass.first) { 3427 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); 3428 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, 3429 StepType, true); 3430 CRD = 3431 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); 3432 EndValueFromAdditionalBypass = 3433 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3434 EndValueFromAdditionalBypass->setName("ind.end"); 3435 } 3436 } 3437 // The new PHI merges the original incoming value, in case of a bypass, 3438 // or the value at the end of the vectorized loop. 3439 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3440 3441 // Fix the scalar body counter (PHI node). 3442 // The old induction's phi node in the scalar body needs the truncated 3443 // value. 3444 for (BasicBlock *BB : LoopBypassBlocks) 3445 BCResumeVal->addIncoming(II.getStartValue(), BB); 3446 3447 if (AdditionalBypass.first) 3448 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3449 EndValueFromAdditionalBypass); 3450 3451 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3452 } 3453 } 3454 3455 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, 3456 MDNode *OrigLoopID) { 3457 assert(L && "Expected valid loop."); 3458 3459 // The trip counts should be cached by now. 3460 Value *Count = getOrCreateTripCount(L); 3461 Value *VectorTripCount = getOrCreateVectorTripCount(L); 3462 3463 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3464 3465 // Add a check in the middle block to see if we have completed 3466 // all of the iterations in the first vector loop. 3467 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3468 // If tail is to be folded, we know we don't need to run the remainder. 3469 if (!Cost->foldTailByMasking()) { 3470 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 3471 Count, VectorTripCount, "cmp.n", 3472 LoopMiddleBlock->getTerminator()); 3473 3474 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3475 // of the corresponding compare because they may have ended up with 3476 // different line numbers and we want to avoid awkward line stepping while 3477 // debugging. Eg. if the compare has got a line number inside the loop. 3478 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3479 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN); 3480 } 3481 3482 // Get ready to start creating new instructions into the vectorized body. 3483 assert(LoopVectorPreHeader == L->getLoopPreheader() && 3484 "Inconsistent vector loop preheader"); 3485 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3486 3487 Optional<MDNode *> VectorizedLoopID = 3488 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3489 LLVMLoopVectorizeFollowupVectorized}); 3490 if (VectorizedLoopID.hasValue()) { 3491 L->setLoopID(VectorizedLoopID.getValue()); 3492 3493 // Do not setAlreadyVectorized if loop attributes have been defined 3494 // explicitly. 3495 return LoopVectorPreHeader; 3496 } 3497 3498 // Keep all loop hints from the original loop on the vector loop (we'll 3499 // replace the vectorizer-specific hints below). 3500 if (MDNode *LID = OrigLoop->getLoopID()) 3501 L->setLoopID(LID); 3502 3503 LoopVectorizeHints Hints(L, true, *ORE); 3504 Hints.setAlreadyVectorized(); 3505 3506 #ifdef EXPENSIVE_CHECKS 3507 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3508 LI->verify(*DT); 3509 #endif 3510 3511 return LoopVectorPreHeader; 3512 } 3513 3514 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 3515 /* 3516 In this function we generate a new loop. The new loop will contain 3517 the vectorized instructions while the old loop will continue to run the 3518 scalar remainder. 3519 3520 [ ] <-- loop iteration number check. 3521 / | 3522 / v 3523 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3524 | / | 3525 | / v 3526 || [ ] <-- vector pre header. 3527 |/ | 3528 | v 3529 | [ ] \ 3530 | [ ]_| <-- vector loop. 3531 | | 3532 | v 3533 | -[ ] <--- middle-block. 3534 | / | 3535 | / v 3536 -|- >[ ] <--- new preheader. 3537 | | 3538 | v 3539 | [ ] \ 3540 | [ ]_| <-- old scalar loop to handle remainder. 3541 \ | 3542 \ v 3543 >[ ] <-- exit block. 3544 ... 3545 */ 3546 3547 // Get the metadata of the original loop before it gets modified. 3548 MDNode *OrigLoopID = OrigLoop->getLoopID(); 3549 3550 // Create an empty vector loop, and prepare basic blocks for the runtime 3551 // checks. 3552 Loop *Lp = createVectorLoopSkeleton(""); 3553 3554 // Now, compare the new count to zero. If it is zero skip the vector loop and 3555 // jump to the scalar loop. This check also covers the case where the 3556 // backedge-taken count is uint##_max: adding one to it will overflow leading 3557 // to an incorrect trip count of zero. In this (rare) case we will also jump 3558 // to the scalar loop. 3559 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3560 3561 // Generate the code to check any assumptions that we've made for SCEV 3562 // expressions. 3563 emitSCEVChecks(Lp, LoopScalarPreHeader); 3564 3565 // Generate the code that checks in runtime if arrays overlap. We put the 3566 // checks into a separate block to make the more common case of few elements 3567 // faster. 3568 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3569 3570 // Some loops have a single integer induction variable, while other loops 3571 // don't. One example is c++ iterators that often have multiple pointer 3572 // induction variables. In the code below we also support a case where we 3573 // don't have a single induction variable. 3574 // 3575 // We try to obtain an induction variable from the original loop as hard 3576 // as possible. However if we don't find one that: 3577 // - is an integer 3578 // - counts from zero, stepping by one 3579 // - is the size of the widest induction variable type 3580 // then we create a new one. 3581 OldInduction = Legal->getPrimaryInduction(); 3582 Type *IdxTy = Legal->getWidestInductionType(); 3583 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3584 // The loop step is equal to the vectorization factor (num of SIMD elements) 3585 // times the unroll factor (num of SIMD instructions). 3586 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); 3587 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF); 3588 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3589 Induction = 3590 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3591 getDebugLocFromInstOrOperands(OldInduction)); 3592 3593 // Emit phis for the new starting index of the scalar loop. 3594 createInductionResumeValues(Lp, CountRoundDown); 3595 3596 return completeLoopSkeleton(Lp, OrigLoopID); 3597 } 3598 3599 // Fix up external users of the induction variable. At this point, we are 3600 // in LCSSA form, with all external PHIs that use the IV having one input value, 3601 // coming from the remainder loop. We need those PHIs to also have a correct 3602 // value for the IV when arriving directly from the middle block. 3603 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3604 const InductionDescriptor &II, 3605 Value *CountRoundDown, Value *EndValue, 3606 BasicBlock *MiddleBlock) { 3607 // There are two kinds of external IV usages - those that use the value 3608 // computed in the last iteration (the PHI) and those that use the penultimate 3609 // value (the value that feeds into the phi from the loop latch). 3610 // We allow both, but they, obviously, have different values. 3611 3612 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3613 3614 DenseMap<Value *, Value *> MissingVals; 3615 3616 // An external user of the last iteration's value should see the value that 3617 // the remainder loop uses to initialize its own IV. 3618 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3619 for (User *U : PostInc->users()) { 3620 Instruction *UI = cast<Instruction>(U); 3621 if (!OrigLoop->contains(UI)) { 3622 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3623 MissingVals[UI] = EndValue; 3624 } 3625 } 3626 3627 // An external user of the penultimate value need to see EndValue - Step. 3628 // The simplest way to get this is to recompute it from the constituent SCEVs, 3629 // that is Start + (Step * (CRD - 1)). 3630 for (User *U : OrigPhi->users()) { 3631 auto *UI = cast<Instruction>(U); 3632 if (!OrigLoop->contains(UI)) { 3633 const DataLayout &DL = 3634 OrigLoop->getHeader()->getModule()->getDataLayout(); 3635 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3636 3637 IRBuilder<> B(MiddleBlock->getTerminator()); 3638 Value *CountMinusOne = B.CreateSub( 3639 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3640 Value *CMO = 3641 !II.getStep()->getType()->isIntegerTy() 3642 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3643 II.getStep()->getType()) 3644 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3645 CMO->setName("cast.cmo"); 3646 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3647 Escape->setName("ind.escape"); 3648 MissingVals[UI] = Escape; 3649 } 3650 } 3651 3652 for (auto &I : MissingVals) { 3653 PHINode *PHI = cast<PHINode>(I.first); 3654 // One corner case we have to handle is two IVs "chasing" each-other, 3655 // that is %IV2 = phi [...], [ %IV1, %latch ] 3656 // In this case, if IV1 has an external use, we need to avoid adding both 3657 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3658 // don't already have an incoming value for the middle block. 3659 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3660 PHI->addIncoming(I.second, MiddleBlock); 3661 } 3662 } 3663 3664 namespace { 3665 3666 struct CSEDenseMapInfo { 3667 static bool canHandle(const Instruction *I) { 3668 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3669 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3670 } 3671 3672 static inline Instruction *getEmptyKey() { 3673 return DenseMapInfo<Instruction *>::getEmptyKey(); 3674 } 3675 3676 static inline Instruction *getTombstoneKey() { 3677 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3678 } 3679 3680 static unsigned getHashValue(const Instruction *I) { 3681 assert(canHandle(I) && "Unknown instruction!"); 3682 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3683 I->value_op_end())); 3684 } 3685 3686 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3687 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3688 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3689 return LHS == RHS; 3690 return LHS->isIdenticalTo(RHS); 3691 } 3692 }; 3693 3694 } // end anonymous namespace 3695 3696 ///Perform cse of induction variable instructions. 3697 static void cse(BasicBlock *BB) { 3698 // Perform simple cse. 3699 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3700 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3701 Instruction *In = &*I++; 3702 3703 if (!CSEDenseMapInfo::canHandle(In)) 3704 continue; 3705 3706 // Check if we can replace this instruction with any of the 3707 // visited instructions. 3708 if (Instruction *V = CSEMap.lookup(In)) { 3709 In->replaceAllUsesWith(V); 3710 In->eraseFromParent(); 3711 continue; 3712 } 3713 3714 CSEMap[In] = In; 3715 } 3716 } 3717 3718 InstructionCost 3719 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, 3720 bool &NeedToScalarize) { 3721 assert(!VF.isScalable() && "scalable vectors not yet supported."); 3722 Function *F = CI->getCalledFunction(); 3723 Type *ScalarRetTy = CI->getType(); 3724 SmallVector<Type *, 4> Tys, ScalarTys; 3725 for (auto &ArgOp : CI->arg_operands()) 3726 ScalarTys.push_back(ArgOp->getType()); 3727 3728 // Estimate cost of scalarized vector call. The source operands are assumed 3729 // to be vectors, so we need to extract individual elements from there, 3730 // execute VF scalar calls, and then gather the result into the vector return 3731 // value. 3732 InstructionCost ScalarCallCost = 3733 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); 3734 if (VF.isScalar()) 3735 return ScalarCallCost; 3736 3737 // Compute corresponding vector type for return value and arguments. 3738 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3739 for (Type *ScalarTy : ScalarTys) 3740 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3741 3742 // Compute costs of unpacking argument values for the scalar calls and 3743 // packing the return values to a vector. 3744 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); 3745 3746 InstructionCost Cost = 3747 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 3748 3749 // If we can't emit a vector call for this function, then the currently found 3750 // cost is the cost we need to return. 3751 NeedToScalarize = true; 3752 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 3753 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3754 3755 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3756 return Cost; 3757 3758 // If the corresponding vector cost is cheaper, return its cost. 3759 InstructionCost VectorCallCost = 3760 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); 3761 if (VectorCallCost < Cost) { 3762 NeedToScalarize = false; 3763 Cost = VectorCallCost; 3764 } 3765 return Cost; 3766 } 3767 3768 InstructionCost 3769 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3770 ElementCount VF) { 3771 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3772 assert(ID && "Expected intrinsic call!"); 3773 3774 IntrinsicCostAttributes CostAttrs(ID, *CI, VF); 3775 return TTI.getIntrinsicInstrCost(CostAttrs, 3776 TargetTransformInfo::TCK_RecipThroughput); 3777 } 3778 3779 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3780 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3781 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3782 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3783 } 3784 3785 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3786 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3787 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3788 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3789 } 3790 3791 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3792 // For every instruction `I` in MinBWs, truncate the operands, create a 3793 // truncated version of `I` and reextend its result. InstCombine runs 3794 // later and will remove any ext/trunc pairs. 3795 SmallPtrSet<Value *, 4> Erased; 3796 for (const auto &KV : Cost->getMinimalBitwidths()) { 3797 // If the value wasn't vectorized, we must maintain the original scalar 3798 // type. The absence of the value from VectorLoopValueMap indicates that it 3799 // wasn't vectorized. 3800 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3801 continue; 3802 for (unsigned Part = 0; Part < UF; ++Part) { 3803 Value *I = getOrCreateVectorValue(KV.first, Part); 3804 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I)) 3805 continue; 3806 Type *OriginalTy = I->getType(); 3807 Type *ScalarTruncatedTy = 3808 IntegerType::get(OriginalTy->getContext(), KV.second); 3809 auto *TruncatedTy = FixedVectorType::get( 3810 ScalarTruncatedTy, 3811 cast<FixedVectorType>(OriginalTy)->getNumElements()); 3812 if (TruncatedTy == OriginalTy) 3813 continue; 3814 3815 IRBuilder<> B(cast<Instruction>(I)); 3816 auto ShrinkOperand = [&](Value *V) -> Value * { 3817 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3818 if (ZI->getSrcTy() == TruncatedTy) 3819 return ZI->getOperand(0); 3820 return B.CreateZExtOrTrunc(V, TruncatedTy); 3821 }; 3822 3823 // The actual instruction modification depends on the instruction type, 3824 // unfortunately. 3825 Value *NewI = nullptr; 3826 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3827 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3828 ShrinkOperand(BO->getOperand(1))); 3829 3830 // Any wrapping introduced by shrinking this operation shouldn't be 3831 // considered undefined behavior. So, we can't unconditionally copy 3832 // arithmetic wrapping flags to NewI. 3833 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3834 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3835 NewI = 3836 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3837 ShrinkOperand(CI->getOperand(1))); 3838 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3839 NewI = B.CreateSelect(SI->getCondition(), 3840 ShrinkOperand(SI->getTrueValue()), 3841 ShrinkOperand(SI->getFalseValue())); 3842 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3843 switch (CI->getOpcode()) { 3844 default: 3845 llvm_unreachable("Unhandled cast!"); 3846 case Instruction::Trunc: 3847 NewI = ShrinkOperand(CI->getOperand(0)); 3848 break; 3849 case Instruction::SExt: 3850 NewI = B.CreateSExtOrTrunc( 3851 CI->getOperand(0), 3852 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3853 break; 3854 case Instruction::ZExt: 3855 NewI = B.CreateZExtOrTrunc( 3856 CI->getOperand(0), 3857 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3858 break; 3859 } 3860 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3861 auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType()) 3862 ->getNumElements(); 3863 auto *O0 = B.CreateZExtOrTrunc( 3864 SI->getOperand(0), 3865 FixedVectorType::get(ScalarTruncatedTy, Elements0)); 3866 auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType()) 3867 ->getNumElements(); 3868 auto *O1 = B.CreateZExtOrTrunc( 3869 SI->getOperand(1), 3870 FixedVectorType::get(ScalarTruncatedTy, Elements1)); 3871 3872 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask()); 3873 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3874 // Don't do anything with the operands, just extend the result. 3875 continue; 3876 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3877 auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType()) 3878 ->getNumElements(); 3879 auto *O0 = B.CreateZExtOrTrunc( 3880 IE->getOperand(0), 3881 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3882 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3883 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3884 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3885 auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType()) 3886 ->getNumElements(); 3887 auto *O0 = B.CreateZExtOrTrunc( 3888 EE->getOperand(0), 3889 FixedVectorType::get(ScalarTruncatedTy, Elements)); 3890 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3891 } else { 3892 // If we don't know what to do, be conservative and don't do anything. 3893 continue; 3894 } 3895 3896 // Lastly, extend the result. 3897 NewI->takeName(cast<Instruction>(I)); 3898 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3899 I->replaceAllUsesWith(Res); 3900 cast<Instruction>(I)->eraseFromParent(); 3901 Erased.insert(I); 3902 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3903 } 3904 } 3905 3906 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3907 for (const auto &KV : Cost->getMinimalBitwidths()) { 3908 // If the value wasn't vectorized, we must maintain the original scalar 3909 // type. The absence of the value from VectorLoopValueMap indicates that it 3910 // wasn't vectorized. 3911 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3912 continue; 3913 for (unsigned Part = 0; Part < UF; ++Part) { 3914 Value *I = getOrCreateVectorValue(KV.first, Part); 3915 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3916 if (Inst && Inst->use_empty()) { 3917 Value *NewI = Inst->getOperand(0); 3918 Inst->eraseFromParent(); 3919 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3920 } 3921 } 3922 } 3923 } 3924 3925 void InnerLoopVectorizer::fixVectorizedLoop() { 3926 // Insert truncates and extends for any truncated instructions as hints to 3927 // InstCombine. 3928 if (VF.isVector()) 3929 truncateToMinimalBitwidths(); 3930 3931 // Fix widened non-induction PHIs by setting up the PHI operands. 3932 if (OrigPHIsToFix.size()) { 3933 assert(EnableVPlanNativePath && 3934 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3935 fixNonInductionPHIs(); 3936 } 3937 3938 // At this point every instruction in the original loop is widened to a 3939 // vector form. Now we need to fix the recurrences in the loop. These PHI 3940 // nodes are currently empty because we did not want to introduce cycles. 3941 // This is the second stage of vectorizing recurrences. 3942 fixCrossIterationPHIs(); 3943 3944 // Forget the original basic block. 3945 PSE.getSE()->forgetLoop(OrigLoop); 3946 3947 // Fix-up external users of the induction variables. 3948 for (auto &Entry : Legal->getInductionVars()) 3949 fixupIVUsers(Entry.first, Entry.second, 3950 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3951 IVEndValues[Entry.first], LoopMiddleBlock); 3952 3953 fixLCSSAPHIs(); 3954 for (Instruction *PI : PredicatedInstructions) 3955 sinkScalarOperands(&*PI); 3956 3957 // Remove redundant induction instructions. 3958 cse(LoopVectorBody); 3959 3960 // Set/update profile weights for the vector and remainder loops as original 3961 // loop iterations are now distributed among them. Note that original loop 3962 // represented by LoopScalarBody becomes remainder loop after vectorization. 3963 // 3964 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3965 // end up getting slightly roughened result but that should be OK since 3966 // profile is not inherently precise anyway. Note also possible bypass of 3967 // vector code caused by legality checks is ignored, assigning all the weight 3968 // to the vector loop, optimistically. 3969 // 3970 // For scalable vectorization we can't know at compile time how many iterations 3971 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3972 // vscale of '1'. 3973 setProfileInfoAfterUnrolling( 3974 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), 3975 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); 3976 } 3977 3978 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3979 // In order to support recurrences we need to be able to vectorize Phi nodes. 3980 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3981 // stage #2: We now need to fix the recurrences by adding incoming edges to 3982 // the currently empty PHI nodes. At this point every instruction in the 3983 // original loop is widened to a vector form so we can use them to construct 3984 // the incoming edges. 3985 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3986 // Handle first-order recurrences and reductions that need to be fixed. 3987 if (Legal->isFirstOrderRecurrence(&Phi)) 3988 fixFirstOrderRecurrence(&Phi); 3989 else if (Legal->isReductionVariable(&Phi)) 3990 fixReduction(&Phi); 3991 } 3992 } 3993 3994 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3995 // This is the second phase of vectorizing first-order recurrences. An 3996 // overview of the transformation is described below. Suppose we have the 3997 // following loop. 3998 // 3999 // for (int i = 0; i < n; ++i) 4000 // b[i] = a[i] - a[i - 1]; 4001 // 4002 // There is a first-order recurrence on "a". For this loop, the shorthand 4003 // scalar IR looks like: 4004 // 4005 // scalar.ph: 4006 // s_init = a[-1] 4007 // br scalar.body 4008 // 4009 // scalar.body: 4010 // i = phi [0, scalar.ph], [i+1, scalar.body] 4011 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 4012 // s2 = a[i] 4013 // b[i] = s2 - s1 4014 // br cond, scalar.body, ... 4015 // 4016 // In this example, s1 is a recurrence because it's value depends on the 4017 // previous iteration. In the first phase of vectorization, we created a 4018 // temporary value for s1. We now complete the vectorization and produce the 4019 // shorthand vector IR shown below (for VF = 4, UF = 1). 4020 // 4021 // vector.ph: 4022 // v_init = vector(..., ..., ..., a[-1]) 4023 // br vector.body 4024 // 4025 // vector.body 4026 // i = phi [0, vector.ph], [i+4, vector.body] 4027 // v1 = phi [v_init, vector.ph], [v2, vector.body] 4028 // v2 = a[i, i+1, i+2, i+3]; 4029 // v3 = vector(v1(3), v2(0, 1, 2)) 4030 // b[i, i+1, i+2, i+3] = v2 - v3 4031 // br cond, vector.body, middle.block 4032 // 4033 // middle.block: 4034 // x = v2(3) 4035 // br scalar.ph 4036 // 4037 // scalar.ph: 4038 // s_init = phi [x, middle.block], [a[-1], otherwise] 4039 // br scalar.body 4040 // 4041 // After execution completes the vector loop, we extract the next value of 4042 // the recurrence (x) to use as the initial value in the scalar loop. 4043 4044 // Get the original loop preheader and single loop latch. 4045 auto *Preheader = OrigLoop->getLoopPreheader(); 4046 auto *Latch = OrigLoop->getLoopLatch(); 4047 4048 // Get the initial and previous values of the scalar recurrence. 4049 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 4050 auto *Previous = Phi->getIncomingValueForBlock(Latch); 4051 4052 // Create a vector from the initial value. 4053 auto *VectorInit = ScalarInit; 4054 if (VF.isVector()) { 4055 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4056 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4057 VectorInit = Builder.CreateInsertElement( 4058 PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 4059 Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); 4060 } 4061 4062 // We constructed a temporary phi node in the first phase of vectorization. 4063 // This phi node will eventually be deleted. 4064 Builder.SetInsertPoint( 4065 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 4066 4067 // Create a phi node for the new recurrence. The current value will either be 4068 // the initial value inserted into a vector or loop-varying vector value. 4069 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 4070 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 4071 4072 // Get the vectorized previous value of the last part UF - 1. It appears last 4073 // among all unrolled iterations, due to the order of their construction. 4074 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 4075 4076 // Find and set the insertion point after the previous value if it is an 4077 // instruction. 4078 BasicBlock::iterator InsertPt; 4079 // Note that the previous value may have been constant-folded so it is not 4080 // guaranteed to be an instruction in the vector loop. 4081 // FIXME: Loop invariant values do not form recurrences. We should deal with 4082 // them earlier. 4083 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 4084 InsertPt = LoopVectorBody->getFirstInsertionPt(); 4085 else { 4086 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 4087 if (isa<PHINode>(PreviousLastPart)) 4088 // If the previous value is a phi node, we should insert after all the phi 4089 // nodes in the block containing the PHI to avoid breaking basic block 4090 // verification. Note that the basic block may be different to 4091 // LoopVectorBody, in case we predicate the loop. 4092 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 4093 else 4094 InsertPt = ++PreviousInst->getIterator(); 4095 } 4096 Builder.SetInsertPoint(&*InsertPt); 4097 4098 // We will construct a vector for the recurrence by combining the values for 4099 // the current and previous iterations. This is the required shuffle mask. 4100 assert(!VF.isScalable()); 4101 SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue()); 4102 ShuffleMask[0] = VF.getKnownMinValue() - 1; 4103 for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) 4104 ShuffleMask[I] = I + VF.getKnownMinValue() - 1; 4105 4106 // The vector from which to take the initial value for the current iteration 4107 // (actual or unrolled). Initially, this is the vector phi node. 4108 Value *Incoming = VecPhi; 4109 4110 // Shuffle the current and previous vector and update the vector parts. 4111 for (unsigned Part = 0; Part < UF; ++Part) { 4112 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 4113 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 4114 auto *Shuffle = 4115 VF.isVector() 4116 ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask) 4117 : Incoming; 4118 PhiPart->replaceAllUsesWith(Shuffle); 4119 cast<Instruction>(PhiPart)->eraseFromParent(); 4120 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 4121 Incoming = PreviousPart; 4122 } 4123 4124 // Fix the latch value of the new recurrence in the vector loop. 4125 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4126 4127 // Extract the last vector element in the middle block. This will be the 4128 // initial value for the recurrence when jumping to the scalar loop. 4129 auto *ExtractForScalar = Incoming; 4130 if (VF.isVector()) { 4131 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4132 ExtractForScalar = Builder.CreateExtractElement( 4133 ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), 4134 "vector.recur.extract"); 4135 } 4136 // Extract the second last element in the middle block if the 4137 // Phi is used outside the loop. We need to extract the phi itself 4138 // and not the last element (the phi update in the current iteration). This 4139 // will be the value when jumping to the exit block from the LoopMiddleBlock, 4140 // when the scalar loop is not run at all. 4141 Value *ExtractForPhiUsedOutsideLoop = nullptr; 4142 if (VF.isVector()) 4143 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 4144 Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), 4145 "vector.recur.extract.for.phi"); 4146 // When loop is unrolled without vectorizing, initialize 4147 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 4148 // `Incoming`. This is analogous to the vectorized case above: extracting the 4149 // second last element when VF > 1. 4150 else if (UF > 1) 4151 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 4152 4153 // Fix the initial value of the original recurrence in the scalar loop. 4154 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 4155 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 4156 for (auto *BB : predecessors(LoopScalarPreHeader)) { 4157 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 4158 Start->addIncoming(Incoming, BB); 4159 } 4160 4161 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 4162 Phi->setName("scalar.recur"); 4163 4164 // Finally, fix users of the recurrence outside the loop. The users will need 4165 // either the last value of the scalar recurrence or the last value of the 4166 // vector recurrence we extracted in the middle block. Since the loop is in 4167 // LCSSA form, we just need to find all the phi nodes for the original scalar 4168 // recurrence in the exit block, and then add an edge for the middle block. 4169 // Note that LCSSA does not imply single entry when the original scalar loop 4170 // had multiple exiting edges (as we always run the last iteration in the 4171 // scalar epilogue); in that case, the exiting path through middle will be 4172 // dynamically dead and the value picked for the phi doesn't matter. 4173 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4174 if (any_of(LCSSAPhi.incoming_values(), 4175 [Phi](Value *V) { return V == Phi; })) 4176 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 4177 } 4178 4179 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 4180 // Get it's reduction variable descriptor. 4181 assert(Legal->isReductionVariable(Phi) && 4182 "Unable to find the reduction variable"); 4183 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4184 4185 RecurKind RK = RdxDesc.getRecurrenceKind(); 4186 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 4187 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 4188 setDebugLocFromInst(Builder, ReductionStartValue); 4189 bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi); 4190 4191 // This is the vector-clone of the value that leaves the loop. 4192 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 4193 4194 // Wrap flags are in general invalid after vectorization, clear them. 4195 clearReductionWrapFlags(RdxDesc); 4196 4197 // Fix the vector-loop phi. 4198 4199 // Reductions do not have to start at zero. They can start with 4200 // any loop invariant values. 4201 BasicBlock *Latch = OrigLoop->getLoopLatch(); 4202 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 4203 4204 for (unsigned Part = 0; Part < UF; ++Part) { 4205 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 4206 Value *Val = getOrCreateVectorValue(LoopVal, Part); 4207 cast<PHINode>(VecRdxPhi) 4208 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4209 } 4210 4211 // Before each round, move the insertion point right between 4212 // the PHIs and the values we are going to write. 4213 // This allows us to write both PHINodes and the extractelement 4214 // instructions. 4215 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4216 4217 setDebugLocFromInst(Builder, LoopExitInst); 4218 4219 // If tail is folded by masking, the vector value to leave the loop should be 4220 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 4221 // instead of the former. For an inloop reduction the reduction will already 4222 // be predicated, and does not need to be handled here. 4223 if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) { 4224 for (unsigned Part = 0; Part < UF; ++Part) { 4225 Value *VecLoopExitInst = 4226 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4227 Value *Sel = nullptr; 4228 for (User *U : VecLoopExitInst->users()) { 4229 if (isa<SelectInst>(U)) { 4230 assert(!Sel && "Reduction exit feeding two selects"); 4231 Sel = U; 4232 } else 4233 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 4234 } 4235 assert(Sel && "Reduction exit feeds no select"); 4236 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 4237 4238 // If the target can create a predicated operator for the reduction at no 4239 // extra cost in the loop (for example a predicated vadd), it can be 4240 // cheaper for the select to remain in the loop than be sunk out of it, 4241 // and so use the select value for the phi instead of the old 4242 // LoopExitValue. 4243 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 4244 if (PreferPredicatedReductionSelect || 4245 TTI->preferPredicatedReductionSelect( 4246 RdxDesc.getOpcode(), Phi->getType(), 4247 TargetTransformInfo::ReductionFlags())) { 4248 auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part)); 4249 VecRdxPhi->setIncomingValueForBlock( 4250 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); 4251 } 4252 } 4253 } 4254 4255 // If the vector reduction can be performed in a smaller type, we truncate 4256 // then extend the loop exit value to enable InstCombine to evaluate the 4257 // entire expression in the smaller type. 4258 if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { 4259 assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); 4260 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4261 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 4262 Builder.SetInsertPoint( 4263 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 4264 VectorParts RdxParts(UF); 4265 for (unsigned Part = 0; Part < UF; ++Part) { 4266 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4267 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4268 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 4269 : Builder.CreateZExt(Trunc, VecTy); 4270 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 4271 UI != RdxParts[Part]->user_end();) 4272 if (*UI != Trunc) { 4273 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 4274 RdxParts[Part] = Extnd; 4275 } else { 4276 ++UI; 4277 } 4278 } 4279 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 4280 for (unsigned Part = 0; Part < UF; ++Part) { 4281 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 4282 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 4283 } 4284 } 4285 4286 // Reduce all of the unrolled parts into a single vector. 4287 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 4288 unsigned Op = RecurrenceDescriptor::getOpcode(RK); 4289 4290 // The middle block terminator has already been assigned a DebugLoc here (the 4291 // OrigLoop's single latch terminator). We want the whole middle block to 4292 // appear to execute on this line because: (a) it is all compiler generated, 4293 // (b) these instructions are always executed after evaluating the latch 4294 // conditional branch, and (c) other passes may add new predecessors which 4295 // terminate on this line. This is the easiest way to ensure we don't 4296 // accidentally cause an extra step back into the loop while debugging. 4297 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 4298 { 4299 // Floating-point operations should have some FMF to enable the reduction. 4300 IRBuilderBase::FastMathFlagGuard FMFG(Builder); 4301 Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 4302 for (unsigned Part = 1; Part < UF; ++Part) { 4303 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 4304 if (Op != Instruction::ICmp && Op != Instruction::FCmp) { 4305 ReducedPartRdx = Builder.CreateBinOp( 4306 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); 4307 } else { 4308 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); 4309 } 4310 } 4311 } 4312 4313 // Create the reduction after the loop. Note that inloop reductions create the 4314 // target reduction in the loop using a Reduction recipe. 4315 if (VF.isVector() && !IsInLoopReductionPhi) { 4316 ReducedPartRdx = 4317 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx); 4318 // If the reduction can be performed in a smaller type, we need to extend 4319 // the reduction to the wider type before we branch to the original loop. 4320 if (Phi->getType() != RdxDesc.getRecurrenceType()) 4321 ReducedPartRdx = 4322 RdxDesc.isSigned() 4323 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 4324 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 4325 } 4326 4327 // Create a phi node that merges control-flow from the backedge-taken check 4328 // block and the middle block. 4329 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 4330 LoopScalarPreHeader->getTerminator()); 4331 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 4332 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 4333 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 4334 4335 // Now, we need to fix the users of the reduction variable 4336 // inside and outside of the scalar remainder loop. 4337 4338 // We know that the loop is in LCSSA form. We need to update the PHI nodes 4339 // in the exit blocks. See comment on analogous loop in 4340 // fixFirstOrderRecurrence for a more complete explaination of the logic. 4341 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) 4342 if (any_of(LCSSAPhi.incoming_values(), 4343 [LoopExitInst](Value *V) { return V == LoopExitInst; })) 4344 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 4345 4346 // Fix the scalar loop reduction variable with the incoming reduction sum 4347 // from the vector body and from the backedge value. 4348 int IncomingEdgeBlockIdx = 4349 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 4350 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 4351 // Pick the other block. 4352 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 4353 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 4354 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 4355 } 4356 4357 void InnerLoopVectorizer::clearReductionWrapFlags( 4358 RecurrenceDescriptor &RdxDesc) { 4359 RecurKind RK = RdxDesc.getRecurrenceKind(); 4360 if (RK != RecurKind::Add && RK != RecurKind::Mul) 4361 return; 4362 4363 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 4364 assert(LoopExitInstr && "null loop exit instruction"); 4365 SmallVector<Instruction *, 8> Worklist; 4366 SmallPtrSet<Instruction *, 8> Visited; 4367 Worklist.push_back(LoopExitInstr); 4368 Visited.insert(LoopExitInstr); 4369 4370 while (!Worklist.empty()) { 4371 Instruction *Cur = Worklist.pop_back_val(); 4372 if (isa<OverflowingBinaryOperator>(Cur)) 4373 for (unsigned Part = 0; Part < UF; ++Part) { 4374 Value *V = getOrCreateVectorValue(Cur, Part); 4375 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 4376 } 4377 4378 for (User *U : Cur->users()) { 4379 Instruction *UI = cast<Instruction>(U); 4380 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 4381 Visited.insert(UI).second) 4382 Worklist.push_back(UI); 4383 } 4384 } 4385 } 4386 4387 void InnerLoopVectorizer::fixLCSSAPHIs() { 4388 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 4389 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1) 4390 // Some phis were already hand updated by the reduction and recurrence 4391 // code above, leave them alone. 4392 continue; 4393 4394 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 4395 // Non-instruction incoming values will have only one value. 4396 unsigned LastLane = 0; 4397 if (isa<Instruction>(IncomingValue)) 4398 LastLane = Cost->isUniformAfterVectorization( 4399 cast<Instruction>(IncomingValue), VF) 4400 ? 0 4401 : VF.getKnownMinValue() - 1; 4402 assert((!VF.isScalable() || LastLane == 0) && 4403 "scalable vectors dont support non-uniform scalars yet"); 4404 // Can be a loop invariant incoming value or the last scalar value to be 4405 // extracted from the vectorized loop. 4406 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 4407 Value *lastIncomingValue = 4408 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 4409 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 4410 } 4411 } 4412 4413 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 4414 // The basic block and loop containing the predicated instruction. 4415 auto *PredBB = PredInst->getParent(); 4416 auto *VectorLoop = LI->getLoopFor(PredBB); 4417 4418 // Initialize a worklist with the operands of the predicated instruction. 4419 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 4420 4421 // Holds instructions that we need to analyze again. An instruction may be 4422 // reanalyzed if we don't yet know if we can sink it or not. 4423 SmallVector<Instruction *, 8> InstsToReanalyze; 4424 4425 // Returns true if a given use occurs in the predicated block. Phi nodes use 4426 // their operands in their corresponding predecessor blocks. 4427 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 4428 auto *I = cast<Instruction>(U.getUser()); 4429 BasicBlock *BB = I->getParent(); 4430 if (auto *Phi = dyn_cast<PHINode>(I)) 4431 BB = Phi->getIncomingBlock( 4432 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 4433 return BB == PredBB; 4434 }; 4435 4436 // Iteratively sink the scalarized operands of the predicated instruction 4437 // into the block we created for it. When an instruction is sunk, it's 4438 // operands are then added to the worklist. The algorithm ends after one pass 4439 // through the worklist doesn't sink a single instruction. 4440 bool Changed; 4441 do { 4442 // Add the instructions that need to be reanalyzed to the worklist, and 4443 // reset the changed indicator. 4444 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 4445 InstsToReanalyze.clear(); 4446 Changed = false; 4447 4448 while (!Worklist.empty()) { 4449 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 4450 4451 // We can't sink an instruction if it is a phi node, is already in the 4452 // predicated block, is not in the loop, or may have side effects. 4453 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 4454 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4455 continue; 4456 4457 // It's legal to sink the instruction if all its uses occur in the 4458 // predicated block. Otherwise, there's nothing to do yet, and we may 4459 // need to reanalyze the instruction. 4460 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4461 InstsToReanalyze.push_back(I); 4462 continue; 4463 } 4464 4465 // Move the instruction to the beginning of the predicated block, and add 4466 // it's operands to the worklist. 4467 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4468 Worklist.insert(I->op_begin(), I->op_end()); 4469 4470 // The sinking may have enabled other instructions to be sunk, so we will 4471 // need to iterate. 4472 Changed = true; 4473 } 4474 } while (Changed); 4475 } 4476 4477 void InnerLoopVectorizer::fixNonInductionPHIs() { 4478 for (PHINode *OrigPhi : OrigPHIsToFix) { 4479 PHINode *NewPhi = 4480 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4481 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4482 4483 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4484 predecessors(OrigPhi->getParent())); 4485 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4486 predecessors(NewPhi->getParent())); 4487 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4488 "Scalar and Vector BB should have the same number of predecessors"); 4489 4490 // The insertion point in Builder may be invalidated by the time we get 4491 // here. Force the Builder insertion point to something valid so that we do 4492 // not run into issues during insertion point restore in 4493 // getOrCreateVectorValue calls below. 4494 Builder.SetInsertPoint(NewPhi); 4495 4496 // The predecessor order is preserved and we can rely on mapping between 4497 // scalar and vector block predecessors. 4498 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4499 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4500 4501 // When looking up the new scalar/vector values to fix up, use incoming 4502 // values from original phi. 4503 Value *ScIncV = 4504 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4505 4506 // Scalar incoming value may need a broadcast 4507 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4508 NewPhi->addIncoming(NewIncV, NewPredBB); 4509 } 4510 } 4511 } 4512 4513 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, 4514 VPUser &Operands, unsigned UF, 4515 ElementCount VF, bool IsPtrLoopInvariant, 4516 SmallBitVector &IsIndexLoopInvariant, 4517 VPTransformState &State) { 4518 // Construct a vector GEP by widening the operands of the scalar GEP as 4519 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4520 // results in a vector of pointers when at least one operand of the GEP 4521 // is vector-typed. Thus, to keep the representation compact, we only use 4522 // vector-typed operands for loop-varying values. 4523 4524 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4525 // If we are vectorizing, but the GEP has only loop-invariant operands, 4526 // the GEP we build (by only using vector-typed operands for 4527 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4528 // produce a vector of pointers, we need to either arbitrarily pick an 4529 // operand to broadcast, or broadcast a clone of the original GEP. 4530 // Here, we broadcast a clone of the original. 4531 // 4532 // TODO: If at some point we decide to scalarize instructions having 4533 // loop-invariant operands, this special case will no longer be 4534 // required. We would add the scalarization decision to 4535 // collectLoopScalars() and teach getVectorValue() to broadcast 4536 // the lane-zero scalar value. 4537 auto *Clone = Builder.Insert(GEP->clone()); 4538 for (unsigned Part = 0; Part < UF; ++Part) { 4539 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4540 State.set(VPDef, GEP, EntryPart, Part); 4541 addMetadata(EntryPart, GEP); 4542 } 4543 } else { 4544 // If the GEP has at least one loop-varying operand, we are sure to 4545 // produce a vector of pointers. But if we are only unrolling, we want 4546 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4547 // produce with the code below will be scalar (if VF == 1) or vector 4548 // (otherwise). Note that for the unroll-only case, we still maintain 4549 // values in the vector mapping with initVector, as we do for other 4550 // instructions. 4551 for (unsigned Part = 0; Part < UF; ++Part) { 4552 // The pointer operand of the new GEP. If it's loop-invariant, we 4553 // won't broadcast it. 4554 auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0}) 4555 : State.get(Operands.getOperand(0), Part); 4556 4557 // Collect all the indices for the new GEP. If any index is 4558 // loop-invariant, we won't broadcast it. 4559 SmallVector<Value *, 4> Indices; 4560 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) { 4561 VPValue *Operand = Operands.getOperand(I); 4562 if (IsIndexLoopInvariant[I - 1]) 4563 Indices.push_back(State.get(Operand, {0, 0})); 4564 else 4565 Indices.push_back(State.get(Operand, Part)); 4566 } 4567 4568 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4569 // but it should be a vector, otherwise. 4570 auto *NewGEP = 4571 GEP->isInBounds() 4572 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4573 Indices) 4574 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4575 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) && 4576 "NewGEP is not a pointer vector"); 4577 State.set(VPDef, GEP, NewGEP, Part); 4578 addMetadata(NewGEP, GEP); 4579 } 4580 } 4581 } 4582 4583 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 4584 RecurrenceDescriptor *RdxDesc, 4585 Value *StartV, unsigned UF, 4586 ElementCount VF) { 4587 assert(!VF.isScalable() && "scalable vectors not yet supported."); 4588 PHINode *P = cast<PHINode>(PN); 4589 if (EnableVPlanNativePath) { 4590 // Currently we enter here in the VPlan-native path for non-induction 4591 // PHIs where all control flow is uniform. We simply widen these PHIs. 4592 // Create a vector phi with no operands - the vector phi operands will be 4593 // set at the end of vector code generation. 4594 Type *VecTy = 4595 (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF); 4596 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4597 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4598 OrigPHIsToFix.push_back(P); 4599 4600 return; 4601 } 4602 4603 assert(PN->getParent() == OrigLoop->getHeader() && 4604 "Non-header phis should have been handled elsewhere"); 4605 4606 // In order to support recurrences we need to be able to vectorize Phi nodes. 4607 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4608 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4609 // this value when we vectorize all of the instructions that use the PHI. 4610 if (RdxDesc || Legal->isFirstOrderRecurrence(P)) { 4611 Value *Iden = nullptr; 4612 bool ScalarPHI = 4613 (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN)); 4614 Type *VecTy = 4615 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF); 4616 4617 if (RdxDesc) { 4618 assert(Legal->isReductionVariable(P) && StartV && 4619 "RdxDesc should only be set for reduction variables; in that case " 4620 "a StartV is also required"); 4621 RecurKind RK = RdxDesc->getRecurrenceKind(); 4622 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) { 4623 // MinMax reduction have the start value as their identify. 4624 if (ScalarPHI) { 4625 Iden = StartV; 4626 } else { 4627 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4628 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4629 StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident"); 4630 } 4631 } else { 4632 Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity( 4633 RK, VecTy->getScalarType()); 4634 Iden = IdenC; 4635 4636 if (!ScalarPHI) { 4637 Iden = ConstantVector::getSplat(VF, IdenC); 4638 IRBuilderBase::InsertPointGuard IPBuilder(Builder); 4639 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 4640 Constant *Zero = Builder.getInt32(0); 4641 StartV = Builder.CreateInsertElement(Iden, StartV, Zero); 4642 } 4643 } 4644 } 4645 4646 for (unsigned Part = 0; Part < UF; ++Part) { 4647 // This is phase one of vectorizing PHIs. 4648 Value *EntryPart = PHINode::Create( 4649 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4650 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4651 if (StartV) { 4652 // Make sure to add the reduction start value only to the 4653 // first unroll part. 4654 Value *StartVal = (Part == 0) ? StartV : Iden; 4655 cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader); 4656 } 4657 } 4658 return; 4659 } 4660 4661 assert(!Legal->isReductionVariable(P) && 4662 "reductions should be handled above"); 4663 4664 setDebugLocFromInst(Builder, P); 4665 4666 // This PHINode must be an induction variable. 4667 // Make sure that we know about it. 4668 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4669 4670 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4671 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4672 4673 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4674 // which can be found from the original scalar operations. 4675 switch (II.getKind()) { 4676 case InductionDescriptor::IK_NoInduction: 4677 llvm_unreachable("Unknown induction"); 4678 case InductionDescriptor::IK_IntInduction: 4679 case InductionDescriptor::IK_FpInduction: 4680 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4681 case InductionDescriptor::IK_PtrInduction: { 4682 // Handle the pointer induction variable case. 4683 assert(P->getType()->isPointerTy() && "Unexpected type."); 4684 4685 if (Cost->isScalarAfterVectorization(P, VF)) { 4686 // This is the normalized GEP that starts counting at zero. 4687 Value *PtrInd = 4688 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType()); 4689 // Determine the number of scalars we need to generate for each unroll 4690 // iteration. If the instruction is uniform, we only need to generate the 4691 // first lane. Otherwise, we generate all VF values. 4692 unsigned Lanes = 4693 Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); 4694 for (unsigned Part = 0; Part < UF; ++Part) { 4695 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4696 Constant *Idx = ConstantInt::get(PtrInd->getType(), 4697 Lane + Part * VF.getKnownMinValue()); 4698 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4699 Value *SclrGep = 4700 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4701 SclrGep->setName("next.gep"); 4702 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4703 } 4704 } 4705 return; 4706 } 4707 assert(isa<SCEVConstant>(II.getStep()) && 4708 "Induction step not a SCEV constant!"); 4709 Type *PhiType = II.getStep()->getType(); 4710 4711 // Build a pointer phi 4712 Value *ScalarStartValue = II.getStartValue(); 4713 Type *ScStValueType = ScalarStartValue->getType(); 4714 PHINode *NewPointerPhi = 4715 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction); 4716 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader); 4717 4718 // A pointer induction, performed by using a gep 4719 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 4720 Instruction *InductionLoc = LoopLatch->getTerminator(); 4721 const SCEV *ScalarStep = II.getStep(); 4722 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 4723 Value *ScalarStepValue = 4724 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); 4725 Value *InductionGEP = GetElementPtrInst::Create( 4726 ScStValueType->getPointerElementType(), NewPointerPhi, 4727 Builder.CreateMul( 4728 ScalarStepValue, 4729 ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), 4730 "ptr.ind", InductionLoc); 4731 NewPointerPhi->addIncoming(InductionGEP, LoopLatch); 4732 4733 // Create UF many actual address geps that use the pointer 4734 // phi as base and a vectorized version of the step value 4735 // (<step*0, ..., step*N>) as offset. 4736 for (unsigned Part = 0; Part < UF; ++Part) { 4737 SmallVector<Constant *, 8> Indices; 4738 // Create a vector of consecutive numbers from zero to VF. 4739 for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) 4740 Indices.push_back( 4741 ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); 4742 Constant *StartOffset = ConstantVector::get(Indices); 4743 4744 Value *GEP = Builder.CreateGEP( 4745 ScStValueType->getPointerElementType(), NewPointerPhi, 4746 Builder.CreateMul( 4747 StartOffset, 4748 Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), 4749 "vector.gep")); 4750 VectorLoopValueMap.setVectorValue(P, Part, GEP); 4751 } 4752 } 4753 } 4754 } 4755 4756 /// A helper function for checking whether an integer division-related 4757 /// instruction may divide by zero (in which case it must be predicated if 4758 /// executed conditionally in the scalar code). 4759 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4760 /// Non-zero divisors that are non compile-time constants will not be 4761 /// converted into multiplication, so we will still end up scalarizing 4762 /// the division, but can do so w/o predication. 4763 static bool mayDivideByZero(Instruction &I) { 4764 assert((I.getOpcode() == Instruction::UDiv || 4765 I.getOpcode() == Instruction::SDiv || 4766 I.getOpcode() == Instruction::URem || 4767 I.getOpcode() == Instruction::SRem) && 4768 "Unexpected instruction"); 4769 Value *Divisor = I.getOperand(1); 4770 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4771 return !CInt || CInt->isZero(); 4772 } 4773 4774 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, 4775 VPUser &User, 4776 VPTransformState &State) { 4777 switch (I.getOpcode()) { 4778 case Instruction::Call: 4779 case Instruction::Br: 4780 case Instruction::PHI: 4781 case Instruction::GetElementPtr: 4782 case Instruction::Select: 4783 llvm_unreachable("This instruction is handled by a different recipe."); 4784 case Instruction::UDiv: 4785 case Instruction::SDiv: 4786 case Instruction::SRem: 4787 case Instruction::URem: 4788 case Instruction::Add: 4789 case Instruction::FAdd: 4790 case Instruction::Sub: 4791 case Instruction::FSub: 4792 case Instruction::FNeg: 4793 case Instruction::Mul: 4794 case Instruction::FMul: 4795 case Instruction::FDiv: 4796 case Instruction::FRem: 4797 case Instruction::Shl: 4798 case Instruction::LShr: 4799 case Instruction::AShr: 4800 case Instruction::And: 4801 case Instruction::Or: 4802 case Instruction::Xor: { 4803 // Just widen unops and binops. 4804 setDebugLocFromInst(Builder, &I); 4805 4806 for (unsigned Part = 0; Part < UF; ++Part) { 4807 SmallVector<Value *, 2> Ops; 4808 for (VPValue *VPOp : User.operands()) 4809 Ops.push_back(State.get(VPOp, Part)); 4810 4811 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4812 4813 if (auto *VecOp = dyn_cast<Instruction>(V)) 4814 VecOp->copyIRFlags(&I); 4815 4816 // Use this vector value for all users of the original instruction. 4817 State.set(Def, &I, V, Part); 4818 addMetadata(V, &I); 4819 } 4820 4821 break; 4822 } 4823 case Instruction::ICmp: 4824 case Instruction::FCmp: { 4825 // Widen compares. Generate vector compares. 4826 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4827 auto *Cmp = cast<CmpInst>(&I); 4828 setDebugLocFromInst(Builder, Cmp); 4829 for (unsigned Part = 0; Part < UF; ++Part) { 4830 Value *A = State.get(User.getOperand(0), Part); 4831 Value *B = State.get(User.getOperand(1), Part); 4832 Value *C = nullptr; 4833 if (FCmp) { 4834 // Propagate fast math flags. 4835 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4836 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4837 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4838 } else { 4839 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4840 } 4841 State.set(Def, &I, C, Part); 4842 addMetadata(C, &I); 4843 } 4844 4845 break; 4846 } 4847 4848 case Instruction::ZExt: 4849 case Instruction::SExt: 4850 case Instruction::FPToUI: 4851 case Instruction::FPToSI: 4852 case Instruction::FPExt: 4853 case Instruction::PtrToInt: 4854 case Instruction::IntToPtr: 4855 case Instruction::SIToFP: 4856 case Instruction::UIToFP: 4857 case Instruction::Trunc: 4858 case Instruction::FPTrunc: 4859 case Instruction::BitCast: { 4860 auto *CI = cast<CastInst>(&I); 4861 setDebugLocFromInst(Builder, CI); 4862 4863 /// Vectorize casts. 4864 Type *DestTy = 4865 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); 4866 4867 for (unsigned Part = 0; Part < UF; ++Part) { 4868 Value *A = State.get(User.getOperand(0), Part); 4869 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4870 State.set(Def, &I, Cast, Part); 4871 addMetadata(Cast, &I); 4872 } 4873 break; 4874 } 4875 default: 4876 // This instruction is not vectorized by simple widening. 4877 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4878 llvm_unreachable("Unhandled instruction!"); 4879 } // end of switch. 4880 } 4881 4882 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def, 4883 VPUser &ArgOperands, 4884 VPTransformState &State) { 4885 assert(!isa<DbgInfoIntrinsic>(I) && 4886 "DbgInfoIntrinsic should have been dropped during VPlan construction"); 4887 setDebugLocFromInst(Builder, &I); 4888 4889 Module *M = I.getParent()->getParent()->getParent(); 4890 auto *CI = cast<CallInst>(&I); 4891 4892 SmallVector<Type *, 4> Tys; 4893 for (Value *ArgOperand : CI->arg_operands()) 4894 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); 4895 4896 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4897 4898 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4899 // version of the instruction. 4900 // Is it beneficial to perform intrinsic call compared to lib call? 4901 bool NeedToScalarize = false; 4902 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4903 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0; 4904 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 4905 assert((UseVectorIntrinsic || !NeedToScalarize) && 4906 "Instruction should be scalarized elsewhere."); 4907 assert(IntrinsicCost.isValid() && CallCost.isValid() && 4908 "Cannot have invalid costs while widening"); 4909 4910 for (unsigned Part = 0; Part < UF; ++Part) { 4911 SmallVector<Value *, 4> Args; 4912 for (auto &I : enumerate(ArgOperands.operands())) { 4913 // Some intrinsics have a scalar argument - don't replace it with a 4914 // vector. 4915 Value *Arg; 4916 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index())) 4917 Arg = State.get(I.value(), Part); 4918 else 4919 Arg = State.get(I.value(), {0, 0}); 4920 Args.push_back(Arg); 4921 } 4922 4923 Function *VectorF; 4924 if (UseVectorIntrinsic) { 4925 // Use vector version of the intrinsic. 4926 Type *TysForDecl[] = {CI->getType()}; 4927 if (VF.isVector()) { 4928 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 4929 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4930 } 4931 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4932 assert(VectorF && "Can't retrieve vector intrinsic."); 4933 } else { 4934 // Use vector version of the function call. 4935 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/); 4936 #ifndef NDEBUG 4937 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr && 4938 "Can't create vector function."); 4939 #endif 4940 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4941 } 4942 SmallVector<OperandBundleDef, 1> OpBundles; 4943 CI->getOperandBundlesAsDefs(OpBundles); 4944 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4945 4946 if (isa<FPMathOperator>(V)) 4947 V->copyFastMathFlags(CI); 4948 4949 State.set(Def, &I, V, Part); 4950 addMetadata(V, &I); 4951 } 4952 } 4953 4954 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef, 4955 VPUser &Operands, 4956 bool InvariantCond, 4957 VPTransformState &State) { 4958 setDebugLocFromInst(Builder, &I); 4959 4960 // The condition can be loop invariant but still defined inside the 4961 // loop. This means that we can't just use the original 'cond' value. 4962 // We have to take the 'vectorized' value and pick the first lane. 4963 // Instcombine will make this a no-op. 4964 auto *InvarCond = 4965 InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; 4966 4967 for (unsigned Part = 0; Part < UF; ++Part) { 4968 Value *Cond = 4969 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); 4970 Value *Op0 = State.get(Operands.getOperand(1), Part); 4971 Value *Op1 = State.get(Operands.getOperand(2), Part); 4972 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); 4973 State.set(VPDef, &I, Sel, Part); 4974 addMetadata(Sel, &I); 4975 } 4976 } 4977 4978 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 4979 // We should not collect Scalars more than once per VF. Right now, this 4980 // function is called from collectUniformsAndScalars(), which already does 4981 // this check. Collecting Scalars for VF=1 does not make any sense. 4982 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && 4983 "This function should not be visited twice for the same VF"); 4984 4985 SmallSetVector<Instruction *, 8> Worklist; 4986 4987 // These sets are used to seed the analysis with pointers used by memory 4988 // accesses that will remain scalar. 4989 SmallSetVector<Instruction *, 8> ScalarPtrs; 4990 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4991 auto *Latch = TheLoop->getLoopLatch(); 4992 4993 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4994 // The pointer operands of loads and stores will be scalar as long as the 4995 // memory access is not a gather or scatter operation. The value operand of a 4996 // store will remain scalar if the store is scalarized. 4997 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4998 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4999 assert(WideningDecision != CM_Unknown && 5000 "Widening decision should be ready at this moment"); 5001 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 5002 if (Ptr == Store->getValueOperand()) 5003 return WideningDecision == CM_Scalarize; 5004 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 5005 "Ptr is neither a value or pointer operand"); 5006 return WideningDecision != CM_GatherScatter; 5007 }; 5008 5009 // A helper that returns true if the given value is a bitcast or 5010 // getelementptr instruction contained in the loop. 5011 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 5012 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 5013 isa<GetElementPtrInst>(V)) && 5014 !TheLoop->isLoopInvariant(V); 5015 }; 5016 5017 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) { 5018 if (!isa<PHINode>(Ptr) || 5019 !Legal->getInductionVars().count(cast<PHINode>(Ptr))) 5020 return false; 5021 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)]; 5022 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction) 5023 return false; 5024 return isScalarUse(MemAccess, Ptr); 5025 }; 5026 5027 // A helper that evaluates a memory access's use of a pointer. If the 5028 // pointer is actually the pointer induction of a loop, it is being 5029 // inserted into Worklist. If the use will be a scalar use, and the 5030 // pointer is only used by memory accesses, we place the pointer in 5031 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs. 5032 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 5033 if (isScalarPtrInduction(MemAccess, Ptr)) { 5034 Worklist.insert(cast<Instruction>(Ptr)); 5035 Instruction *Update = cast<Instruction>( 5036 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch)); 5037 Worklist.insert(Update); 5038 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr 5039 << "\n"); 5040 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update 5041 << "\n"); 5042 return; 5043 } 5044 // We only care about bitcast and getelementptr instructions contained in 5045 // the loop. 5046 if (!isLoopVaryingBitCastOrGEP(Ptr)) 5047 return; 5048 5049 // If the pointer has already been identified as scalar (e.g., if it was 5050 // also identified as uniform), there's nothing to do. 5051 auto *I = cast<Instruction>(Ptr); 5052 if (Worklist.count(I)) 5053 return; 5054 5055 // If the use of the pointer will be a scalar use, and all users of the 5056 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 5057 // place the pointer in PossibleNonScalarPtrs. 5058 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 5059 return isa<LoadInst>(U) || isa<StoreInst>(U); 5060 })) 5061 ScalarPtrs.insert(I); 5062 else 5063 PossibleNonScalarPtrs.insert(I); 5064 }; 5065 5066 // We seed the scalars analysis with three classes of instructions: (1) 5067 // instructions marked uniform-after-vectorization and (2) bitcast, 5068 // getelementptr and (pointer) phi instructions used by memory accesses 5069 // requiring a scalar use. 5070 // 5071 // (1) Add to the worklist all instructions that have been identified as 5072 // uniform-after-vectorization. 5073 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 5074 5075 // (2) Add to the worklist all bitcast and getelementptr instructions used by 5076 // memory accesses requiring a scalar use. The pointer operands of loads and 5077 // stores will be scalar as long as the memory accesses is not a gather or 5078 // scatter operation. The value operand of a store will remain scalar if the 5079 // store is scalarized. 5080 for (auto *BB : TheLoop->blocks()) 5081 for (auto &I : *BB) { 5082 if (auto *Load = dyn_cast<LoadInst>(&I)) { 5083 evaluatePtrUse(Load, Load->getPointerOperand()); 5084 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 5085 evaluatePtrUse(Store, Store->getPointerOperand()); 5086 evaluatePtrUse(Store, Store->getValueOperand()); 5087 } 5088 } 5089 for (auto *I : ScalarPtrs) 5090 if (!PossibleNonScalarPtrs.count(I)) { 5091 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 5092 Worklist.insert(I); 5093 } 5094 5095 // Insert the forced scalars. 5096 // FIXME: Currently widenPHIInstruction() often creates a dead vector 5097 // induction variable when the PHI user is scalarized. 5098 auto ForcedScalar = ForcedScalars.find(VF); 5099 if (ForcedScalar != ForcedScalars.end()) 5100 for (auto *I : ForcedScalar->second) 5101 Worklist.insert(I); 5102 5103 // Expand the worklist by looking through any bitcasts and getelementptr 5104 // instructions we've already identified as scalar. This is similar to the 5105 // expansion step in collectLoopUniforms(); however, here we're only 5106 // expanding to include additional bitcasts and getelementptr instructions. 5107 unsigned Idx = 0; 5108 while (Idx != Worklist.size()) { 5109 Instruction *Dst = Worklist[Idx++]; 5110 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 5111 continue; 5112 auto *Src = cast<Instruction>(Dst->getOperand(0)); 5113 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 5114 auto *J = cast<Instruction>(U); 5115 return !TheLoop->contains(J) || Worklist.count(J) || 5116 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 5117 isScalarUse(J, Src)); 5118 })) { 5119 Worklist.insert(Src); 5120 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 5121 } 5122 } 5123 5124 // An induction variable will remain scalar if all users of the induction 5125 // variable and induction variable update remain scalar. 5126 for (auto &Induction : Legal->getInductionVars()) { 5127 auto *Ind = Induction.first; 5128 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5129 5130 // If tail-folding is applied, the primary induction variable will be used 5131 // to feed a vector compare. 5132 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 5133 continue; 5134 5135 // Determine if all users of the induction variable are scalar after 5136 // vectorization. 5137 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5138 auto *I = cast<Instruction>(U); 5139 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 5140 }); 5141 if (!ScalarInd) 5142 continue; 5143 5144 // Determine if all users of the induction variable update instruction are 5145 // scalar after vectorization. 5146 auto ScalarIndUpdate = 5147 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5148 auto *I = cast<Instruction>(U); 5149 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 5150 }); 5151 if (!ScalarIndUpdate) 5152 continue; 5153 5154 // The induction variable and its update instruction will remain scalar. 5155 Worklist.insert(Ind); 5156 Worklist.insert(IndUpdate); 5157 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 5158 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 5159 << "\n"); 5160 } 5161 5162 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 5163 } 5164 5165 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, 5166 ElementCount VF) { 5167 if (!blockNeedsPredication(I->getParent())) 5168 return false; 5169 switch(I->getOpcode()) { 5170 default: 5171 break; 5172 case Instruction::Load: 5173 case Instruction::Store: { 5174 if (!Legal->isMaskRequired(I)) 5175 return false; 5176 auto *Ptr = getLoadStorePointerOperand(I); 5177 auto *Ty = getMemInstValueType(I); 5178 // We have already decided how to vectorize this instruction, get that 5179 // result. 5180 if (VF.isVector()) { 5181 InstWidening WideningDecision = getWideningDecision(I, VF); 5182 assert(WideningDecision != CM_Unknown && 5183 "Widening decision should be ready at this moment"); 5184 return WideningDecision == CM_Scalarize; 5185 } 5186 const Align Alignment = getLoadStoreAlignment(I); 5187 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 5188 isLegalMaskedGather(Ty, Alignment)) 5189 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 5190 isLegalMaskedScatter(Ty, Alignment)); 5191 } 5192 case Instruction::UDiv: 5193 case Instruction::SDiv: 5194 case Instruction::SRem: 5195 case Instruction::URem: 5196 return mayDivideByZero(*I); 5197 } 5198 return false; 5199 } 5200 5201 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 5202 Instruction *I, ElementCount VF) { 5203 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 5204 assert(getWideningDecision(I, VF) == CM_Unknown && 5205 "Decision should not be set yet."); 5206 auto *Group = getInterleavedAccessGroup(I); 5207 assert(Group && "Must have a group."); 5208 5209 // If the instruction's allocated size doesn't equal it's type size, it 5210 // requires padding and will be scalarized. 5211 auto &DL = I->getModule()->getDataLayout(); 5212 auto *ScalarTy = getMemInstValueType(I); 5213 if (hasIrregularType(ScalarTy, DL, VF)) 5214 return false; 5215 5216 // Check if masking is required. 5217 // A Group may need masking for one of two reasons: it resides in a block that 5218 // needs predication, or it was decided to use masking to deal with gaps. 5219 bool PredicatedAccessRequiresMasking = 5220 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 5221 bool AccessWithGapsRequiresMasking = 5222 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5223 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 5224 return true; 5225 5226 // If masked interleaving is required, we expect that the user/target had 5227 // enabled it, because otherwise it either wouldn't have been created or 5228 // it should have been invalidated by the CostModel. 5229 assert(useMaskedInterleavedAccesses(TTI) && 5230 "Masked interleave-groups for predicated accesses are not enabled."); 5231 5232 auto *Ty = getMemInstValueType(I); 5233 const Align Alignment = getLoadStoreAlignment(I); 5234 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 5235 : TTI.isLegalMaskedStore(Ty, Alignment); 5236 } 5237 5238 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 5239 Instruction *I, ElementCount VF) { 5240 // Get and ensure we have a valid memory instruction. 5241 LoadInst *LI = dyn_cast<LoadInst>(I); 5242 StoreInst *SI = dyn_cast<StoreInst>(I); 5243 assert((LI || SI) && "Invalid memory instruction"); 5244 5245 auto *Ptr = getLoadStorePointerOperand(I); 5246 5247 // In order to be widened, the pointer should be consecutive, first of all. 5248 if (!Legal->isConsecutivePtr(Ptr)) 5249 return false; 5250 5251 // If the instruction is a store located in a predicated block, it will be 5252 // scalarized. 5253 if (isScalarWithPredication(I)) 5254 return false; 5255 5256 // If the instruction's allocated size doesn't equal it's type size, it 5257 // requires padding and will be scalarized. 5258 auto &DL = I->getModule()->getDataLayout(); 5259 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 5260 if (hasIrregularType(ScalarTy, DL, VF)) 5261 return false; 5262 5263 return true; 5264 } 5265 5266 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 5267 // We should not collect Uniforms more than once per VF. Right now, 5268 // this function is called from collectUniformsAndScalars(), which 5269 // already does this check. Collecting Uniforms for VF=1 does not make any 5270 // sense. 5271 5272 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && 5273 "This function should not be visited twice for the same VF"); 5274 5275 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 5276 // not analyze again. Uniforms.count(VF) will return 1. 5277 Uniforms[VF].clear(); 5278 5279 // We now know that the loop is vectorizable! 5280 // Collect instructions inside the loop that will remain uniform after 5281 // vectorization. 5282 5283 // Global values, params and instructions outside of current loop are out of 5284 // scope. 5285 auto isOutOfScope = [&](Value *V) -> bool { 5286 Instruction *I = dyn_cast<Instruction>(V); 5287 return (!I || !TheLoop->contains(I)); 5288 }; 5289 5290 SetVector<Instruction *> Worklist; 5291 BasicBlock *Latch = TheLoop->getLoopLatch(); 5292 5293 // Instructions that are scalar with predication must not be considered 5294 // uniform after vectorization, because that would create an erroneous 5295 // replicating region where only a single instance out of VF should be formed. 5296 // TODO: optimize such seldom cases if found important, see PR40816. 5297 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 5298 if (isOutOfScope(I)) { 5299 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 5300 << *I << "\n"); 5301 return; 5302 } 5303 if (isScalarWithPredication(I, VF)) { 5304 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 5305 << *I << "\n"); 5306 return; 5307 } 5308 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 5309 Worklist.insert(I); 5310 }; 5311 5312 // Start with the conditional branch. If the branch condition is an 5313 // instruction contained in the loop that is only used by the branch, it is 5314 // uniform. 5315 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 5316 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 5317 addToWorklistIfAllowed(Cmp); 5318 5319 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 5320 InstWidening WideningDecision = getWideningDecision(I, VF); 5321 assert(WideningDecision != CM_Unknown && 5322 "Widening decision should be ready at this moment"); 5323 5324 // A uniform memory op is itself uniform. We exclude uniform stores 5325 // here as they demand the last lane, not the first one. 5326 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) { 5327 assert(WideningDecision == CM_Scalarize); 5328 return true; 5329 } 5330 5331 return (WideningDecision == CM_Widen || 5332 WideningDecision == CM_Widen_Reverse || 5333 WideningDecision == CM_Interleave); 5334 }; 5335 5336 5337 // Returns true if Ptr is the pointer operand of a memory access instruction 5338 // I, and I is known to not require scalarization. 5339 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 5340 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 5341 }; 5342 5343 // Holds a list of values which are known to have at least one uniform use. 5344 // Note that there may be other uses which aren't uniform. A "uniform use" 5345 // here is something which only demands lane 0 of the unrolled iterations; 5346 // it does not imply that all lanes produce the same value (e.g. this is not 5347 // the usual meaning of uniform) 5348 SmallPtrSet<Value *, 8> HasUniformUse; 5349 5350 // Scan the loop for instructions which are either a) known to have only 5351 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 5352 for (auto *BB : TheLoop->blocks()) 5353 for (auto &I : *BB) { 5354 // If there's no pointer operand, there's nothing to do. 5355 auto *Ptr = getLoadStorePointerOperand(&I); 5356 if (!Ptr) 5357 continue; 5358 5359 // A uniform memory op is itself uniform. We exclude uniform stores 5360 // here as they demand the last lane, not the first one. 5361 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I)) 5362 addToWorklistIfAllowed(&I); 5363 5364 if (isUniformDecision(&I, VF)) { 5365 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check"); 5366 HasUniformUse.insert(Ptr); 5367 } 5368 } 5369 5370 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 5371 // demanding) users. Since loops are assumed to be in LCSSA form, this 5372 // disallows uses outside the loop as well. 5373 for (auto *V : HasUniformUse) { 5374 if (isOutOfScope(V)) 5375 continue; 5376 auto *I = cast<Instruction>(V); 5377 auto UsersAreMemAccesses = 5378 llvm::all_of(I->users(), [&](User *U) -> bool { 5379 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 5380 }); 5381 if (UsersAreMemAccesses) 5382 addToWorklistIfAllowed(I); 5383 } 5384 5385 // Expand Worklist in topological order: whenever a new instruction 5386 // is added , its users should be already inside Worklist. It ensures 5387 // a uniform instruction will only be used by uniform instructions. 5388 unsigned idx = 0; 5389 while (idx != Worklist.size()) { 5390 Instruction *I = Worklist[idx++]; 5391 5392 for (auto OV : I->operand_values()) { 5393 // isOutOfScope operands cannot be uniform instructions. 5394 if (isOutOfScope(OV)) 5395 continue; 5396 // First order recurrence Phi's should typically be considered 5397 // non-uniform. 5398 auto *OP = dyn_cast<PHINode>(OV); 5399 if (OP && Legal->isFirstOrderRecurrence(OP)) 5400 continue; 5401 // If all the users of the operand are uniform, then add the 5402 // operand into the uniform worklist. 5403 auto *OI = cast<Instruction>(OV); 5404 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 5405 auto *J = cast<Instruction>(U); 5406 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 5407 })) 5408 addToWorklistIfAllowed(OI); 5409 } 5410 } 5411 5412 // For an instruction to be added into Worklist above, all its users inside 5413 // the loop should also be in Worklist. However, this condition cannot be 5414 // true for phi nodes that form a cyclic dependence. We must process phi 5415 // nodes separately. An induction variable will remain uniform if all users 5416 // of the induction variable and induction variable update remain uniform. 5417 // The code below handles both pointer and non-pointer induction variables. 5418 for (auto &Induction : Legal->getInductionVars()) { 5419 auto *Ind = Induction.first; 5420 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 5421 5422 // Determine if all users of the induction variable are uniform after 5423 // vectorization. 5424 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 5425 auto *I = cast<Instruction>(U); 5426 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 5427 isVectorizedMemAccessUse(I, Ind); 5428 }); 5429 if (!UniformInd) 5430 continue; 5431 5432 // Determine if all users of the induction variable update instruction are 5433 // uniform after vectorization. 5434 auto UniformIndUpdate = 5435 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 5436 auto *I = cast<Instruction>(U); 5437 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 5438 isVectorizedMemAccessUse(I, IndUpdate); 5439 }); 5440 if (!UniformIndUpdate) 5441 continue; 5442 5443 // The induction variable and its update instruction will remain uniform. 5444 addToWorklistIfAllowed(Ind); 5445 addToWorklistIfAllowed(IndUpdate); 5446 } 5447 5448 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 5449 } 5450 5451 bool LoopVectorizationCostModel::runtimeChecksRequired() { 5452 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 5453 5454 if (Legal->getRuntimePointerChecking()->Need) { 5455 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 5456 "runtime pointer checks needed. Enable vectorization of this " 5457 "loop with '#pragma clang loop vectorize(enable)' when " 5458 "compiling with -Os/-Oz", 5459 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5460 return true; 5461 } 5462 5463 if (!PSE.getUnionPredicate().getPredicates().empty()) { 5464 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 5465 "runtime SCEV checks needed. Enable vectorization of this " 5466 "loop with '#pragma clang loop vectorize(enable)' when " 5467 "compiling with -Os/-Oz", 5468 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5469 return true; 5470 } 5471 5472 // FIXME: Avoid specializing for stride==1 instead of bailing out. 5473 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 5474 reportVectorizationFailure("Runtime stride check for small trip count", 5475 "runtime stride == 1 checks needed. Enable vectorization of " 5476 "this loop without such check by compiling with -Os/-Oz", 5477 "CantVersionLoopWithOptForSize", ORE, TheLoop); 5478 return true; 5479 } 5480 5481 return false; 5482 } 5483 5484 Optional<ElementCount> 5485 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 5486 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 5487 // TODO: It may by useful to do since it's still likely to be dynamically 5488 // uniform if the target can skip. 5489 reportVectorizationFailure( 5490 "Not inserting runtime ptr check for divergent target", 5491 "runtime pointer checks needed. Not enabled for divergent target", 5492 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 5493 return None; 5494 } 5495 5496 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5497 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 5498 if (TC == 1) { 5499 reportVectorizationFailure("Single iteration (non) loop", 5500 "loop trip count is one, irrelevant for vectorization", 5501 "SingleIterationLoop", ORE, TheLoop); 5502 return None; 5503 } 5504 5505 switch (ScalarEpilogueStatus) { 5506 case CM_ScalarEpilogueAllowed: 5507 return computeFeasibleMaxVF(TC, UserVF); 5508 case CM_ScalarEpilogueNotAllowedUsePredicate: 5509 LLVM_FALLTHROUGH; 5510 case CM_ScalarEpilogueNotNeededUsePredicate: 5511 LLVM_DEBUG( 5512 dbgs() << "LV: vector predicate hint/switch found.\n" 5513 << "LV: Not allowing scalar epilogue, creating predicated " 5514 << "vector loop.\n"); 5515 break; 5516 case CM_ScalarEpilogueNotAllowedLowTripLoop: 5517 // fallthrough as a special case of OptForSize 5518 case CM_ScalarEpilogueNotAllowedOptSize: 5519 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 5520 LLVM_DEBUG( 5521 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 5522 else 5523 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 5524 << "count.\n"); 5525 5526 // Bail if runtime checks are required, which are not good when optimising 5527 // for size. 5528 if (runtimeChecksRequired()) 5529 return None; 5530 5531 break; 5532 } 5533 5534 // The only loops we can vectorize without a scalar epilogue, are loops with 5535 // a bottom-test and a single exiting block. We'd have to handle the fact 5536 // that not every instruction executes on the last iteration. This will 5537 // require a lane mask which varies through the vector loop body. (TODO) 5538 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 5539 // If there was a tail-folding hint/switch, but we can't fold the tail by 5540 // masking, fallback to a vectorization with a scalar epilogue. 5541 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5542 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5543 "scalar epilogue instead.\n"); 5544 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5545 return computeFeasibleMaxVF(TC, UserVF); 5546 } 5547 return None; 5548 } 5549 5550 // Now try the tail folding 5551 5552 // Invalidate interleave groups that require an epilogue if we can't mask 5553 // the interleave-group. 5554 if (!useMaskedInterleavedAccesses(TTI)) { 5555 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 5556 "No decisions should have been taken at this point"); 5557 // Note: There is no need to invalidate any cost modeling decisions here, as 5558 // non where taken so far. 5559 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 5560 } 5561 5562 ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF); 5563 assert(!MaxVF.isScalable() && 5564 "Scalable vectors do not yet support tail folding"); 5565 assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) && 5566 "MaxVF must be a power of 2"); 5567 unsigned MaxVFtimesIC = 5568 UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue(); 5569 // Avoid tail folding if the trip count is known to be a multiple of any VF we 5570 // chose. 5571 ScalarEvolution *SE = PSE.getSE(); 5572 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 5573 const SCEV *ExitCount = SE->getAddExpr( 5574 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 5575 const SCEV *Rem = SE->getURemExpr( 5576 SE->applyLoopGuards(ExitCount, TheLoop), 5577 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 5578 if (Rem->isZero()) { 5579 // Accept MaxVF if we do not have a tail. 5580 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 5581 return MaxVF; 5582 } 5583 5584 // If we don't know the precise trip count, or if the trip count that we 5585 // found modulo the vectorization factor is not zero, try to fold the tail 5586 // by masking. 5587 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 5588 if (Legal->prepareToFoldTailByMasking()) { 5589 FoldTailByMasking = true; 5590 return MaxVF; 5591 } 5592 5593 // If there was a tail-folding hint/switch, but we can't fold the tail by 5594 // masking, fallback to a vectorization with a scalar epilogue. 5595 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 5596 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 5597 "scalar epilogue instead.\n"); 5598 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 5599 return MaxVF; 5600 } 5601 5602 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 5603 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 5604 return None; 5605 } 5606 5607 if (TC == 0) { 5608 reportVectorizationFailure( 5609 "Unable to calculate the loop count due to complex control flow", 5610 "unable to calculate the loop count due to complex control flow", 5611 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5612 return None; 5613 } 5614 5615 reportVectorizationFailure( 5616 "Cannot optimize for size and vectorize at the same time.", 5617 "cannot optimize for size and vectorize at the same time. " 5618 "Enable vectorization of this loop with '#pragma clang loop " 5619 "vectorize(enable)' when compiling with -Os/-Oz", 5620 "NoTailLoopWithOptForSize", ORE, TheLoop); 5621 return None; 5622 } 5623 5624 ElementCount 5625 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, 5626 ElementCount UserVF) { 5627 bool IgnoreScalableUserVF = UserVF.isScalable() && 5628 !TTI.supportsScalableVectors() && 5629 !ForceTargetSupportsScalableVectors; 5630 if (IgnoreScalableUserVF) { 5631 LLVM_DEBUG( 5632 dbgs() << "LV: Ignoring VF=" << UserVF 5633 << " because target does not support scalable vectors.\n"); 5634 ORE->emit([&]() { 5635 return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF", 5636 TheLoop->getStartLoc(), 5637 TheLoop->getHeader()) 5638 << "Ignoring VF=" << ore::NV("UserVF", UserVF) 5639 << " because target does not support scalable vectors."; 5640 }); 5641 } 5642 5643 // Beyond this point two scenarios are handled. If UserVF isn't specified 5644 // then a suitable VF is chosen. If UserVF is specified and there are 5645 // dependencies, check if it's legal. However, if a UserVF is specified and 5646 // there are no dependencies, then there's nothing to do. 5647 if (UserVF.isNonZero() && !IgnoreScalableUserVF && 5648 Legal->isSafeForAnyVectorWidth()) 5649 return UserVF; 5650 5651 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5652 unsigned SmallestType, WidestType; 5653 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5654 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5655 5656 // Get the maximum safe dependence distance in bits computed by LAA. 5657 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5658 // the memory accesses that is most restrictive (involved in the smallest 5659 // dependence distance). 5660 unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits(); 5661 5662 // If the user vectorization factor is legally unsafe, clamp it to a safe 5663 // value. Otherwise, return as is. 5664 if (UserVF.isNonZero() && !IgnoreScalableUserVF) { 5665 unsigned MaxSafeElements = 5666 PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType); 5667 ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements); 5668 5669 if (UserVF.isScalable()) { 5670 Optional<unsigned> MaxVScale = TTI.getMaxVScale(); 5671 5672 // Scale VF by vscale before checking if it's safe. 5673 MaxSafeVF = ElementCount::getScalable( 5674 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); 5675 5676 if (MaxSafeVF.isZero()) { 5677 // The dependence distance is too small to use scalable vectors, 5678 // fallback on fixed. 5679 LLVM_DEBUG( 5680 dbgs() 5681 << "LV: Max legal vector width too small, scalable vectorization " 5682 "unfeasible. Using fixed-width vectorization instead.\n"); 5683 ORE->emit([&]() { 5684 return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible", 5685 TheLoop->getStartLoc(), 5686 TheLoop->getHeader()) 5687 << "Max legal vector width too small, scalable vectorization " 5688 << "unfeasible. Using fixed-width vectorization instead."; 5689 }); 5690 return computeFeasibleMaxVF( 5691 ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue())); 5692 } 5693 } 5694 5695 LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n"); 5696 5697 if (ElementCount::isKnownLE(UserVF, MaxSafeVF)) 5698 return UserVF; 5699 5700 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 5701 << " is unsafe, clamping to max safe VF=" << MaxSafeVF 5702 << ".\n"); 5703 ORE->emit([&]() { 5704 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 5705 TheLoop->getStartLoc(), 5706 TheLoop->getHeader()) 5707 << "User-specified vectorization factor " 5708 << ore::NV("UserVectorizationFactor", UserVF) 5709 << " is unsafe, clamping to maximum safe vectorization factor " 5710 << ore::NV("VectorizationFactor", MaxSafeVF); 5711 }); 5712 return MaxSafeVF; 5713 } 5714 5715 WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits); 5716 5717 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 5718 // Note that both WidestRegister and WidestType may not be a powers of 2. 5719 unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType); 5720 5721 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5722 << " / " << WidestType << " bits.\n"); 5723 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5724 << WidestRegister << " bits.\n"); 5725 5726 assert(MaxVectorSize <= WidestRegister && 5727 "Did not expect to pack so many elements" 5728 " into one vector!"); 5729 if (MaxVectorSize == 0) { 5730 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5731 MaxVectorSize = 1; 5732 return ElementCount::getFixed(MaxVectorSize); 5733 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5734 isPowerOf2_32(ConstTripCount)) { 5735 // We need to clamp the VF to be the ConstTripCount. There is no point in 5736 // choosing a higher viable VF as done in the loop below. 5737 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5738 << ConstTripCount << "\n"); 5739 MaxVectorSize = ConstTripCount; 5740 return ElementCount::getFixed(MaxVectorSize); 5741 } 5742 5743 unsigned MaxVF = MaxVectorSize; 5744 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5745 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5746 // Collect all viable vectorization factors larger than the default MaxVF 5747 // (i.e. MaxVectorSize). 5748 SmallVector<ElementCount, 8> VFs; 5749 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5750 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5751 VFs.push_back(ElementCount::getFixed(VS)); 5752 5753 // For each VF calculate its register usage. 5754 auto RUs = calculateRegisterUsage(VFs); 5755 5756 // Select the largest VF which doesn't require more registers than existing 5757 // ones. 5758 for (int i = RUs.size() - 1; i >= 0; --i) { 5759 bool Selected = true; 5760 for (auto& pair : RUs[i].MaxLocalUsers) { 5761 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5762 if (pair.second > TargetNumRegisters) 5763 Selected = false; 5764 } 5765 if (Selected) { 5766 MaxVF = VFs[i].getKnownMinValue(); 5767 break; 5768 } 5769 } 5770 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5771 if (MaxVF < MinVF) { 5772 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5773 << ") with target's minimum: " << MinVF << '\n'); 5774 MaxVF = MinVF; 5775 } 5776 } 5777 } 5778 return ElementCount::getFixed(MaxVF); 5779 } 5780 5781 VectorizationFactor 5782 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { 5783 // FIXME: This can be fixed for scalable vectors later, because at this stage 5784 // the LoopVectorizer will only consider vectorizing a loop with scalable 5785 // vectors when the loop has a hint to enable vectorization for a given VF. 5786 assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); 5787 5788 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; 5789 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5790 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5791 5792 unsigned Width = 1; 5793 const float ScalarCost = *ExpectedCost.getValue(); 5794 float Cost = ScalarCost; 5795 5796 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5797 if (ForceVectorization && MaxVF.isVector()) { 5798 // Ignore scalar width, because the user explicitly wants vectorization. 5799 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5800 // evaluation. 5801 Cost = std::numeric_limits<float>::max(); 5802 } 5803 5804 for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) { 5805 // Notice that the vector loop needs to be executed less times, so 5806 // we need to divide the cost of the vector loops by the width of 5807 // the vector elements. 5808 VectorizationCostTy C = expectedCost(ElementCount::getFixed(i)); 5809 assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); 5810 float VectorCost = *C.first.getValue() / (float)i; 5811 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5812 << " costs: " << (int)VectorCost << ".\n"); 5813 if (!C.second && !ForceVectorization) { 5814 LLVM_DEBUG( 5815 dbgs() << "LV: Not considering vector loop of width " << i 5816 << " because it will not generate any vector instructions.\n"); 5817 continue; 5818 } 5819 5820 // If profitable add it to ProfitableVF list. 5821 if (VectorCost < ScalarCost) { 5822 ProfitableVFs.push_back(VectorizationFactor( 5823 {ElementCount::getFixed(i), (unsigned)VectorCost})); 5824 } 5825 5826 if (VectorCost < Cost) { 5827 Cost = VectorCost; 5828 Width = i; 5829 } 5830 } 5831 5832 if (!EnableCondStoresVectorization && NumPredStores) { 5833 reportVectorizationFailure("There are conditional stores.", 5834 "store that is conditionally executed prevents vectorization", 5835 "ConditionalStore", ORE, TheLoop); 5836 Width = 1; 5837 Cost = ScalarCost; 5838 } 5839 5840 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5841 << "LV: Vectorization seems to be not beneficial, " 5842 << "but was forced by a user.\n"); 5843 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5844 VectorizationFactor Factor = {ElementCount::getFixed(Width), 5845 (unsigned)(Width * Cost)}; 5846 return Factor; 5847 } 5848 5849 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( 5850 const Loop &L, ElementCount VF) const { 5851 // Cross iteration phis such as reductions need special handling and are 5852 // currently unsupported. 5853 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) { 5854 return Legal->isFirstOrderRecurrence(&Phi) || 5855 Legal->isReductionVariable(&Phi); 5856 })) 5857 return false; 5858 5859 // Phis with uses outside of the loop require special handling and are 5860 // currently unsupported. 5861 for (auto &Entry : Legal->getInductionVars()) { 5862 // Look for uses of the value of the induction at the last iteration. 5863 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch()); 5864 for (User *U : PostInc->users()) 5865 if (!L.contains(cast<Instruction>(U))) 5866 return false; 5867 // Look for uses of penultimate value of the induction. 5868 for (User *U : Entry.first->users()) 5869 if (!L.contains(cast<Instruction>(U))) 5870 return false; 5871 } 5872 5873 // Induction variables that are widened require special handling that is 5874 // currently not supported. 5875 if (any_of(Legal->getInductionVars(), [&](auto &Entry) { 5876 return !(this->isScalarAfterVectorization(Entry.first, VF) || 5877 this->isProfitableToScalarize(Entry.first, VF)); 5878 })) 5879 return false; 5880 5881 return true; 5882 } 5883 5884 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5885 const ElementCount VF) const { 5886 // FIXME: We need a much better cost-model to take different parameters such 5887 // as register pressure, code size increase and cost of extra branches into 5888 // account. For now we apply a very crude heuristic and only consider loops 5889 // with vectorization factors larger than a certain value. 5890 // We also consider epilogue vectorization unprofitable for targets that don't 5891 // consider interleaving beneficial (eg. MVE). 5892 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) 5893 return false; 5894 if (VF.getFixedValue() >= EpilogueVectorizationMinVF) 5895 return true; 5896 return false; 5897 } 5898 5899 VectorizationFactor 5900 LoopVectorizationCostModel::selectEpilogueVectorizationFactor( 5901 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { 5902 VectorizationFactor Result = VectorizationFactor::Disabled(); 5903 if (!EnableEpilogueVectorization) { 5904 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); 5905 return Result; 5906 } 5907 5908 if (!isScalarEpilogueAllowed()) { 5909 LLVM_DEBUG( 5910 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " 5911 "allowed.\n";); 5912 return Result; 5913 } 5914 5915 // FIXME: This can be fixed for scalable vectors later, because at this stage 5916 // the LoopVectorizer will only consider vectorizing a loop with scalable 5917 // vectors when the loop has a hint to enable vectorization for a given VF. 5918 if (MainLoopVF.isScalable()) { 5919 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not " 5920 "yet supported.\n"); 5921 return Result; 5922 } 5923 5924 // Not really a cost consideration, but check for unsupported cases here to 5925 // simplify the logic. 5926 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { 5927 LLVM_DEBUG( 5928 dbgs() << "LEV: Unable to vectorize epilogue because the loop is " 5929 "not a supported candidate.\n";); 5930 return Result; 5931 } 5932 5933 if (EpilogueVectorizationForceVF > 1) { 5934 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); 5935 if (LVP.hasPlanWithVFs( 5936 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)})) 5937 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0}; 5938 else { 5939 LLVM_DEBUG( 5940 dbgs() 5941 << "LEV: Epilogue vectorization forced factor is not viable.\n";); 5942 return Result; 5943 } 5944 } 5945 5946 if (TheLoop->getHeader()->getParent()->hasOptSize() || 5947 TheLoop->getHeader()->getParent()->hasMinSize()) { 5948 LLVM_DEBUG( 5949 dbgs() 5950 << "LEV: Epilogue vectorization skipped due to opt for size.\n";); 5951 return Result; 5952 } 5953 5954 if (!isEpilogueVectorizationProfitable(MainLoopVF)) 5955 return Result; 5956 5957 for (auto &NextVF : ProfitableVFs) 5958 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && 5959 (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) && 5960 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width})) 5961 Result = NextVF; 5962 5963 if (Result != VectorizationFactor::Disabled()) 5964 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5965 << Result.Width.getFixedValue() << "\n";); 5966 return Result; 5967 } 5968 5969 std::pair<unsigned, unsigned> 5970 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5971 unsigned MinWidth = -1U; 5972 unsigned MaxWidth = 8; 5973 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5974 5975 // For each block. 5976 for (BasicBlock *BB : TheLoop->blocks()) { 5977 // For each instruction in the loop. 5978 for (Instruction &I : BB->instructionsWithoutDebug()) { 5979 Type *T = I.getType(); 5980 5981 // Skip ignored values. 5982 if (ValuesToIgnore.count(&I)) 5983 continue; 5984 5985 // Only examine Loads, Stores and PHINodes. 5986 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5987 continue; 5988 5989 // Examine PHI nodes that are reduction variables. Update the type to 5990 // account for the recurrence type. 5991 if (auto *PN = dyn_cast<PHINode>(&I)) { 5992 if (!Legal->isReductionVariable(PN)) 5993 continue; 5994 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5995 if (PreferInLoopReductions || 5996 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5997 RdxDesc.getRecurrenceType(), 5998 TargetTransformInfo::ReductionFlags())) 5999 continue; 6000 T = RdxDesc.getRecurrenceType(); 6001 } 6002 6003 // Examine the stored values. 6004 if (auto *ST = dyn_cast<StoreInst>(&I)) 6005 T = ST->getValueOperand()->getType(); 6006 6007 // Ignore loaded pointer types and stored pointer types that are not 6008 // vectorizable. 6009 // 6010 // FIXME: The check here attempts to predict whether a load or store will 6011 // be vectorized. We only know this for certain after a VF has 6012 // been selected. Here, we assume that if an access can be 6013 // vectorized, it will be. We should also look at extending this 6014 // optimization to non-pointer types. 6015 // 6016 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 6017 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 6018 continue; 6019 6020 MinWidth = std::min(MinWidth, 6021 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6022 MaxWidth = std::max(MaxWidth, 6023 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 6024 } 6025 } 6026 6027 return {MinWidth, MaxWidth}; 6028 } 6029 6030 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 6031 unsigned LoopCost) { 6032 // -- The interleave heuristics -- 6033 // We interleave the loop in order to expose ILP and reduce the loop overhead. 6034 // There are many micro-architectural considerations that we can't predict 6035 // at this level. For example, frontend pressure (on decode or fetch) due to 6036 // code size, or the number and capabilities of the execution ports. 6037 // 6038 // We use the following heuristics to select the interleave count: 6039 // 1. If the code has reductions, then we interleave to break the cross 6040 // iteration dependency. 6041 // 2. If the loop is really small, then we interleave to reduce the loop 6042 // overhead. 6043 // 3. We don't interleave if we think that we will spill registers to memory 6044 // due to the increased register pressure. 6045 6046 if (!isScalarEpilogueAllowed()) 6047 return 1; 6048 6049 // We used the distance for the interleave count. 6050 if (Legal->getMaxSafeDepDistBytes() != -1U) 6051 return 1; 6052 6053 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 6054 const bool HasReductions = !Legal->getReductionVars().empty(); 6055 // Do not interleave loops with a relatively small known or estimated trip 6056 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 6057 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 6058 // because with the above conditions interleaving can expose ILP and break 6059 // cross iteration dependences for reductions. 6060 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 6061 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 6062 return 1; 6063 6064 RegisterUsage R = calculateRegisterUsage({VF})[0]; 6065 // We divide by these constants so assume that we have at least one 6066 // instruction that uses at least one register. 6067 for (auto& pair : R.MaxLocalUsers) { 6068 pair.second = std::max(pair.second, 1U); 6069 } 6070 6071 // We calculate the interleave count using the following formula. 6072 // Subtract the number of loop invariants from the number of available 6073 // registers. These registers are used by all of the interleaved instances. 6074 // Next, divide the remaining registers by the number of registers that is 6075 // required by the loop, in order to estimate how many parallel instances 6076 // fit without causing spills. All of this is rounded down if necessary to be 6077 // a power of two. We want power of two interleave count to simplify any 6078 // addressing operations or alignment considerations. 6079 // We also want power of two interleave counts to ensure that the induction 6080 // variable of the vector loop wraps to zero, when tail is folded by masking; 6081 // this currently happens when OptForSize, in which case IC is set to 1 above. 6082 unsigned IC = UINT_MAX; 6083 6084 for (auto& pair : R.MaxLocalUsers) { 6085 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 6086 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 6087 << " registers of " 6088 << TTI.getRegisterClassName(pair.first) << " register class\n"); 6089 if (VF.isScalar()) { 6090 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 6091 TargetNumRegisters = ForceTargetNumScalarRegs; 6092 } else { 6093 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 6094 TargetNumRegisters = ForceTargetNumVectorRegs; 6095 } 6096 unsigned MaxLocalUsers = pair.second; 6097 unsigned LoopInvariantRegs = 0; 6098 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 6099 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 6100 6101 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 6102 // Don't count the induction variable as interleaved. 6103 if (EnableIndVarRegisterHeur) { 6104 TmpIC = 6105 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 6106 std::max(1U, (MaxLocalUsers - 1))); 6107 } 6108 6109 IC = std::min(IC, TmpIC); 6110 } 6111 6112 // Clamp the interleave ranges to reasonable counts. 6113 unsigned MaxInterleaveCount = 6114 TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); 6115 6116 // Check if the user has overridden the max. 6117 if (VF.isScalar()) { 6118 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 6119 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 6120 } else { 6121 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 6122 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 6123 } 6124 6125 // If trip count is known or estimated compile time constant, limit the 6126 // interleave count to be less than the trip count divided by VF, provided it 6127 // is at least 1. 6128 // 6129 // For scalable vectors we can't know if interleaving is beneficial. It may 6130 // not be beneficial for small loops if none of the lanes in the second vector 6131 // iterations is enabled. However, for larger loops, there is likely to be a 6132 // similar benefit as for fixed-width vectors. For now, we choose to leave 6133 // the InterleaveCount as if vscale is '1', although if some information about 6134 // the vector is known (e.g. min vector size), we can make a better decision. 6135 if (BestKnownTC) { 6136 MaxInterleaveCount = 6137 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); 6138 // Make sure MaxInterleaveCount is greater than 0. 6139 MaxInterleaveCount = std::max(1u, MaxInterleaveCount); 6140 } 6141 6142 assert(MaxInterleaveCount > 0 && 6143 "Maximum interleave count must be greater than 0"); 6144 6145 // Clamp the calculated IC to be between the 1 and the max interleave count 6146 // that the target and trip count allows. 6147 if (IC > MaxInterleaveCount) 6148 IC = MaxInterleaveCount; 6149 else 6150 // Make sure IC is greater than 0. 6151 IC = std::max(1u, IC); 6152 6153 assert(IC > 0 && "Interleave count must be greater than 0."); 6154 6155 // If we did not calculate the cost for VF (because the user selected the VF) 6156 // then we calculate the cost of VF here. 6157 if (LoopCost == 0) { 6158 assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); 6159 LoopCost = *expectedCost(VF).first.getValue(); 6160 } 6161 6162 assert(LoopCost && "Non-zero loop cost expected"); 6163 6164 // Interleave if we vectorized this loop and there is a reduction that could 6165 // benefit from interleaving. 6166 if (VF.isVector() && HasReductions) { 6167 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 6168 return IC; 6169 } 6170 6171 // Note that if we've already vectorized the loop we will have done the 6172 // runtime check and so interleaving won't require further checks. 6173 bool InterleavingRequiresRuntimePointerCheck = 6174 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 6175 6176 // We want to interleave small loops in order to reduce the loop overhead and 6177 // potentially expose ILP opportunities. 6178 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 6179 << "LV: IC is " << IC << '\n' 6180 << "LV: VF is " << VF << '\n'); 6181 const bool AggressivelyInterleaveReductions = 6182 TTI.enableAggressiveInterleaving(HasReductions); 6183 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 6184 // We assume that the cost overhead is 1 and we use the cost model 6185 // to estimate the cost of the loop and interleave until the cost of the 6186 // loop overhead is about 5% of the cost of the loop. 6187 unsigned SmallIC = 6188 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 6189 6190 // Interleave until store/load ports (estimated by max interleave count) are 6191 // saturated. 6192 unsigned NumStores = Legal->getNumStores(); 6193 unsigned NumLoads = Legal->getNumLoads(); 6194 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 6195 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 6196 6197 // If we have a scalar reduction (vector reductions are already dealt with 6198 // by this point), we can increase the critical path length if the loop 6199 // we're interleaving is inside another loop. Limit, by default to 2, so the 6200 // critical path only gets increased by one reduction operation. 6201 if (HasReductions && TheLoop->getLoopDepth() > 1) { 6202 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 6203 SmallIC = std::min(SmallIC, F); 6204 StoresIC = std::min(StoresIC, F); 6205 LoadsIC = std::min(LoadsIC, F); 6206 } 6207 6208 if (EnableLoadStoreRuntimeInterleave && 6209 std::max(StoresIC, LoadsIC) > SmallIC) { 6210 LLVM_DEBUG( 6211 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 6212 return std::max(StoresIC, LoadsIC); 6213 } 6214 6215 // If there are scalar reductions and TTI has enabled aggressive 6216 // interleaving for reductions, we will interleave to expose ILP. 6217 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 6218 AggressivelyInterleaveReductions) { 6219 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6220 // Interleave no less than SmallIC but not as aggressive as the normal IC 6221 // to satisfy the rare situation when resources are too limited. 6222 return std::max(IC / 2, SmallIC); 6223 } else { 6224 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 6225 return SmallIC; 6226 } 6227 } 6228 6229 // Interleave if this is a large loop (small loops are already dealt with by 6230 // this point) that could benefit from interleaving. 6231 if (AggressivelyInterleaveReductions) { 6232 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 6233 return IC; 6234 } 6235 6236 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 6237 return 1; 6238 } 6239 6240 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 6241 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 6242 // This function calculates the register usage by measuring the highest number 6243 // of values that are alive at a single location. Obviously, this is a very 6244 // rough estimation. We scan the loop in a topological order in order and 6245 // assign a number to each instruction. We use RPO to ensure that defs are 6246 // met before their users. We assume that each instruction that has in-loop 6247 // users starts an interval. We record every time that an in-loop value is 6248 // used, so we have a list of the first and last occurrences of each 6249 // instruction. Next, we transpose this data structure into a multi map that 6250 // holds the list of intervals that *end* at a specific location. This multi 6251 // map allows us to perform a linear search. We scan the instructions linearly 6252 // and record each time that a new interval starts, by placing it in a set. 6253 // If we find this value in the multi-map then we remove it from the set. 6254 // The max register usage is the maximum size of the set. 6255 // We also search for instructions that are defined outside the loop, but are 6256 // used inside the loop. We need this number separately from the max-interval 6257 // usage number because when we unroll, loop-invariant values do not take 6258 // more register. 6259 LoopBlocksDFS DFS(TheLoop); 6260 DFS.perform(LI); 6261 6262 RegisterUsage RU; 6263 6264 // Each 'key' in the map opens a new interval. The values 6265 // of the map are the index of the 'last seen' usage of the 6266 // instruction that is the key. 6267 using IntervalMap = DenseMap<Instruction *, unsigned>; 6268 6269 // Maps instruction to its index. 6270 SmallVector<Instruction *, 64> IdxToInstr; 6271 // Marks the end of each interval. 6272 IntervalMap EndPoint; 6273 // Saves the list of instruction indices that are used in the loop. 6274 SmallPtrSet<Instruction *, 8> Ends; 6275 // Saves the list of values that are used in the loop but are 6276 // defined outside the loop, such as arguments and constants. 6277 SmallPtrSet<Value *, 8> LoopInvariants; 6278 6279 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6280 for (Instruction &I : BB->instructionsWithoutDebug()) { 6281 IdxToInstr.push_back(&I); 6282 6283 // Save the end location of each USE. 6284 for (Value *U : I.operands()) { 6285 auto *Instr = dyn_cast<Instruction>(U); 6286 6287 // Ignore non-instruction values such as arguments, constants, etc. 6288 if (!Instr) 6289 continue; 6290 6291 // If this instruction is outside the loop then record it and continue. 6292 if (!TheLoop->contains(Instr)) { 6293 LoopInvariants.insert(Instr); 6294 continue; 6295 } 6296 6297 // Overwrite previous end points. 6298 EndPoint[Instr] = IdxToInstr.size(); 6299 Ends.insert(Instr); 6300 } 6301 } 6302 } 6303 6304 // Saves the list of intervals that end with the index in 'key'. 6305 using InstrList = SmallVector<Instruction *, 2>; 6306 DenseMap<unsigned, InstrList> TransposeEnds; 6307 6308 // Transpose the EndPoints to a list of values that end at each index. 6309 for (auto &Interval : EndPoint) 6310 TransposeEnds[Interval.second].push_back(Interval.first); 6311 6312 SmallPtrSet<Instruction *, 8> OpenIntervals; 6313 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 6314 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 6315 6316 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 6317 6318 // A lambda that gets the register usage for the given type and VF. 6319 const auto &TTICapture = TTI; 6320 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) { 6321 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 6322 return 0U; 6323 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 6324 }; 6325 6326 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 6327 Instruction *I = IdxToInstr[i]; 6328 6329 // Remove all of the instructions that end at this location. 6330 InstrList &List = TransposeEnds[i]; 6331 for (Instruction *ToRemove : List) 6332 OpenIntervals.erase(ToRemove); 6333 6334 // Ignore instructions that are never used within the loop. 6335 if (!Ends.count(I)) 6336 continue; 6337 6338 // Skip ignored values. 6339 if (ValuesToIgnore.count(I)) 6340 continue; 6341 6342 // For each VF find the maximum usage of registers. 6343 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 6344 // Count the number of live intervals. 6345 SmallMapVector<unsigned, unsigned, 4> RegUsage; 6346 6347 if (VFs[j].isScalar()) { 6348 for (auto Inst : OpenIntervals) { 6349 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6350 if (RegUsage.find(ClassID) == RegUsage.end()) 6351 RegUsage[ClassID] = 1; 6352 else 6353 RegUsage[ClassID] += 1; 6354 } 6355 } else { 6356 collectUniformsAndScalars(VFs[j]); 6357 for (auto Inst : OpenIntervals) { 6358 // Skip ignored values for VF > 1. 6359 if (VecValuesToIgnore.count(Inst)) 6360 continue; 6361 if (isScalarAfterVectorization(Inst, VFs[j])) { 6362 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 6363 if (RegUsage.find(ClassID) == RegUsage.end()) 6364 RegUsage[ClassID] = 1; 6365 else 6366 RegUsage[ClassID] += 1; 6367 } else { 6368 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 6369 if (RegUsage.find(ClassID) == RegUsage.end()) 6370 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 6371 else 6372 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 6373 } 6374 } 6375 } 6376 6377 for (auto& pair : RegUsage) { 6378 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 6379 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 6380 else 6381 MaxUsages[j][pair.first] = pair.second; 6382 } 6383 } 6384 6385 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 6386 << OpenIntervals.size() << '\n'); 6387 6388 // Add the current instruction to the list of open intervals. 6389 OpenIntervals.insert(I); 6390 } 6391 6392 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 6393 SmallMapVector<unsigned, unsigned, 4> Invariant; 6394 6395 for (auto Inst : LoopInvariants) { 6396 unsigned Usage = 6397 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 6398 unsigned ClassID = 6399 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); 6400 if (Invariant.find(ClassID) == Invariant.end()) 6401 Invariant[ClassID] = Usage; 6402 else 6403 Invariant[ClassID] += Usage; 6404 } 6405 6406 LLVM_DEBUG({ 6407 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 6408 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 6409 << " item\n"; 6410 for (const auto &pair : MaxUsages[i]) { 6411 dbgs() << "LV(REG): RegisterClass: " 6412 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6413 << " registers\n"; 6414 } 6415 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 6416 << " item\n"; 6417 for (const auto &pair : Invariant) { 6418 dbgs() << "LV(REG): RegisterClass: " 6419 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 6420 << " registers\n"; 6421 } 6422 }); 6423 6424 RU.LoopInvariantRegs = Invariant; 6425 RU.MaxLocalUsers = MaxUsages[i]; 6426 RUs[i] = RU; 6427 } 6428 6429 return RUs; 6430 } 6431 6432 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 6433 // TODO: Cost model for emulated masked load/store is completely 6434 // broken. This hack guides the cost model to use an artificially 6435 // high enough value to practically disable vectorization with such 6436 // operations, except where previously deployed legality hack allowed 6437 // using very low cost values. This is to avoid regressions coming simply 6438 // from moving "masked load/store" check from legality to cost model. 6439 // Masked Load/Gather emulation was previously never allowed. 6440 // Limited number of Masked Store/Scatter emulation was allowed. 6441 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 6442 return isa<LoadInst>(I) || 6443 (isa<StoreInst>(I) && 6444 NumPredStores > NumberOfStoresToPredicate); 6445 } 6446 6447 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 6448 // If we aren't vectorizing the loop, or if we've already collected the 6449 // instructions to scalarize, there's nothing to do. Collection may already 6450 // have occurred if we have a user-selected VF and are now computing the 6451 // expected cost for interleaving. 6452 if (VF.isScalar() || VF.isZero() || 6453 InstsToScalarize.find(VF) != InstsToScalarize.end()) 6454 return; 6455 6456 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 6457 // not profitable to scalarize any instructions, the presence of VF in the 6458 // map will indicate that we've analyzed it already. 6459 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 6460 6461 // Find all the instructions that are scalar with predication in the loop and 6462 // determine if it would be better to not if-convert the blocks they are in. 6463 // If so, we also record the instructions to scalarize. 6464 for (BasicBlock *BB : TheLoop->blocks()) { 6465 if (!blockNeedsPredication(BB)) 6466 continue; 6467 for (Instruction &I : *BB) 6468 if (isScalarWithPredication(&I)) { 6469 ScalarCostsTy ScalarCosts; 6470 // Do not apply discount logic if hacked cost is needed 6471 // for emulated masked memrefs. 6472 if (!useEmulatedMaskMemRefHack(&I) && 6473 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 6474 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 6475 // Remember that BB will remain after vectorization. 6476 PredicatedBBsAfterVectorization.insert(BB); 6477 } 6478 } 6479 } 6480 6481 int LoopVectorizationCostModel::computePredInstDiscount( 6482 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 6483 assert(!isUniformAfterVectorization(PredInst, VF) && 6484 "Instruction marked uniform-after-vectorization will be predicated"); 6485 6486 // Initialize the discount to zero, meaning that the scalar version and the 6487 // vector version cost the same. 6488 InstructionCost Discount = 0; 6489 6490 // Holds instructions to analyze. The instructions we visit are mapped in 6491 // ScalarCosts. Those instructions are the ones that would be scalarized if 6492 // we find that the scalar version costs less. 6493 SmallVector<Instruction *, 8> Worklist; 6494 6495 // Returns true if the given instruction can be scalarized. 6496 auto canBeScalarized = [&](Instruction *I) -> bool { 6497 // We only attempt to scalarize instructions forming a single-use chain 6498 // from the original predicated block that would otherwise be vectorized. 6499 // Although not strictly necessary, we give up on instructions we know will 6500 // already be scalar to avoid traversing chains that are unlikely to be 6501 // beneficial. 6502 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 6503 isScalarAfterVectorization(I, VF)) 6504 return false; 6505 6506 // If the instruction is scalar with predication, it will be analyzed 6507 // separately. We ignore it within the context of PredInst. 6508 if (isScalarWithPredication(I)) 6509 return false; 6510 6511 // If any of the instruction's operands are uniform after vectorization, 6512 // the instruction cannot be scalarized. This prevents, for example, a 6513 // masked load from being scalarized. 6514 // 6515 // We assume we will only emit a value for lane zero of an instruction 6516 // marked uniform after vectorization, rather than VF identical values. 6517 // Thus, if we scalarize an instruction that uses a uniform, we would 6518 // create uses of values corresponding to the lanes we aren't emitting code 6519 // for. This behavior can be changed by allowing getScalarValue to clone 6520 // the lane zero values for uniforms rather than asserting. 6521 for (Use &U : I->operands()) 6522 if (auto *J = dyn_cast<Instruction>(U.get())) 6523 if (isUniformAfterVectorization(J, VF)) 6524 return false; 6525 6526 // Otherwise, we can scalarize the instruction. 6527 return true; 6528 }; 6529 6530 // Compute the expected cost discount from scalarizing the entire expression 6531 // feeding the predicated instruction. We currently only consider expressions 6532 // that are single-use instruction chains. 6533 Worklist.push_back(PredInst); 6534 while (!Worklist.empty()) { 6535 Instruction *I = Worklist.pop_back_val(); 6536 6537 // If we've already analyzed the instruction, there's nothing to do. 6538 if (ScalarCosts.find(I) != ScalarCosts.end()) 6539 continue; 6540 6541 // Compute the cost of the vector instruction. Note that this cost already 6542 // includes the scalarization overhead of the predicated instruction. 6543 InstructionCost VectorCost = getInstructionCost(I, VF).first; 6544 6545 // Compute the cost of the scalarized instruction. This cost is the cost of 6546 // the instruction as if it wasn't if-converted and instead remained in the 6547 // predicated block. We will scale this cost by block probability after 6548 // computing the scalarization overhead. 6549 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6550 InstructionCost ScalarCost = 6551 VF.getKnownMinValue() * 6552 getInstructionCost(I, ElementCount::getFixed(1)).first; 6553 6554 // Compute the scalarization overhead of needed insertelement instructions 6555 // and phi nodes. 6556 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 6557 ScalarCost += TTI.getScalarizationOverhead( 6558 cast<VectorType>(ToVectorTy(I->getType(), VF)), 6559 APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); 6560 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6561 ScalarCost += 6562 VF.getKnownMinValue() * 6563 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); 6564 } 6565 6566 // Compute the scalarization overhead of needed extractelement 6567 // instructions. For each of the instruction's operands, if the operand can 6568 // be scalarized, add it to the worklist; otherwise, account for the 6569 // overhead. 6570 for (Use &U : I->operands()) 6571 if (auto *J = dyn_cast<Instruction>(U.get())) { 6572 assert(VectorType::isValidElementType(J->getType()) && 6573 "Instruction has non-scalar type"); 6574 if (canBeScalarized(J)) 6575 Worklist.push_back(J); 6576 else if (needsExtract(J, VF)) { 6577 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6578 ScalarCost += TTI.getScalarizationOverhead( 6579 cast<VectorType>(ToVectorTy(J->getType(), VF)), 6580 APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); 6581 } 6582 } 6583 6584 // Scale the total scalar cost by block probability. 6585 ScalarCost /= getReciprocalPredBlockProb(); 6586 6587 // Compute the discount. A non-negative discount means the vector version 6588 // of the instruction costs more, and scalarizing would be beneficial. 6589 Discount += VectorCost - ScalarCost; 6590 ScalarCosts[I] = ScalarCost; 6591 } 6592 6593 return *Discount.getValue(); 6594 } 6595 6596 LoopVectorizationCostModel::VectorizationCostTy 6597 LoopVectorizationCostModel::expectedCost(ElementCount VF) { 6598 VectorizationCostTy Cost; 6599 6600 // For each block. 6601 for (BasicBlock *BB : TheLoop->blocks()) { 6602 VectorizationCostTy BlockCost; 6603 6604 // For each instruction in the old loop. 6605 for (Instruction &I : BB->instructionsWithoutDebug()) { 6606 // Skip ignored values. 6607 if (ValuesToIgnore.count(&I) || 6608 (VF.isVector() && VecValuesToIgnore.count(&I))) 6609 continue; 6610 6611 VectorizationCostTy C = getInstructionCost(&I, VF); 6612 6613 // Check if we should override the cost. 6614 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 6615 C.first = InstructionCost(ForceTargetInstructionCost); 6616 6617 BlockCost.first += C.first; 6618 BlockCost.second |= C.second; 6619 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6620 << " for VF " << VF << " For instruction: " << I 6621 << '\n'); 6622 } 6623 6624 // If we are vectorizing a predicated block, it will have been 6625 // if-converted. This means that the block's instructions (aside from 6626 // stores and instructions that may divide by zero) will now be 6627 // unconditionally executed. For the scalar case, we may not always execute 6628 // the predicated block, if it is an if-else block. Thus, scale the block's 6629 // cost by the probability of executing it. blockNeedsPredication from 6630 // Legal is used so as to not include all blocks in tail folded loops. 6631 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6632 BlockCost.first /= getReciprocalPredBlockProb(); 6633 6634 Cost.first += BlockCost.first; 6635 Cost.second |= BlockCost.second; 6636 } 6637 6638 return Cost; 6639 } 6640 6641 /// Gets Address Access SCEV after verifying that the access pattern 6642 /// is loop invariant except the induction variable dependence. 6643 /// 6644 /// This SCEV can be sent to the Target in order to estimate the address 6645 /// calculation cost. 6646 static const SCEV *getAddressAccessSCEV( 6647 Value *Ptr, 6648 LoopVectorizationLegality *Legal, 6649 PredicatedScalarEvolution &PSE, 6650 const Loop *TheLoop) { 6651 6652 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6653 if (!Gep) 6654 return nullptr; 6655 6656 // We are looking for a gep with all loop invariant indices except for one 6657 // which should be an induction variable. 6658 auto SE = PSE.getSE(); 6659 unsigned NumOperands = Gep->getNumOperands(); 6660 for (unsigned i = 1; i < NumOperands; ++i) { 6661 Value *Opd = Gep->getOperand(i); 6662 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6663 !Legal->isInductionVariable(Opd)) 6664 return nullptr; 6665 } 6666 6667 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6668 return PSE.getSCEV(Ptr); 6669 } 6670 6671 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 6672 return Legal->hasStride(I->getOperand(0)) || 6673 Legal->hasStride(I->getOperand(1)); 6674 } 6675 6676 InstructionCost 6677 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6678 ElementCount VF) { 6679 assert(VF.isVector() && 6680 "Scalarization cost of instruction implies vectorization."); 6681 assert(!VF.isScalable() && "scalable vectors not yet supported."); 6682 Type *ValTy = getMemInstValueType(I); 6683 auto SE = PSE.getSE(); 6684 6685 unsigned AS = getLoadStoreAddressSpace(I); 6686 Value *Ptr = getLoadStorePointerOperand(I); 6687 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6688 6689 // Figure out whether the access is strided and get the stride value 6690 // if it's known in compile time 6691 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6692 6693 // Get the cost of the scalar memory instruction and address computation. 6694 InstructionCost Cost = 6695 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6696 6697 // Don't pass *I here, since it is scalar but will actually be part of a 6698 // vectorized loop where the user of it is a vectorized instruction. 6699 const Align Alignment = getLoadStoreAlignment(I); 6700 Cost += VF.getKnownMinValue() * 6701 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 6702 AS, TTI::TCK_RecipThroughput); 6703 6704 // Get the overhead of the extractelement and insertelement instructions 6705 // we might create due to scalarization. 6706 Cost += getScalarizationOverhead(I, VF); 6707 6708 // If we have a predicated store, it may not be executed for each vector 6709 // lane. Scale the cost by the probability of executing the predicated 6710 // block. 6711 if (isPredicatedInst(I)) { 6712 Cost /= getReciprocalPredBlockProb(); 6713 6714 if (useEmulatedMaskMemRefHack(I)) 6715 // Artificially setting to a high enough value to practically disable 6716 // vectorization with such operations. 6717 Cost = 3000000; 6718 } 6719 6720 return Cost; 6721 } 6722 6723 InstructionCost 6724 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6725 ElementCount VF) { 6726 Type *ValTy = getMemInstValueType(I); 6727 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6728 Value *Ptr = getLoadStorePointerOperand(I); 6729 unsigned AS = getLoadStoreAddressSpace(I); 6730 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 6731 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6732 6733 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6734 "Stride should be 1 or -1 for consecutive memory access"); 6735 const Align Alignment = getLoadStoreAlignment(I); 6736 InstructionCost Cost = 0; 6737 if (Legal->isMaskRequired(I)) 6738 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6739 CostKind); 6740 else 6741 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6742 CostKind, I); 6743 6744 bool Reverse = ConsecutiveStride < 0; 6745 if (Reverse) 6746 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6747 return Cost; 6748 } 6749 6750 InstructionCost 6751 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6752 ElementCount VF) { 6753 assert(Legal->isUniformMemOp(*I)); 6754 6755 Type *ValTy = getMemInstValueType(I); 6756 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6757 const Align Alignment = getLoadStoreAlignment(I); 6758 unsigned AS = getLoadStoreAddressSpace(I); 6759 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6760 if (isa<LoadInst>(I)) { 6761 return TTI.getAddressComputationCost(ValTy) + 6762 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6763 CostKind) + 6764 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6765 } 6766 StoreInst *SI = cast<StoreInst>(I); 6767 6768 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 6769 return TTI.getAddressComputationCost(ValTy) + 6770 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6771 CostKind) + 6772 (isLoopInvariantStoreValue 6773 ? 0 6774 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6775 VF.getKnownMinValue() - 1)); 6776 } 6777 6778 InstructionCost 6779 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6780 ElementCount VF) { 6781 Type *ValTy = getMemInstValueType(I); 6782 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6783 const Align Alignment = getLoadStoreAlignment(I); 6784 const Value *Ptr = getLoadStorePointerOperand(I); 6785 6786 return TTI.getAddressComputationCost(VectorTy) + 6787 TTI.getGatherScatterOpCost( 6788 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6789 TargetTransformInfo::TCK_RecipThroughput, I); 6790 } 6791 6792 InstructionCost 6793 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6794 ElementCount VF) { 6795 // TODO: Once we have support for interleaving with scalable vectors 6796 // we can calculate the cost properly here. 6797 if (VF.isScalable()) 6798 return InstructionCost::getInvalid(); 6799 6800 Type *ValTy = getMemInstValueType(I); 6801 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6802 unsigned AS = getLoadStoreAddressSpace(I); 6803 6804 auto Group = getInterleavedAccessGroup(I); 6805 assert(Group && "Fail to get an interleaved access group."); 6806 6807 unsigned InterleaveFactor = Group->getFactor(); 6808 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6809 6810 // Holds the indices of existing members in an interleaved load group. 6811 // An interleaved store group doesn't need this as it doesn't allow gaps. 6812 SmallVector<unsigned, 4> Indices; 6813 if (isa<LoadInst>(I)) { 6814 for (unsigned i = 0; i < InterleaveFactor; i++) 6815 if (Group->getMember(i)) 6816 Indices.push_back(i); 6817 } 6818 6819 // Calculate the cost of the whole interleaved group. 6820 bool UseMaskForGaps = 6821 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 6822 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6823 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6824 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); 6825 6826 if (Group->isReverse()) { 6827 // TODO: Add support for reversed masked interleaved access. 6828 assert(!Legal->isMaskRequired(I) && 6829 "Reverse masked interleaved access not supported."); 6830 Cost += Group->getNumMembers() * 6831 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 6832 } 6833 return Cost; 6834 } 6835 6836 InstructionCost LoopVectorizationCostModel::getReductionPatternCost( 6837 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { 6838 // Early exit for no inloop reductions 6839 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6840 return InstructionCost::getInvalid(); 6841 auto *VectorTy = cast<VectorType>(Ty); 6842 6843 // We are looking for a pattern of, and finding the minimal acceptable cost: 6844 // reduce(mul(ext(A), ext(B))) or 6845 // reduce(mul(A, B)) or 6846 // reduce(ext(A)) or 6847 // reduce(A). 6848 // The basic idea is that we walk down the tree to do that, finding the root 6849 // reduction instruction in InLoopReductionImmediateChains. From there we find 6850 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6851 // of the components. If the reduction cost is lower then we return it for the 6852 // reduction instruction and 0 for the other instructions in the pattern. If 6853 // it is not we return an invalid cost specifying the orignal cost method 6854 // should be used. 6855 Instruction *RetI = I; 6856 if ((RetI->getOpcode() == Instruction::SExt || 6857 RetI->getOpcode() == Instruction::ZExt)) { 6858 if (!RetI->hasOneUser()) 6859 return InstructionCost::getInvalid(); 6860 RetI = RetI->user_back(); 6861 } 6862 if (RetI->getOpcode() == Instruction::Mul && 6863 RetI->user_back()->getOpcode() == Instruction::Add) { 6864 if (!RetI->hasOneUser()) 6865 return InstructionCost::getInvalid(); 6866 RetI = RetI->user_back(); 6867 } 6868 6869 // Test if the found instruction is a reduction, and if not return an invalid 6870 // cost specifying the parent to use the original cost modelling. 6871 if (!InLoopReductionImmediateChains.count(RetI)) 6872 return InstructionCost::getInvalid(); 6873 6874 // Find the reduction this chain is a part of and calculate the basic cost of 6875 // the reduction on its own. 6876 Instruction *LastChain = InLoopReductionImmediateChains[RetI]; 6877 Instruction *ReductionPhi = LastChain; 6878 while (!isa<PHINode>(ReductionPhi)) 6879 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; 6880 6881 RecurrenceDescriptor RdxDesc = 6882 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; 6883 unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), 6884 VectorTy, false, CostKind); 6885 6886 // Get the operand that was not the reduction chain and match it to one of the 6887 // patterns, returning the better cost if it is found. 6888 Instruction *RedOp = RetI->getOperand(1) == LastChain 6889 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6890 : dyn_cast<Instruction>(RetI->getOperand(1)); 6891 6892 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6893 6894 if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) && 6895 !TheLoop->isLoopInvariant(RedOp)) { 6896 bool IsUnsigned = isa<ZExtInst>(RedOp); 6897 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6898 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6899 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6900 CostKind); 6901 6902 unsigned ExtCost = 6903 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6904 TTI::CastContextHint::None, CostKind, RedOp); 6905 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6906 return I == RetI ? *RedCost.getValue() : 0; 6907 } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { 6908 Instruction *Mul = RedOp; 6909 Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0)); 6910 Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1)); 6911 if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) && 6912 Op0->getOpcode() == Op1->getOpcode() && 6913 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6914 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6915 bool IsUnsigned = isa<ZExtInst>(Op0); 6916 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6917 // reduce(mul(ext, ext)) 6918 unsigned ExtCost = 6919 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, 6920 TTI::CastContextHint::None, CostKind, Op0); 6921 unsigned MulCost = 6922 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6923 6924 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6925 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6926 CostKind); 6927 6928 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) 6929 return I == RetI ? *RedCost.getValue() : 0; 6930 } else { 6931 unsigned MulCost = 6932 TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); 6933 6934 InstructionCost RedCost = TTI.getExtendedAddReductionCost( 6935 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, 6936 CostKind); 6937 6938 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6939 return I == RetI ? *RedCost.getValue() : 0; 6940 } 6941 } 6942 6943 return I == RetI ? BaseCost : InstructionCost::getInvalid(); 6944 } 6945 6946 InstructionCost 6947 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6948 ElementCount VF) { 6949 // Calculate scalar cost only. Vectorization cost should be ready at this 6950 // moment. 6951 if (VF.isScalar()) { 6952 Type *ValTy = getMemInstValueType(I); 6953 const Align Alignment = getLoadStoreAlignment(I); 6954 unsigned AS = getLoadStoreAddressSpace(I); 6955 6956 return TTI.getAddressComputationCost(ValTy) + 6957 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6958 TTI::TCK_RecipThroughput, I); 6959 } 6960 return getWideningCost(I, VF); 6961 } 6962 6963 LoopVectorizationCostModel::VectorizationCostTy 6964 LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6965 ElementCount VF) { 6966 // If we know that this instruction will remain uniform, check the cost of 6967 // the scalar version. 6968 if (isUniformAfterVectorization(I, VF)) 6969 VF = ElementCount::getFixed(1); 6970 6971 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6972 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6973 6974 // Forced scalars do not have any scalarization overhead. 6975 auto ForcedScalar = ForcedScalars.find(VF); 6976 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6977 auto InstSet = ForcedScalar->second; 6978 if (InstSet.count(I)) 6979 return VectorizationCostTy( 6980 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6981 VF.getKnownMinValue()), 6982 false); 6983 } 6984 6985 Type *VectorTy; 6986 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6987 6988 bool TypeNotScalarized = 6989 VF.isVector() && VectorTy->isVectorTy() && 6990 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); 6991 return VectorizationCostTy(C, TypeNotScalarized); 6992 } 6993 6994 InstructionCost 6995 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 6996 ElementCount VF) { 6997 6998 assert(!VF.isScalable() && 6999 "cannot compute scalarization overhead for scalable vectorization"); 7000 if (VF.isScalar()) 7001 return 0; 7002 7003 InstructionCost Cost = 0; 7004 Type *RetTy = ToVectorTy(I->getType(), VF); 7005 if (!RetTy->isVoidTy() && 7006 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 7007 Cost += TTI.getScalarizationOverhead( 7008 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), 7009 true, false); 7010 7011 // Some targets keep addresses scalar. 7012 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 7013 return Cost; 7014 7015 // Some targets support efficient element stores. 7016 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 7017 return Cost; 7018 7019 // Collect operands to consider. 7020 CallInst *CI = dyn_cast<CallInst>(I); 7021 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 7022 7023 // Skip operands that do not require extraction/scalarization and do not incur 7024 // any overhead. 7025 return Cost + TTI.getOperandsScalarizationOverhead( 7026 filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); 7027 } 7028 7029 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 7030 if (VF.isScalar()) 7031 return; 7032 NumPredStores = 0; 7033 for (BasicBlock *BB : TheLoop->blocks()) { 7034 // For each instruction in the old loop. 7035 for (Instruction &I : *BB) { 7036 Value *Ptr = getLoadStorePointerOperand(&I); 7037 if (!Ptr) 7038 continue; 7039 7040 // TODO: We should generate better code and update the cost model for 7041 // predicated uniform stores. Today they are treated as any other 7042 // predicated store (see added test cases in 7043 // invariant-store-vectorization.ll). 7044 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 7045 NumPredStores++; 7046 7047 if (Legal->isUniformMemOp(I)) { 7048 // TODO: Avoid replicating loads and stores instead of 7049 // relying on instcombine to remove them. 7050 // Load: Scalar load + broadcast 7051 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 7052 InstructionCost Cost = getUniformMemOpCost(&I, VF); 7053 setWideningDecision(&I, VF, CM_Scalarize, Cost); 7054 continue; 7055 } 7056 7057 // We assume that widening is the best solution when possible. 7058 if (memoryInstructionCanBeWidened(&I, VF)) { 7059 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 7060 int ConsecutiveStride = 7061 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 7062 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 7063 "Expected consecutive stride."); 7064 InstWidening Decision = 7065 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 7066 setWideningDecision(&I, VF, Decision, Cost); 7067 continue; 7068 } 7069 7070 // Choose between Interleaving, Gather/Scatter or Scalarization. 7071 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 7072 unsigned NumAccesses = 1; 7073 if (isAccessInterleaved(&I)) { 7074 auto Group = getInterleavedAccessGroup(&I); 7075 assert(Group && "Fail to get an interleaved access group."); 7076 7077 // Make one decision for the whole group. 7078 if (getWideningDecision(&I, VF) != CM_Unknown) 7079 continue; 7080 7081 NumAccesses = Group->getNumMembers(); 7082 if (interleavedAccessCanBeWidened(&I, VF)) 7083 InterleaveCost = getInterleaveGroupCost(&I, VF); 7084 } 7085 7086 InstructionCost GatherScatterCost = 7087 isLegalGatherOrScatter(&I) 7088 ? getGatherScatterCost(&I, VF) * NumAccesses 7089 : InstructionCost::getInvalid(); 7090 7091 InstructionCost ScalarizationCost = 7092 !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses 7093 : InstructionCost::getInvalid(); 7094 7095 // Choose better solution for the current VF, 7096 // write down this decision and use it during vectorization. 7097 InstructionCost Cost; 7098 InstWidening Decision; 7099 if (InterleaveCost <= GatherScatterCost && 7100 InterleaveCost < ScalarizationCost) { 7101 Decision = CM_Interleave; 7102 Cost = InterleaveCost; 7103 } else if (GatherScatterCost < ScalarizationCost) { 7104 Decision = CM_GatherScatter; 7105 Cost = GatherScatterCost; 7106 } else { 7107 assert(!VF.isScalable() && 7108 "We cannot yet scalarise for scalable vectors"); 7109 Decision = CM_Scalarize; 7110 Cost = ScalarizationCost; 7111 } 7112 // If the instructions belongs to an interleave group, the whole group 7113 // receives the same decision. The whole group receives the cost, but 7114 // the cost will actually be assigned to one instruction. 7115 if (auto Group = getInterleavedAccessGroup(&I)) 7116 setWideningDecision(Group, VF, Decision, Cost); 7117 else 7118 setWideningDecision(&I, VF, Decision, Cost); 7119 } 7120 } 7121 7122 // Make sure that any load of address and any other address computation 7123 // remains scalar unless there is gather/scatter support. This avoids 7124 // inevitable extracts into address registers, and also has the benefit of 7125 // activating LSR more, since that pass can't optimize vectorized 7126 // addresses. 7127 if (TTI.prefersVectorizedAddressing()) 7128 return; 7129 7130 // Start with all scalar pointer uses. 7131 SmallPtrSet<Instruction *, 8> AddrDefs; 7132 for (BasicBlock *BB : TheLoop->blocks()) 7133 for (Instruction &I : *BB) { 7134 Instruction *PtrDef = 7135 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 7136 if (PtrDef && TheLoop->contains(PtrDef) && 7137 getWideningDecision(&I, VF) != CM_GatherScatter) 7138 AddrDefs.insert(PtrDef); 7139 } 7140 7141 // Add all instructions used to generate the addresses. 7142 SmallVector<Instruction *, 4> Worklist; 7143 append_range(Worklist, AddrDefs); 7144 while (!Worklist.empty()) { 7145 Instruction *I = Worklist.pop_back_val(); 7146 for (auto &Op : I->operands()) 7147 if (auto *InstOp = dyn_cast<Instruction>(Op)) 7148 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 7149 AddrDefs.insert(InstOp).second) 7150 Worklist.push_back(InstOp); 7151 } 7152 7153 for (auto *I : AddrDefs) { 7154 if (isa<LoadInst>(I)) { 7155 // Setting the desired widening decision should ideally be handled in 7156 // by cost functions, but since this involves the task of finding out 7157 // if the loaded register is involved in an address computation, it is 7158 // instead changed here when we know this is the case. 7159 InstWidening Decision = getWideningDecision(I, VF); 7160 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 7161 // Scalarize a widened load of address. 7162 setWideningDecision( 7163 I, VF, CM_Scalarize, 7164 (VF.getKnownMinValue() * 7165 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 7166 else if (auto Group = getInterleavedAccessGroup(I)) { 7167 // Scalarize an interleave group of address loads. 7168 for (unsigned I = 0; I < Group->getFactor(); ++I) { 7169 if (Instruction *Member = Group->getMember(I)) 7170 setWideningDecision( 7171 Member, VF, CM_Scalarize, 7172 (VF.getKnownMinValue() * 7173 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 7174 } 7175 } 7176 } else 7177 // Make sure I gets scalarized and a cost estimate without 7178 // scalarization overhead. 7179 ForcedScalars[VF].insert(I); 7180 } 7181 } 7182 7183 InstructionCost 7184 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 7185 Type *&VectorTy) { 7186 Type *RetTy = I->getType(); 7187 if (canTruncateToMinimalBitwidth(I, VF)) 7188 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 7189 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 7190 auto SE = PSE.getSE(); 7191 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 7192 7193 // TODO: We need to estimate the cost of intrinsic calls. 7194 switch (I->getOpcode()) { 7195 case Instruction::GetElementPtr: 7196 // We mark this instruction as zero-cost because the cost of GEPs in 7197 // vectorized code depends on whether the corresponding memory instruction 7198 // is scalarized or not. Therefore, we handle GEPs with the memory 7199 // instruction cost. 7200 return 0; 7201 case Instruction::Br: { 7202 // In cases of scalarized and predicated instructions, there will be VF 7203 // predicated blocks in the vectorized loop. Each branch around these 7204 // blocks requires also an extract of its vector compare i1 element. 7205 bool ScalarPredicatedBB = false; 7206 BranchInst *BI = cast<BranchInst>(I); 7207 if (VF.isVector() && BI->isConditional() && 7208 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) || 7209 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1)))) 7210 ScalarPredicatedBB = true; 7211 7212 if (ScalarPredicatedBB) { 7213 // Return cost for branches around scalarized and predicated blocks. 7214 assert(!VF.isScalable() && "scalable vectors not yet supported."); 7215 auto *Vec_i1Ty = 7216 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 7217 return (TTI.getScalarizationOverhead( 7218 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), 7219 false, true) + 7220 (TTI.getCFInstrCost(Instruction::Br, CostKind) * 7221 VF.getKnownMinValue())); 7222 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 7223 // The back-edge branch will remain, as will all scalar branches. 7224 return TTI.getCFInstrCost(Instruction::Br, CostKind); 7225 else 7226 // This branch will be eliminated by if-conversion. 7227 return 0; 7228 // Note: We currently assume zero cost for an unconditional branch inside 7229 // a predicated block since it will become a fall-through, although we 7230 // may decide in the future to call TTI for all branches. 7231 } 7232 case Instruction::PHI: { 7233 auto *Phi = cast<PHINode>(I); 7234 7235 // First-order recurrences are replaced by vector shuffles inside the loop. 7236 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 7237 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) 7238 return TTI.getShuffleCost( 7239 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy), 7240 VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); 7241 7242 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 7243 // converted into select instructions. We require N - 1 selects per phi 7244 // node, where N is the number of incoming values. 7245 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 7246 return (Phi->getNumIncomingValues() - 1) * 7247 TTI.getCmpSelInstrCost( 7248 Instruction::Select, ToVectorTy(Phi->getType(), VF), 7249 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 7250 CmpInst::BAD_ICMP_PREDICATE, CostKind); 7251 7252 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 7253 } 7254 case Instruction::UDiv: 7255 case Instruction::SDiv: 7256 case Instruction::URem: 7257 case Instruction::SRem: 7258 // If we have a predicated instruction, it may not be executed for each 7259 // vector lane. Get the scalarization cost and scale this amount by the 7260 // probability of executing the predicated block. If the instruction is not 7261 // predicated, we fall through to the next case. 7262 if (VF.isVector() && isScalarWithPredication(I)) { 7263 InstructionCost Cost = 0; 7264 7265 // These instructions have a non-void type, so account for the phi nodes 7266 // that we will create. This cost is likely to be zero. The phi node 7267 // cost, if any, should be scaled by the block probability because it 7268 // models a copy at the end of each predicated block. 7269 Cost += VF.getKnownMinValue() * 7270 TTI.getCFInstrCost(Instruction::PHI, CostKind); 7271 7272 // The cost of the non-predicated instruction. 7273 Cost += VF.getKnownMinValue() * 7274 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); 7275 7276 // The cost of insertelement and extractelement instructions needed for 7277 // scalarization. 7278 Cost += getScalarizationOverhead(I, VF); 7279 7280 // Scale the cost by the probability of executing the predicated blocks. 7281 // This assumes the predicated block for each vector lane is equally 7282 // likely. 7283 return Cost / getReciprocalPredBlockProb(); 7284 } 7285 LLVM_FALLTHROUGH; 7286 case Instruction::Add: 7287 case Instruction::FAdd: 7288 case Instruction::Sub: 7289 case Instruction::FSub: 7290 case Instruction::Mul: 7291 case Instruction::FMul: 7292 case Instruction::FDiv: 7293 case Instruction::FRem: 7294 case Instruction::Shl: 7295 case Instruction::LShr: 7296 case Instruction::AShr: 7297 case Instruction::And: 7298 case Instruction::Or: 7299 case Instruction::Xor: { 7300 // Since we will replace the stride by 1 the multiplication should go away. 7301 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 7302 return 0; 7303 7304 // Detect reduction patterns 7305 InstructionCost RedCost; 7306 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7307 .isValid()) 7308 return RedCost; 7309 7310 // Certain instructions can be cheaper to vectorize if they have a constant 7311 // second vector operand. One example of this are shifts on x86. 7312 Value *Op2 = I->getOperand(1); 7313 TargetTransformInfo::OperandValueProperties Op2VP; 7314 TargetTransformInfo::OperandValueKind Op2VK = 7315 TTI.getOperandInfo(Op2, Op2VP); 7316 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 7317 Op2VK = TargetTransformInfo::OK_UniformValue; 7318 7319 SmallVector<const Value *, 4> Operands(I->operand_values()); 7320 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7321 return N * TTI.getArithmeticInstrCost( 7322 I->getOpcode(), VectorTy, CostKind, 7323 TargetTransformInfo::OK_AnyValue, 7324 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 7325 } 7326 case Instruction::FNeg: { 7327 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 7328 unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; 7329 return N * TTI.getArithmeticInstrCost( 7330 I->getOpcode(), VectorTy, CostKind, 7331 TargetTransformInfo::OK_AnyValue, 7332 TargetTransformInfo::OK_AnyValue, 7333 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 7334 I->getOperand(0), I); 7335 } 7336 case Instruction::Select: { 7337 SelectInst *SI = cast<SelectInst>(I); 7338 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7339 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7340 Type *CondTy = SI->getCondition()->getType(); 7341 if (!ScalarCond) 7342 CondTy = VectorType::get(CondTy, VF); 7343 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, 7344 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7345 } 7346 case Instruction::ICmp: 7347 case Instruction::FCmp: { 7348 Type *ValTy = I->getOperand(0)->getType(); 7349 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7350 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7351 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7352 VectorTy = ToVectorTy(ValTy, VF); 7353 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7354 CmpInst::BAD_ICMP_PREDICATE, CostKind, I); 7355 } 7356 case Instruction::Store: 7357 case Instruction::Load: { 7358 ElementCount Width = VF; 7359 if (Width.isVector()) { 7360 InstWidening Decision = getWideningDecision(I, Width); 7361 assert(Decision != CM_Unknown && 7362 "CM decision should be taken at this point"); 7363 if (Decision == CM_Scalarize) 7364 Width = ElementCount::getFixed(1); 7365 } 7366 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 7367 return getMemoryInstructionCost(I, VF); 7368 } 7369 case Instruction::ZExt: 7370 case Instruction::SExt: 7371 case Instruction::FPToUI: 7372 case Instruction::FPToSI: 7373 case Instruction::FPExt: 7374 case Instruction::PtrToInt: 7375 case Instruction::IntToPtr: 7376 case Instruction::SIToFP: 7377 case Instruction::UIToFP: 7378 case Instruction::Trunc: 7379 case Instruction::FPTrunc: 7380 case Instruction::BitCast: { 7381 // Computes the CastContextHint from a Load/Store instruction. 7382 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7383 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7384 "Expected a load or a store!"); 7385 7386 if (VF.isScalar() || !TheLoop->contains(I)) 7387 return TTI::CastContextHint::Normal; 7388 7389 switch (getWideningDecision(I, VF)) { 7390 case LoopVectorizationCostModel::CM_GatherScatter: 7391 return TTI::CastContextHint::GatherScatter; 7392 case LoopVectorizationCostModel::CM_Interleave: 7393 return TTI::CastContextHint::Interleave; 7394 case LoopVectorizationCostModel::CM_Scalarize: 7395 case LoopVectorizationCostModel::CM_Widen: 7396 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7397 : TTI::CastContextHint::Normal; 7398 case LoopVectorizationCostModel::CM_Widen_Reverse: 7399 return TTI::CastContextHint::Reversed; 7400 case LoopVectorizationCostModel::CM_Unknown: 7401 llvm_unreachable("Instr did not go through cost modelling?"); 7402 } 7403 7404 llvm_unreachable("Unhandled case!"); 7405 }; 7406 7407 unsigned Opcode = I->getOpcode(); 7408 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7409 // For Trunc, the context is the only user, which must be a StoreInst. 7410 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7411 if (I->hasOneUse()) 7412 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7413 CCH = ComputeCCH(Store); 7414 } 7415 // For Z/Sext, the context is the operand, which must be a LoadInst. 7416 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7417 Opcode == Instruction::FPExt) { 7418 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7419 CCH = ComputeCCH(Load); 7420 } 7421 7422 // We optimize the truncation of induction variables having constant 7423 // integer steps. The cost of these truncations is the same as the scalar 7424 // operation. 7425 if (isOptimizableIVTruncate(I, VF)) { 7426 auto *Trunc = cast<TruncInst>(I); 7427 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7428 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7429 } 7430 7431 // Detect reduction patterns 7432 InstructionCost RedCost; 7433 if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7434 .isValid()) 7435 return RedCost; 7436 7437 Type *SrcScalarTy = I->getOperand(0)->getType(); 7438 Type *SrcVecTy = 7439 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7440 if (canTruncateToMinimalBitwidth(I, VF)) { 7441 // This cast is going to be shrunk. This may remove the cast or it might 7442 // turn it into slightly different cast. For example, if MinBW == 16, 7443 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7444 // 7445 // Calculate the modified src and dest types. 7446 Type *MinVecTy = VectorTy; 7447 if (Opcode == Instruction::Trunc) { 7448 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7449 VectorTy = 7450 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7451 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7452 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 7453 VectorTy = 7454 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7455 } 7456 } 7457 7458 unsigned N; 7459 if (isScalarAfterVectorization(I, VF)) { 7460 assert(!VF.isScalable() && "VF is assumed to be non scalable"); 7461 N = VF.getKnownMinValue(); 7462 } else 7463 N = 1; 7464 return N * 7465 TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7466 } 7467 case Instruction::Call: { 7468 bool NeedToScalarize; 7469 CallInst *CI = cast<CallInst>(I); 7470 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 7471 if (getVectorIntrinsicIDForCall(CI, TLI)) { 7472 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 7473 return std::min(CallCost, IntrinsicCost); 7474 } 7475 return CallCost; 7476 } 7477 case Instruction::ExtractValue: 7478 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7479 default: 7480 // The cost of executing VF copies of the scalar instruction. This opcode 7481 // is unknown. Assume that it is the same as 'mul'. 7482 return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( 7483 Instruction::Mul, VectorTy, CostKind) + 7484 getScalarizationOverhead(I, VF); 7485 } // end of switch. 7486 } 7487 7488 char LoopVectorize::ID = 0; 7489 7490 static const char lv_name[] = "Loop Vectorization"; 7491 7492 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 7493 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 7494 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 7495 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 7496 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 7497 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 7498 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 7499 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 7500 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 7501 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 7502 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 7503 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 7504 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 7505 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 7506 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 7507 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 7508 7509 namespace llvm { 7510 7511 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 7512 7513 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 7514 bool VectorizeOnlyWhenForced) { 7515 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 7516 } 7517 7518 } // end namespace llvm 7519 7520 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 7521 // Check if the pointer operand of a load or store instruction is 7522 // consecutive. 7523 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 7524 return Legal->isConsecutivePtr(Ptr); 7525 return false; 7526 } 7527 7528 void LoopVectorizationCostModel::collectValuesToIgnore() { 7529 // Ignore ephemeral values. 7530 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7531 7532 // Ignore type-promoting instructions we identified during reduction 7533 // detection. 7534 for (auto &Reduction : Legal->getReductionVars()) { 7535 RecurrenceDescriptor &RedDes = Reduction.second; 7536 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7537 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7538 } 7539 // Ignore type-casting instructions we identified during induction 7540 // detection. 7541 for (auto &Induction : Legal->getInductionVars()) { 7542 InductionDescriptor &IndDes = Induction.second; 7543 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7544 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7545 } 7546 } 7547 7548 void LoopVectorizationCostModel::collectInLoopReductions() { 7549 for (auto &Reduction : Legal->getReductionVars()) { 7550 PHINode *Phi = Reduction.first; 7551 RecurrenceDescriptor &RdxDesc = Reduction.second; 7552 7553 // We don't collect reductions that are type promoted (yet). 7554 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7555 continue; 7556 7557 // If the target would prefer this reduction to happen "in-loop", then we 7558 // want to record it as such. 7559 unsigned Opcode = RdxDesc.getOpcode(); 7560 if (!PreferInLoopReductions && 7561 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7562 TargetTransformInfo::ReductionFlags())) 7563 continue; 7564 7565 // Check that we can correctly put the reductions into the loop, by 7566 // finding the chain of operations that leads from the phi to the loop 7567 // exit value. 7568 SmallVector<Instruction *, 4> ReductionOperations = 7569 RdxDesc.getReductionOpChain(Phi, TheLoop); 7570 bool InLoop = !ReductionOperations.empty(); 7571 if (InLoop) { 7572 InLoopReductionChains[Phi] = ReductionOperations; 7573 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7574 Instruction *LastChain = Phi; 7575 for (auto *I : ReductionOperations) { 7576 InLoopReductionImmediateChains[I] = LastChain; 7577 LastChain = I; 7578 } 7579 } 7580 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7581 << " reduction for phi: " << *Phi << "\n"); 7582 } 7583 } 7584 7585 // TODO: we could return a pair of values that specify the max VF and 7586 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7587 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7588 // doesn't have a cost model that can choose which plan to execute if 7589 // more than one is generated. 7590 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 7591 LoopVectorizationCostModel &CM) { 7592 unsigned WidestType; 7593 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7594 return WidestVectorRegBits / WidestType; 7595 } 7596 7597 VectorizationFactor 7598 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7599 assert(!UserVF.isScalable() && "scalable vectors not yet supported"); 7600 ElementCount VF = UserVF; 7601 // Outer loop handling: They may require CFG and instruction level 7602 // transformations before even evaluating whether vectorization is profitable. 7603 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7604 // the vectorization pipeline. 7605 if (!OrigLoop->isInnermost()) { 7606 // If the user doesn't provide a vectorization factor, determine a 7607 // reasonable one. 7608 if (UserVF.isZero()) { 7609 VF = ElementCount::getFixed( 7610 determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM)); 7611 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7612 7613 // Make sure we have a VF > 1 for stress testing. 7614 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7615 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7616 << "overriding computed VF.\n"); 7617 VF = ElementCount::getFixed(4); 7618 } 7619 } 7620 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7621 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7622 "VF needs to be a power of two"); 7623 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7624 << "VF " << VF << " to build VPlans.\n"); 7625 buildVPlans(VF, VF); 7626 7627 // For VPlan build stress testing, we bail out after VPlan construction. 7628 if (VPlanBuildStressTest) 7629 return VectorizationFactor::Disabled(); 7630 7631 return {VF, 0 /*Cost*/}; 7632 } 7633 7634 LLVM_DEBUG( 7635 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7636 "VPlan-native path.\n"); 7637 return VectorizationFactor::Disabled(); 7638 } 7639 7640 Optional<VectorizationFactor> 7641 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7642 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7643 Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC); 7644 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 7645 return None; 7646 7647 // Invalidate interleave groups if all blocks of loop will be predicated. 7648 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 7649 !useMaskedInterleavedAccesses(*TTI)) { 7650 LLVM_DEBUG( 7651 dbgs() 7652 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7653 "which requires masked-interleaved support.\n"); 7654 if (CM.InterleaveInfo.invalidateGroups()) 7655 // Invalidating interleave groups also requires invalidating all decisions 7656 // based on them, which includes widening decisions and uniform and scalar 7657 // values. 7658 CM.invalidateCostModelingDecisions(); 7659 } 7660 7661 ElementCount MaxVF = MaybeMaxVF.getValue(); 7662 assert(MaxVF.isNonZero() && "MaxVF is zero."); 7663 7664 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF); 7665 if (!UserVF.isZero() && 7666 (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) { 7667 // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable 7668 // VFs here, this should be reverted to only use legal UserVFs once the 7669 // loop below supports scalable VFs. 7670 ElementCount VF = UserVFIsLegal ? UserVF : MaxVF; 7671 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") 7672 << " VF " << VF << ".\n"); 7673 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7674 "VF needs to be a power of two"); 7675 // Collect the instructions (and their associated costs) that will be more 7676 // profitable to scalarize. 7677 CM.selectUserVectorizationFactor(VF); 7678 CM.collectInLoopReductions(); 7679 buildVPlansWithVPRecipes(VF, VF); 7680 LLVM_DEBUG(printPlans(dbgs())); 7681 return {{VF, 0}}; 7682 } 7683 7684 assert(!MaxVF.isScalable() && 7685 "Scalable vectors not yet supported beyond this point"); 7686 7687 for (ElementCount VF = ElementCount::getFixed(1); 7688 ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { 7689 // Collect Uniform and Scalar instructions after vectorization with VF. 7690 CM.collectUniformsAndScalars(VF); 7691 7692 // Collect the instructions (and their associated costs) that will be more 7693 // profitable to scalarize. 7694 if (VF.isVector()) 7695 CM.collectInstsToScalarize(VF); 7696 } 7697 7698 CM.collectInLoopReductions(); 7699 7700 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); 7701 LLVM_DEBUG(printPlans(dbgs())); 7702 if (MaxVF.isScalar()) 7703 return VectorizationFactor::Disabled(); 7704 7705 // Select the optimal vectorization factor. 7706 return CM.selectVectorizationFactor(MaxVF); 7707 } 7708 7709 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { 7710 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 7711 << '\n'); 7712 BestVF = VF; 7713 BestUF = UF; 7714 7715 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 7716 return !Plan->hasVF(VF); 7717 }); 7718 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 7719 } 7720 7721 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 7722 DominatorTree *DT) { 7723 // Perform the actual loop transformation. 7724 7725 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 7726 VPCallbackILV CallbackILV(ILV); 7727 7728 assert(BestVF.hasValue() && "Vectorization Factor is missing"); 7729 7730 VPTransformState State{*BestVF, 7731 BestUF, 7732 OrigLoop, 7733 LI, 7734 DT, 7735 ILV.Builder, 7736 ILV.VectorLoopValueMap, 7737 &ILV, 7738 CallbackILV}; 7739 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 7740 State.TripCount = ILV.getOrCreateTripCount(nullptr); 7741 State.CanonicalIV = ILV.Induction; 7742 7743 ILV.printDebugTracesAtStart(); 7744 7745 //===------------------------------------------------===// 7746 // 7747 // Notice: any optimization or new instruction that go 7748 // into the code below should also be implemented in 7749 // the cost-model. 7750 // 7751 //===------------------------------------------------===// 7752 7753 // 2. Copy and widen instructions from the old loop into the new loop. 7754 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 7755 VPlans.front()->execute(&State); 7756 7757 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7758 // predication, updating analyses. 7759 ILV.fixVectorizedLoop(); 7760 7761 ILV.printDebugTracesAtEnd(); 7762 } 7763 7764 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 7765 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7766 7767 // We create new control-flow for the vectorized loop, so the original exit 7768 // conditions will be dead after vectorization if it's only used by the 7769 // terminator 7770 SmallVector<BasicBlock*> ExitingBlocks; 7771 OrigLoop->getExitingBlocks(ExitingBlocks); 7772 for (auto *BB : ExitingBlocks) { 7773 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0)); 7774 if (!Cmp || !Cmp->hasOneUse()) 7775 continue; 7776 7777 // TODO: we should introduce a getUniqueExitingBlocks on Loop 7778 if (!DeadInstructions.insert(Cmp).second) 7779 continue; 7780 7781 // The operands of the icmp is often a dead trunc, used by IndUpdate. 7782 // TODO: can recurse through operands in general 7783 for (Value *Op : Cmp->operands()) { 7784 if (isa<TruncInst>(Op) && Op->hasOneUse()) 7785 DeadInstructions.insert(cast<Instruction>(Op)); 7786 } 7787 } 7788 7789 // We create new "steps" for induction variable updates to which the original 7790 // induction variables map. An original update instruction will be dead if 7791 // all its users except the induction variable are dead. 7792 auto *Latch = OrigLoop->getLoopLatch(); 7793 for (auto &Induction : Legal->getInductionVars()) { 7794 PHINode *Ind = Induction.first; 7795 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 7796 7797 // If the tail is to be folded by masking, the primary induction variable, 7798 // if exists, isn't dead: it will be used for masking. Don't kill it. 7799 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction()) 7800 continue; 7801 7802 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 7803 return U == Ind || DeadInstructions.count(cast<Instruction>(U)); 7804 })) 7805 DeadInstructions.insert(IndUpdate); 7806 7807 // We record as "Dead" also the type-casting instructions we had identified 7808 // during induction analysis. We don't need any handling for them in the 7809 // vectorized loop because we have proven that, under a proper runtime 7810 // test guarding the vectorized loop, the value of the phi, and the casted 7811 // value of the phi, are the same. The last instruction in this casting chain 7812 // will get its scalar/vector/widened def from the scalar/vector/widened def 7813 // of the respective phi node. Any other casts in the induction def-use chain 7814 // have no other uses outside the phi update chain, and will be ignored. 7815 InductionDescriptor &IndDes = Induction.second; 7816 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7817 DeadInstructions.insert(Casts.begin(), Casts.end()); 7818 } 7819 } 7820 7821 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 7822 7823 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 7824 7825 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 7826 Instruction::BinaryOps BinOp) { 7827 // When unrolling and the VF is 1, we only need to add a simple scalar. 7828 Type *Ty = Val->getType(); 7829 assert(!Ty->isVectorTy() && "Val must be a scalar"); 7830 7831 if (Ty->isFloatingPointTy()) { 7832 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 7833 7834 // Floating point operations had to be 'fast' to enable the unrolling. 7835 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 7836 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 7837 } 7838 Constant *C = ConstantInt::get(Ty, StartIdx); 7839 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 7840 } 7841 7842 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7843 SmallVector<Metadata *, 4> MDs; 7844 // Reserve first location for self reference to the LoopID metadata node. 7845 MDs.push_back(nullptr); 7846 bool IsUnrollMetadata = false; 7847 MDNode *LoopID = L->getLoopID(); 7848 if (LoopID) { 7849 // First find existing loop unrolling disable metadata. 7850 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7851 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7852 if (MD) { 7853 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7854 IsUnrollMetadata = 7855 S && S->getString().startswith("llvm.loop.unroll.disable"); 7856 } 7857 MDs.push_back(LoopID->getOperand(i)); 7858 } 7859 } 7860 7861 if (!IsUnrollMetadata) { 7862 // Add runtime unroll disable metadata. 7863 LLVMContext &Context = L->getHeader()->getContext(); 7864 SmallVector<Metadata *, 1> DisableOperands; 7865 DisableOperands.push_back( 7866 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7867 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7868 MDs.push_back(DisableNode); 7869 MDNode *NewLoopID = MDNode::get(Context, MDs); 7870 // Set operand 0 to refer to the loop id itself. 7871 NewLoopID->replaceOperandWith(0, NewLoopID); 7872 L->setLoopID(NewLoopID); 7873 } 7874 } 7875 7876 //===--------------------------------------------------------------------===// 7877 // EpilogueVectorizerMainLoop 7878 //===--------------------------------------------------------------------===// 7879 7880 /// This function is partially responsible for generating the control flow 7881 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7882 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { 7883 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7884 Loop *Lp = createVectorLoopSkeleton(""); 7885 7886 // Generate the code to check the minimum iteration count of the vector 7887 // epilogue (see below). 7888 EPI.EpilogueIterationCountCheck = 7889 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true); 7890 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7891 7892 // Generate the code to check any assumptions that we've made for SCEV 7893 // expressions. 7894 BasicBlock *SavedPreHeader = LoopVectorPreHeader; 7895 emitSCEVChecks(Lp, LoopScalarPreHeader); 7896 7897 // If a safety check was generated save it. 7898 if (SavedPreHeader != LoopVectorPreHeader) 7899 EPI.SCEVSafetyCheck = SavedPreHeader; 7900 7901 // Generate the code that checks at runtime if arrays overlap. We put the 7902 // checks into a separate block to make the more common case of few elements 7903 // faster. 7904 SavedPreHeader = LoopVectorPreHeader; 7905 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 7906 7907 // If a safety check was generated save/overwite it. 7908 if (SavedPreHeader != LoopVectorPreHeader) 7909 EPI.MemSafetyCheck = SavedPreHeader; 7910 7911 // Generate the iteration count check for the main loop, *after* the check 7912 // for the epilogue loop, so that the path-length is shorter for the case 7913 // that goes directly through the vector epilogue. The longer-path length for 7914 // the main loop is compensated for, by the gain from vectorizing the larger 7915 // trip count. Note: the branch will get updated later on when we vectorize 7916 // the epilogue. 7917 EPI.MainLoopIterationCountCheck = 7918 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false); 7919 7920 // Generate the induction variable. 7921 OldInduction = Legal->getPrimaryInduction(); 7922 Type *IdxTy = Legal->getWidestInductionType(); 7923 Value *StartIdx = ConstantInt::get(IdxTy, 0); 7924 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 7925 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 7926 EPI.VectorTripCount = CountRoundDown; 7927 Induction = 7928 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 7929 getDebugLocFromInstOrOperands(OldInduction)); 7930 7931 // Skip induction resume value creation here because they will be created in 7932 // the second pass. If we created them here, they wouldn't be used anyway, 7933 // because the vplan in the second pass still contains the inductions from the 7934 // original loop. 7935 7936 return completeLoopSkeleton(Lp, OrigLoopID); 7937 } 7938 7939 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7940 LLVM_DEBUG({ 7941 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7942 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 7943 << ", Main Loop UF:" << EPI.MainLoopUF 7944 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 7945 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7946 }); 7947 } 7948 7949 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7950 DEBUG_WITH_TYPE(VerboseDebug, { 7951 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n"; 7952 }); 7953 } 7954 7955 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck( 7956 Loop *L, BasicBlock *Bypass, bool ForEpilogue) { 7957 assert(L && "Expected valid Loop."); 7958 assert(Bypass && "Expected valid bypass basic block."); 7959 unsigned VFactor = 7960 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue(); 7961 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7962 Value *Count = getOrCreateTripCount(L); 7963 // Reuse existing vector loop preheader for TC checks. 7964 // Note that new preheader block is generated for vector loop. 7965 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7966 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7967 7968 // Generate code to check if the loop's trip count is less than VF * UF of the 7969 // main vector loop. 7970 auto P = 7971 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 7972 7973 Value *CheckMinIters = Builder.CreateICmp( 7974 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor), 7975 "min.iters.check"); 7976 7977 if (!ForEpilogue) 7978 TCCheckBlock->setName("vector.main.loop.iter.check"); 7979 7980 // Create new preheader for vector loop. 7981 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7982 DT, LI, nullptr, "vector.ph"); 7983 7984 if (ForEpilogue) { 7985 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7986 DT->getNode(Bypass)->getIDom()) && 7987 "TC check is expected to dominate Bypass"); 7988 7989 // Update dominator for Bypass & LoopExit. 7990 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7991 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7992 7993 LoopBypassBlocks.push_back(TCCheckBlock); 7994 7995 // Save the trip count so we don't have to regenerate it in the 7996 // vec.epilog.iter.check. This is safe to do because the trip count 7997 // generated here dominates the vector epilog iter check. 7998 EPI.TripCount = Count; 7999 } 8000 8001 ReplaceInstWithInst( 8002 TCCheckBlock->getTerminator(), 8003 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8004 8005 return TCCheckBlock; 8006 } 8007 8008 //===--------------------------------------------------------------------===// 8009 // EpilogueVectorizerEpilogueLoop 8010 //===--------------------------------------------------------------------===// 8011 8012 /// This function is partially responsible for generating the control flow 8013 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 8014 BasicBlock * 8015 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { 8016 MDNode *OrigLoopID = OrigLoop->getLoopID(); 8017 Loop *Lp = createVectorLoopSkeleton("vec.epilog."); 8018 8019 // Now, compare the remaining count and if there aren't enough iterations to 8020 // execute the vectorized epilogue skip to the scalar part. 8021 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 8022 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 8023 LoopVectorPreHeader = 8024 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 8025 LI, nullptr, "vec.epilog.ph"); 8026 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader, 8027 VecEpilogueIterationCountCheck); 8028 8029 // Adjust the control flow taking the state info from the main loop 8030 // vectorization into account. 8031 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 8032 "expected this to be saved from the previous pass."); 8033 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 8034 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 8035 8036 DT->changeImmediateDominator(LoopVectorPreHeader, 8037 EPI.MainLoopIterationCountCheck); 8038 8039 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 8040 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8041 8042 if (EPI.SCEVSafetyCheck) 8043 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 8044 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8045 if (EPI.MemSafetyCheck) 8046 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 8047 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 8048 8049 DT->changeImmediateDominator( 8050 VecEpilogueIterationCountCheck, 8051 VecEpilogueIterationCountCheck->getSinglePredecessor()); 8052 8053 DT->changeImmediateDominator(LoopScalarPreHeader, 8054 EPI.EpilogueIterationCountCheck); 8055 DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck); 8056 8057 // Keep track of bypass blocks, as they feed start values to the induction 8058 // phis in the scalar loop preheader. 8059 if (EPI.SCEVSafetyCheck) 8060 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 8061 if (EPI.MemSafetyCheck) 8062 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 8063 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 8064 8065 // Generate a resume induction for the vector epilogue and put it in the 8066 // vector epilogue preheader 8067 Type *IdxTy = Legal->getWidestInductionType(); 8068 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val", 8069 LoopVectorPreHeader->getFirstNonPHI()); 8070 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 8071 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 8072 EPI.MainLoopIterationCountCheck); 8073 8074 // Generate the induction variable. 8075 OldInduction = Legal->getPrimaryInduction(); 8076 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 8077 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); 8078 Value *StartIdx = EPResumeVal; 8079 Induction = 8080 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 8081 getDebugLocFromInstOrOperands(OldInduction)); 8082 8083 // Generate induction resume values. These variables save the new starting 8084 // indexes for the scalar loop. They are used to test if there are any tail 8085 // iterations left once the vector loop has completed. 8086 // Note that when the vectorized epilogue is skipped due to iteration count 8087 // check, then the resume value for the induction variable comes from 8088 // the trip count of the main vector loop, hence passing the AdditionalBypass 8089 // argument. 8090 createInductionResumeValues(Lp, CountRoundDown, 8091 {VecEpilogueIterationCountCheck, 8092 EPI.VectorTripCount} /* AdditionalBypass */); 8093 8094 AddRuntimeUnrollDisableMetaData(Lp); 8095 return completeLoopSkeleton(Lp, OrigLoopID); 8096 } 8097 8098 BasicBlock * 8099 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 8100 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) { 8101 8102 assert(EPI.TripCount && 8103 "Expected trip count to have been safed in the first pass."); 8104 assert( 8105 (!isa<Instruction>(EPI.TripCount) || 8106 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 8107 "saved trip count does not dominate insertion point."); 8108 Value *TC = EPI.TripCount; 8109 IRBuilder<> Builder(Insert->getTerminator()); 8110 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 8111 8112 // Generate code to check if the loop's trip count is less than VF * UF of the 8113 // vector epilogue loop. 8114 auto P = 8115 Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; 8116 8117 Value *CheckMinIters = Builder.CreateICmp( 8118 P, Count, 8119 ConstantInt::get(Count->getType(), 8120 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF), 8121 "min.epilog.iters.check"); 8122 8123 ReplaceInstWithInst( 8124 Insert->getTerminator(), 8125 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 8126 8127 LoopBypassBlocks.push_back(Insert); 8128 return Insert; 8129 } 8130 8131 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 8132 LLVM_DEBUG({ 8133 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 8134 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue() 8135 << ", Main Loop UF:" << EPI.MainLoopUF 8136 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue() 8137 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 8138 }); 8139 } 8140 8141 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 8142 DEBUG_WITH_TYPE(VerboseDebug, { 8143 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n"; 8144 }); 8145 } 8146 8147 bool LoopVectorizationPlanner::getDecisionAndClampRange( 8148 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 8149 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 8150 bool PredicateAtRangeStart = Predicate(Range.Start); 8151 8152 for (ElementCount TmpVF = Range.Start * 2; 8153 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2) 8154 if (Predicate(TmpVF) != PredicateAtRangeStart) { 8155 Range.End = TmpVF; 8156 break; 8157 } 8158 8159 return PredicateAtRangeStart; 8160 } 8161 8162 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 8163 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 8164 /// of VF's starting at a given VF and extending it as much as possible. Each 8165 /// vectorization decision can potentially shorten this sub-range during 8166 /// buildVPlan(). 8167 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 8168 ElementCount MaxVF) { 8169 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8170 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8171 VFRange SubRange = {VF, MaxVFPlusOne}; 8172 VPlans.push_back(buildVPlan(SubRange)); 8173 VF = SubRange.End; 8174 } 8175 } 8176 8177 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8178 VPlanPtr &Plan) { 8179 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8180 8181 // Look for cached value. 8182 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8183 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8184 if (ECEntryIt != EdgeMaskCache.end()) 8185 return ECEntryIt->second; 8186 8187 VPValue *SrcMask = createBlockInMask(Src, Plan); 8188 8189 // The terminator has to be a branch inst! 8190 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8191 assert(BI && "Unexpected terminator found"); 8192 8193 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8194 return EdgeMaskCache[Edge] = SrcMask; 8195 8196 // If source is an exiting block, we know the exit edge is dynamically dead 8197 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8198 // adding uses of an otherwise potentially dead instruction. 8199 if (OrigLoop->isLoopExiting(Src)) 8200 return EdgeMaskCache[Edge] = SrcMask; 8201 8202 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition()); 8203 assert(EdgeMask && "No Edge Mask found for condition"); 8204 8205 if (BI->getSuccessor(0) != Dst) 8206 EdgeMask = Builder.createNot(EdgeMask); 8207 8208 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 8209 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 8210 8211 return EdgeMaskCache[Edge] = EdgeMask; 8212 } 8213 8214 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 8215 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8216 8217 // Look for cached value. 8218 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 8219 if (BCEntryIt != BlockMaskCache.end()) 8220 return BCEntryIt->second; 8221 8222 // All-one mask is modelled as no-mask following the convention for masked 8223 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8224 VPValue *BlockMask = nullptr; 8225 8226 if (OrigLoop->getHeader() == BB) { 8227 if (!CM.blockNeedsPredication(BB)) 8228 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 8229 8230 // Create the block in mask as the first non-phi instruction in the block. 8231 VPBuilder::InsertPointGuard Guard(Builder); 8232 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); 8233 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); 8234 8235 // Introduce the early-exit compare IV <= BTC to form header block mask. 8236 // This is used instead of IV < TC because TC may wrap, unlike BTC. 8237 // Start by constructing the desired canonical IV. 8238 VPValue *IV = nullptr; 8239 if (Legal->getPrimaryInduction()) 8240 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); 8241 else { 8242 auto IVRecipe = new VPWidenCanonicalIVRecipe(); 8243 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); 8244 IV = IVRecipe->getVPValue(); 8245 } 8246 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 8247 bool TailFolded = !CM.isScalarEpilogueAllowed(); 8248 8249 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { 8250 // While ActiveLaneMask is a binary op that consumes the loop tripcount 8251 // as a second argument, we only pass the IV here and extract the 8252 // tripcount from the transform state where codegen of the VP instructions 8253 // happen. 8254 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); 8255 } else { 8256 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 8257 } 8258 return BlockMaskCache[BB] = BlockMask; 8259 } 8260 8261 // This is the block mask. We OR all incoming edges. 8262 for (auto *Predecessor : predecessors(BB)) { 8263 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8264 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 8265 return BlockMaskCache[BB] = EdgeMask; 8266 8267 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8268 BlockMask = EdgeMask; 8269 continue; 8270 } 8271 8272 BlockMask = Builder.createOr(BlockMask, EdgeMask); 8273 } 8274 8275 return BlockMaskCache[BB] = BlockMask; 8276 } 8277 8278 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 8279 VPlanPtr &Plan) { 8280 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8281 "Must be called with either a load or store"); 8282 8283 auto willWiden = [&](ElementCount VF) -> bool { 8284 if (VF.isScalar()) 8285 return false; 8286 LoopVectorizationCostModel::InstWidening Decision = 8287 CM.getWideningDecision(I, VF); 8288 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8289 "CM decision should be taken at this point."); 8290 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8291 return true; 8292 if (CM.isScalarAfterVectorization(I, VF) || 8293 CM.isProfitableToScalarize(I, VF)) 8294 return false; 8295 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8296 }; 8297 8298 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8299 return nullptr; 8300 8301 VPValue *Mask = nullptr; 8302 if (Legal->isMaskRequired(I)) 8303 Mask = createBlockInMask(I->getParent(), Plan); 8304 8305 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 8306 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8307 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 8308 8309 StoreInst *Store = cast<StoreInst>(I); 8310 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 8311 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 8312 } 8313 8314 VPWidenIntOrFpInductionRecipe * 8315 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { 8316 // Check if this is an integer or fp induction. If so, build the recipe that 8317 // produces its scalar and vector values. 8318 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 8319 if (II.getKind() == InductionDescriptor::IK_IntInduction || 8320 II.getKind() == InductionDescriptor::IK_FpInduction) { 8321 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8322 return new VPWidenIntOrFpInductionRecipe(Phi, Start); 8323 } 8324 8325 return nullptr; 8326 } 8327 8328 VPWidenIntOrFpInductionRecipe * 8329 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range, 8330 VPlan &Plan) const { 8331 // Optimize the special case where the source is a constant integer 8332 // induction variable. Notice that we can only optimize the 'trunc' case 8333 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8334 // (c) other casts depend on pointer size. 8335 8336 // Determine whether \p K is a truncation based on an induction variable that 8337 // can be optimized. 8338 auto isOptimizableIVTruncate = 8339 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8340 return [=](ElementCount VF) -> bool { 8341 return CM.isOptimizableIVTruncate(K, VF); 8342 }; 8343 }; 8344 8345 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8346 isOptimizableIVTruncate(I), Range)) { 8347 8348 InductionDescriptor II = 8349 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); 8350 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); 8351 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 8352 Start, I); 8353 } 8354 return nullptr; 8355 } 8356 8357 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) { 8358 // We know that all PHIs in non-header blocks are converted into selects, so 8359 // we don't have to worry about the insertion order and we can just use the 8360 // builder. At this point we generate the predication tree. There may be 8361 // duplications since this is a simple recursive scan, but future 8362 // optimizations will clean it up. 8363 8364 SmallVector<VPValue *, 2> Operands; 8365 unsigned NumIncoming = Phi->getNumIncomingValues(); 8366 for (unsigned In = 0; In < NumIncoming; In++) { 8367 VPValue *EdgeMask = 8368 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 8369 assert((EdgeMask || NumIncoming == 1) && 8370 "Multiple predecessors with one having a full mask"); 8371 Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In))); 8372 if (EdgeMask) 8373 Operands.push_back(EdgeMask); 8374 } 8375 return new VPBlendRecipe(Phi, Operands); 8376 } 8377 8378 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range, 8379 VPlan &Plan) const { 8380 8381 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8382 [this, CI](ElementCount VF) { 8383 return CM.isScalarWithPredication(CI, VF); 8384 }, 8385 Range); 8386 8387 if (IsPredicated) 8388 return nullptr; 8389 8390 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8391 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8392 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8393 ID == Intrinsic::pseudoprobe || 8394 ID == Intrinsic::experimental_noalias_scope_decl)) 8395 return nullptr; 8396 8397 auto willWiden = [&](ElementCount VF) -> bool { 8398 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8399 // The following case may be scalarized depending on the VF. 8400 // The flag shows whether we use Intrinsic or a usual Call for vectorized 8401 // version of the instruction. 8402 // Is it beneficial to perform intrinsic call compared to lib call? 8403 bool NeedToScalarize = false; 8404 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 8405 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; 8406 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; 8407 assert(IntrinsicCost.isValid() && CallCost.isValid() && 8408 "Cannot have invalid costs while widening"); 8409 return UseVectorIntrinsic || !NeedToScalarize; 8410 }; 8411 8412 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8413 return nullptr; 8414 8415 return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands())); 8416 } 8417 8418 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8419 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8420 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8421 // Instruction should be widened, unless it is scalar after vectorization, 8422 // scalarization is profitable or it is predicated. 8423 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8424 return CM.isScalarAfterVectorization(I, VF) || 8425 CM.isProfitableToScalarize(I, VF) || 8426 CM.isScalarWithPredication(I, VF); 8427 }; 8428 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8429 Range); 8430 } 8431 8432 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { 8433 auto IsVectorizableOpcode = [](unsigned Opcode) { 8434 switch (Opcode) { 8435 case Instruction::Add: 8436 case Instruction::And: 8437 case Instruction::AShr: 8438 case Instruction::BitCast: 8439 case Instruction::FAdd: 8440 case Instruction::FCmp: 8441 case Instruction::FDiv: 8442 case Instruction::FMul: 8443 case Instruction::FNeg: 8444 case Instruction::FPExt: 8445 case Instruction::FPToSI: 8446 case Instruction::FPToUI: 8447 case Instruction::FPTrunc: 8448 case Instruction::FRem: 8449 case Instruction::FSub: 8450 case Instruction::ICmp: 8451 case Instruction::IntToPtr: 8452 case Instruction::LShr: 8453 case Instruction::Mul: 8454 case Instruction::Or: 8455 case Instruction::PtrToInt: 8456 case Instruction::SDiv: 8457 case Instruction::Select: 8458 case Instruction::SExt: 8459 case Instruction::Shl: 8460 case Instruction::SIToFP: 8461 case Instruction::SRem: 8462 case Instruction::Sub: 8463 case Instruction::Trunc: 8464 case Instruction::UDiv: 8465 case Instruction::UIToFP: 8466 case Instruction::URem: 8467 case Instruction::Xor: 8468 case Instruction::ZExt: 8469 return true; 8470 } 8471 return false; 8472 }; 8473 8474 if (!IsVectorizableOpcode(I->getOpcode())) 8475 return nullptr; 8476 8477 // Success: widen this instruction. 8478 return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); 8479 } 8480 8481 VPBasicBlock *VPRecipeBuilder::handleReplication( 8482 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 8483 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 8484 VPlanPtr &Plan) { 8485 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8486 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8487 Range); 8488 8489 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8490 [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, 8491 Range); 8492 8493 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), 8494 IsUniform, IsPredicated); 8495 setRecipe(I, Recipe); 8496 Plan->addVPValue(I, Recipe); 8497 8498 // Find if I uses a predicated instruction. If so, it will use its scalar 8499 // value. Avoid hoisting the insert-element which packs the scalar value into 8500 // a vector value, as that happens iff all users use the vector value. 8501 for (auto &Op : I->operands()) 8502 if (auto *PredInst = dyn_cast<Instruction>(Op)) 8503 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 8504 PredInst2Recipe[PredInst]->setAlsoPack(false); 8505 8506 // Finalize the recipe for Instr, first if it is not predicated. 8507 if (!IsPredicated) { 8508 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8509 VPBB->appendRecipe(Recipe); 8510 return VPBB; 8511 } 8512 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8513 assert(VPBB->getSuccessors().empty() && 8514 "VPBB has successors when handling predicated replication."); 8515 // Record predicated instructions for above packing optimizations. 8516 PredInst2Recipe[I] = Recipe; 8517 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 8518 VPBlockUtils::insertBlockAfter(Region, VPBB); 8519 auto *RegSucc = new VPBasicBlock(); 8520 VPBlockUtils::insertBlockAfter(RegSucc, Region); 8521 return RegSucc; 8522 } 8523 8524 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 8525 VPRecipeBase *PredRecipe, 8526 VPlanPtr &Plan) { 8527 // Instructions marked for predication are replicated and placed under an 8528 // if-then construct to prevent side-effects. 8529 8530 // Generate recipes to compute the block mask for this region. 8531 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 8532 8533 // Build the triangular if-then region. 8534 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 8535 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 8536 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 8537 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 8538 auto *PHIRecipe = Instr->getType()->isVoidTy() 8539 ? nullptr 8540 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr)); 8541 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 8542 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 8543 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 8544 8545 // Note: first set Entry as region entry and then connect successors starting 8546 // from it in order, to propagate the "parent" of each VPBasicBlock. 8547 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 8548 VPBlockUtils::connectBlocks(Pred, Exit); 8549 8550 return Region; 8551 } 8552 8553 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8554 VFRange &Range, 8555 VPlanPtr &Plan) { 8556 // First, check for specific widening recipes that deal with calls, memory 8557 // operations, inductions and Phi nodes. 8558 if (auto *CI = dyn_cast<CallInst>(Instr)) 8559 return tryToWidenCall(CI, Range, *Plan); 8560 8561 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8562 return tryToWidenMemory(Instr, Range, Plan); 8563 8564 VPRecipeBase *Recipe; 8565 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8566 if (Phi->getParent() != OrigLoop->getHeader()) 8567 return tryToBlend(Phi, Plan); 8568 if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan))) 8569 return Recipe; 8570 8571 if (Legal->isReductionVariable(Phi)) { 8572 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8573 VPValue *StartV = 8574 Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue()); 8575 return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV); 8576 } 8577 8578 return new VPWidenPHIRecipe(Phi); 8579 } 8580 8581 if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( 8582 cast<TruncInst>(Instr), Range, *Plan))) 8583 return Recipe; 8584 8585 if (!shouldWiden(Instr, Range)) 8586 return nullptr; 8587 8588 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8589 return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()), 8590 OrigLoop); 8591 8592 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8593 bool InvariantCond = 8594 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop); 8595 return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()), 8596 InvariantCond); 8597 } 8598 8599 return tryToWiden(Instr, *Plan); 8600 } 8601 8602 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8603 ElementCount MaxVF) { 8604 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8605 8606 // Collect instructions from the original loop that will become trivially dead 8607 // in the vectorized loop. We don't need to vectorize these instructions. For 8608 // example, original induction update instructions can become dead because we 8609 // separately emit induction "steps" when generating code for the new loop. 8610 // Similarly, we create a new latch condition when setting up the structure 8611 // of the new loop, so the old one can become dead. 8612 SmallPtrSet<Instruction *, 4> DeadInstructions; 8613 collectTriviallyDeadInstructions(DeadInstructions); 8614 8615 // Add assume instructions we need to drop to DeadInstructions, to prevent 8616 // them from being added to the VPlan. 8617 // TODO: We only need to drop assumes in blocks that get flattend. If the 8618 // control flow is preserved, we should keep them. 8619 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 8620 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 8621 8622 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 8623 // Dead instructions do not need sinking. Remove them from SinkAfter. 8624 for (Instruction *I : DeadInstructions) 8625 SinkAfter.erase(I); 8626 8627 auto MaxVFPlusOne = MaxVF.getWithIncrement(1); 8628 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { 8629 VFRange SubRange = {VF, MaxVFPlusOne}; 8630 VPlans.push_back( 8631 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); 8632 VF = SubRange.End; 8633 } 8634 } 8635 8636 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 8637 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions, 8638 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 8639 8640 // Hold a mapping from predicated instructions to their recipes, in order to 8641 // fix their AlsoPack behavior if a user is determined to replicate and use a 8642 // scalar instead of vector value. 8643 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 8644 8645 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8646 8647 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8648 8649 // --------------------------------------------------------------------------- 8650 // Pre-construction: record ingredients whose recipes we'll need to further 8651 // process after constructing the initial VPlan. 8652 // --------------------------------------------------------------------------- 8653 8654 // Mark instructions we'll need to sink later and their targets as 8655 // ingredients whose recipe we'll need to record. 8656 for (auto &Entry : SinkAfter) { 8657 RecipeBuilder.recordRecipeOf(Entry.first); 8658 RecipeBuilder.recordRecipeOf(Entry.second); 8659 } 8660 for (auto &Reduction : CM.getInLoopReductionChains()) { 8661 PHINode *Phi = Reduction.first; 8662 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); 8663 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8664 8665 RecipeBuilder.recordRecipeOf(Phi); 8666 for (auto &R : ReductionOperations) { 8667 RecipeBuilder.recordRecipeOf(R); 8668 // For min/max reducitons, where we have a pair of icmp/select, we also 8669 // need to record the ICmp recipe, so it can be removed later. 8670 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) 8671 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0))); 8672 } 8673 } 8674 8675 // For each interleave group which is relevant for this (possibly trimmed) 8676 // Range, add it to the set of groups to be later applied to the VPlan and add 8677 // placeholders for its members' Recipes which we'll be replacing with a 8678 // single VPInterleaveRecipe. 8679 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8680 auto applyIG = [IG, this](ElementCount VF) -> bool { 8681 return (VF.isVector() && // Query is illegal for VF == 1 8682 CM.getWideningDecision(IG->getInsertPos(), VF) == 8683 LoopVectorizationCostModel::CM_Interleave); 8684 }; 8685 if (!getDecisionAndClampRange(applyIG, Range)) 8686 continue; 8687 InterleaveGroups.insert(IG); 8688 for (unsigned i = 0; i < IG->getFactor(); i++) 8689 if (Instruction *Member = IG->getMember(i)) 8690 RecipeBuilder.recordRecipeOf(Member); 8691 }; 8692 8693 // --------------------------------------------------------------------------- 8694 // Build initial VPlan: Scan the body of the loop in a topological order to 8695 // visit each basic block after having visited its predecessor basic blocks. 8696 // --------------------------------------------------------------------------- 8697 8698 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 8699 auto Plan = std::make_unique<VPlan>(); 8700 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 8701 Plan->setEntry(VPBB); 8702 8703 // Scan the body of the loop in a topological order to visit each basic block 8704 // after having visited its predecessor basic blocks. 8705 LoopBlocksDFS DFS(OrigLoop); 8706 DFS.perform(LI); 8707 8708 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8709 // Relevant instructions from basic block BB will be grouped into VPRecipe 8710 // ingredients and fill a new VPBasicBlock. 8711 unsigned VPBBsForBB = 0; 8712 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 8713 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 8714 VPBB = FirstVPBBForBB; 8715 Builder.setInsertPoint(VPBB); 8716 8717 // Introduce each ingredient into VPlan. 8718 // TODO: Model and preserve debug instrinsics in VPlan. 8719 for (Instruction &I : BB->instructionsWithoutDebug()) { 8720 Instruction *Instr = &I; 8721 8722 // First filter out irrelevant instructions, to ensure no recipes are 8723 // built for them. 8724 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr)) 8725 continue; 8726 8727 if (auto Recipe = 8728 RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) { 8729 for (auto *Def : Recipe->definedValues()) { 8730 auto *UV = Def->getUnderlyingValue(); 8731 Plan->addVPValue(UV, Def); 8732 } 8733 8734 RecipeBuilder.setRecipe(Instr, Recipe); 8735 VPBB->appendRecipe(Recipe); 8736 continue; 8737 } 8738 8739 // Otherwise, if all widening options failed, Instruction is to be 8740 // replicated. This may create a successor for VPBB. 8741 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 8742 Instr, Range, VPBB, PredInst2Recipe, Plan); 8743 if (NextVPBB != VPBB) { 8744 VPBB = NextVPBB; 8745 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 8746 : ""); 8747 } 8748 } 8749 } 8750 8751 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 8752 // may also be empty, such as the last one VPBB, reflecting original 8753 // basic-blocks with no recipes. 8754 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 8755 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 8756 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 8757 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 8758 delete PreEntry; 8759 8760 // --------------------------------------------------------------------------- 8761 // Transform initial VPlan: Apply previously taken decisions, in order, to 8762 // bring the VPlan to its final state. 8763 // --------------------------------------------------------------------------- 8764 8765 // Apply Sink-After legal constraints. 8766 for (auto &Entry : SinkAfter) { 8767 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 8768 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 8769 // If the target is in a replication region, make sure to move Sink to the 8770 // block after it, not into the replication region itself. 8771 if (auto *Region = 8772 dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) { 8773 if (Region->isReplicator()) { 8774 assert(Region->getNumSuccessors() == 1 && "Expected SESE region!"); 8775 VPBasicBlock *NextBlock = 8776 cast<VPBasicBlock>(Region->getSuccessors().front()); 8777 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); 8778 continue; 8779 } 8780 } 8781 Sink->moveAfter(Target); 8782 } 8783 8784 // Interleave memory: for each Interleave Group we marked earlier as relevant 8785 // for this VPlan, replace the Recipes widening its memory instructions with a 8786 // single VPInterleaveRecipe at its insertion point. 8787 for (auto IG : InterleaveGroups) { 8788 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8789 RecipeBuilder.getRecipe(IG->getInsertPos())); 8790 SmallVector<VPValue *, 4> StoredValues; 8791 for (unsigned i = 0; i < IG->getFactor(); ++i) 8792 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) 8793 StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0))); 8794 8795 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8796 Recipe->getMask()); 8797 VPIG->insertBefore(Recipe); 8798 unsigned J = 0; 8799 for (unsigned i = 0; i < IG->getFactor(); ++i) 8800 if (Instruction *Member = IG->getMember(i)) { 8801 if (!Member->getType()->isVoidTy()) { 8802 VPValue *OriginalV = Plan->getVPValue(Member); 8803 Plan->removeVPValueFor(Member); 8804 Plan->addVPValue(Member, VPIG->getVPValue(J)); 8805 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8806 J++; 8807 } 8808 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 8809 } 8810 } 8811 8812 // Adjust the recipes for any inloop reductions. 8813 if (Range.Start.isVector()) 8814 adjustRecipesForInLoopReductions(Plan, RecipeBuilder); 8815 8816 // Finally, if tail is folded by masking, introduce selects between the phi 8817 // and the live-out instruction of each reduction, at the end of the latch. 8818 if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) { 8819 Builder.setInsertPoint(VPBB); 8820 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 8821 for (auto &Reduction : Legal->getReductionVars()) { 8822 if (CM.isInLoopReduction(Reduction.first)) 8823 continue; 8824 VPValue *Phi = Plan->getOrAddVPValue(Reduction.first); 8825 VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr()); 8826 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 8827 } 8828 } 8829 8830 std::string PlanName; 8831 raw_string_ostream RSO(PlanName); 8832 ElementCount VF = Range.Start; 8833 Plan->addVF(VF); 8834 RSO << "Initial VPlan for VF={" << VF; 8835 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) { 8836 Plan->addVF(VF); 8837 RSO << "," << VF; 8838 } 8839 RSO << "},UF>=1"; 8840 RSO.flush(); 8841 Plan->setName(PlanName); 8842 8843 return Plan; 8844 } 8845 8846 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8847 // Outer loop handling: They may require CFG and instruction level 8848 // transformations before even evaluating whether vectorization is profitable. 8849 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8850 // the vectorization pipeline. 8851 assert(!OrigLoop->isInnermost()); 8852 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8853 8854 // Create new empty VPlan 8855 auto Plan = std::make_unique<VPlan>(); 8856 8857 // Build hierarchical CFG 8858 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8859 HCFGBuilder.buildHierarchicalCFG(); 8860 8861 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End); 8862 VF *= 2) 8863 Plan->addVF(VF); 8864 8865 if (EnableVPlanPredication) { 8866 VPlanPredicator VPP(*Plan); 8867 VPP.predicate(); 8868 8869 // Avoid running transformation to recipes until masked code generation in 8870 // VPlan-native path is in place. 8871 return Plan; 8872 } 8873 8874 SmallPtrSet<Instruction *, 1> DeadInstructions; 8875 VPlanTransforms::VPInstructionsToVPRecipes( 8876 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 8877 return Plan; 8878 } 8879 8880 // Adjust the recipes for any inloop reductions. The chain of instructions 8881 // leading from the loop exit instr to the phi need to be converted to 8882 // reductions, with one operand being vector and the other being the scalar 8883 // reduction chain. 8884 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( 8885 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { 8886 for (auto &Reduction : CM.getInLoopReductionChains()) { 8887 PHINode *Phi = Reduction.first; 8888 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; 8889 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; 8890 8891 // ReductionOperations are orders top-down from the phi's use to the 8892 // LoopExitValue. We keep a track of the previous item (the Chain) to tell 8893 // which of the two operands will remain scalar and which will be reduced. 8894 // For minmax the chain will be the select instructions. 8895 Instruction *Chain = Phi; 8896 for (Instruction *R : ReductionOperations) { 8897 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); 8898 RecurKind Kind = RdxDesc.getRecurrenceKind(); 8899 8900 VPValue *ChainOp = Plan->getVPValue(Chain); 8901 unsigned FirstOpId; 8902 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8903 assert(isa<VPWidenSelectRecipe>(WidenRecipe) && 8904 "Expected to replace a VPWidenSelectSC"); 8905 FirstOpId = 1; 8906 } else { 8907 assert(isa<VPWidenRecipe>(WidenRecipe) && 8908 "Expected to replace a VPWidenSC"); 8909 FirstOpId = 0; 8910 } 8911 unsigned VecOpId = 8912 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId; 8913 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); 8914 8915 auto *CondOp = CM.foldTailByMasking() 8916 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan) 8917 : nullptr; 8918 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 8919 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI); 8920 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8921 Plan->removeVPValueFor(R); 8922 Plan->addVPValue(R, RedRecipe); 8923 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); 8924 WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe); 8925 WidenRecipe->eraseFromParent(); 8926 8927 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 8928 VPRecipeBase *CompareRecipe = 8929 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0))); 8930 assert(isa<VPWidenRecipe>(CompareRecipe) && 8931 "Expected to replace a VPWidenSC"); 8932 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && 8933 "Expected no remaining users"); 8934 CompareRecipe->eraseFromParent(); 8935 } 8936 Chain = R; 8937 } 8938 } 8939 } 8940 8941 Value* LoopVectorizationPlanner::VPCallbackILV:: 8942 getOrCreateVectorValues(Value *V, unsigned Part) { 8943 return ILV.getOrCreateVectorValue(V, Part); 8944 } 8945 8946 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 8947 Value *V, const VPIteration &Instance) { 8948 return ILV.getOrCreateScalarValue(V, Instance); 8949 } 8950 8951 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 8952 VPSlotTracker &SlotTracker) const { 8953 O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 8954 IG->getInsertPos()->printAsOperand(O, false); 8955 O << ", "; 8956 getAddr()->printAsOperand(O, SlotTracker); 8957 VPValue *Mask = getMask(); 8958 if (Mask) { 8959 O << ", "; 8960 Mask->printAsOperand(O, SlotTracker); 8961 } 8962 for (unsigned i = 0; i < IG->getFactor(); ++i) 8963 if (Instruction *I = IG->getMember(i)) 8964 O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; 8965 } 8966 8967 void VPWidenCallRecipe::execute(VPTransformState &State) { 8968 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this, 8969 *this, State); 8970 } 8971 8972 void VPWidenSelectRecipe::execute(VPTransformState &State) { 8973 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()), 8974 this, *this, InvariantCond, State); 8975 } 8976 8977 void VPWidenRecipe::execute(VPTransformState &State) { 8978 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); 8979 } 8980 8981 void VPWidenGEPRecipe::execute(VPTransformState &State) { 8982 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this, 8983 *this, State.UF, State.VF, IsPtrLoopInvariant, 8984 IsIndexLoopInvariant, State); 8985 } 8986 8987 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 8988 assert(!State.Instance && "Int or FP induction being replicated."); 8989 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), 8990 Trunc); 8991 } 8992 8993 void VPWidenPHIRecipe::execute(VPTransformState &State) { 8994 Value *StartV = 8995 getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr; 8996 State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF); 8997 } 8998 8999 void VPBlendRecipe::execute(VPTransformState &State) { 9000 State.ILV->setDebugLocFromInst(State.Builder, Phi); 9001 // We know that all PHIs in non-header blocks are converted into 9002 // selects, so we don't have to worry about the insertion order and we 9003 // can just use the builder. 9004 // At this point we generate the predication tree. There may be 9005 // duplications since this is a simple recursive scan, but future 9006 // optimizations will clean it up. 9007 9008 unsigned NumIncoming = getNumIncomingValues(); 9009 9010 // Generate a sequence of selects of the form: 9011 // SELECT(Mask3, In3, 9012 // SELECT(Mask2, In2, 9013 // SELECT(Mask1, In1, 9014 // In0))) 9015 // Note that Mask0 is never used: lanes for which no path reaches this phi and 9016 // are essentially undef are taken from In0. 9017 InnerLoopVectorizer::VectorParts Entry(State.UF); 9018 for (unsigned In = 0; In < NumIncoming; ++In) { 9019 for (unsigned Part = 0; Part < State.UF; ++Part) { 9020 // We might have single edge PHIs (blocks) - use an identity 9021 // 'select' for the first PHI operand. 9022 Value *In0 = State.get(getIncomingValue(In), Part); 9023 if (In == 0) 9024 Entry[Part] = In0; // Initialize with the first incoming value. 9025 else { 9026 // Select between the current value and the previous incoming edge 9027 // based on the incoming mask. 9028 Value *Cond = State.get(getMask(In), Part); 9029 Entry[Part] = 9030 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 9031 } 9032 } 9033 } 9034 for (unsigned Part = 0; Part < State.UF; ++Part) 9035 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 9036 } 9037 9038 void VPInterleaveRecipe::execute(VPTransformState &State) { 9039 assert(!State.Instance && "Interleave group being replicated."); 9040 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9041 getStoredValues(), getMask()); 9042 } 9043 9044 void VPReductionRecipe::execute(VPTransformState &State) { 9045 assert(!State.Instance && "Reduction being replicated."); 9046 for (unsigned Part = 0; Part < State.UF; ++Part) { 9047 RecurKind Kind = RdxDesc->getRecurrenceKind(); 9048 Value *NewVecOp = State.get(getVecOp(), Part); 9049 if (VPValue *Cond = getCondOp()) { 9050 Value *NewCond = State.get(Cond, Part); 9051 VectorType *VecTy = cast<VectorType>(NewVecOp->getType()); 9052 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 9053 Kind, VecTy->getElementType()); 9054 Constant *IdenVec = 9055 ConstantVector::getSplat(VecTy->getElementCount(), Iden); 9056 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); 9057 NewVecOp = Select; 9058 } 9059 Value *NewRed = 9060 createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); 9061 Value *PrevInChain = State.get(getChainOp(), Part); 9062 Value *NextInChain; 9063 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9064 NextInChain = 9065 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), 9066 NewRed, PrevInChain); 9067 } else { 9068 NextInChain = State.Builder.CreateBinOp( 9069 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, 9070 PrevInChain); 9071 } 9072 State.set(this, getUnderlyingInstr(), NextInChain, Part); 9073 } 9074 } 9075 9076 void VPReplicateRecipe::execute(VPTransformState &State) { 9077 if (State.Instance) { // Generate a single instance. 9078 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9079 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, 9080 *State.Instance, IsPredicated, State); 9081 // Insert scalar instance packing it into a vector. 9082 if (AlsoPack && State.VF.isVector()) { 9083 // If we're constructing lane 0, initialize to start from poison. 9084 if (State.Instance->Lane == 0) { 9085 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9086 Value *Poison = PoisonValue::get( 9087 VectorType::get(getUnderlyingValue()->getType(), State.VF)); 9088 State.ValueMap.setVectorValue(getUnderlyingInstr(), 9089 State.Instance->Part, Poison); 9090 } 9091 State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(), 9092 *State.Instance); 9093 } 9094 return; 9095 } 9096 9097 // Generate scalar instances for all VF lanes of all UF parts, unless the 9098 // instruction is uniform inwhich case generate only the first lane for each 9099 // of the UF parts. 9100 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); 9101 assert((!State.VF.isScalable() || IsUniform) && 9102 "Can't scalarize a scalable vector"); 9103 for (unsigned Part = 0; Part < State.UF; ++Part) 9104 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9105 State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane}, 9106 IsPredicated, State); 9107 } 9108 9109 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 9110 assert(State.Instance && "Branch on Mask works only on single instance."); 9111 9112 unsigned Part = State.Instance->Part; 9113 unsigned Lane = State.Instance->Lane; 9114 9115 Value *ConditionBit = nullptr; 9116 VPValue *BlockInMask = getMask(); 9117 if (BlockInMask) { 9118 ConditionBit = State.get(BlockInMask, Part); 9119 if (ConditionBit->getType()->isVectorTy()) 9120 ConditionBit = State.Builder.CreateExtractElement( 9121 ConditionBit, State.Builder.getInt32(Lane)); 9122 } else // Block in mask is all-one. 9123 ConditionBit = State.Builder.getTrue(); 9124 9125 // Replace the temporary unreachable terminator with a new conditional branch, 9126 // whose two destinations will be set later when they are created. 9127 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 9128 assert(isa<UnreachableInst>(CurrentTerminator) && 9129 "Expected to replace unreachable terminator with conditional branch."); 9130 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 9131 CondBr->setSuccessor(0, nullptr); 9132 ReplaceInstWithInst(CurrentTerminator, CondBr); 9133 } 9134 9135 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 9136 assert(State.Instance && "Predicated instruction PHI works per instance."); 9137 Instruction *ScalarPredInst = 9138 cast<Instruction>(State.get(getOperand(0), *State.Instance)); 9139 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 9140 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 9141 assert(PredicatingBB && "Predicated block has no single predecessor."); 9142 9143 // By current pack/unpack logic we need to generate only a single phi node: if 9144 // a vector value for the predicated instruction exists at this point it means 9145 // the instruction has vector users only, and a phi for the vector value is 9146 // needed. In this case the recipe of the predicated instruction is marked to 9147 // also do that packing, thereby "hoisting" the insert-element sequence. 9148 // Otherwise, a phi node for the scalar value is needed. 9149 unsigned Part = State.Instance->Part; 9150 Instruction *PredInst = 9151 cast<Instruction>(getOperand(0)->getUnderlyingValue()); 9152 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 9153 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 9154 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 9155 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 9156 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 9157 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 9158 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 9159 } else { 9160 Type *PredInstType = PredInst->getType(); 9161 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 9162 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB); 9163 Phi->addIncoming(ScalarPredInst, PredicatedBB); 9164 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 9165 } 9166 } 9167 9168 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9169 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9170 State.ILV->vectorizeMemoryInstruction(&Ingredient, State, 9171 StoredValue ? nullptr : getVPValue(), 9172 getAddr(), StoredValue, getMask()); 9173 } 9174 9175 // Determine how to lower the scalar epilogue, which depends on 1) optimising 9176 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9177 // predication, and 4) a TTI hook that analyses whether the loop is suitable 9178 // for predication. 9179 static ScalarEpilogueLowering getScalarEpilogueLowering( 9180 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9181 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9182 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 9183 LoopVectorizationLegality &LVL) { 9184 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9185 // don't look at hints or options, and don't request a scalar epilogue. 9186 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9187 // LoopAccessInfo (due to code dependency and not being able to reliably get 9188 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9189 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9190 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9191 // back to the old way and vectorize with versioning when forced. See D81345.) 9192 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9193 PGSOQueryType::IRPass) && 9194 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9195 return CM_ScalarEpilogueNotAllowedOptSize; 9196 9197 // 2) If set, obey the directives 9198 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9199 switch (PreferPredicateOverEpilogue) { 9200 case PreferPredicateTy::ScalarEpilogue: 9201 return CM_ScalarEpilogueAllowed; 9202 case PreferPredicateTy::PredicateElseScalarEpilogue: 9203 return CM_ScalarEpilogueNotNeededUsePredicate; 9204 case PreferPredicateTy::PredicateOrDontVectorize: 9205 return CM_ScalarEpilogueNotAllowedUsePredicate; 9206 }; 9207 } 9208 9209 // 3) If set, obey the hints 9210 switch (Hints.getPredicate()) { 9211 case LoopVectorizeHints::FK_Enabled: 9212 return CM_ScalarEpilogueNotNeededUsePredicate; 9213 case LoopVectorizeHints::FK_Disabled: 9214 return CM_ScalarEpilogueAllowed; 9215 }; 9216 9217 // 4) if the TTI hook indicates this is profitable, request predication. 9218 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 9219 LVL.getLAI())) 9220 return CM_ScalarEpilogueNotNeededUsePredicate; 9221 9222 return CM_ScalarEpilogueAllowed; 9223 } 9224 9225 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V, 9226 unsigned Part) { 9227 set(Def, V, Part); 9228 ILV->setVectorValue(IRDef, Part, V); 9229 } 9230 9231 // Process the loop in the VPlan-native vectorization path. This path builds 9232 // VPlan upfront in the vectorization pipeline, which allows to apply 9233 // VPlan-to-VPlan transformations from the very beginning without modifying the 9234 // input LLVM IR. 9235 static bool processLoopInVPlanNativePath( 9236 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9237 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9238 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9239 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9240 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 9241 9242 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9243 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9244 return false; 9245 } 9246 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9247 Function *F = L->getHeader()->getParent(); 9248 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9249 9250 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9251 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 9252 9253 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9254 &Hints, IAI); 9255 // Use the planner for outer loop vectorization. 9256 // TODO: CM is not used at this point inside the planner. Turn CM into an 9257 // optional argument if we don't need it in the future. 9258 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE); 9259 9260 // Get user vectorization factor. 9261 ElementCount UserVF = Hints.getWidth(); 9262 9263 // Plan how to best vectorize, return the best VF and its cost. 9264 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9265 9266 // If we are stress testing VPlan builds, do not attempt to generate vector 9267 // code. Masked vector code generation support will follow soon. 9268 // Also, do not attempt to vectorize if no vector code will be produced. 9269 if (VPlanBuildStressTest || EnableVPlanPredication || 9270 VectorizationFactor::Disabled() == VF) 9271 return false; 9272 9273 LVP.setBestPlan(VF.Width, 1); 9274 9275 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 9276 &CM, BFI, PSI); 9277 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9278 << L->getHeader()->getParent()->getName() << "\"\n"); 9279 LVP.executePlan(LB, DT); 9280 9281 // Mark the loop as already vectorized to avoid vectorizing again. 9282 Hints.setAlreadyVectorized(); 9283 9284 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9285 return true; 9286 } 9287 9288 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9289 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9290 !EnableLoopInterleaving), 9291 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9292 !EnableLoopVectorization) {} 9293 9294 bool LoopVectorizePass::processLoop(Loop *L) { 9295 assert((EnableVPlanNativePath || L->isInnermost()) && 9296 "VPlan-native path is not enabled. Only process inner loops."); 9297 9298 #ifndef NDEBUG 9299 const std::string DebugLocStr = getDebugLocString(L); 9300 #endif /* NDEBUG */ 9301 9302 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 9303 << L->getHeader()->getParent()->getName() << "\" from " 9304 << DebugLocStr << "\n"); 9305 9306 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 9307 9308 LLVM_DEBUG( 9309 dbgs() << "LV: Loop hints:" 9310 << " force=" 9311 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9312 ? "disabled" 9313 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9314 ? "enabled" 9315 : "?")) 9316 << " width=" << Hints.getWidth() 9317 << " unroll=" << Hints.getInterleave() << "\n"); 9318 9319 // Function containing loop 9320 Function *F = L->getHeader()->getParent(); 9321 9322 // Looking at the diagnostic output is the only way to determine if a loop 9323 // was vectorized (other than looking at the IR or machine code), so it 9324 // is important to generate an optimization remark for each loop. Most of 9325 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9326 // generated as OptimizationRemark and OptimizationRemarkMissed are 9327 // less verbose reporting vectorized loops and unvectorized loops that may 9328 // benefit from vectorization, respectively. 9329 9330 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9331 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9332 return false; 9333 } 9334 9335 PredicatedScalarEvolution PSE(*SE, *L); 9336 9337 // Check if it is legal to vectorize the loop. 9338 LoopVectorizationRequirements Requirements(*ORE); 9339 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 9340 &Requirements, &Hints, DB, AC, BFI, PSI); 9341 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9342 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9343 Hints.emitRemarkWithHints(); 9344 return false; 9345 } 9346 9347 // Check the function attributes and profiles to find out if this function 9348 // should be optimized for size. 9349 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 9350 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 9351 9352 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9353 // here. They may require CFG and instruction level transformations before 9354 // even evaluating whether vectorization is profitable. Since we cannot modify 9355 // the incoming IR, we need to build VPlan upfront in the vectorization 9356 // pipeline. 9357 if (!L->isInnermost()) 9358 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9359 ORE, BFI, PSI, Hints); 9360 9361 assert(L->isInnermost() && "Inner loop expected."); 9362 9363 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9364 // count by optimizing for size, to minimize overheads. 9365 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9366 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9367 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9368 << "This loop is worth vectorizing only if no scalar " 9369 << "iteration overheads are incurred."); 9370 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9371 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9372 else { 9373 LLVM_DEBUG(dbgs() << "\n"); 9374 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9375 } 9376 } 9377 9378 // Check the function attributes to see if implicit floats are allowed. 9379 // FIXME: This check doesn't seem possibly correct -- what if the loop is 9380 // an integer loop and the vector instructions selected are purely integer 9381 // vector instructions? 9382 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9383 reportVectorizationFailure( 9384 "Can't vectorize when the NoImplicitFloat attribute is used", 9385 "loop not vectorized due to NoImplicitFloat attribute", 9386 "NoImplicitFloat", ORE, L); 9387 Hints.emitRemarkWithHints(); 9388 return false; 9389 } 9390 9391 // Check if the target supports potentially unsafe FP vectorization. 9392 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9393 // for the target we're vectorizing for, to make sure none of the 9394 // additional fp-math flags can help. 9395 if (Hints.isPotentiallyUnsafe() && 9396 TTI->isFPVectorizationPotentiallyUnsafe()) { 9397 reportVectorizationFailure( 9398 "Potentially unsafe FP op prevents vectorization", 9399 "loop not vectorized due to unsafe FP support.", 9400 "UnsafeFP", ORE, L); 9401 Hints.emitRemarkWithHints(); 9402 return false; 9403 } 9404 9405 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9406 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9407 9408 // If an override option has been passed in for interleaved accesses, use it. 9409 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9410 UseInterleaved = EnableInterleavedMemAccesses; 9411 9412 // Analyze interleaved memory accesses. 9413 if (UseInterleaved) { 9414 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9415 } 9416 9417 // Use the cost model. 9418 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9419 F, &Hints, IAI); 9420 CM.collectValuesToIgnore(); 9421 9422 // Use the planner for vectorization. 9423 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE); 9424 9425 // Get user vectorization factor and interleave count. 9426 ElementCount UserVF = Hints.getWidth(); 9427 unsigned UserIC = Hints.getInterleave(); 9428 9429 // Plan how to best vectorize, return the best VF and its cost. 9430 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 9431 9432 VectorizationFactor VF = VectorizationFactor::Disabled(); 9433 unsigned IC = 1; 9434 9435 if (MaybeVF) { 9436 VF = *MaybeVF; 9437 // Select the interleave count. 9438 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 9439 } 9440 9441 // Identify the diagnostic messages that should be produced. 9442 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 9443 bool VectorizeLoop = true, InterleaveLoop = true; 9444 if (Requirements.doesNotMeet(F, L, Hints)) { 9445 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 9446 "requirements.\n"); 9447 Hints.emitRemarkWithHints(); 9448 return false; 9449 } 9450 9451 if (VF.Width.isScalar()) { 9452 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 9453 VecDiagMsg = std::make_pair( 9454 "VectorizationNotBeneficial", 9455 "the cost-model indicates that vectorization is not beneficial"); 9456 VectorizeLoop = false; 9457 } 9458 9459 if (!MaybeVF && UserIC > 1) { 9460 // Tell the user interleaving was avoided up-front, despite being explicitly 9461 // requested. 9462 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 9463 "interleaving should be avoided up front\n"); 9464 IntDiagMsg = std::make_pair( 9465 "InterleavingAvoided", 9466 "Ignoring UserIC, because interleaving was avoided up front"); 9467 InterleaveLoop = false; 9468 } else if (IC == 1 && UserIC <= 1) { 9469 // Tell the user interleaving is not beneficial. 9470 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 9471 IntDiagMsg = std::make_pair( 9472 "InterleavingNotBeneficial", 9473 "the cost-model indicates that interleaving is not beneficial"); 9474 InterleaveLoop = false; 9475 if (UserIC == 1) { 9476 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 9477 IntDiagMsg.second += 9478 " and is explicitly disabled or interleave count is set to 1"; 9479 } 9480 } else if (IC > 1 && UserIC == 1) { 9481 // Tell the user interleaving is beneficial, but it explicitly disabled. 9482 LLVM_DEBUG( 9483 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 9484 IntDiagMsg = std::make_pair( 9485 "InterleavingBeneficialButDisabled", 9486 "the cost-model indicates that interleaving is beneficial " 9487 "but is explicitly disabled or interleave count is set to 1"); 9488 InterleaveLoop = false; 9489 } 9490 9491 // Override IC if user provided an interleave count. 9492 IC = UserIC > 0 ? UserIC : IC; 9493 9494 // Emit diagnostic messages, if any. 9495 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 9496 if (!VectorizeLoop && !InterleaveLoop) { 9497 // Do not vectorize or interleaving the loop. 9498 ORE->emit([&]() { 9499 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 9500 L->getStartLoc(), L->getHeader()) 9501 << VecDiagMsg.second; 9502 }); 9503 ORE->emit([&]() { 9504 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 9505 L->getStartLoc(), L->getHeader()) 9506 << IntDiagMsg.second; 9507 }); 9508 return false; 9509 } else if (!VectorizeLoop && InterleaveLoop) { 9510 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9511 ORE->emit([&]() { 9512 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 9513 L->getStartLoc(), L->getHeader()) 9514 << VecDiagMsg.second; 9515 }); 9516 } else if (VectorizeLoop && !InterleaveLoop) { 9517 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9518 << ") in " << DebugLocStr << '\n'); 9519 ORE->emit([&]() { 9520 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 9521 L->getStartLoc(), L->getHeader()) 9522 << IntDiagMsg.second; 9523 }); 9524 } else if (VectorizeLoop && InterleaveLoop) { 9525 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 9526 << ") in " << DebugLocStr << '\n'); 9527 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 9528 } 9529 9530 LVP.setBestPlan(VF.Width, IC); 9531 9532 using namespace ore; 9533 bool DisableRuntimeUnroll = false; 9534 MDNode *OrigLoopID = L->getLoopID(); 9535 9536 if (!VectorizeLoop) { 9537 assert(IC > 1 && "interleave count should not be 1 or 0"); 9538 // If we decided that it is not legal to vectorize the loop, then 9539 // interleave it. 9540 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, 9541 BFI, PSI); 9542 LVP.executePlan(Unroller, DT); 9543 9544 ORE->emit([&]() { 9545 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 9546 L->getHeader()) 9547 << "interleaved loop (interleaved count: " 9548 << NV("InterleaveCount", IC) << ")"; 9549 }); 9550 } else { 9551 // If we decided that it is *legal* to vectorize the loop, then do it. 9552 9553 // Consider vectorizing the epilogue too if it's profitable. 9554 VectorizationFactor EpilogueVF = 9555 CM.selectEpilogueVectorizationFactor(VF.Width, LVP); 9556 if (EpilogueVF.Width.isVector()) { 9557 9558 // The first pass vectorizes the main loop and creates a scalar epilogue 9559 // to be vectorized by executing the plan (potentially with a different 9560 // factor) again shortly afterwards. 9561 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, 9562 EpilogueVF.Width.getKnownMinValue(), 1); 9563 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, 9564 &LVL, &CM, BFI, PSI); 9565 9566 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); 9567 LVP.executePlan(MainILV, DT); 9568 ++LoopsVectorized; 9569 9570 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9571 formLCSSARecursively(*L, *DT, LI, SE); 9572 9573 // Second pass vectorizes the epilogue and adjusts the control flow 9574 // edges from the first pass. 9575 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); 9576 EPI.MainLoopVF = EPI.EpilogueVF; 9577 EPI.MainLoopUF = EPI.EpilogueUF; 9578 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 9579 ORE, EPI, &LVL, &CM, BFI, PSI); 9580 LVP.executePlan(EpilogILV, DT); 9581 ++LoopsEpilogueVectorized; 9582 9583 if (!MainILV.areSafetyChecksAdded()) 9584 DisableRuntimeUnroll = true; 9585 } else { 9586 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 9587 &LVL, &CM, BFI, PSI); 9588 LVP.executePlan(LB, DT); 9589 ++LoopsVectorized; 9590 9591 // Add metadata to disable runtime unrolling a scalar loop when there are 9592 // no runtime checks about strides and memory. A scalar loop that is 9593 // rarely used is not worth unrolling. 9594 if (!LB.areSafetyChecksAdded()) 9595 DisableRuntimeUnroll = true; 9596 } 9597 9598 // Report the vectorization decision. 9599 ORE->emit([&]() { 9600 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 9601 L->getHeader()) 9602 << "vectorized loop (vectorization width: " 9603 << NV("VectorizationFactor", VF.Width) 9604 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 9605 }); 9606 } 9607 9608 Optional<MDNode *> RemainderLoopID = 9609 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 9610 LLVMLoopVectorizeFollowupEpilogue}); 9611 if (RemainderLoopID.hasValue()) { 9612 L->setLoopID(RemainderLoopID.getValue()); 9613 } else { 9614 if (DisableRuntimeUnroll) 9615 AddRuntimeUnrollDisableMetaData(L); 9616 9617 // Mark the loop as already vectorized to avoid vectorizing again. 9618 Hints.setAlreadyVectorized(); 9619 } 9620 9621 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9622 return true; 9623 } 9624 9625 LoopVectorizeResult LoopVectorizePass::runImpl( 9626 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 9627 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 9628 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, 9629 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 9630 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 9631 SE = &SE_; 9632 LI = &LI_; 9633 TTI = &TTI_; 9634 DT = &DT_; 9635 BFI = &BFI_; 9636 TLI = TLI_; 9637 AA = &AA_; 9638 AC = &AC_; 9639 GetLAA = &GetLAA_; 9640 DB = &DB_; 9641 ORE = &ORE_; 9642 PSI = PSI_; 9643 9644 // Don't attempt if 9645 // 1. the target claims to have no vector registers, and 9646 // 2. interleaving won't help ILP. 9647 // 9648 // The second condition is necessary because, even if the target has no 9649 // vector registers, loop vectorization may still enable scalar 9650 // interleaving. 9651 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 9652 TTI->getMaxInterleaveFactor(1) < 2) 9653 return LoopVectorizeResult(false, false); 9654 9655 bool Changed = false, CFGChanged = false; 9656 9657 // The vectorizer requires loops to be in simplified form. 9658 // Since simplification may add new inner loops, it has to run before the 9659 // legality and profitability checks. This means running the loop vectorizer 9660 // will simplify all loops, regardless of whether anything end up being 9661 // vectorized. 9662 for (auto &L : *LI) 9663 Changed |= CFGChanged |= 9664 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 9665 9666 // Build up a worklist of inner-loops to vectorize. This is necessary as 9667 // the act of vectorizing or partially unrolling a loop creates new loops 9668 // and can invalidate iterators across the loops. 9669 SmallVector<Loop *, 8> Worklist; 9670 9671 for (Loop *L : *LI) 9672 collectSupportedLoops(*L, LI, ORE, Worklist); 9673 9674 LoopsAnalyzed += Worklist.size(); 9675 9676 // Now walk the identified inner loops. 9677 while (!Worklist.empty()) { 9678 Loop *L = Worklist.pop_back_val(); 9679 9680 // For the inner loops we actually process, form LCSSA to simplify the 9681 // transform. 9682 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 9683 9684 Changed |= CFGChanged |= processLoop(L); 9685 } 9686 9687 // Process each loop nest in the function. 9688 return LoopVectorizeResult(Changed, CFGChanged); 9689 } 9690 9691 PreservedAnalyses LoopVectorizePass::run(Function &F, 9692 FunctionAnalysisManager &AM) { 9693 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 9694 auto &LI = AM.getResult<LoopAnalysis>(F); 9695 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 9696 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 9697 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 9698 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 9699 auto &AA = AM.getResult<AAManager>(F); 9700 auto &AC = AM.getResult<AssumptionAnalysis>(F); 9701 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 9702 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 9703 MemorySSA *MSSA = EnableMSSALoopDependency 9704 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 9705 : nullptr; 9706 9707 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 9708 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 9709 [&](Loop &L) -> const LoopAccessInfo & { 9710 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, 9711 TLI, TTI, nullptr, MSSA}; 9712 return LAM.getResult<LoopAccessAnalysis>(L, AR); 9713 }; 9714 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 9715 ProfileSummaryInfo *PSI = 9716 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 9717 LoopVectorizeResult Result = 9718 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 9719 if (!Result.MadeAnyChange) 9720 return PreservedAnalyses::all(); 9721 PreservedAnalyses PA; 9722 9723 // We currently do not preserve loopinfo/dominator analyses with outer loop 9724 // vectorization. Until this is addressed, mark these analyses as preserved 9725 // only for non-VPlan-native path. 9726 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 9727 if (!EnableVPlanNativePath) { 9728 PA.preserve<LoopAnalysis>(); 9729 PA.preserve<DominatorTreeAnalysis>(); 9730 } 9731 PA.preserve<BasicAA>(); 9732 PA.preserve<GlobalsAA>(); 9733 if (!Result.MadeCFGChange) 9734 PA.preserveSet<CFGAnalyses>(); 9735 return PA; 9736 } 9737