1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I); 411 412 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 413 void fixVectorizedLoop(); 414 415 // Return true if any runtime check is added. 416 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 417 418 /// A type for vectorized values in the new loop. Each value from the 419 /// original loop, when vectorized, is represented by UF vector values in the 420 /// new unrolled loop, where UF is the unroll factor. 421 using VectorParts = SmallVector<Value *, 2>; 422 423 /// Vectorize a single GetElementPtrInst based on information gathered and 424 /// decisions taken during planning. 425 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 426 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 427 428 /// Vectorize a single PHINode in a block. This method handles the induction 429 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 430 /// arbitrary length vectors. 431 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 432 433 /// A helper function to scalarize a single Instruction in the innermost loop. 434 /// Generates a sequence of scalar instances for each lane between \p MinLane 435 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 436 /// inclusive.. 437 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 438 bool IfPredicateInstr); 439 440 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 441 /// is provided, the integer induction variable will first be truncated to 442 /// the corresponding type. 443 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 444 445 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 446 /// vector or scalar value on-demand if one is not yet available. When 447 /// vectorizing a loop, we visit the definition of an instruction before its 448 /// uses. When visiting the definition, we either vectorize or scalarize the 449 /// instruction, creating an entry for it in the corresponding map. (In some 450 /// cases, such as induction variables, we will create both vector and scalar 451 /// entries.) Then, as we encounter uses of the definition, we derive values 452 /// for each scalar or vector use unless such a value is already available. 453 /// For example, if we scalarize a definition and one of its uses is vector, 454 /// we build the required vector on-demand with an insertelement sequence 455 /// when visiting the use. Otherwise, if the use is scalar, we can use the 456 /// existing scalar definition. 457 /// 458 /// Return a value in the new loop corresponding to \p V from the original 459 /// loop at unroll index \p Part. If the value has already been vectorized, 460 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 461 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 462 /// a new vector value on-demand by inserting the scalar values into a vector 463 /// with an insertelement sequence. If the value has been neither vectorized 464 /// nor scalarized, it must be loop invariant, so we simply broadcast the 465 /// value into a vector. 466 Value *getOrCreateVectorValue(Value *V, unsigned Part); 467 468 /// Return a value in the new loop corresponding to \p V from the original 469 /// loop at unroll and vector indices \p Instance. If the value has been 470 /// vectorized but not scalarized, the necessary extractelement instruction 471 /// will be generated. 472 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 473 474 /// Construct the vector value of a scalarized value \p V one lane at a time. 475 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 476 477 /// Try to vectorize the interleaved access group that \p Instr belongs to 478 /// with the base address given in \p Addr, optionally masking the vector 479 /// operations if \p BlockInMask is non-null. Use \p State to translate given 480 /// VPValues to IR values in the vectorized loop. 481 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, 482 VPValue *Addr, VPValue *BlockInMask = nullptr); 483 484 /// Vectorize Load and Store instructions with the base address given in \p 485 /// Addr, optionally masking the vector operations if \p BlockInMask is 486 /// non-null. Use \p State to translate given VPValues to IR values in the 487 /// vectorized loop. 488 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 489 VPValue *Addr, 490 VPValue *BlockInMask = nullptr); 491 492 /// Set the debug location in the builder using the debug location in 493 /// the instruction. 494 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 495 496 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 497 void fixNonInductionPHIs(void); 498 499 protected: 500 friend class LoopVectorizationPlanner; 501 502 /// A small list of PHINodes. 503 using PhiVector = SmallVector<PHINode *, 4>; 504 505 /// A type for scalarized values in the new loop. Each value from the 506 /// original loop, when scalarized, is represented by UF x VF scalar values 507 /// in the new unrolled loop, where UF is the unroll factor and VF is the 508 /// vectorization factor. 509 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 510 511 /// Set up the values of the IVs correctly when exiting the vector loop. 512 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 513 Value *CountRoundDown, Value *EndValue, 514 BasicBlock *MiddleBlock); 515 516 /// Create a new induction variable inside L. 517 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 518 Value *Step, Instruction *DL); 519 520 /// Handle all cross-iteration phis in the header. 521 void fixCrossIterationPHIs(); 522 523 /// Fix a first-order recurrence. This is the second phase of vectorizing 524 /// this phi node. 525 void fixFirstOrderRecurrence(PHINode *Phi); 526 527 /// Fix a reduction cross-iteration phi. This is the second phase of 528 /// vectorizing this phi node. 529 void fixReduction(PHINode *Phi); 530 531 /// Clear NSW/NUW flags from reduction instructions if necessary. 532 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 533 534 /// The Loop exit block may have single value PHI nodes with some 535 /// incoming value. While vectorizing we only handled real values 536 /// that were defined inside the loop and we should have one value for 537 /// each predecessor of its parent basic block. See PR14725. 538 void fixLCSSAPHIs(); 539 540 /// Iteratively sink the scalarized operands of a predicated instruction into 541 /// the block that was created for it. 542 void sinkScalarOperands(Instruction *PredInst); 543 544 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 545 /// represented as. 546 void truncateToMinimalBitwidths(); 547 548 /// Create a broadcast instruction. This method generates a broadcast 549 /// instruction (shuffle) for loop invariant values and for the induction 550 /// value. If this is the induction variable then we extend it to N, N+1, ... 551 /// this is needed because each iteration in the loop corresponds to a SIMD 552 /// element. 553 virtual Value *getBroadcastInstrs(Value *V); 554 555 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 556 /// to each vector element of Val. The sequence starts at StartIndex. 557 /// \p Opcode is relevant for FP induction variable. 558 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 559 Instruction::BinaryOps Opcode = 560 Instruction::BinaryOpsEnd); 561 562 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 563 /// variable on which to base the steps, \p Step is the size of the step, and 564 /// \p EntryVal is the value from the original loop that maps to the steps. 565 /// Note that \p EntryVal doesn't have to be an induction variable - it 566 /// can also be a truncate instruction. 567 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 568 const InductionDescriptor &ID); 569 570 /// Create a vector induction phi node based on an existing scalar one. \p 571 /// EntryVal is the value from the original loop that maps to the vector phi 572 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 573 /// truncate instruction, instead of widening the original IV, we widen a 574 /// version of the IV truncated to \p EntryVal's type. 575 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 576 Value *Step, Instruction *EntryVal); 577 578 /// Returns true if an instruction \p I should be scalarized instead of 579 /// vectorized for the chosen vectorization factor. 580 bool shouldScalarizeInstruction(Instruction *I) const; 581 582 /// Returns true if we should generate a scalar version of \p IV. 583 bool needsScalarInduction(Instruction *IV) const; 584 585 /// If there is a cast involved in the induction variable \p ID, which should 586 /// be ignored in the vectorized loop body, this function records the 587 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 588 /// cast. We had already proved that the casted Phi is equal to the uncasted 589 /// Phi in the vectorized loop (under a runtime guard), and therefore 590 /// there is no need to vectorize the cast - the same value can be used in the 591 /// vector loop for both the Phi and the cast. 592 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 593 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 594 /// 595 /// \p EntryVal is the value from the original loop that maps to the vector 596 /// phi node and is used to distinguish what is the IV currently being 597 /// processed - original one (if \p EntryVal is a phi corresponding to the 598 /// original IV) or the "newly-created" one based on the proof mentioned above 599 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 600 /// latter case \p EntryVal is a TruncInst and we must not record anything for 601 /// that IV, but it's error-prone to expect callers of this routine to care 602 /// about that, hence this explicit parameter. 603 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 604 const Instruction *EntryVal, 605 Value *VectorLoopValue, 606 unsigned Part, 607 unsigned Lane = UINT_MAX); 608 609 /// Generate a shuffle sequence that will reverse the vector Vec. 610 virtual Value *reverseVector(Value *Vec); 611 612 /// Returns (and creates if needed) the original loop trip count. 613 Value *getOrCreateTripCount(Loop *NewLoop); 614 615 /// Returns (and creates if needed) the trip count of the widened loop. 616 Value *getOrCreateVectorTripCount(Loop *NewLoop); 617 618 /// Returns a bitcasted value to the requested vector type. 619 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 620 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 621 const DataLayout &DL); 622 623 /// Emit a bypass check to see if the vector trip count is zero, including if 624 /// it overflows. 625 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 626 627 /// Emit a bypass check to see if all of the SCEV assumptions we've 628 /// had to make are correct. 629 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 630 631 /// Emit bypass checks to check any memory assumptions we may have made. 632 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 633 634 /// Compute the transformed value of Index at offset StartValue using step 635 /// StepValue. 636 /// For integer induction, returns StartValue + Index * StepValue. 637 /// For pointer induction, returns StartValue[Index * StepValue]. 638 /// FIXME: The newly created binary instructions should contain nsw/nuw 639 /// flags, which can be found from the original scalar operations. 640 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 641 const DataLayout &DL, 642 const InductionDescriptor &ID) const; 643 644 /// Add additional metadata to \p To that was not present on \p Orig. 645 /// 646 /// Currently this is used to add the noalias annotations based on the 647 /// inserted memchecks. Use this for instructions that are *cloned* into the 648 /// vector loop. 649 void addNewMetadata(Instruction *To, const Instruction *Orig); 650 651 /// Add metadata from one instruction to another. 652 /// 653 /// This includes both the original MDs from \p From and additional ones (\see 654 /// addNewMetadata). Use this for *newly created* instructions in the vector 655 /// loop. 656 void addMetadata(Instruction *To, Instruction *From); 657 658 /// Similar to the previous function but it adds the metadata to a 659 /// vector of instructions. 660 void addMetadata(ArrayRef<Value *> To, Instruction *From); 661 662 /// The original loop. 663 Loop *OrigLoop; 664 665 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 666 /// dynamic knowledge to simplify SCEV expressions and converts them to a 667 /// more usable form. 668 PredicatedScalarEvolution &PSE; 669 670 /// Loop Info. 671 LoopInfo *LI; 672 673 /// Dominator Tree. 674 DominatorTree *DT; 675 676 /// Alias Analysis. 677 AliasAnalysis *AA; 678 679 /// Target Library Info. 680 const TargetLibraryInfo *TLI; 681 682 /// Target Transform Info. 683 const TargetTransformInfo *TTI; 684 685 /// Assumption Cache. 686 AssumptionCache *AC; 687 688 /// Interface to emit optimization remarks. 689 OptimizationRemarkEmitter *ORE; 690 691 /// LoopVersioning. It's only set up (non-null) if memchecks were 692 /// used. 693 /// 694 /// This is currently only used to add no-alias metadata based on the 695 /// memchecks. The actually versioning is performed manually. 696 std::unique_ptr<LoopVersioning> LVer; 697 698 /// The vectorization SIMD factor to use. Each vector will have this many 699 /// vector elements. 700 unsigned VF; 701 702 /// The vectorization unroll factor to use. Each scalar is vectorized to this 703 /// many different vector instructions. 704 unsigned UF; 705 706 /// The builder that we use 707 IRBuilder<> Builder; 708 709 // --- Vectorization state --- 710 711 /// The vector-loop preheader. 712 BasicBlock *LoopVectorPreHeader; 713 714 /// The scalar-loop preheader. 715 BasicBlock *LoopScalarPreHeader; 716 717 /// Middle Block between the vector and the scalar. 718 BasicBlock *LoopMiddleBlock; 719 720 /// The ExitBlock of the scalar loop. 721 BasicBlock *LoopExitBlock; 722 723 /// The vector loop body. 724 BasicBlock *LoopVectorBody; 725 726 /// The scalar loop body. 727 BasicBlock *LoopScalarBody; 728 729 /// A list of all bypass blocks. The first block is the entry of the loop. 730 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 731 732 /// The new Induction variable which was added to the new block. 733 PHINode *Induction = nullptr; 734 735 /// The induction variable of the old basic block. 736 PHINode *OldInduction = nullptr; 737 738 /// Maps values from the original loop to their corresponding values in the 739 /// vectorized loop. A key value can map to either vector values, scalar 740 /// values or both kinds of values, depending on whether the key was 741 /// vectorized and scalarized. 742 VectorizerValueMap VectorLoopValueMap; 743 744 /// Store instructions that were predicated. 745 SmallVector<Instruction *, 4> PredicatedInstructions; 746 747 /// Trip count of the original loop. 748 Value *TripCount = nullptr; 749 750 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 751 Value *VectorTripCount = nullptr; 752 753 /// The legality analysis. 754 LoopVectorizationLegality *Legal; 755 756 /// The profitablity analysis. 757 LoopVectorizationCostModel *Cost; 758 759 // Record whether runtime checks are added. 760 bool AddedSafetyChecks = false; 761 762 // Holds the end values for each induction variable. We save the end values 763 // so we can later fix-up the external users of the induction variables. 764 DenseMap<PHINode *, Value *> IVEndValues; 765 766 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 767 // fixed up at the end of vector code generation. 768 SmallVector<PHINode *, 8> OrigPHIsToFix; 769 }; 770 771 class InnerLoopUnroller : public InnerLoopVectorizer { 772 public: 773 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 774 LoopInfo *LI, DominatorTree *DT, 775 const TargetLibraryInfo *TLI, 776 const TargetTransformInfo *TTI, AssumptionCache *AC, 777 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 778 LoopVectorizationLegality *LVL, 779 LoopVectorizationCostModel *CM) 780 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 781 UnrollFactor, LVL, CM) {} 782 783 private: 784 Value *getBroadcastInstrs(Value *V) override; 785 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 786 Instruction::BinaryOps Opcode = 787 Instruction::BinaryOpsEnd) override; 788 Value *reverseVector(Value *Vec) override; 789 }; 790 791 } // end namespace llvm 792 793 /// Look for a meaningful debug location on the instruction or it's 794 /// operands. 795 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 796 if (!I) 797 return I; 798 799 DebugLoc Empty; 800 if (I->getDebugLoc() != Empty) 801 return I; 802 803 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 804 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 805 if (OpInst->getDebugLoc() != Empty) 806 return OpInst; 807 } 808 809 return I; 810 } 811 812 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 813 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 814 const DILocation *DIL = Inst->getDebugLoc(); 815 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 816 !isa<DbgInfoIntrinsic>(Inst)) { 817 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 818 if (NewDIL) 819 B.SetCurrentDebugLocation(NewDIL.getValue()); 820 else 821 LLVM_DEBUG(dbgs() 822 << "Failed to create new discriminator: " 823 << DIL->getFilename() << " Line: " << DIL->getLine()); 824 } 825 else 826 B.SetCurrentDebugLocation(DIL); 827 } else 828 B.SetCurrentDebugLocation(DebugLoc()); 829 } 830 831 /// Write a record \p DebugMsg about vectorization failure to the debug 832 /// output stream. If \p I is passed, it is an instruction that prevents 833 /// vectorization. 834 #ifndef NDEBUG 835 static void debugVectorizationFailure(const StringRef DebugMsg, 836 Instruction *I) { 837 dbgs() << "LV: Not vectorizing: " << DebugMsg; 838 if (I != nullptr) 839 dbgs() << " " << *I; 840 else 841 dbgs() << '.'; 842 dbgs() << '\n'; 843 } 844 #endif 845 846 /// Create an analysis remark that explains why vectorization failed 847 /// 848 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 849 /// RemarkName is the identifier for the remark. If \p I is passed it is an 850 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 851 /// the location of the remark. \return the remark object that can be 852 /// streamed to. 853 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 854 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 855 Value *CodeRegion = TheLoop->getHeader(); 856 DebugLoc DL = TheLoop->getStartLoc(); 857 858 if (I) { 859 CodeRegion = I->getParent(); 860 // If there is no debug location attached to the instruction, revert back to 861 // using the loop's. 862 if (I->getDebugLoc()) 863 DL = I->getDebugLoc(); 864 } 865 866 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 867 R << "loop not vectorized: "; 868 return R; 869 } 870 871 namespace llvm { 872 873 void reportVectorizationFailure(const StringRef DebugMsg, 874 const StringRef OREMsg, const StringRef ORETag, 875 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 876 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 877 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 878 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 879 ORETag, TheLoop, I) << OREMsg); 880 } 881 882 } // end namespace llvm 883 884 #ifndef NDEBUG 885 /// \return string containing a file name and a line # for the given loop. 886 static std::string getDebugLocString(const Loop *L) { 887 std::string Result; 888 if (L) { 889 raw_string_ostream OS(Result); 890 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 891 LoopDbgLoc.print(OS); 892 else 893 // Just print the module name. 894 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 895 OS.flush(); 896 } 897 return Result; 898 } 899 #endif 900 901 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 902 const Instruction *Orig) { 903 // If the loop was versioned with memchecks, add the corresponding no-alias 904 // metadata. 905 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 906 LVer->annotateInstWithNoAlias(To, Orig); 907 } 908 909 void InnerLoopVectorizer::addMetadata(Instruction *To, 910 Instruction *From) { 911 propagateMetadata(To, From); 912 addNewMetadata(To, From); 913 } 914 915 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 916 Instruction *From) { 917 for (Value *V : To) { 918 if (Instruction *I = dyn_cast<Instruction>(V)) 919 addMetadata(I, From); 920 } 921 } 922 923 namespace llvm { 924 925 // Loop vectorization cost-model hints how the scalar epilogue loop should be 926 // lowered. 927 enum ScalarEpilogueLowering { 928 929 // The default: allowing scalar epilogues. 930 CM_ScalarEpilogueAllowed, 931 932 // Vectorization with OptForSize: don't allow epilogues. 933 CM_ScalarEpilogueNotAllowedOptSize, 934 935 // A special case of vectorisation with OptForSize: loops with a very small 936 // trip count are considered for vectorization under OptForSize, thereby 937 // making sure the cost of their loop body is dominant, free of runtime 938 // guards and scalar iteration overheads. 939 CM_ScalarEpilogueNotAllowedLowTripLoop, 940 941 // Loop hint predicate indicating an epilogue is undesired. 942 CM_ScalarEpilogueNotNeededUsePredicate 943 }; 944 945 /// LoopVectorizationCostModel - estimates the expected speedups due to 946 /// vectorization. 947 /// In many cases vectorization is not profitable. This can happen because of 948 /// a number of reasons. In this class we mainly attempt to predict the 949 /// expected speedup/slowdowns due to the supported instruction set. We use the 950 /// TargetTransformInfo to query the different backends for the cost of 951 /// different operations. 952 class LoopVectorizationCostModel { 953 public: 954 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 955 PredicatedScalarEvolution &PSE, LoopInfo *LI, 956 LoopVectorizationLegality *Legal, 957 const TargetTransformInfo &TTI, 958 const TargetLibraryInfo *TLI, DemandedBits *DB, 959 AssumptionCache *AC, 960 OptimizationRemarkEmitter *ORE, const Function *F, 961 const LoopVectorizeHints *Hints, 962 InterleavedAccessInfo &IAI) 963 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 964 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 965 Hints(Hints), InterleaveInfo(IAI) {} 966 967 /// \return An upper bound for the vectorization factor, or None if 968 /// vectorization and interleaving should be avoided up front. 969 Optional<unsigned> computeMaxVF(); 970 971 /// \return True if runtime checks are required for vectorization, and false 972 /// otherwise. 973 bool runtimeChecksRequired(); 974 975 /// \return The most profitable vectorization factor and the cost of that VF. 976 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 977 /// then this vectorization factor will be selected if vectorization is 978 /// possible. 979 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 980 981 /// Setup cost-based decisions for user vectorization factor. 982 void selectUserVectorizationFactor(unsigned UserVF) { 983 collectUniformsAndScalars(UserVF); 984 collectInstsToScalarize(UserVF); 985 } 986 987 /// \return The size (in bits) of the smallest and widest types in the code 988 /// that needs to be vectorized. We ignore values that remain scalar such as 989 /// 64 bit loop indices. 990 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 991 992 /// \return The desired interleave count. 993 /// If interleave count has been specified by metadata it will be returned. 994 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 995 /// are the selected vectorization factor and the cost of the selected VF. 996 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 997 998 /// Memory access instruction may be vectorized in more than one way. 999 /// Form of instruction after vectorization depends on cost. 1000 /// This function takes cost-based decisions for Load/Store instructions 1001 /// and collects them in a map. This decisions map is used for building 1002 /// the lists of loop-uniform and loop-scalar instructions. 1003 /// The calculated cost is saved with widening decision in order to 1004 /// avoid redundant calculations. 1005 void setCostBasedWideningDecision(unsigned VF); 1006 1007 /// A struct that represents some properties of the register usage 1008 /// of a loop. 1009 struct RegisterUsage { 1010 /// Holds the number of loop invariant values that are used in the loop. 1011 /// The key is ClassID of target-provided register class. 1012 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1013 /// Holds the maximum number of concurrent live intervals in the loop. 1014 /// The key is ClassID of target-provided register class. 1015 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1016 }; 1017 1018 /// \return Returns information about the register usages of the loop for the 1019 /// given vectorization factors. 1020 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1021 1022 /// Collect values we want to ignore in the cost model. 1023 void collectValuesToIgnore(); 1024 1025 /// \returns The smallest bitwidth each instruction can be represented with. 1026 /// The vector equivalents of these instructions should be truncated to this 1027 /// type. 1028 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1029 return MinBWs; 1030 } 1031 1032 /// \returns True if it is more profitable to scalarize instruction \p I for 1033 /// vectorization factor \p VF. 1034 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1035 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1036 1037 // Cost model is not run in the VPlan-native path - return conservative 1038 // result until this changes. 1039 if (EnableVPlanNativePath) 1040 return false; 1041 1042 auto Scalars = InstsToScalarize.find(VF); 1043 assert(Scalars != InstsToScalarize.end() && 1044 "VF not yet analyzed for scalarization profitability"); 1045 return Scalars->second.find(I) != Scalars->second.end(); 1046 } 1047 1048 /// Returns true if \p I is known to be uniform after vectorization. 1049 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1050 if (VF == 1) 1051 return true; 1052 1053 // Cost model is not run in the VPlan-native path - return conservative 1054 // result until this changes. 1055 if (EnableVPlanNativePath) 1056 return false; 1057 1058 auto UniformsPerVF = Uniforms.find(VF); 1059 assert(UniformsPerVF != Uniforms.end() && 1060 "VF not yet analyzed for uniformity"); 1061 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1062 } 1063 1064 /// Returns true if \p I is known to be scalar after vectorization. 1065 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1066 if (VF == 1) 1067 return true; 1068 1069 // Cost model is not run in the VPlan-native path - return conservative 1070 // result until this changes. 1071 if (EnableVPlanNativePath) 1072 return false; 1073 1074 auto ScalarsPerVF = Scalars.find(VF); 1075 assert(ScalarsPerVF != Scalars.end() && 1076 "Scalar values are not calculated for VF"); 1077 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1078 } 1079 1080 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1081 /// for vectorization factor \p VF. 1082 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1083 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1084 !isProfitableToScalarize(I, VF) && 1085 !isScalarAfterVectorization(I, VF); 1086 } 1087 1088 /// Decision that was taken during cost calculation for memory instruction. 1089 enum InstWidening { 1090 CM_Unknown, 1091 CM_Widen, // For consecutive accesses with stride +1. 1092 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1093 CM_Interleave, 1094 CM_GatherScatter, 1095 CM_Scalarize 1096 }; 1097 1098 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1099 /// instruction \p I and vector width \p VF. 1100 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1101 unsigned Cost) { 1102 assert(VF >= 2 && "Expected VF >=2"); 1103 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1104 } 1105 1106 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1107 /// interleaving group \p Grp and vector width \p VF. 1108 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1109 InstWidening W, unsigned Cost) { 1110 assert(VF >= 2 && "Expected VF >=2"); 1111 /// Broadcast this decicion to all instructions inside the group. 1112 /// But the cost will be assigned to one instruction only. 1113 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1114 if (auto *I = Grp->getMember(i)) { 1115 if (Grp->getInsertPos() == I) 1116 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1117 else 1118 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1119 } 1120 } 1121 } 1122 1123 /// Return the cost model decision for the given instruction \p I and vector 1124 /// width \p VF. Return CM_Unknown if this instruction did not pass 1125 /// through the cost modeling. 1126 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1127 assert(VF >= 2 && "Expected VF >=2"); 1128 1129 // Cost model is not run in the VPlan-native path - return conservative 1130 // result until this changes. 1131 if (EnableVPlanNativePath) 1132 return CM_GatherScatter; 1133 1134 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1135 auto Itr = WideningDecisions.find(InstOnVF); 1136 if (Itr == WideningDecisions.end()) 1137 return CM_Unknown; 1138 return Itr->second.first; 1139 } 1140 1141 /// Return the vectorization cost for the given instruction \p I and vector 1142 /// width \p VF. 1143 unsigned getWideningCost(Instruction *I, unsigned VF) { 1144 assert(VF >= 2 && "Expected VF >=2"); 1145 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1146 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1147 "The cost is not calculated"); 1148 return WideningDecisions[InstOnVF].second; 1149 } 1150 1151 /// Return True if instruction \p I is an optimizable truncate whose operand 1152 /// is an induction variable. Such a truncate will be removed by adding a new 1153 /// induction variable with the destination type. 1154 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1155 // If the instruction is not a truncate, return false. 1156 auto *Trunc = dyn_cast<TruncInst>(I); 1157 if (!Trunc) 1158 return false; 1159 1160 // Get the source and destination types of the truncate. 1161 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1162 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1163 1164 // If the truncate is free for the given types, return false. Replacing a 1165 // free truncate with an induction variable would add an induction variable 1166 // update instruction to each iteration of the loop. We exclude from this 1167 // check the primary induction variable since it will need an update 1168 // instruction regardless. 1169 Value *Op = Trunc->getOperand(0); 1170 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1171 return false; 1172 1173 // If the truncated value is not an induction variable, return false. 1174 return Legal->isInductionPhi(Op); 1175 } 1176 1177 /// Collects the instructions to scalarize for each predicated instruction in 1178 /// the loop. 1179 void collectInstsToScalarize(unsigned VF); 1180 1181 /// Collect Uniform and Scalar values for the given \p VF. 1182 /// The sets depend on CM decision for Load/Store instructions 1183 /// that may be vectorized as interleave, gather-scatter or scalarized. 1184 void collectUniformsAndScalars(unsigned VF) { 1185 // Do the analysis once. 1186 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1187 return; 1188 setCostBasedWideningDecision(VF); 1189 collectLoopUniforms(VF); 1190 collectLoopScalars(VF); 1191 } 1192 1193 /// Returns true if the target machine supports masked store operation 1194 /// for the given \p DataType and kind of access to \p Ptr. 1195 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1196 return Legal->isConsecutivePtr(Ptr) && 1197 TTI.isLegalMaskedStore(DataType, Alignment); 1198 } 1199 1200 /// Returns true if the target machine supports masked load operation 1201 /// for the given \p DataType and kind of access to \p Ptr. 1202 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1203 return Legal->isConsecutivePtr(Ptr) && 1204 TTI.isLegalMaskedLoad(DataType, Alignment); 1205 } 1206 1207 /// Returns true if the target machine supports masked scatter operation 1208 /// for the given \p DataType. 1209 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1210 return TTI.isLegalMaskedScatter(DataType, Alignment); 1211 } 1212 1213 /// Returns true if the target machine supports masked gather operation 1214 /// for the given \p DataType. 1215 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1216 return TTI.isLegalMaskedGather(DataType, Alignment); 1217 } 1218 1219 /// Returns true if the target machine can represent \p V as a masked gather 1220 /// or scatter operation. 1221 bool isLegalGatherOrScatter(Value *V) { 1222 bool LI = isa<LoadInst>(V); 1223 bool SI = isa<StoreInst>(V); 1224 if (!LI && !SI) 1225 return false; 1226 auto *Ty = getMemInstValueType(V); 1227 MaybeAlign Align = getLoadStoreAlignment(V); 1228 return (LI && isLegalMaskedGather(Ty, Align)) || 1229 (SI && isLegalMaskedScatter(Ty, Align)); 1230 } 1231 1232 /// Returns true if \p I is an instruction that will be scalarized with 1233 /// predication. Such instructions include conditional stores and 1234 /// instructions that may divide by zero. 1235 /// If a non-zero VF has been calculated, we check if I will be scalarized 1236 /// predication for that VF. 1237 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1238 1239 // Returns true if \p I is an instruction that will be predicated either 1240 // through scalar predication or masked load/store or masked gather/scatter. 1241 // Superset of instructions that return true for isScalarWithPredication. 1242 bool isPredicatedInst(Instruction *I) { 1243 if (!blockNeedsPredication(I->getParent())) 1244 return false; 1245 // Loads and stores that need some form of masked operation are predicated 1246 // instructions. 1247 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1248 return Legal->isMaskRequired(I); 1249 return isScalarWithPredication(I); 1250 } 1251 1252 /// Returns true if \p I is a memory instruction with consecutive memory 1253 /// access that can be widened. 1254 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1255 1256 /// Returns true if \p I is a memory instruction in an interleaved-group 1257 /// of memory accesses that can be vectorized with wide vector loads/stores 1258 /// and shuffles. 1259 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1260 1261 /// Check if \p Instr belongs to any interleaved access group. 1262 bool isAccessInterleaved(Instruction *Instr) { 1263 return InterleaveInfo.isInterleaved(Instr); 1264 } 1265 1266 /// Get the interleaved access group that \p Instr belongs to. 1267 const InterleaveGroup<Instruction> * 1268 getInterleavedAccessGroup(Instruction *Instr) { 1269 return InterleaveInfo.getInterleaveGroup(Instr); 1270 } 1271 1272 /// Returns true if an interleaved group requires a scalar iteration 1273 /// to handle accesses with gaps, and there is nothing preventing us from 1274 /// creating a scalar epilogue. 1275 bool requiresScalarEpilogue() const { 1276 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1277 } 1278 1279 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1280 /// loop hint annotation. 1281 bool isScalarEpilogueAllowed() const { 1282 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1283 } 1284 1285 /// Returns true if all loop blocks should be masked to fold tail loop. 1286 bool foldTailByMasking() const { return FoldTailByMasking; } 1287 1288 bool blockNeedsPredication(BasicBlock *BB) { 1289 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1290 } 1291 1292 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1293 /// with factor VF. Return the cost of the instruction, including 1294 /// scalarization overhead if it's needed. 1295 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1296 1297 /// Estimate cost of a call instruction CI if it were vectorized with factor 1298 /// VF. Return the cost of the instruction, including scalarization overhead 1299 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1300 /// scalarized - 1301 /// i.e. either vector version isn't available, or is too expensive. 1302 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1303 1304 private: 1305 unsigned NumPredStores = 0; 1306 1307 /// \return An upper bound for the vectorization factor, larger than zero. 1308 /// One is returned if vectorization should best be avoided due to cost. 1309 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1310 1311 /// The vectorization cost is a combination of the cost itself and a boolean 1312 /// indicating whether any of the contributing operations will actually 1313 /// operate on 1314 /// vector values after type legalization in the backend. If this latter value 1315 /// is 1316 /// false, then all operations will be scalarized (i.e. no vectorization has 1317 /// actually taken place). 1318 using VectorizationCostTy = std::pair<unsigned, bool>; 1319 1320 /// Returns the expected execution cost. The unit of the cost does 1321 /// not matter because we use the 'cost' units to compare different 1322 /// vector widths. The cost that is returned is *not* normalized by 1323 /// the factor width. 1324 VectorizationCostTy expectedCost(unsigned VF); 1325 1326 /// Returns the execution time cost of an instruction for a given vector 1327 /// width. Vector width of one means scalar. 1328 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1329 1330 /// The cost-computation logic from getInstructionCost which provides 1331 /// the vector type as an output parameter. 1332 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1333 1334 /// Calculate vectorization cost of memory instruction \p I. 1335 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1336 1337 /// The cost computation for scalarized memory instruction. 1338 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1339 1340 /// The cost computation for interleaving group of memory instructions. 1341 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1342 1343 /// The cost computation for Gather/Scatter instruction. 1344 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1345 1346 /// The cost computation for widening instruction \p I with consecutive 1347 /// memory access. 1348 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1349 1350 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1351 /// Load: scalar load + broadcast. 1352 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1353 /// element) 1354 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1355 1356 /// Estimate the overhead of scalarizing an instruction. This is a 1357 /// convenience wrapper for the type-based getScalarizationOverhead API. 1358 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1359 1360 /// Returns whether the instruction is a load or store and will be a emitted 1361 /// as a vector operation. 1362 bool isConsecutiveLoadOrStore(Instruction *I); 1363 1364 /// Returns true if an artificially high cost for emulated masked memrefs 1365 /// should be used. 1366 bool useEmulatedMaskMemRefHack(Instruction *I); 1367 1368 /// Map of scalar integer values to the smallest bitwidth they can be legally 1369 /// represented as. The vector equivalents of these values should be truncated 1370 /// to this type. 1371 MapVector<Instruction *, uint64_t> MinBWs; 1372 1373 /// A type representing the costs for instructions if they were to be 1374 /// scalarized rather than vectorized. The entries are Instruction-Cost 1375 /// pairs. 1376 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1377 1378 /// A set containing all BasicBlocks that are known to present after 1379 /// vectorization as a predicated block. 1380 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1381 1382 /// Records whether it is allowed to have the original scalar loop execute at 1383 /// least once. This may be needed as a fallback loop in case runtime 1384 /// aliasing/dependence checks fail, or to handle the tail/remainder 1385 /// iterations when the trip count is unknown or doesn't divide by the VF, 1386 /// or as a peel-loop to handle gaps in interleave-groups. 1387 /// Under optsize and when the trip count is very small we don't allow any 1388 /// iterations to execute in the scalar loop. 1389 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1390 1391 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1392 bool FoldTailByMasking = false; 1393 1394 /// A map holding scalar costs for different vectorization factors. The 1395 /// presence of a cost for an instruction in the mapping indicates that the 1396 /// instruction will be scalarized when vectorizing with the associated 1397 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1398 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1399 1400 /// Holds the instructions known to be uniform after vectorization. 1401 /// The data is collected per VF. 1402 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1403 1404 /// Holds the instructions known to be scalar after vectorization. 1405 /// The data is collected per VF. 1406 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1407 1408 /// Holds the instructions (address computations) that are forced to be 1409 /// scalarized. 1410 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1411 1412 /// Returns the expected difference in cost from scalarizing the expression 1413 /// feeding a predicated instruction \p PredInst. The instructions to 1414 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1415 /// non-negative return value implies the expression will be scalarized. 1416 /// Currently, only single-use chains are considered for scalarization. 1417 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1418 unsigned VF); 1419 1420 /// Collect the instructions that are uniform after vectorization. An 1421 /// instruction is uniform if we represent it with a single scalar value in 1422 /// the vectorized loop corresponding to each vector iteration. Examples of 1423 /// uniform instructions include pointer operands of consecutive or 1424 /// interleaved memory accesses. Note that although uniformity implies an 1425 /// instruction will be scalar, the reverse is not true. In general, a 1426 /// scalarized instruction will be represented by VF scalar values in the 1427 /// vectorized loop, each corresponding to an iteration of the original 1428 /// scalar loop. 1429 void collectLoopUniforms(unsigned VF); 1430 1431 /// Collect the instructions that are scalar after vectorization. An 1432 /// instruction is scalar if it is known to be uniform or will be scalarized 1433 /// during vectorization. Non-uniform scalarized instructions will be 1434 /// represented by VF values in the vectorized loop, each corresponding to an 1435 /// iteration of the original scalar loop. 1436 void collectLoopScalars(unsigned VF); 1437 1438 /// Keeps cost model vectorization decision and cost for instructions. 1439 /// Right now it is used for memory instructions only. 1440 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1441 std::pair<InstWidening, unsigned>>; 1442 1443 DecisionList WideningDecisions; 1444 1445 /// Returns true if \p V is expected to be vectorized and it needs to be 1446 /// extracted. 1447 bool needsExtract(Value *V, unsigned VF) const { 1448 Instruction *I = dyn_cast<Instruction>(V); 1449 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1450 return false; 1451 1452 // Assume we can vectorize V (and hence we need extraction) if the 1453 // scalars are not computed yet. This can happen, because it is called 1454 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1455 // the scalars are collected. That should be a safe assumption in most 1456 // cases, because we check if the operands have vectorizable types 1457 // beforehand in LoopVectorizationLegality. 1458 return Scalars.find(VF) == Scalars.end() || 1459 !isScalarAfterVectorization(I, VF); 1460 }; 1461 1462 /// Returns a range containing only operands needing to be extracted. 1463 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1464 unsigned VF) { 1465 return SmallVector<Value *, 4>(make_filter_range( 1466 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1467 } 1468 1469 public: 1470 /// The loop that we evaluate. 1471 Loop *TheLoop; 1472 1473 /// Predicated scalar evolution analysis. 1474 PredicatedScalarEvolution &PSE; 1475 1476 /// Loop Info analysis. 1477 LoopInfo *LI; 1478 1479 /// Vectorization legality. 1480 LoopVectorizationLegality *Legal; 1481 1482 /// Vector target information. 1483 const TargetTransformInfo &TTI; 1484 1485 /// Target Library Info. 1486 const TargetLibraryInfo *TLI; 1487 1488 /// Demanded bits analysis. 1489 DemandedBits *DB; 1490 1491 /// Assumption cache. 1492 AssumptionCache *AC; 1493 1494 /// Interface to emit optimization remarks. 1495 OptimizationRemarkEmitter *ORE; 1496 1497 const Function *TheFunction; 1498 1499 /// Loop Vectorize Hint. 1500 const LoopVectorizeHints *Hints; 1501 1502 /// The interleave access information contains groups of interleaved accesses 1503 /// with the same stride and close to each other. 1504 InterleavedAccessInfo &InterleaveInfo; 1505 1506 /// Values to ignore in the cost model. 1507 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1508 1509 /// Values to ignore in the cost model when VF > 1. 1510 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1511 }; 1512 1513 } // end namespace llvm 1514 1515 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1516 // vectorization. The loop needs to be annotated with #pragma omp simd 1517 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1518 // vector length information is not provided, vectorization is not considered 1519 // explicit. Interleave hints are not allowed either. These limitations will be 1520 // relaxed in the future. 1521 // Please, note that we are currently forced to abuse the pragma 'clang 1522 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1523 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1524 // provides *explicit vectorization hints* (LV can bypass legal checks and 1525 // assume that vectorization is legal). However, both hints are implemented 1526 // using the same metadata (llvm.loop.vectorize, processed by 1527 // LoopVectorizeHints). This will be fixed in the future when the native IR 1528 // representation for pragma 'omp simd' is introduced. 1529 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1530 OptimizationRemarkEmitter *ORE) { 1531 assert(!OuterLp->empty() && "This is not an outer loop"); 1532 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1533 1534 // Only outer loops with an explicit vectorization hint are supported. 1535 // Unannotated outer loops are ignored. 1536 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1537 return false; 1538 1539 Function *Fn = OuterLp->getHeader()->getParent(); 1540 if (!Hints.allowVectorization(Fn, OuterLp, 1541 true /*VectorizeOnlyWhenForced*/)) { 1542 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1543 return false; 1544 } 1545 1546 if (Hints.getInterleave() > 1) { 1547 // TODO: Interleave support is future work. 1548 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1549 "outer loops.\n"); 1550 Hints.emitRemarkWithHints(); 1551 return false; 1552 } 1553 1554 return true; 1555 } 1556 1557 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1558 OptimizationRemarkEmitter *ORE, 1559 SmallVectorImpl<Loop *> &V) { 1560 // Collect inner loops and outer loops without irreducible control flow. For 1561 // now, only collect outer loops that have explicit vectorization hints. If we 1562 // are stress testing the VPlan H-CFG construction, we collect the outermost 1563 // loop of every loop nest. 1564 if (L.empty() || VPlanBuildStressTest || 1565 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1566 LoopBlocksRPO RPOT(&L); 1567 RPOT.perform(LI); 1568 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1569 V.push_back(&L); 1570 // TODO: Collect inner loops inside marked outer loops in case 1571 // vectorization fails for the outer loop. Do not invoke 1572 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1573 // already known to be reducible. We can use an inherited attribute for 1574 // that. 1575 return; 1576 } 1577 } 1578 for (Loop *InnerL : L) 1579 collectSupportedLoops(*InnerL, LI, ORE, V); 1580 } 1581 1582 namespace { 1583 1584 /// The LoopVectorize Pass. 1585 struct LoopVectorize : public FunctionPass { 1586 /// Pass identification, replacement for typeid 1587 static char ID; 1588 1589 LoopVectorizePass Impl; 1590 1591 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1592 bool VectorizeOnlyWhenForced = false) 1593 : FunctionPass(ID) { 1594 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1595 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1596 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1597 } 1598 1599 bool runOnFunction(Function &F) override { 1600 if (skipFunction(F)) 1601 return false; 1602 1603 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1604 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1605 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1606 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1607 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1608 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1609 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1610 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1611 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1612 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1613 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1614 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1615 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1616 1617 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1618 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1619 1620 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1621 GetLAA, *ORE, PSI); 1622 } 1623 1624 void getAnalysisUsage(AnalysisUsage &AU) const override { 1625 AU.addRequired<AssumptionCacheTracker>(); 1626 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1627 AU.addRequired<DominatorTreeWrapperPass>(); 1628 AU.addRequired<LoopInfoWrapperPass>(); 1629 AU.addRequired<ScalarEvolutionWrapperPass>(); 1630 AU.addRequired<TargetTransformInfoWrapperPass>(); 1631 AU.addRequired<AAResultsWrapperPass>(); 1632 AU.addRequired<LoopAccessLegacyAnalysis>(); 1633 AU.addRequired<DemandedBitsWrapperPass>(); 1634 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1635 AU.addRequired<InjectTLIMappingsLegacy>(); 1636 1637 // We currently do not preserve loopinfo/dominator analyses with outer loop 1638 // vectorization. Until this is addressed, mark these analyses as preserved 1639 // only for non-VPlan-native path. 1640 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1641 if (!EnableVPlanNativePath) { 1642 AU.addPreserved<LoopInfoWrapperPass>(); 1643 AU.addPreserved<DominatorTreeWrapperPass>(); 1644 } 1645 1646 AU.addPreserved<BasicAAWrapperPass>(); 1647 AU.addPreserved<GlobalsAAWrapperPass>(); 1648 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1649 } 1650 }; 1651 1652 } // end anonymous namespace 1653 1654 //===----------------------------------------------------------------------===// 1655 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1656 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1657 //===----------------------------------------------------------------------===// 1658 1659 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1660 // We need to place the broadcast of invariant variables outside the loop, 1661 // but only if it's proven safe to do so. Else, broadcast will be inside 1662 // vector loop body. 1663 Instruction *Instr = dyn_cast<Instruction>(V); 1664 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1665 (!Instr || 1666 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1667 // Place the code for broadcasting invariant variables in the new preheader. 1668 IRBuilder<>::InsertPointGuard Guard(Builder); 1669 if (SafeToHoist) 1670 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1671 1672 // Broadcast the scalar into all locations in the vector. 1673 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1674 1675 return Shuf; 1676 } 1677 1678 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1679 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1680 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1681 "Expected either an induction phi-node or a truncate of it!"); 1682 Value *Start = II.getStartValue(); 1683 1684 // Construct the initial value of the vector IV in the vector loop preheader 1685 auto CurrIP = Builder.saveIP(); 1686 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1687 if (isa<TruncInst>(EntryVal)) { 1688 assert(Start->getType()->isIntegerTy() && 1689 "Truncation requires an integer type"); 1690 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1691 Step = Builder.CreateTrunc(Step, TruncType); 1692 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1693 } 1694 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1695 Value *SteppedStart = 1696 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1697 1698 // We create vector phi nodes for both integer and floating-point induction 1699 // variables. Here, we determine the kind of arithmetic we will perform. 1700 Instruction::BinaryOps AddOp; 1701 Instruction::BinaryOps MulOp; 1702 if (Step->getType()->isIntegerTy()) { 1703 AddOp = Instruction::Add; 1704 MulOp = Instruction::Mul; 1705 } else { 1706 AddOp = II.getInductionOpcode(); 1707 MulOp = Instruction::FMul; 1708 } 1709 1710 // Multiply the vectorization factor by the step using integer or 1711 // floating-point arithmetic as appropriate. 1712 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1713 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1714 1715 // Create a vector splat to use in the induction update. 1716 // 1717 // FIXME: If the step is non-constant, we create the vector splat with 1718 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1719 // handle a constant vector splat. 1720 Value *SplatVF = 1721 isa<Constant>(Mul) 1722 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1723 : Builder.CreateVectorSplat(VF, Mul); 1724 Builder.restoreIP(CurrIP); 1725 1726 // We may need to add the step a number of times, depending on the unroll 1727 // factor. The last of those goes into the PHI. 1728 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1729 &*LoopVectorBody->getFirstInsertionPt()); 1730 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1731 Instruction *LastInduction = VecInd; 1732 for (unsigned Part = 0; Part < UF; ++Part) { 1733 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1734 1735 if (isa<TruncInst>(EntryVal)) 1736 addMetadata(LastInduction, EntryVal); 1737 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1738 1739 LastInduction = cast<Instruction>(addFastMathFlag( 1740 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1741 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1742 } 1743 1744 // Move the last step to the end of the latch block. This ensures consistent 1745 // placement of all induction updates. 1746 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1747 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1748 auto *ICmp = cast<Instruction>(Br->getCondition()); 1749 LastInduction->moveBefore(ICmp); 1750 LastInduction->setName("vec.ind.next"); 1751 1752 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1753 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1754 } 1755 1756 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1757 return Cost->isScalarAfterVectorization(I, VF) || 1758 Cost->isProfitableToScalarize(I, VF); 1759 } 1760 1761 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1762 if (shouldScalarizeInstruction(IV)) 1763 return true; 1764 auto isScalarInst = [&](User *U) -> bool { 1765 auto *I = cast<Instruction>(U); 1766 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1767 }; 1768 return llvm::any_of(IV->users(), isScalarInst); 1769 } 1770 1771 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1772 const InductionDescriptor &ID, const Instruction *EntryVal, 1773 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1774 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1775 "Expected either an induction phi-node or a truncate of it!"); 1776 1777 // This induction variable is not the phi from the original loop but the 1778 // newly-created IV based on the proof that casted Phi is equal to the 1779 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1780 // re-uses the same InductionDescriptor that original IV uses but we don't 1781 // have to do any recording in this case - that is done when original IV is 1782 // processed. 1783 if (isa<TruncInst>(EntryVal)) 1784 return; 1785 1786 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1787 if (Casts.empty()) 1788 return; 1789 // Only the first Cast instruction in the Casts vector is of interest. 1790 // The rest of the Casts (if exist) have no uses outside the 1791 // induction update chain itself. 1792 Instruction *CastInst = *Casts.begin(); 1793 if (Lane < UINT_MAX) 1794 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1795 else 1796 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1797 } 1798 1799 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1800 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1801 "Primary induction variable must have an integer type"); 1802 1803 auto II = Legal->getInductionVars().find(IV); 1804 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1805 1806 auto ID = II->second; 1807 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1808 1809 // The scalar value to broadcast. This will be derived from the canonical 1810 // induction variable. 1811 Value *ScalarIV = nullptr; 1812 1813 // The value from the original loop to which we are mapping the new induction 1814 // variable. 1815 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1816 1817 // True if we have vectorized the induction variable. 1818 auto VectorizedIV = false; 1819 1820 // Determine if we want a scalar version of the induction variable. This is 1821 // true if the induction variable itself is not widened, or if it has at 1822 // least one user in the loop that is not widened. 1823 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1824 1825 // Generate code for the induction step. Note that induction steps are 1826 // required to be loop-invariant 1827 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1828 "Induction step should be loop invariant"); 1829 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1830 Value *Step = nullptr; 1831 if (PSE.getSE()->isSCEVable(IV->getType())) { 1832 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1833 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1834 LoopVectorPreHeader->getTerminator()); 1835 } else { 1836 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1837 } 1838 1839 // Try to create a new independent vector induction variable. If we can't 1840 // create the phi node, we will splat the scalar induction variable in each 1841 // loop iteration. 1842 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1843 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1844 VectorizedIV = true; 1845 } 1846 1847 // If we haven't yet vectorized the induction variable, or if we will create 1848 // a scalar one, we need to define the scalar induction variable and step 1849 // values. If we were given a truncation type, truncate the canonical 1850 // induction variable and step. Otherwise, derive these values from the 1851 // induction descriptor. 1852 if (!VectorizedIV || NeedsScalarIV) { 1853 ScalarIV = Induction; 1854 if (IV != OldInduction) { 1855 ScalarIV = IV->getType()->isIntegerTy() 1856 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1857 : Builder.CreateCast(Instruction::SIToFP, Induction, 1858 IV->getType()); 1859 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1860 ScalarIV->setName("offset.idx"); 1861 } 1862 if (Trunc) { 1863 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1864 assert(Step->getType()->isIntegerTy() && 1865 "Truncation requires an integer step"); 1866 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1867 Step = Builder.CreateTrunc(Step, TruncType); 1868 } 1869 } 1870 1871 // If we haven't yet vectorized the induction variable, splat the scalar 1872 // induction variable, and build the necessary step vectors. 1873 // TODO: Don't do it unless the vectorized IV is really required. 1874 if (!VectorizedIV) { 1875 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1876 for (unsigned Part = 0; Part < UF; ++Part) { 1877 Value *EntryPart = 1878 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1879 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1880 if (Trunc) 1881 addMetadata(EntryPart, Trunc); 1882 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1883 } 1884 } 1885 1886 // If an induction variable is only used for counting loop iterations or 1887 // calculating addresses, it doesn't need to be widened. Create scalar steps 1888 // that can be used by instructions we will later scalarize. Note that the 1889 // addition of the scalar steps will not increase the number of instructions 1890 // in the loop in the common case prior to InstCombine. We will be trading 1891 // one vector extract for each scalar step. 1892 if (NeedsScalarIV) 1893 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1894 } 1895 1896 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1897 Instruction::BinaryOps BinOp) { 1898 // Create and check the types. 1899 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1900 int VLen = Val->getType()->getVectorNumElements(); 1901 1902 Type *STy = Val->getType()->getScalarType(); 1903 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1904 "Induction Step must be an integer or FP"); 1905 assert(Step->getType() == STy && "Step has wrong type"); 1906 1907 SmallVector<Constant *, 8> Indices; 1908 1909 if (STy->isIntegerTy()) { 1910 // Create a vector of consecutive numbers from zero to VF. 1911 for (int i = 0; i < VLen; ++i) 1912 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1913 1914 // Add the consecutive indices to the vector value. 1915 Constant *Cv = ConstantVector::get(Indices); 1916 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1917 Step = Builder.CreateVectorSplat(VLen, Step); 1918 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1919 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1920 // which can be found from the original scalar operations. 1921 Step = Builder.CreateMul(Cv, Step); 1922 return Builder.CreateAdd(Val, Step, "induction"); 1923 } 1924 1925 // Floating point induction. 1926 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1927 "Binary Opcode should be specified for FP induction"); 1928 // Create a vector of consecutive numbers from zero to VF. 1929 for (int i = 0; i < VLen; ++i) 1930 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1931 1932 // Add the consecutive indices to the vector value. 1933 Constant *Cv = ConstantVector::get(Indices); 1934 1935 Step = Builder.CreateVectorSplat(VLen, Step); 1936 1937 // Floating point operations had to be 'fast' to enable the induction. 1938 FastMathFlags Flags; 1939 Flags.setFast(); 1940 1941 Value *MulOp = Builder.CreateFMul(Cv, Step); 1942 if (isa<Instruction>(MulOp)) 1943 // Have to check, MulOp may be a constant 1944 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1945 1946 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1947 if (isa<Instruction>(BOp)) 1948 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1949 return BOp; 1950 } 1951 1952 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1953 Instruction *EntryVal, 1954 const InductionDescriptor &ID) { 1955 // We shouldn't have to build scalar steps if we aren't vectorizing. 1956 assert(VF > 1 && "VF should be greater than one"); 1957 1958 // Get the value type and ensure it and the step have the same integer type. 1959 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1960 assert(ScalarIVTy == Step->getType() && 1961 "Val and Step should have the same type"); 1962 1963 // We build scalar steps for both integer and floating-point induction 1964 // variables. Here, we determine the kind of arithmetic we will perform. 1965 Instruction::BinaryOps AddOp; 1966 Instruction::BinaryOps MulOp; 1967 if (ScalarIVTy->isIntegerTy()) { 1968 AddOp = Instruction::Add; 1969 MulOp = Instruction::Mul; 1970 } else { 1971 AddOp = ID.getInductionOpcode(); 1972 MulOp = Instruction::FMul; 1973 } 1974 1975 // Determine the number of scalars we need to generate for each unroll 1976 // iteration. If EntryVal is uniform, we only need to generate the first 1977 // lane. Otherwise, we generate all VF values. 1978 unsigned Lanes = 1979 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1980 : VF; 1981 // Compute the scalar steps and save the results in VectorLoopValueMap. 1982 for (unsigned Part = 0; Part < UF; ++Part) { 1983 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1984 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1985 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1986 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1987 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1988 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1989 } 1990 } 1991 } 1992 1993 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1994 assert(V != Induction && "The new induction variable should not be used."); 1995 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1996 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1997 1998 // If we have a stride that is replaced by one, do it here. Defer this for 1999 // the VPlan-native path until we start running Legal checks in that path. 2000 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2001 V = ConstantInt::get(V->getType(), 1); 2002 2003 // If we have a vector mapped to this value, return it. 2004 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2005 return VectorLoopValueMap.getVectorValue(V, Part); 2006 2007 // If the value has not been vectorized, check if it has been scalarized 2008 // instead. If it has been scalarized, and we actually need the value in 2009 // vector form, we will construct the vector values on demand. 2010 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2011 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2012 2013 // If we've scalarized a value, that value should be an instruction. 2014 auto *I = cast<Instruction>(V); 2015 2016 // If we aren't vectorizing, we can just copy the scalar map values over to 2017 // the vector map. 2018 if (VF == 1) { 2019 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2020 return ScalarValue; 2021 } 2022 2023 // Get the last scalar instruction we generated for V and Part. If the value 2024 // is known to be uniform after vectorization, this corresponds to lane zero 2025 // of the Part unroll iteration. Otherwise, the last instruction is the one 2026 // we created for the last vector lane of the Part unroll iteration. 2027 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2028 auto *LastInst = cast<Instruction>( 2029 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2030 2031 // Set the insert point after the last scalarized instruction. This ensures 2032 // the insertelement sequence will directly follow the scalar definitions. 2033 auto OldIP = Builder.saveIP(); 2034 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2035 Builder.SetInsertPoint(&*NewIP); 2036 2037 // However, if we are vectorizing, we need to construct the vector values. 2038 // If the value is known to be uniform after vectorization, we can just 2039 // broadcast the scalar value corresponding to lane zero for each unroll 2040 // iteration. Otherwise, we construct the vector values using insertelement 2041 // instructions. Since the resulting vectors are stored in 2042 // VectorLoopValueMap, we will only generate the insertelements once. 2043 Value *VectorValue = nullptr; 2044 if (Cost->isUniformAfterVectorization(I, VF)) { 2045 VectorValue = getBroadcastInstrs(ScalarValue); 2046 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2047 } else { 2048 // Initialize packing with insertelements to start from undef. 2049 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2050 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2051 for (unsigned Lane = 0; Lane < VF; ++Lane) 2052 packScalarIntoVectorValue(V, {Part, Lane}); 2053 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2054 } 2055 Builder.restoreIP(OldIP); 2056 return VectorValue; 2057 } 2058 2059 // If this scalar is unknown, assume that it is a constant or that it is 2060 // loop invariant. Broadcast V and save the value for future uses. 2061 Value *B = getBroadcastInstrs(V); 2062 VectorLoopValueMap.setVectorValue(V, Part, B); 2063 return B; 2064 } 2065 2066 Value * 2067 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2068 const VPIteration &Instance) { 2069 // If the value is not an instruction contained in the loop, it should 2070 // already be scalar. 2071 if (OrigLoop->isLoopInvariant(V)) 2072 return V; 2073 2074 assert(Instance.Lane > 0 2075 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2076 : true && "Uniform values only have lane zero"); 2077 2078 // If the value from the original loop has not been vectorized, it is 2079 // represented by UF x VF scalar values in the new loop. Return the requested 2080 // scalar value. 2081 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2082 return VectorLoopValueMap.getScalarValue(V, Instance); 2083 2084 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2085 // for the given unroll part. If this entry is not a vector type (i.e., the 2086 // vectorization factor is one), there is no need to generate an 2087 // extractelement instruction. 2088 auto *U = getOrCreateVectorValue(V, Instance.Part); 2089 if (!U->getType()->isVectorTy()) { 2090 assert(VF == 1 && "Value not scalarized has non-vector type"); 2091 return U; 2092 } 2093 2094 // Otherwise, the value from the original loop has been vectorized and is 2095 // represented by UF vector values. Extract and return the requested scalar 2096 // value from the appropriate vector lane. 2097 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2098 } 2099 2100 void InnerLoopVectorizer::packScalarIntoVectorValue( 2101 Value *V, const VPIteration &Instance) { 2102 assert(V != Induction && "The new induction variable should not be used."); 2103 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2104 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2105 2106 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2107 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2108 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2109 Builder.getInt32(Instance.Lane)); 2110 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2111 } 2112 2113 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2114 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2115 SmallVector<Constant *, 8> ShuffleMask; 2116 for (unsigned i = 0; i < VF; ++i) 2117 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2118 2119 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2120 ConstantVector::get(ShuffleMask), 2121 "reverse"); 2122 } 2123 2124 // Return whether we allow using masked interleave-groups (for dealing with 2125 // strided loads/stores that reside in predicated blocks, or for dealing 2126 // with gaps). 2127 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2128 // If an override option has been passed in for interleaved accesses, use it. 2129 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2130 return EnableMaskedInterleavedMemAccesses; 2131 2132 return TTI.enableMaskedInterleavedAccessVectorization(); 2133 } 2134 2135 // Try to vectorize the interleave group that \p Instr belongs to. 2136 // 2137 // E.g. Translate following interleaved load group (factor = 3): 2138 // for (i = 0; i < N; i+=3) { 2139 // R = Pic[i]; // Member of index 0 2140 // G = Pic[i+1]; // Member of index 1 2141 // B = Pic[i+2]; // Member of index 2 2142 // ... // do something to R, G, B 2143 // } 2144 // To: 2145 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2146 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2147 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2148 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2149 // 2150 // Or translate following interleaved store group (factor = 3): 2151 // for (i = 0; i < N; i+=3) { 2152 // ... do something to R, G, B 2153 // Pic[i] = R; // Member of index 0 2154 // Pic[i+1] = G; // Member of index 1 2155 // Pic[i+2] = B; // Member of index 2 2156 // } 2157 // To: 2158 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2159 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2160 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2161 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2162 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2163 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2164 VPTransformState &State, 2165 VPValue *Addr, 2166 VPValue *BlockInMask) { 2167 const InterleaveGroup<Instruction> *Group = 2168 Cost->getInterleavedAccessGroup(Instr); 2169 assert(Group && "Fail to get an interleaved access group."); 2170 2171 // Skip if current instruction is not the insert position. 2172 if (Instr != Group->getInsertPos()) 2173 return; 2174 2175 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2176 2177 // Prepare for the vector type of the interleaved load/store. 2178 Type *ScalarTy = getMemInstValueType(Instr); 2179 unsigned InterleaveFactor = Group->getFactor(); 2180 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2181 2182 // Prepare for the new pointers. 2183 SmallVector<Value *, 2> AddrParts; 2184 unsigned Index = Group->getIndex(Instr); 2185 2186 // TODO: extend the masked interleaved-group support to reversed access. 2187 assert((!BlockInMask || !Group->isReverse()) && 2188 "Reversed masked interleave-group not supported."); 2189 2190 // If the group is reverse, adjust the index to refer to the last vector lane 2191 // instead of the first. We adjust the index from the first vector lane, 2192 // rather than directly getting the pointer for lane VF - 1, because the 2193 // pointer operand of the interleaved access is supposed to be uniform. For 2194 // uniform instructions, we're only required to generate a value for the 2195 // first vector lane in each unroll iteration. 2196 if (Group->isReverse()) 2197 Index += (VF - 1) * Group->getFactor(); 2198 2199 for (unsigned Part = 0; Part < UF; Part++) { 2200 Value *AddrPart = State.get(Addr, {Part, 0}); 2201 setDebugLocFromInst(Builder, AddrPart); 2202 2203 // Notice current instruction could be any index. Need to adjust the address 2204 // to the member of index 0. 2205 // 2206 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2207 // b = A[i]; // Member of index 0 2208 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2209 // 2210 // E.g. A[i+1] = a; // Member of index 1 2211 // A[i] = b; // Member of index 0 2212 // A[i+2] = c; // Member of index 2 (Current instruction) 2213 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2214 2215 bool InBounds = false; 2216 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2217 InBounds = gep->isInBounds(); 2218 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2219 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2220 2221 // Cast to the vector pointer type. 2222 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2223 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2224 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2225 } 2226 2227 setDebugLocFromInst(Builder, Instr); 2228 Value *UndefVec = UndefValue::get(VecTy); 2229 2230 Value *MaskForGaps = nullptr; 2231 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2232 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2233 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2234 } 2235 2236 // Vectorize the interleaved load group. 2237 if (isa<LoadInst>(Instr)) { 2238 // For each unroll part, create a wide load for the group. 2239 SmallVector<Value *, 2> NewLoads; 2240 for (unsigned Part = 0; Part < UF; Part++) { 2241 Instruction *NewLoad; 2242 if (BlockInMask || MaskForGaps) { 2243 assert(useMaskedInterleavedAccesses(*TTI) && 2244 "masked interleaved groups are not allowed."); 2245 Value *GroupMask = MaskForGaps; 2246 if (BlockInMask) { 2247 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2248 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2249 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2250 Value *ShuffledMask = Builder.CreateShuffleVector( 2251 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2252 GroupMask = MaskForGaps 2253 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2254 MaskForGaps) 2255 : ShuffledMask; 2256 } 2257 NewLoad = 2258 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2259 GroupMask, UndefVec, "wide.masked.vec"); 2260 } 2261 else 2262 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2263 Group->getAlign(), "wide.vec"); 2264 Group->addMetadata(NewLoad); 2265 NewLoads.push_back(NewLoad); 2266 } 2267 2268 // For each member in the group, shuffle out the appropriate data from the 2269 // wide loads. 2270 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2271 Instruction *Member = Group->getMember(I); 2272 2273 // Skip the gaps in the group. 2274 if (!Member) 2275 continue; 2276 2277 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2278 for (unsigned Part = 0; Part < UF; Part++) { 2279 Value *StridedVec = Builder.CreateShuffleVector( 2280 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2281 2282 // If this member has different type, cast the result type. 2283 if (Member->getType() != ScalarTy) { 2284 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2285 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2286 } 2287 2288 if (Group->isReverse()) 2289 StridedVec = reverseVector(StridedVec); 2290 2291 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2292 } 2293 } 2294 return; 2295 } 2296 2297 // The sub vector type for current instruction. 2298 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2299 2300 // Vectorize the interleaved store group. 2301 for (unsigned Part = 0; Part < UF; Part++) { 2302 // Collect the stored vector from each member. 2303 SmallVector<Value *, 4> StoredVecs; 2304 for (unsigned i = 0; i < InterleaveFactor; i++) { 2305 // Interleaved store group doesn't allow a gap, so each index has a member 2306 Instruction *Member = Group->getMember(i); 2307 assert(Member && "Fail to get a member from an interleaved store group"); 2308 2309 Value *StoredVec = getOrCreateVectorValue( 2310 cast<StoreInst>(Member)->getValueOperand(), Part); 2311 if (Group->isReverse()) 2312 StoredVec = reverseVector(StoredVec); 2313 2314 // If this member has different type, cast it to a unified type. 2315 2316 if (StoredVec->getType() != SubVT) 2317 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2318 2319 StoredVecs.push_back(StoredVec); 2320 } 2321 2322 // Concatenate all vectors into a wide vector. 2323 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2324 2325 // Interleave the elements in the wide vector. 2326 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2327 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2328 "interleaved.vec"); 2329 2330 Instruction *NewStoreInstr; 2331 if (BlockInMask) { 2332 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2333 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2334 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2335 Value *ShuffledMask = Builder.CreateShuffleVector( 2336 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2337 NewStoreInstr = Builder.CreateMaskedStore( 2338 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2339 } 2340 else 2341 NewStoreInstr = 2342 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2343 2344 Group->addMetadata(NewStoreInstr); 2345 } 2346 } 2347 2348 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2349 VPTransformState &State, 2350 VPValue *Addr, 2351 VPValue *BlockInMask) { 2352 // Attempt to issue a wide load. 2353 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2354 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2355 2356 assert((LI || SI) && "Invalid Load/Store instruction"); 2357 2358 LoopVectorizationCostModel::InstWidening Decision = 2359 Cost->getWideningDecision(Instr, VF); 2360 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2361 "CM decision should be taken at this point"); 2362 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2363 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); 2364 2365 Type *ScalarDataTy = getMemInstValueType(Instr); 2366 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2367 // An alignment of 0 means target abi alignment. We need to use the scalar's 2368 // target abi alignment in such a case. 2369 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2370 const Align Alignment = 2371 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2372 2373 // Determine if the pointer operand of the access is either consecutive or 2374 // reverse consecutive. 2375 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2376 bool ConsecutiveStride = 2377 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2378 bool CreateGatherScatter = 2379 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2380 2381 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2382 // gather/scatter. Otherwise Decision should have been to Scalarize. 2383 assert((ConsecutiveStride || CreateGatherScatter) && 2384 "The instruction should be scalarized"); 2385 (void)ConsecutiveStride; 2386 2387 VectorParts BlockInMaskParts(UF); 2388 bool isMaskRequired = BlockInMask; 2389 if (isMaskRequired) 2390 for (unsigned Part = 0; Part < UF; ++Part) 2391 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2392 2393 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2394 // Calculate the pointer for the specific unroll-part. 2395 GetElementPtrInst *PartPtr = nullptr; 2396 2397 bool InBounds = false; 2398 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2399 InBounds = gep->isInBounds(); 2400 2401 if (Reverse) { 2402 // If the address is consecutive but reversed, then the 2403 // wide store needs to start at the last vector element. 2404 PartPtr = cast<GetElementPtrInst>( 2405 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2406 PartPtr->setIsInBounds(InBounds); 2407 PartPtr = cast<GetElementPtrInst>( 2408 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2409 PartPtr->setIsInBounds(InBounds); 2410 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2411 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2412 } else { 2413 PartPtr = cast<GetElementPtrInst>( 2414 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2415 PartPtr->setIsInBounds(InBounds); 2416 } 2417 2418 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2419 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2420 }; 2421 2422 // Handle Stores: 2423 if (SI) { 2424 setDebugLocFromInst(Builder, SI); 2425 2426 for (unsigned Part = 0; Part < UF; ++Part) { 2427 Instruction *NewSI = nullptr; 2428 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2429 if (CreateGatherScatter) { 2430 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2431 Value *VectorGep = State.get(Addr, Part); 2432 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2433 MaskPart); 2434 } else { 2435 if (Reverse) { 2436 // If we store to reverse consecutive memory locations, then we need 2437 // to reverse the order of elements in the stored value. 2438 StoredVal = reverseVector(StoredVal); 2439 // We don't want to update the value in the map as it might be used in 2440 // another expression. So don't call resetVectorValue(StoredVal). 2441 } 2442 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2443 if (isMaskRequired) 2444 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2445 BlockInMaskParts[Part]); 2446 else 2447 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2448 } 2449 addMetadata(NewSI, SI); 2450 } 2451 return; 2452 } 2453 2454 // Handle loads. 2455 assert(LI && "Must have a load instruction"); 2456 setDebugLocFromInst(Builder, LI); 2457 for (unsigned Part = 0; Part < UF; ++Part) { 2458 Value *NewLI; 2459 if (CreateGatherScatter) { 2460 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2461 Value *VectorGep = State.get(Addr, Part); 2462 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2463 nullptr, "wide.masked.gather"); 2464 addMetadata(NewLI, LI); 2465 } else { 2466 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2467 if (isMaskRequired) 2468 NewLI = Builder.CreateMaskedLoad( 2469 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2470 "wide.masked.load"); 2471 else 2472 NewLI = 2473 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2474 2475 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2476 addMetadata(NewLI, LI); 2477 if (Reverse) 2478 NewLI = reverseVector(NewLI); 2479 } 2480 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2481 } 2482 } 2483 2484 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2485 const VPIteration &Instance, 2486 bool IfPredicateInstr) { 2487 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2488 2489 setDebugLocFromInst(Builder, Instr); 2490 2491 // Does this instruction return a value ? 2492 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2493 2494 Instruction *Cloned = Instr->clone(); 2495 if (!IsVoidRetTy) 2496 Cloned->setName(Instr->getName() + ".cloned"); 2497 2498 // Replace the operands of the cloned instructions with their scalar 2499 // equivalents in the new loop. 2500 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2501 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2502 Cloned->setOperand(op, NewOp); 2503 } 2504 addNewMetadata(Cloned, Instr); 2505 2506 // Place the cloned scalar in the new loop. 2507 Builder.Insert(Cloned); 2508 2509 // Add the cloned scalar to the scalar map entry. 2510 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2511 2512 // If we just cloned a new assumption, add it the assumption cache. 2513 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2514 if (II->getIntrinsicID() == Intrinsic::assume) 2515 AC->registerAssumption(II); 2516 2517 // End if-block. 2518 if (IfPredicateInstr) 2519 PredicatedInstructions.push_back(Cloned); 2520 } 2521 2522 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2523 Value *End, Value *Step, 2524 Instruction *DL) { 2525 BasicBlock *Header = L->getHeader(); 2526 BasicBlock *Latch = L->getLoopLatch(); 2527 // As we're just creating this loop, it's possible no latch exists 2528 // yet. If so, use the header as this will be a single block loop. 2529 if (!Latch) 2530 Latch = Header; 2531 2532 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2533 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2534 setDebugLocFromInst(Builder, OldInst); 2535 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2536 2537 Builder.SetInsertPoint(Latch->getTerminator()); 2538 setDebugLocFromInst(Builder, OldInst); 2539 2540 // Create i+1 and fill the PHINode. 2541 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2542 Induction->addIncoming(Start, L->getLoopPreheader()); 2543 Induction->addIncoming(Next, Latch); 2544 // Create the compare. 2545 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2546 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2547 2548 // Now we have two terminators. Remove the old one from the block. 2549 Latch->getTerminator()->eraseFromParent(); 2550 2551 return Induction; 2552 } 2553 2554 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2555 if (TripCount) 2556 return TripCount; 2557 2558 assert(L && "Create Trip Count for null loop."); 2559 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2560 // Find the loop boundaries. 2561 ScalarEvolution *SE = PSE.getSE(); 2562 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2563 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2564 "Invalid loop count"); 2565 2566 Type *IdxTy = Legal->getWidestInductionType(); 2567 assert(IdxTy && "No type for induction"); 2568 2569 // The exit count might have the type of i64 while the phi is i32. This can 2570 // happen if we have an induction variable that is sign extended before the 2571 // compare. The only way that we get a backedge taken count is that the 2572 // induction variable was signed and as such will not overflow. In such a case 2573 // truncation is legal. 2574 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2575 IdxTy->getPrimitiveSizeInBits()) 2576 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2577 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2578 2579 // Get the total trip count from the count by adding 1. 2580 const SCEV *ExitCount = SE->getAddExpr( 2581 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2582 2583 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2584 2585 // Expand the trip count and place the new instructions in the preheader. 2586 // Notice that the pre-header does not change, only the loop body. 2587 SCEVExpander Exp(*SE, DL, "induction"); 2588 2589 // Count holds the overall loop count (N). 2590 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2591 L->getLoopPreheader()->getTerminator()); 2592 2593 if (TripCount->getType()->isPointerTy()) 2594 TripCount = 2595 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2596 L->getLoopPreheader()->getTerminator()); 2597 2598 return TripCount; 2599 } 2600 2601 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2602 if (VectorTripCount) 2603 return VectorTripCount; 2604 2605 Value *TC = getOrCreateTripCount(L); 2606 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2607 2608 Type *Ty = TC->getType(); 2609 Constant *Step = ConstantInt::get(Ty, VF * UF); 2610 2611 // If the tail is to be folded by masking, round the number of iterations N 2612 // up to a multiple of Step instead of rounding down. This is done by first 2613 // adding Step-1 and then rounding down. Note that it's ok if this addition 2614 // overflows: the vector induction variable will eventually wrap to zero given 2615 // that it starts at zero and its Step is a power of two; the loop will then 2616 // exit, with the last early-exit vector comparison also producing all-true. 2617 if (Cost->foldTailByMasking()) { 2618 assert(isPowerOf2_32(VF * UF) && 2619 "VF*UF must be a power of 2 when folding tail by masking"); 2620 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2621 } 2622 2623 // Now we need to generate the expression for the part of the loop that the 2624 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2625 // iterations are not required for correctness, or N - Step, otherwise. Step 2626 // is equal to the vectorization factor (number of SIMD elements) times the 2627 // unroll factor (number of SIMD instructions). 2628 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2629 2630 // If there is a non-reversed interleaved group that may speculatively access 2631 // memory out-of-bounds, we need to ensure that there will be at least one 2632 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2633 // the trip count, we set the remainder to be equal to the step. If the step 2634 // does not evenly divide the trip count, no adjustment is necessary since 2635 // there will already be scalar iterations. Note that the minimum iterations 2636 // check ensures that N >= Step. 2637 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2638 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2639 R = Builder.CreateSelect(IsZero, Step, R); 2640 } 2641 2642 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2643 2644 return VectorTripCount; 2645 } 2646 2647 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2648 const DataLayout &DL) { 2649 // Verify that V is a vector type with same number of elements as DstVTy. 2650 unsigned VF = DstVTy->getNumElements(); 2651 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2652 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2653 Type *SrcElemTy = SrcVecTy->getElementType(); 2654 Type *DstElemTy = DstVTy->getElementType(); 2655 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2656 "Vector elements must have same size"); 2657 2658 // Do a direct cast if element types are castable. 2659 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2660 return Builder.CreateBitOrPointerCast(V, DstVTy); 2661 } 2662 // V cannot be directly casted to desired vector type. 2663 // May happen when V is a floating point vector but DstVTy is a vector of 2664 // pointers or vice-versa. Handle this using a two-step bitcast using an 2665 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2666 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2667 "Only one type should be a pointer type"); 2668 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2669 "Only one type should be a floating point type"); 2670 Type *IntTy = 2671 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2672 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2673 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2674 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2675 } 2676 2677 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2678 BasicBlock *Bypass) { 2679 Value *Count = getOrCreateTripCount(L); 2680 // Reuse existing vector loop preheader for TC checks. 2681 // Note that new preheader block is generated for vector loop. 2682 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2683 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2684 2685 // Generate code to check if the loop's trip count is less than VF * UF, or 2686 // equal to it in case a scalar epilogue is required; this implies that the 2687 // vector trip count is zero. This check also covers the case where adding one 2688 // to the backedge-taken count overflowed leading to an incorrect trip count 2689 // of zero. In this case we will also jump to the scalar loop. 2690 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2691 : ICmpInst::ICMP_ULT; 2692 2693 // If tail is to be folded, vector loop takes care of all iterations. 2694 Value *CheckMinIters = Builder.getFalse(); 2695 if (!Cost->foldTailByMasking()) 2696 CheckMinIters = Builder.CreateICmp( 2697 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2698 "min.iters.check"); 2699 2700 // Create new preheader for vector loop. 2701 LoopVectorPreHeader = 2702 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2703 "vector.ph"); 2704 2705 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2706 DT->getNode(Bypass)->getIDom()) && 2707 "TC check is expected to dominate Bypass"); 2708 2709 // Update dominator for Bypass & LoopExit. 2710 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2711 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2712 2713 ReplaceInstWithInst( 2714 TCCheckBlock->getTerminator(), 2715 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2716 LoopBypassBlocks.push_back(TCCheckBlock); 2717 } 2718 2719 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2720 // Reuse existing vector loop preheader for SCEV checks. 2721 // Note that new preheader block is generated for vector loop. 2722 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2723 2724 // Generate the code to check that the SCEV assumptions that we made. 2725 // We want the new basic block to start at the first instruction in a 2726 // sequence of instructions that form a check. 2727 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2728 "scev.check"); 2729 Value *SCEVCheck = Exp.expandCodeForPredicate( 2730 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2731 2732 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2733 if (C->isZero()) 2734 return; 2735 2736 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2737 "Cannot SCEV check stride or overflow when optimizing for size"); 2738 2739 SCEVCheckBlock->setName("vector.scevcheck"); 2740 // Create new preheader for vector loop. 2741 LoopVectorPreHeader = 2742 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2743 nullptr, "vector.ph"); 2744 2745 // Update dominator only if this is first RT check. 2746 if (LoopBypassBlocks.empty()) { 2747 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2748 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2749 } 2750 2751 ReplaceInstWithInst( 2752 SCEVCheckBlock->getTerminator(), 2753 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2754 LoopBypassBlocks.push_back(SCEVCheckBlock); 2755 AddedSafetyChecks = true; 2756 } 2757 2758 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2759 // VPlan-native path does not do any analysis for runtime checks currently. 2760 if (EnableVPlanNativePath) 2761 return; 2762 2763 // Reuse existing vector loop preheader for runtime memory checks. 2764 // Note that new preheader block is generated for vector loop. 2765 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2766 2767 // Generate the code that checks in runtime if arrays overlap. We put the 2768 // checks into a separate block to make the more common case of few elements 2769 // faster. 2770 Instruction *FirstCheckInst; 2771 Instruction *MemRuntimeCheck; 2772 std::tie(FirstCheckInst, MemRuntimeCheck) = 2773 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2774 if (!MemRuntimeCheck) 2775 return; 2776 2777 if (MemCheckBlock->getParent()->hasOptSize()) { 2778 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2779 "Cannot emit memory checks when optimizing for size, unless forced " 2780 "to vectorize."); 2781 ORE->emit([&]() { 2782 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2783 L->getStartLoc(), L->getHeader()) 2784 << "Code-size may be reduced by not forcing " 2785 "vectorization, or by source-code modifications " 2786 "eliminating the need for runtime checks " 2787 "(e.g., adding 'restrict')."; 2788 }); 2789 } 2790 2791 MemCheckBlock->setName("vector.memcheck"); 2792 // Create new preheader for vector loop. 2793 LoopVectorPreHeader = 2794 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2795 "vector.ph"); 2796 2797 // Update dominator only if this is first RT check. 2798 if (LoopBypassBlocks.empty()) { 2799 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2800 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2801 } 2802 2803 ReplaceInstWithInst( 2804 MemCheckBlock->getTerminator(), 2805 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2806 LoopBypassBlocks.push_back(MemCheckBlock); 2807 AddedSafetyChecks = true; 2808 2809 // We currently don't use LoopVersioning for the actual loop cloning but we 2810 // still use it to add the noalias metadata. 2811 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2812 PSE.getSE()); 2813 LVer->prepareNoAliasMetadata(); 2814 } 2815 2816 Value *InnerLoopVectorizer::emitTransformedIndex( 2817 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2818 const InductionDescriptor &ID) const { 2819 2820 SCEVExpander Exp(*SE, DL, "induction"); 2821 auto Step = ID.getStep(); 2822 auto StartValue = ID.getStartValue(); 2823 assert(Index->getType() == Step->getType() && 2824 "Index type does not match StepValue type"); 2825 2826 // Note: the IR at this point is broken. We cannot use SE to create any new 2827 // SCEV and then expand it, hoping that SCEV's simplification will give us 2828 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2829 // lead to various SCEV crashes. So all we can do is to use builder and rely 2830 // on InstCombine for future simplifications. Here we handle some trivial 2831 // cases only. 2832 auto CreateAdd = [&B](Value *X, Value *Y) { 2833 assert(X->getType() == Y->getType() && "Types don't match!"); 2834 if (auto *CX = dyn_cast<ConstantInt>(X)) 2835 if (CX->isZero()) 2836 return Y; 2837 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2838 if (CY->isZero()) 2839 return X; 2840 return B.CreateAdd(X, Y); 2841 }; 2842 2843 auto CreateMul = [&B](Value *X, Value *Y) { 2844 assert(X->getType() == Y->getType() && "Types don't match!"); 2845 if (auto *CX = dyn_cast<ConstantInt>(X)) 2846 if (CX->isOne()) 2847 return Y; 2848 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2849 if (CY->isOne()) 2850 return X; 2851 return B.CreateMul(X, Y); 2852 }; 2853 2854 switch (ID.getKind()) { 2855 case InductionDescriptor::IK_IntInduction: { 2856 assert(Index->getType() == StartValue->getType() && 2857 "Index type does not match StartValue type"); 2858 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2859 return B.CreateSub(StartValue, Index); 2860 auto *Offset = CreateMul( 2861 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2862 return CreateAdd(StartValue, Offset); 2863 } 2864 case InductionDescriptor::IK_PtrInduction: { 2865 assert(isa<SCEVConstant>(Step) && 2866 "Expected constant step for pointer induction"); 2867 return B.CreateGEP( 2868 StartValue->getType()->getPointerElementType(), StartValue, 2869 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2870 &*B.GetInsertPoint()))); 2871 } 2872 case InductionDescriptor::IK_FpInduction: { 2873 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2874 auto InductionBinOp = ID.getInductionBinOp(); 2875 assert(InductionBinOp && 2876 (InductionBinOp->getOpcode() == Instruction::FAdd || 2877 InductionBinOp->getOpcode() == Instruction::FSub) && 2878 "Original bin op should be defined for FP induction"); 2879 2880 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2881 2882 // Floating point operations had to be 'fast' to enable the induction. 2883 FastMathFlags Flags; 2884 Flags.setFast(); 2885 2886 Value *MulExp = B.CreateFMul(StepValue, Index); 2887 if (isa<Instruction>(MulExp)) 2888 // We have to check, the MulExp may be a constant. 2889 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2890 2891 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2892 "induction"); 2893 if (isa<Instruction>(BOp)) 2894 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2895 2896 return BOp; 2897 } 2898 case InductionDescriptor::IK_NoInduction: 2899 return nullptr; 2900 } 2901 llvm_unreachable("invalid enum"); 2902 } 2903 2904 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2905 /* 2906 In this function we generate a new loop. The new loop will contain 2907 the vectorized instructions while the old loop will continue to run the 2908 scalar remainder. 2909 2910 [ ] <-- loop iteration number check. 2911 / | 2912 / v 2913 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2914 | / | 2915 | / v 2916 || [ ] <-- vector pre header. 2917 |/ | 2918 | v 2919 | [ ] \ 2920 | [ ]_| <-- vector loop. 2921 | | 2922 | v 2923 | -[ ] <--- middle-block. 2924 | / | 2925 | / v 2926 -|- >[ ] <--- new preheader. 2927 | | 2928 | v 2929 | [ ] \ 2930 | [ ]_| <-- old scalar loop to handle remainder. 2931 \ | 2932 \ v 2933 >[ ] <-- exit block. 2934 ... 2935 */ 2936 2937 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2938 2939 // Some loops have a single integer induction variable, while other loops 2940 // don't. One example is c++ iterators that often have multiple pointer 2941 // induction variables. In the code below we also support a case where we 2942 // don't have a single induction variable. 2943 // 2944 // We try to obtain an induction variable from the original loop as hard 2945 // as possible. However if we don't find one that: 2946 // - is an integer 2947 // - counts from zero, stepping by one 2948 // - is the size of the widest induction variable type 2949 // then we create a new one. 2950 OldInduction = Legal->getPrimaryInduction(); 2951 Type *IdxTy = Legal->getWidestInductionType(); 2952 2953 // Split the single block loop into the two loop structure described above. 2954 LoopScalarBody = OrigLoop->getHeader(); 2955 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2956 LoopExitBlock = OrigLoop->getExitBlock(); 2957 assert(LoopExitBlock && "Must have an exit block"); 2958 assert(LoopVectorPreHeader && "Invalid loop structure"); 2959 2960 LoopMiddleBlock = 2961 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2962 LI, nullptr, "middle.block"); 2963 LoopScalarPreHeader = 2964 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2965 nullptr, "scalar.ph"); 2966 // We intentionally don't let SplitBlock to update LoopInfo since 2967 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2968 // LoopVectorBody is explicitly added to the correct place few lines later. 2969 LoopVectorBody = 2970 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2971 nullptr, nullptr, "vector.body"); 2972 2973 // Update dominator for loop exit. 2974 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2975 2976 // Create and register the new vector loop. 2977 Loop *Lp = LI->AllocateLoop(); 2978 Loop *ParentLoop = OrigLoop->getParentLoop(); 2979 2980 // Insert the new loop into the loop nest and register the new basic blocks 2981 // before calling any utilities such as SCEV that require valid LoopInfo. 2982 if (ParentLoop) { 2983 ParentLoop->addChildLoop(Lp); 2984 } else { 2985 LI->addTopLevelLoop(Lp); 2986 } 2987 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 2988 2989 // Find the loop boundaries. 2990 Value *Count = getOrCreateTripCount(Lp); 2991 2992 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2993 2994 // Now, compare the new count to zero. If it is zero skip the vector loop and 2995 // jump to the scalar loop. This check also covers the case where the 2996 // backedge-taken count is uint##_max: adding one to it will overflow leading 2997 // to an incorrect trip count of zero. In this (rare) case we will also jump 2998 // to the scalar loop. 2999 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3000 3001 // Generate the code to check any assumptions that we've made for SCEV 3002 // expressions. 3003 emitSCEVChecks(Lp, LoopScalarPreHeader); 3004 3005 // Generate the code that checks in runtime if arrays overlap. We put the 3006 // checks into a separate block to make the more common case of few elements 3007 // faster. 3008 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3009 3010 // Generate the induction variable. 3011 // The loop step is equal to the vectorization factor (num of SIMD elements) 3012 // times the unroll factor (num of SIMD instructions). 3013 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3014 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3015 Induction = 3016 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3017 getDebugLocFromInstOrOperands(OldInduction)); 3018 3019 // We are going to resume the execution of the scalar loop. 3020 // Go over all of the induction variables that we found and fix the 3021 // PHIs that are left in the scalar version of the loop. 3022 // The starting values of PHI nodes depend on the counter of the last 3023 // iteration in the vectorized loop. 3024 // If we come from a bypass edge then we need to start from the original 3025 // start value. 3026 3027 // This variable saves the new starting index for the scalar loop. It is used 3028 // to test if there are any tail iterations left once the vector loop has 3029 // completed. 3030 for (auto &InductionEntry : Legal->getInductionVars()) { 3031 PHINode *OrigPhi = InductionEntry.first; 3032 InductionDescriptor II = InductionEntry.second; 3033 3034 // Create phi nodes to merge from the backedge-taken check block. 3035 PHINode *BCResumeVal = 3036 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3037 LoopScalarPreHeader->getTerminator()); 3038 // Copy original phi DL over to the new one. 3039 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3040 Value *&EndValue = IVEndValues[OrigPhi]; 3041 if (OrigPhi == OldInduction) { 3042 // We know what the end value is. 3043 EndValue = CountRoundDown; 3044 } else { 3045 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3046 Type *StepType = II.getStep()->getType(); 3047 Instruction::CastOps CastOp = 3048 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3049 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3050 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3051 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3052 EndValue->setName("ind.end"); 3053 } 3054 3055 // The new PHI merges the original incoming value, in case of a bypass, 3056 // or the value at the end of the vectorized loop. 3057 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3058 3059 // Fix the scalar body counter (PHI node). 3060 // The old induction's phi node in the scalar body needs the truncated 3061 // value. 3062 for (BasicBlock *BB : LoopBypassBlocks) 3063 BCResumeVal->addIncoming(II.getStartValue(), BB); 3064 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3065 } 3066 3067 // We need the OrigLoop (scalar loop part) latch terminator to help 3068 // produce correct debug info for the middle block BB instructions. 3069 // The legality check stage guarantees that the loop will have a single 3070 // latch. 3071 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3072 "Scalar loop latch terminator isn't a branch"); 3073 BranchInst *ScalarLatchBr = 3074 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3075 3076 // Add a check in the middle block to see if we have completed 3077 // all of the iterations in the first vector loop. 3078 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3079 // If tail is to be folded, we know we don't need to run the remainder. 3080 Value *CmpN = Builder.getTrue(); 3081 if (!Cost->foldTailByMasking()) { 3082 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3083 CountRoundDown, "cmp.n", 3084 LoopMiddleBlock->getTerminator()); 3085 3086 // Here we use the same DebugLoc as the scalar loop latch branch instead 3087 // of the corresponding compare because they may have ended up with 3088 // different line numbers and we want to avoid awkward line stepping while 3089 // debugging. Eg. if the compare has got a line number inside the loop. 3090 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3091 } 3092 3093 BranchInst *BrInst = 3094 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3095 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3096 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3097 3098 // Get ready to start creating new instructions into the vectorized body. 3099 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3100 "Inconsistent vector loop preheader"); 3101 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3102 3103 Optional<MDNode *> VectorizedLoopID = 3104 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3105 LLVMLoopVectorizeFollowupVectorized}); 3106 if (VectorizedLoopID.hasValue()) { 3107 Lp->setLoopID(VectorizedLoopID.getValue()); 3108 3109 // Do not setAlreadyVectorized if loop attributes have been defined 3110 // explicitly. 3111 return LoopVectorPreHeader; 3112 } 3113 3114 // Keep all loop hints from the original loop on the vector loop (we'll 3115 // replace the vectorizer-specific hints below). 3116 if (MDNode *LID = OrigLoop->getLoopID()) 3117 Lp->setLoopID(LID); 3118 3119 LoopVectorizeHints Hints(Lp, true, *ORE); 3120 Hints.setAlreadyVectorized(); 3121 3122 #ifdef EXPENSIVE_CHECKS 3123 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3124 LI->verify(*DT); 3125 #endif 3126 3127 return LoopVectorPreHeader; 3128 } 3129 3130 // Fix up external users of the induction variable. At this point, we are 3131 // in LCSSA form, with all external PHIs that use the IV having one input value, 3132 // coming from the remainder loop. We need those PHIs to also have a correct 3133 // value for the IV when arriving directly from the middle block. 3134 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3135 const InductionDescriptor &II, 3136 Value *CountRoundDown, Value *EndValue, 3137 BasicBlock *MiddleBlock) { 3138 // There are two kinds of external IV usages - those that use the value 3139 // computed in the last iteration (the PHI) and those that use the penultimate 3140 // value (the value that feeds into the phi from the loop latch). 3141 // We allow both, but they, obviously, have different values. 3142 3143 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3144 3145 DenseMap<Value *, Value *> MissingVals; 3146 3147 // An external user of the last iteration's value should see the value that 3148 // the remainder loop uses to initialize its own IV. 3149 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3150 for (User *U : PostInc->users()) { 3151 Instruction *UI = cast<Instruction>(U); 3152 if (!OrigLoop->contains(UI)) { 3153 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3154 MissingVals[UI] = EndValue; 3155 } 3156 } 3157 3158 // An external user of the penultimate value need to see EndValue - Step. 3159 // The simplest way to get this is to recompute it from the constituent SCEVs, 3160 // that is Start + (Step * (CRD - 1)). 3161 for (User *U : OrigPhi->users()) { 3162 auto *UI = cast<Instruction>(U); 3163 if (!OrigLoop->contains(UI)) { 3164 const DataLayout &DL = 3165 OrigLoop->getHeader()->getModule()->getDataLayout(); 3166 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3167 3168 IRBuilder<> B(MiddleBlock->getTerminator()); 3169 Value *CountMinusOne = B.CreateSub( 3170 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3171 Value *CMO = 3172 !II.getStep()->getType()->isIntegerTy() 3173 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3174 II.getStep()->getType()) 3175 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3176 CMO->setName("cast.cmo"); 3177 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3178 Escape->setName("ind.escape"); 3179 MissingVals[UI] = Escape; 3180 } 3181 } 3182 3183 for (auto &I : MissingVals) { 3184 PHINode *PHI = cast<PHINode>(I.first); 3185 // One corner case we have to handle is two IVs "chasing" each-other, 3186 // that is %IV2 = phi [...], [ %IV1, %latch ] 3187 // In this case, if IV1 has an external use, we need to avoid adding both 3188 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3189 // don't already have an incoming value for the middle block. 3190 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3191 PHI->addIncoming(I.second, MiddleBlock); 3192 } 3193 } 3194 3195 namespace { 3196 3197 struct CSEDenseMapInfo { 3198 static bool canHandle(const Instruction *I) { 3199 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3200 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3201 } 3202 3203 static inline Instruction *getEmptyKey() { 3204 return DenseMapInfo<Instruction *>::getEmptyKey(); 3205 } 3206 3207 static inline Instruction *getTombstoneKey() { 3208 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3209 } 3210 3211 static unsigned getHashValue(const Instruction *I) { 3212 assert(canHandle(I) && "Unknown instruction!"); 3213 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3214 I->value_op_end())); 3215 } 3216 3217 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3218 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3219 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3220 return LHS == RHS; 3221 return LHS->isIdenticalTo(RHS); 3222 } 3223 }; 3224 3225 } // end anonymous namespace 3226 3227 ///Perform cse of induction variable instructions. 3228 static void cse(BasicBlock *BB) { 3229 // Perform simple cse. 3230 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3231 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3232 Instruction *In = &*I++; 3233 3234 if (!CSEDenseMapInfo::canHandle(In)) 3235 continue; 3236 3237 // Check if we can replace this instruction with any of the 3238 // visited instructions. 3239 if (Instruction *V = CSEMap.lookup(In)) { 3240 In->replaceAllUsesWith(V); 3241 In->eraseFromParent(); 3242 continue; 3243 } 3244 3245 CSEMap[In] = In; 3246 } 3247 } 3248 3249 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3250 unsigned VF, 3251 bool &NeedToScalarize) { 3252 Function *F = CI->getCalledFunction(); 3253 Type *ScalarRetTy = CI->getType(); 3254 SmallVector<Type *, 4> Tys, ScalarTys; 3255 for (auto &ArgOp : CI->arg_operands()) 3256 ScalarTys.push_back(ArgOp->getType()); 3257 3258 // Estimate cost of scalarized vector call. The source operands are assumed 3259 // to be vectors, so we need to extract individual elements from there, 3260 // execute VF scalar calls, and then gather the result into the vector return 3261 // value. 3262 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3263 if (VF == 1) 3264 return ScalarCallCost; 3265 3266 // Compute corresponding vector type for return value and arguments. 3267 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3268 for (Type *ScalarTy : ScalarTys) 3269 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3270 3271 // Compute costs of unpacking argument values for the scalar calls and 3272 // packing the return values to a vector. 3273 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3274 3275 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3276 3277 // If we can't emit a vector call for this function, then the currently found 3278 // cost is the cost we need to return. 3279 NeedToScalarize = true; 3280 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3281 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3282 3283 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3284 return Cost; 3285 3286 // If the corresponding vector cost is cheaper, return its cost. 3287 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3288 if (VectorCallCost < Cost) { 3289 NeedToScalarize = false; 3290 return VectorCallCost; 3291 } 3292 return Cost; 3293 } 3294 3295 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3296 unsigned VF) { 3297 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3298 assert(ID && "Expected intrinsic call!"); 3299 3300 FastMathFlags FMF; 3301 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3302 FMF = FPMO->getFastMathFlags(); 3303 3304 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3305 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); 3306 } 3307 3308 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3309 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3310 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3311 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3312 } 3313 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3314 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3315 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3316 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3317 } 3318 3319 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3320 // For every instruction `I` in MinBWs, truncate the operands, create a 3321 // truncated version of `I` and reextend its result. InstCombine runs 3322 // later and will remove any ext/trunc pairs. 3323 SmallPtrSet<Value *, 4> Erased; 3324 for (const auto &KV : Cost->getMinimalBitwidths()) { 3325 // If the value wasn't vectorized, we must maintain the original scalar 3326 // type. The absence of the value from VectorLoopValueMap indicates that it 3327 // wasn't vectorized. 3328 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3329 continue; 3330 for (unsigned Part = 0; Part < UF; ++Part) { 3331 Value *I = getOrCreateVectorValue(KV.first, Part); 3332 if (Erased.find(I) != Erased.end() || I->use_empty() || 3333 !isa<Instruction>(I)) 3334 continue; 3335 Type *OriginalTy = I->getType(); 3336 Type *ScalarTruncatedTy = 3337 IntegerType::get(OriginalTy->getContext(), KV.second); 3338 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3339 OriginalTy->getVectorNumElements()); 3340 if (TruncatedTy == OriginalTy) 3341 continue; 3342 3343 IRBuilder<> B(cast<Instruction>(I)); 3344 auto ShrinkOperand = [&](Value *V) -> Value * { 3345 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3346 if (ZI->getSrcTy() == TruncatedTy) 3347 return ZI->getOperand(0); 3348 return B.CreateZExtOrTrunc(V, TruncatedTy); 3349 }; 3350 3351 // The actual instruction modification depends on the instruction type, 3352 // unfortunately. 3353 Value *NewI = nullptr; 3354 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3355 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3356 ShrinkOperand(BO->getOperand(1))); 3357 3358 // Any wrapping introduced by shrinking this operation shouldn't be 3359 // considered undefined behavior. So, we can't unconditionally copy 3360 // arithmetic wrapping flags to NewI. 3361 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3362 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3363 NewI = 3364 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3365 ShrinkOperand(CI->getOperand(1))); 3366 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3367 NewI = B.CreateSelect(SI->getCondition(), 3368 ShrinkOperand(SI->getTrueValue()), 3369 ShrinkOperand(SI->getFalseValue())); 3370 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3371 switch (CI->getOpcode()) { 3372 default: 3373 llvm_unreachable("Unhandled cast!"); 3374 case Instruction::Trunc: 3375 NewI = ShrinkOperand(CI->getOperand(0)); 3376 break; 3377 case Instruction::SExt: 3378 NewI = B.CreateSExtOrTrunc( 3379 CI->getOperand(0), 3380 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3381 break; 3382 case Instruction::ZExt: 3383 NewI = B.CreateZExtOrTrunc( 3384 CI->getOperand(0), 3385 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3386 break; 3387 } 3388 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3389 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3390 auto *O0 = B.CreateZExtOrTrunc( 3391 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3392 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3393 auto *O1 = B.CreateZExtOrTrunc( 3394 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3395 3396 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3397 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3398 // Don't do anything with the operands, just extend the result. 3399 continue; 3400 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3401 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3402 auto *O0 = B.CreateZExtOrTrunc( 3403 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3404 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3405 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3406 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3407 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3408 auto *O0 = B.CreateZExtOrTrunc( 3409 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3410 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3411 } else { 3412 // If we don't know what to do, be conservative and don't do anything. 3413 continue; 3414 } 3415 3416 // Lastly, extend the result. 3417 NewI->takeName(cast<Instruction>(I)); 3418 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3419 I->replaceAllUsesWith(Res); 3420 cast<Instruction>(I)->eraseFromParent(); 3421 Erased.insert(I); 3422 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3423 } 3424 } 3425 3426 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3427 for (const auto &KV : Cost->getMinimalBitwidths()) { 3428 // If the value wasn't vectorized, we must maintain the original scalar 3429 // type. The absence of the value from VectorLoopValueMap indicates that it 3430 // wasn't vectorized. 3431 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3432 continue; 3433 for (unsigned Part = 0; Part < UF; ++Part) { 3434 Value *I = getOrCreateVectorValue(KV.first, Part); 3435 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3436 if (Inst && Inst->use_empty()) { 3437 Value *NewI = Inst->getOperand(0); 3438 Inst->eraseFromParent(); 3439 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3440 } 3441 } 3442 } 3443 } 3444 3445 void InnerLoopVectorizer::fixVectorizedLoop() { 3446 // Insert truncates and extends for any truncated instructions as hints to 3447 // InstCombine. 3448 if (VF > 1) 3449 truncateToMinimalBitwidths(); 3450 3451 // Fix widened non-induction PHIs by setting up the PHI operands. 3452 if (OrigPHIsToFix.size()) { 3453 assert(EnableVPlanNativePath && 3454 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3455 fixNonInductionPHIs(); 3456 } 3457 3458 // At this point every instruction in the original loop is widened to a 3459 // vector form. Now we need to fix the recurrences in the loop. These PHI 3460 // nodes are currently empty because we did not want to introduce cycles. 3461 // This is the second stage of vectorizing recurrences. 3462 fixCrossIterationPHIs(); 3463 3464 // Forget the original basic block. 3465 PSE.getSE()->forgetLoop(OrigLoop); 3466 3467 // Fix-up external users of the induction variables. 3468 for (auto &Entry : Legal->getInductionVars()) 3469 fixupIVUsers(Entry.first, Entry.second, 3470 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3471 IVEndValues[Entry.first], LoopMiddleBlock); 3472 3473 fixLCSSAPHIs(); 3474 for (Instruction *PI : PredicatedInstructions) 3475 sinkScalarOperands(&*PI); 3476 3477 // Remove redundant induction instructions. 3478 cse(LoopVectorBody); 3479 3480 // Set/update profile weights for the vector and remainder loops as original 3481 // loop iterations are now distributed among them. Note that original loop 3482 // represented by LoopScalarBody becomes remainder loop after vectorization. 3483 // 3484 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3485 // end up getting slightly roughened result but that should be OK since 3486 // profile is not inherently precise anyway. Note also possible bypass of 3487 // vector code caused by legality checks is ignored, assigning all the weight 3488 // to the vector loop, optimistically. 3489 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3490 LI->getLoopFor(LoopVectorBody), 3491 LI->getLoopFor(LoopScalarBody), VF * UF); 3492 } 3493 3494 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3495 // In order to support recurrences we need to be able to vectorize Phi nodes. 3496 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3497 // stage #2: We now need to fix the recurrences by adding incoming edges to 3498 // the currently empty PHI nodes. At this point every instruction in the 3499 // original loop is widened to a vector form so we can use them to construct 3500 // the incoming edges. 3501 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3502 // Handle first-order recurrences and reductions that need to be fixed. 3503 if (Legal->isFirstOrderRecurrence(&Phi)) 3504 fixFirstOrderRecurrence(&Phi); 3505 else if (Legal->isReductionVariable(&Phi)) 3506 fixReduction(&Phi); 3507 } 3508 } 3509 3510 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3511 // This is the second phase of vectorizing first-order recurrences. An 3512 // overview of the transformation is described below. Suppose we have the 3513 // following loop. 3514 // 3515 // for (int i = 0; i < n; ++i) 3516 // b[i] = a[i] - a[i - 1]; 3517 // 3518 // There is a first-order recurrence on "a". For this loop, the shorthand 3519 // scalar IR looks like: 3520 // 3521 // scalar.ph: 3522 // s_init = a[-1] 3523 // br scalar.body 3524 // 3525 // scalar.body: 3526 // i = phi [0, scalar.ph], [i+1, scalar.body] 3527 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3528 // s2 = a[i] 3529 // b[i] = s2 - s1 3530 // br cond, scalar.body, ... 3531 // 3532 // In this example, s1 is a recurrence because it's value depends on the 3533 // previous iteration. In the first phase of vectorization, we created a 3534 // temporary value for s1. We now complete the vectorization and produce the 3535 // shorthand vector IR shown below (for VF = 4, UF = 1). 3536 // 3537 // vector.ph: 3538 // v_init = vector(..., ..., ..., a[-1]) 3539 // br vector.body 3540 // 3541 // vector.body 3542 // i = phi [0, vector.ph], [i+4, vector.body] 3543 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3544 // v2 = a[i, i+1, i+2, i+3]; 3545 // v3 = vector(v1(3), v2(0, 1, 2)) 3546 // b[i, i+1, i+2, i+3] = v2 - v3 3547 // br cond, vector.body, middle.block 3548 // 3549 // middle.block: 3550 // x = v2(3) 3551 // br scalar.ph 3552 // 3553 // scalar.ph: 3554 // s_init = phi [x, middle.block], [a[-1], otherwise] 3555 // br scalar.body 3556 // 3557 // After execution completes the vector loop, we extract the next value of 3558 // the recurrence (x) to use as the initial value in the scalar loop. 3559 3560 // Get the original loop preheader and single loop latch. 3561 auto *Preheader = OrigLoop->getLoopPreheader(); 3562 auto *Latch = OrigLoop->getLoopLatch(); 3563 3564 // Get the initial and previous values of the scalar recurrence. 3565 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3566 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3567 3568 // Create a vector from the initial value. 3569 auto *VectorInit = ScalarInit; 3570 if (VF > 1) { 3571 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3572 VectorInit = Builder.CreateInsertElement( 3573 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3574 Builder.getInt32(VF - 1), "vector.recur.init"); 3575 } 3576 3577 // We constructed a temporary phi node in the first phase of vectorization. 3578 // This phi node will eventually be deleted. 3579 Builder.SetInsertPoint( 3580 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3581 3582 // Create a phi node for the new recurrence. The current value will either be 3583 // the initial value inserted into a vector or loop-varying vector value. 3584 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3585 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3586 3587 // Get the vectorized previous value of the last part UF - 1. It appears last 3588 // among all unrolled iterations, due to the order of their construction. 3589 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3590 3591 // Find and set the insertion point after the previous value if it is an 3592 // instruction. 3593 BasicBlock::iterator InsertPt; 3594 // Note that the previous value may have been constant-folded so it is not 3595 // guaranteed to be an instruction in the vector loop. 3596 // FIXME: Loop invariant values do not form recurrences. We should deal with 3597 // them earlier. 3598 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3599 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3600 else { 3601 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3602 if (isa<PHINode>(PreviousLastPart)) 3603 // If the previous value is a phi node, we should insert after all the phi 3604 // nodes in the block containing the PHI to avoid breaking basic block 3605 // verification. Note that the basic block may be different to 3606 // LoopVectorBody, in case we predicate the loop. 3607 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3608 else 3609 InsertPt = ++PreviousInst->getIterator(); 3610 } 3611 Builder.SetInsertPoint(&*InsertPt); 3612 3613 // We will construct a vector for the recurrence by combining the values for 3614 // the current and previous iterations. This is the required shuffle mask. 3615 SmallVector<Constant *, 8> ShuffleMask(VF); 3616 ShuffleMask[0] = Builder.getInt32(VF - 1); 3617 for (unsigned I = 1; I < VF; ++I) 3618 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3619 3620 // The vector from which to take the initial value for the current iteration 3621 // (actual or unrolled). Initially, this is the vector phi node. 3622 Value *Incoming = VecPhi; 3623 3624 // Shuffle the current and previous vector and update the vector parts. 3625 for (unsigned Part = 0; Part < UF; ++Part) { 3626 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3627 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3628 auto *Shuffle = 3629 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3630 ConstantVector::get(ShuffleMask)) 3631 : Incoming; 3632 PhiPart->replaceAllUsesWith(Shuffle); 3633 cast<Instruction>(PhiPart)->eraseFromParent(); 3634 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3635 Incoming = PreviousPart; 3636 } 3637 3638 // Fix the latch value of the new recurrence in the vector loop. 3639 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3640 3641 // Extract the last vector element in the middle block. This will be the 3642 // initial value for the recurrence when jumping to the scalar loop. 3643 auto *ExtractForScalar = Incoming; 3644 if (VF > 1) { 3645 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3646 ExtractForScalar = Builder.CreateExtractElement( 3647 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3648 } 3649 // Extract the second last element in the middle block if the 3650 // Phi is used outside the loop. We need to extract the phi itself 3651 // and not the last element (the phi update in the current iteration). This 3652 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3653 // when the scalar loop is not run at all. 3654 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3655 if (VF > 1) 3656 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3657 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3658 // When loop is unrolled without vectorizing, initialize 3659 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3660 // `Incoming`. This is analogous to the vectorized case above: extracting the 3661 // second last element when VF > 1. 3662 else if (UF > 1) 3663 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3664 3665 // Fix the initial value of the original recurrence in the scalar loop. 3666 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3667 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3668 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3669 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3670 Start->addIncoming(Incoming, BB); 3671 } 3672 3673 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3674 Phi->setName("scalar.recur"); 3675 3676 // Finally, fix users of the recurrence outside the loop. The users will need 3677 // either the last value of the scalar recurrence or the last value of the 3678 // vector recurrence we extracted in the middle block. Since the loop is in 3679 // LCSSA form, we just need to find all the phi nodes for the original scalar 3680 // recurrence in the exit block, and then add an edge for the middle block. 3681 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3682 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3683 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3684 } 3685 } 3686 } 3687 3688 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3689 Constant *Zero = Builder.getInt32(0); 3690 3691 // Get it's reduction variable descriptor. 3692 assert(Legal->isReductionVariable(Phi) && 3693 "Unable to find the reduction variable"); 3694 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3695 3696 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3697 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3698 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3699 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3700 RdxDesc.getMinMaxRecurrenceKind(); 3701 setDebugLocFromInst(Builder, ReductionStartValue); 3702 3703 // We need to generate a reduction vector from the incoming scalar. 3704 // To do so, we need to generate the 'identity' vector and override 3705 // one of the elements with the incoming scalar reduction. We need 3706 // to do it in the vector-loop preheader. 3707 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3708 3709 // This is the vector-clone of the value that leaves the loop. 3710 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3711 3712 // Find the reduction identity variable. Zero for addition, or, xor, 3713 // one for multiplication, -1 for And. 3714 Value *Identity; 3715 Value *VectorStart; 3716 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3717 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3718 // MinMax reduction have the start value as their identify. 3719 if (VF == 1) { 3720 VectorStart = Identity = ReductionStartValue; 3721 } else { 3722 VectorStart = Identity = 3723 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3724 } 3725 } else { 3726 // Handle other reduction kinds: 3727 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3728 RK, VecTy->getScalarType()); 3729 if (VF == 1) { 3730 Identity = Iden; 3731 // This vector is the Identity vector where the first element is the 3732 // incoming scalar reduction. 3733 VectorStart = ReductionStartValue; 3734 } else { 3735 Identity = ConstantVector::getSplat({VF, false}, Iden); 3736 3737 // This vector is the Identity vector where the first element is the 3738 // incoming scalar reduction. 3739 VectorStart = 3740 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3741 } 3742 } 3743 3744 // Wrap flags are in general invalid after vectorization, clear them. 3745 clearReductionWrapFlags(RdxDesc); 3746 3747 // Fix the vector-loop phi. 3748 3749 // Reductions do not have to start at zero. They can start with 3750 // any loop invariant values. 3751 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3752 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3753 3754 for (unsigned Part = 0; Part < UF; ++Part) { 3755 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3756 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3757 // Make sure to add the reduction start value only to the 3758 // first unroll part. 3759 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3760 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3761 cast<PHINode>(VecRdxPhi) 3762 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3763 } 3764 3765 // Before each round, move the insertion point right between 3766 // the PHIs and the values we are going to write. 3767 // This allows us to write both PHINodes and the extractelement 3768 // instructions. 3769 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3770 3771 setDebugLocFromInst(Builder, LoopExitInst); 3772 3773 // If tail is folded by masking, the vector value to leave the loop should be 3774 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3775 // instead of the former. 3776 if (Cost->foldTailByMasking()) { 3777 for (unsigned Part = 0; Part < UF; ++Part) { 3778 Value *VecLoopExitInst = 3779 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3780 Value *Sel = nullptr; 3781 for (User *U : VecLoopExitInst->users()) { 3782 if (isa<SelectInst>(U)) { 3783 assert(!Sel && "Reduction exit feeding two selects"); 3784 Sel = U; 3785 } else 3786 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3787 } 3788 assert(Sel && "Reduction exit feeds no select"); 3789 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3790 } 3791 } 3792 3793 // If the vector reduction can be performed in a smaller type, we truncate 3794 // then extend the loop exit value to enable InstCombine to evaluate the 3795 // entire expression in the smaller type. 3796 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3797 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3798 Builder.SetInsertPoint( 3799 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3800 VectorParts RdxParts(UF); 3801 for (unsigned Part = 0; Part < UF; ++Part) { 3802 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3803 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3804 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3805 : Builder.CreateZExt(Trunc, VecTy); 3806 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3807 UI != RdxParts[Part]->user_end();) 3808 if (*UI != Trunc) { 3809 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3810 RdxParts[Part] = Extnd; 3811 } else { 3812 ++UI; 3813 } 3814 } 3815 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3816 for (unsigned Part = 0; Part < UF; ++Part) { 3817 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3818 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3819 } 3820 } 3821 3822 // Reduce all of the unrolled parts into a single vector. 3823 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3824 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3825 3826 // The middle block terminator has already been assigned a DebugLoc here (the 3827 // OrigLoop's single latch terminator). We want the whole middle block to 3828 // appear to execute on this line because: (a) it is all compiler generated, 3829 // (b) these instructions are always executed after evaluating the latch 3830 // conditional branch, and (c) other passes may add new predecessors which 3831 // terminate on this line. This is the easiest way to ensure we don't 3832 // accidentally cause an extra step back into the loop while debugging. 3833 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3834 for (unsigned Part = 1; Part < UF; ++Part) { 3835 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3836 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3837 // Floating point operations had to be 'fast' to enable the reduction. 3838 ReducedPartRdx = addFastMathFlag( 3839 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3840 ReducedPartRdx, "bin.rdx"), 3841 RdxDesc.getFastMathFlags()); 3842 else 3843 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3844 RdxPart); 3845 } 3846 3847 if (VF > 1) { 3848 bool NoNaN = Legal->hasFunNoNaNAttr(); 3849 ReducedPartRdx = 3850 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3851 // If the reduction can be performed in a smaller type, we need to extend 3852 // the reduction to the wider type before we branch to the original loop. 3853 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3854 ReducedPartRdx = 3855 RdxDesc.isSigned() 3856 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3857 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3858 } 3859 3860 // Create a phi node that merges control-flow from the backedge-taken check 3861 // block and the middle block. 3862 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3863 LoopScalarPreHeader->getTerminator()); 3864 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3865 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3866 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3867 3868 // Now, we need to fix the users of the reduction variable 3869 // inside and outside of the scalar remainder loop. 3870 // We know that the loop is in LCSSA form. We need to update the 3871 // PHI nodes in the exit blocks. 3872 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3873 // All PHINodes need to have a single entry edge, or two if 3874 // we already fixed them. 3875 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3876 3877 // We found a reduction value exit-PHI. Update it with the 3878 // incoming bypass edge. 3879 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3880 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3881 } // end of the LCSSA phi scan. 3882 3883 // Fix the scalar loop reduction variable with the incoming reduction sum 3884 // from the vector body and from the backedge value. 3885 int IncomingEdgeBlockIdx = 3886 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3887 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3888 // Pick the other block. 3889 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3890 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3891 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3892 } 3893 3894 void InnerLoopVectorizer::clearReductionWrapFlags( 3895 RecurrenceDescriptor &RdxDesc) { 3896 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3897 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3898 RK != RecurrenceDescriptor::RK_IntegerMult) 3899 return; 3900 3901 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3902 assert(LoopExitInstr && "null loop exit instruction"); 3903 SmallVector<Instruction *, 8> Worklist; 3904 SmallPtrSet<Instruction *, 8> Visited; 3905 Worklist.push_back(LoopExitInstr); 3906 Visited.insert(LoopExitInstr); 3907 3908 while (!Worklist.empty()) { 3909 Instruction *Cur = Worklist.pop_back_val(); 3910 if (isa<OverflowingBinaryOperator>(Cur)) 3911 for (unsigned Part = 0; Part < UF; ++Part) { 3912 Value *V = getOrCreateVectorValue(Cur, Part); 3913 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3914 } 3915 3916 for (User *U : Cur->users()) { 3917 Instruction *UI = cast<Instruction>(U); 3918 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3919 Visited.insert(UI).second) 3920 Worklist.push_back(UI); 3921 } 3922 } 3923 } 3924 3925 void InnerLoopVectorizer::fixLCSSAPHIs() { 3926 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3927 if (LCSSAPhi.getNumIncomingValues() == 1) { 3928 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3929 // Non-instruction incoming values will have only one value. 3930 unsigned LastLane = 0; 3931 if (isa<Instruction>(IncomingValue)) 3932 LastLane = Cost->isUniformAfterVectorization( 3933 cast<Instruction>(IncomingValue), VF) 3934 ? 0 3935 : VF - 1; 3936 // Can be a loop invariant incoming value or the last scalar value to be 3937 // extracted from the vectorized loop. 3938 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3939 Value *lastIncomingValue = 3940 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3941 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3942 } 3943 } 3944 } 3945 3946 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3947 // The basic block and loop containing the predicated instruction. 3948 auto *PredBB = PredInst->getParent(); 3949 auto *VectorLoop = LI->getLoopFor(PredBB); 3950 3951 // Initialize a worklist with the operands of the predicated instruction. 3952 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3953 3954 // Holds instructions that we need to analyze again. An instruction may be 3955 // reanalyzed if we don't yet know if we can sink it or not. 3956 SmallVector<Instruction *, 8> InstsToReanalyze; 3957 3958 // Returns true if a given use occurs in the predicated block. Phi nodes use 3959 // their operands in their corresponding predecessor blocks. 3960 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3961 auto *I = cast<Instruction>(U.getUser()); 3962 BasicBlock *BB = I->getParent(); 3963 if (auto *Phi = dyn_cast<PHINode>(I)) 3964 BB = Phi->getIncomingBlock( 3965 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3966 return BB == PredBB; 3967 }; 3968 3969 // Iteratively sink the scalarized operands of the predicated instruction 3970 // into the block we created for it. When an instruction is sunk, it's 3971 // operands are then added to the worklist. The algorithm ends after one pass 3972 // through the worklist doesn't sink a single instruction. 3973 bool Changed; 3974 do { 3975 // Add the instructions that need to be reanalyzed to the worklist, and 3976 // reset the changed indicator. 3977 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3978 InstsToReanalyze.clear(); 3979 Changed = false; 3980 3981 while (!Worklist.empty()) { 3982 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3983 3984 // We can't sink an instruction if it is a phi node, is already in the 3985 // predicated block, is not in the loop, or may have side effects. 3986 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3987 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3988 continue; 3989 3990 // It's legal to sink the instruction if all its uses occur in the 3991 // predicated block. Otherwise, there's nothing to do yet, and we may 3992 // need to reanalyze the instruction. 3993 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3994 InstsToReanalyze.push_back(I); 3995 continue; 3996 } 3997 3998 // Move the instruction to the beginning of the predicated block, and add 3999 // it's operands to the worklist. 4000 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4001 Worklist.insert(I->op_begin(), I->op_end()); 4002 4003 // The sinking may have enabled other instructions to be sunk, so we will 4004 // need to iterate. 4005 Changed = true; 4006 } 4007 } while (Changed); 4008 } 4009 4010 void InnerLoopVectorizer::fixNonInductionPHIs() { 4011 for (PHINode *OrigPhi : OrigPHIsToFix) { 4012 PHINode *NewPhi = 4013 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4014 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4015 4016 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4017 predecessors(OrigPhi->getParent())); 4018 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4019 predecessors(NewPhi->getParent())); 4020 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4021 "Scalar and Vector BB should have the same number of predecessors"); 4022 4023 // The insertion point in Builder may be invalidated by the time we get 4024 // here. Force the Builder insertion point to something valid so that we do 4025 // not run into issues during insertion point restore in 4026 // getOrCreateVectorValue calls below. 4027 Builder.SetInsertPoint(NewPhi); 4028 4029 // The predecessor order is preserved and we can rely on mapping between 4030 // scalar and vector block predecessors. 4031 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4032 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4033 4034 // When looking up the new scalar/vector values to fix up, use incoming 4035 // values from original phi. 4036 Value *ScIncV = 4037 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4038 4039 // Scalar incoming value may need a broadcast 4040 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4041 NewPhi->addIncoming(NewIncV, NewPredBB); 4042 } 4043 } 4044 } 4045 4046 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4047 unsigned VF, bool IsPtrLoopInvariant, 4048 SmallBitVector &IsIndexLoopInvariant) { 4049 // Construct a vector GEP by widening the operands of the scalar GEP as 4050 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4051 // results in a vector of pointers when at least one operand of the GEP 4052 // is vector-typed. Thus, to keep the representation compact, we only use 4053 // vector-typed operands for loop-varying values. 4054 4055 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4056 // If we are vectorizing, but the GEP has only loop-invariant operands, 4057 // the GEP we build (by only using vector-typed operands for 4058 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4059 // produce a vector of pointers, we need to either arbitrarily pick an 4060 // operand to broadcast, or broadcast a clone of the original GEP. 4061 // Here, we broadcast a clone of the original. 4062 // 4063 // TODO: If at some point we decide to scalarize instructions having 4064 // loop-invariant operands, this special case will no longer be 4065 // required. We would add the scalarization decision to 4066 // collectLoopScalars() and teach getVectorValue() to broadcast 4067 // the lane-zero scalar value. 4068 auto *Clone = Builder.Insert(GEP->clone()); 4069 for (unsigned Part = 0; Part < UF; ++Part) { 4070 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4071 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4072 addMetadata(EntryPart, GEP); 4073 } 4074 } else { 4075 // If the GEP has at least one loop-varying operand, we are sure to 4076 // produce a vector of pointers. But if we are only unrolling, we want 4077 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4078 // produce with the code below will be scalar (if VF == 1) or vector 4079 // (otherwise). Note that for the unroll-only case, we still maintain 4080 // values in the vector mapping with initVector, as we do for other 4081 // instructions. 4082 for (unsigned Part = 0; Part < UF; ++Part) { 4083 // The pointer operand of the new GEP. If it's loop-invariant, we 4084 // won't broadcast it. 4085 auto *Ptr = IsPtrLoopInvariant 4086 ? GEP->getPointerOperand() 4087 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4088 4089 // Collect all the indices for the new GEP. If any index is 4090 // loop-invariant, we won't broadcast it. 4091 SmallVector<Value *, 4> Indices; 4092 for (auto Index : enumerate(GEP->indices())) { 4093 Value *User = Index.value().get(); 4094 if (IsIndexLoopInvariant[Index.index()]) 4095 Indices.push_back(User); 4096 else 4097 Indices.push_back(getOrCreateVectorValue(User, Part)); 4098 } 4099 4100 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4101 // but it should be a vector, otherwise. 4102 auto *NewGEP = 4103 GEP->isInBounds() 4104 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4105 Indices) 4106 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4107 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4108 "NewGEP is not a pointer vector"); 4109 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4110 addMetadata(NewGEP, GEP); 4111 } 4112 } 4113 } 4114 4115 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4116 unsigned VF) { 4117 PHINode *P = cast<PHINode>(PN); 4118 if (EnableVPlanNativePath) { 4119 // Currently we enter here in the VPlan-native path for non-induction 4120 // PHIs where all control flow is uniform. We simply widen these PHIs. 4121 // Create a vector phi with no operands - the vector phi operands will be 4122 // set at the end of vector code generation. 4123 Type *VecTy = 4124 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4125 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4126 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4127 OrigPHIsToFix.push_back(P); 4128 4129 return; 4130 } 4131 4132 assert(PN->getParent() == OrigLoop->getHeader() && 4133 "Non-header phis should have been handled elsewhere"); 4134 4135 // In order to support recurrences we need to be able to vectorize Phi nodes. 4136 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4137 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4138 // this value when we vectorize all of the instructions that use the PHI. 4139 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4140 for (unsigned Part = 0; Part < UF; ++Part) { 4141 // This is phase one of vectorizing PHIs. 4142 Type *VecTy = 4143 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4144 Value *EntryPart = PHINode::Create( 4145 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4146 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4147 } 4148 return; 4149 } 4150 4151 setDebugLocFromInst(Builder, P); 4152 4153 // This PHINode must be an induction variable. 4154 // Make sure that we know about it. 4155 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4156 4157 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4158 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4159 4160 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4161 // which can be found from the original scalar operations. 4162 switch (II.getKind()) { 4163 case InductionDescriptor::IK_NoInduction: 4164 llvm_unreachable("Unknown induction"); 4165 case InductionDescriptor::IK_IntInduction: 4166 case InductionDescriptor::IK_FpInduction: 4167 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4168 case InductionDescriptor::IK_PtrInduction: { 4169 // Handle the pointer induction variable case. 4170 assert(P->getType()->isPointerTy() && "Unexpected type."); 4171 // This is the normalized GEP that starts counting at zero. 4172 Value *PtrInd = Induction; 4173 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4174 // Determine the number of scalars we need to generate for each unroll 4175 // iteration. If the instruction is uniform, we only need to generate the 4176 // first lane. Otherwise, we generate all VF values. 4177 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4178 // These are the scalar results. Notice that we don't generate vector GEPs 4179 // because scalar GEPs result in better code. 4180 for (unsigned Part = 0; Part < UF; ++Part) { 4181 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4182 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4183 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4184 Value *SclrGep = 4185 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4186 SclrGep->setName("next.gep"); 4187 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4188 } 4189 } 4190 return; 4191 } 4192 } 4193 } 4194 4195 /// A helper function for checking whether an integer division-related 4196 /// instruction may divide by zero (in which case it must be predicated if 4197 /// executed conditionally in the scalar code). 4198 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4199 /// Non-zero divisors that are non compile-time constants will not be 4200 /// converted into multiplication, so we will still end up scalarizing 4201 /// the division, but can do so w/o predication. 4202 static bool mayDivideByZero(Instruction &I) { 4203 assert((I.getOpcode() == Instruction::UDiv || 4204 I.getOpcode() == Instruction::SDiv || 4205 I.getOpcode() == Instruction::URem || 4206 I.getOpcode() == Instruction::SRem) && 4207 "Unexpected instruction"); 4208 Value *Divisor = I.getOperand(1); 4209 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4210 return !CInt || CInt->isZero(); 4211 } 4212 4213 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4214 switch (I.getOpcode()) { 4215 case Instruction::Br: 4216 case Instruction::PHI: 4217 case Instruction::GetElementPtr: 4218 llvm_unreachable("This instruction is handled by a different recipe."); 4219 case Instruction::UDiv: 4220 case Instruction::SDiv: 4221 case Instruction::SRem: 4222 case Instruction::URem: 4223 case Instruction::Add: 4224 case Instruction::FAdd: 4225 case Instruction::Sub: 4226 case Instruction::FSub: 4227 case Instruction::FNeg: 4228 case Instruction::Mul: 4229 case Instruction::FMul: 4230 case Instruction::FDiv: 4231 case Instruction::FRem: 4232 case Instruction::Shl: 4233 case Instruction::LShr: 4234 case Instruction::AShr: 4235 case Instruction::And: 4236 case Instruction::Or: 4237 case Instruction::Xor: { 4238 // Just widen unops and binops. 4239 setDebugLocFromInst(Builder, &I); 4240 4241 for (unsigned Part = 0; Part < UF; ++Part) { 4242 SmallVector<Value *, 2> Ops; 4243 for (Value *Op : I.operands()) 4244 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4245 4246 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4247 4248 if (auto *VecOp = dyn_cast<Instruction>(V)) 4249 VecOp->copyIRFlags(&I); 4250 4251 // Use this vector value for all users of the original instruction. 4252 VectorLoopValueMap.setVectorValue(&I, Part, V); 4253 addMetadata(V, &I); 4254 } 4255 4256 break; 4257 } 4258 case Instruction::Select: { 4259 // Widen selects. 4260 // If the selector is loop invariant we can create a select 4261 // instruction with a scalar condition. Otherwise, use vector-select. 4262 auto *SE = PSE.getSE(); 4263 bool InvariantCond = 4264 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4265 setDebugLocFromInst(Builder, &I); 4266 4267 // The condition can be loop invariant but still defined inside the 4268 // loop. This means that we can't just use the original 'cond' value. 4269 // We have to take the 'vectorized' value and pick the first lane. 4270 // Instcombine will make this a no-op. 4271 4272 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4273 4274 for (unsigned Part = 0; Part < UF; ++Part) { 4275 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4276 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4277 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4278 Value *Sel = 4279 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4280 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4281 addMetadata(Sel, &I); 4282 } 4283 4284 break; 4285 } 4286 4287 case Instruction::ICmp: 4288 case Instruction::FCmp: { 4289 // Widen compares. Generate vector compares. 4290 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4291 auto *Cmp = cast<CmpInst>(&I); 4292 setDebugLocFromInst(Builder, Cmp); 4293 for (unsigned Part = 0; Part < UF; ++Part) { 4294 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4295 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4296 Value *C = nullptr; 4297 if (FCmp) { 4298 // Propagate fast math flags. 4299 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4300 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4301 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4302 } else { 4303 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4304 } 4305 VectorLoopValueMap.setVectorValue(&I, Part, C); 4306 addMetadata(C, &I); 4307 } 4308 4309 break; 4310 } 4311 4312 case Instruction::ZExt: 4313 case Instruction::SExt: 4314 case Instruction::FPToUI: 4315 case Instruction::FPToSI: 4316 case Instruction::FPExt: 4317 case Instruction::PtrToInt: 4318 case Instruction::IntToPtr: 4319 case Instruction::SIToFP: 4320 case Instruction::UIToFP: 4321 case Instruction::Trunc: 4322 case Instruction::FPTrunc: 4323 case Instruction::BitCast: { 4324 auto *CI = cast<CastInst>(&I); 4325 setDebugLocFromInst(Builder, CI); 4326 4327 /// Vectorize casts. 4328 Type *DestTy = 4329 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4330 4331 for (unsigned Part = 0; Part < UF; ++Part) { 4332 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4333 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4334 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4335 addMetadata(Cast, &I); 4336 } 4337 break; 4338 } 4339 4340 case Instruction::Call: { 4341 // Ignore dbg intrinsics. 4342 if (isa<DbgInfoIntrinsic>(I)) 4343 break; 4344 setDebugLocFromInst(Builder, &I); 4345 4346 Module *M = I.getParent()->getParent()->getParent(); 4347 auto *CI = cast<CallInst>(&I); 4348 4349 SmallVector<Type *, 4> Tys; 4350 for (Value *ArgOperand : CI->arg_operands()) 4351 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4352 4353 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4354 4355 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4356 // version of the instruction. 4357 // Is it beneficial to perform intrinsic call compared to lib call? 4358 bool NeedToScalarize = false; 4359 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4360 bool UseVectorIntrinsic = 4361 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4362 assert((UseVectorIntrinsic || !NeedToScalarize) && 4363 "Instruction should be scalarized elsewhere."); 4364 4365 for (unsigned Part = 0; Part < UF; ++Part) { 4366 SmallVector<Value *, 4> Args; 4367 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4368 Value *Arg = CI->getArgOperand(i); 4369 // Some intrinsics have a scalar argument - don't replace it with a 4370 // vector. 4371 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4372 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4373 Args.push_back(Arg); 4374 } 4375 4376 Function *VectorF; 4377 if (UseVectorIntrinsic) { 4378 // Use vector version of the intrinsic. 4379 Type *TysForDecl[] = {CI->getType()}; 4380 if (VF > 1) 4381 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4382 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4383 } else { 4384 // Use vector version of the function call. 4385 const VFShape Shape = 4386 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4387 #ifndef NDEBUG 4388 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4389 assert(std::find_if(Infos.begin(), Infos.end(), 4390 [&Shape](const VFInfo &Info) { 4391 return Info.Shape == Shape; 4392 }) != Infos.end() && 4393 "Vector function shape is missing from the database."); 4394 #endif 4395 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4396 } 4397 assert(VectorF && "Can't create vector function."); 4398 4399 SmallVector<OperandBundleDef, 1> OpBundles; 4400 CI->getOperandBundlesAsDefs(OpBundles); 4401 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4402 4403 if (isa<FPMathOperator>(V)) 4404 V->copyFastMathFlags(CI); 4405 4406 VectorLoopValueMap.setVectorValue(&I, Part, V); 4407 addMetadata(V, &I); 4408 } 4409 4410 break; 4411 } 4412 4413 default: 4414 // This instruction is not vectorized by simple widening. 4415 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4416 llvm_unreachable("Unhandled instruction!"); 4417 } // end of switch. 4418 } 4419 4420 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4421 // We should not collect Scalars more than once per VF. Right now, this 4422 // function is called from collectUniformsAndScalars(), which already does 4423 // this check. Collecting Scalars for VF=1 does not make any sense. 4424 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4425 "This function should not be visited twice for the same VF"); 4426 4427 SmallSetVector<Instruction *, 8> Worklist; 4428 4429 // These sets are used to seed the analysis with pointers used by memory 4430 // accesses that will remain scalar. 4431 SmallSetVector<Instruction *, 8> ScalarPtrs; 4432 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4433 4434 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4435 // The pointer operands of loads and stores will be scalar as long as the 4436 // memory access is not a gather or scatter operation. The value operand of a 4437 // store will remain scalar if the store is scalarized. 4438 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4439 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4440 assert(WideningDecision != CM_Unknown && 4441 "Widening decision should be ready at this moment"); 4442 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4443 if (Ptr == Store->getValueOperand()) 4444 return WideningDecision == CM_Scalarize; 4445 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4446 "Ptr is neither a value or pointer operand"); 4447 return WideningDecision != CM_GatherScatter; 4448 }; 4449 4450 // A helper that returns true if the given value is a bitcast or 4451 // getelementptr instruction contained in the loop. 4452 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4453 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4454 isa<GetElementPtrInst>(V)) && 4455 !TheLoop->isLoopInvariant(V); 4456 }; 4457 4458 // A helper that evaluates a memory access's use of a pointer. If the use 4459 // will be a scalar use, and the pointer is only used by memory accesses, we 4460 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4461 // PossibleNonScalarPtrs. 4462 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4463 // We only care about bitcast and getelementptr instructions contained in 4464 // the loop. 4465 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4466 return; 4467 4468 // If the pointer has already been identified as scalar (e.g., if it was 4469 // also identified as uniform), there's nothing to do. 4470 auto *I = cast<Instruction>(Ptr); 4471 if (Worklist.count(I)) 4472 return; 4473 4474 // If the use of the pointer will be a scalar use, and all users of the 4475 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4476 // place the pointer in PossibleNonScalarPtrs. 4477 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4478 return isa<LoadInst>(U) || isa<StoreInst>(U); 4479 })) 4480 ScalarPtrs.insert(I); 4481 else 4482 PossibleNonScalarPtrs.insert(I); 4483 }; 4484 4485 // We seed the scalars analysis with three classes of instructions: (1) 4486 // instructions marked uniform-after-vectorization, (2) bitcast and 4487 // getelementptr instructions used by memory accesses requiring a scalar use, 4488 // and (3) pointer induction variables and their update instructions (we 4489 // currently only scalarize these). 4490 // 4491 // (1) Add to the worklist all instructions that have been identified as 4492 // uniform-after-vectorization. 4493 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4494 4495 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4496 // memory accesses requiring a scalar use. The pointer operands of loads and 4497 // stores will be scalar as long as the memory accesses is not a gather or 4498 // scatter operation. The value operand of a store will remain scalar if the 4499 // store is scalarized. 4500 for (auto *BB : TheLoop->blocks()) 4501 for (auto &I : *BB) { 4502 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4503 evaluatePtrUse(Load, Load->getPointerOperand()); 4504 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4505 evaluatePtrUse(Store, Store->getPointerOperand()); 4506 evaluatePtrUse(Store, Store->getValueOperand()); 4507 } 4508 } 4509 for (auto *I : ScalarPtrs) 4510 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4511 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4512 Worklist.insert(I); 4513 } 4514 4515 // (3) Add to the worklist all pointer induction variables and their update 4516 // instructions. 4517 // 4518 // TODO: Once we are able to vectorize pointer induction variables we should 4519 // no longer insert them into the worklist here. 4520 auto *Latch = TheLoop->getLoopLatch(); 4521 for (auto &Induction : Legal->getInductionVars()) { 4522 auto *Ind = Induction.first; 4523 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4524 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4525 continue; 4526 Worklist.insert(Ind); 4527 Worklist.insert(IndUpdate); 4528 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4529 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4530 << "\n"); 4531 } 4532 4533 // Insert the forced scalars. 4534 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4535 // induction variable when the PHI user is scalarized. 4536 auto ForcedScalar = ForcedScalars.find(VF); 4537 if (ForcedScalar != ForcedScalars.end()) 4538 for (auto *I : ForcedScalar->second) 4539 Worklist.insert(I); 4540 4541 // Expand the worklist by looking through any bitcasts and getelementptr 4542 // instructions we've already identified as scalar. This is similar to the 4543 // expansion step in collectLoopUniforms(); however, here we're only 4544 // expanding to include additional bitcasts and getelementptr instructions. 4545 unsigned Idx = 0; 4546 while (Idx != Worklist.size()) { 4547 Instruction *Dst = Worklist[Idx++]; 4548 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4549 continue; 4550 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4551 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4552 auto *J = cast<Instruction>(U); 4553 return !TheLoop->contains(J) || Worklist.count(J) || 4554 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4555 isScalarUse(J, Src)); 4556 })) { 4557 Worklist.insert(Src); 4558 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4559 } 4560 } 4561 4562 // An induction variable will remain scalar if all users of the induction 4563 // variable and induction variable update remain scalar. 4564 for (auto &Induction : Legal->getInductionVars()) { 4565 auto *Ind = Induction.first; 4566 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4567 4568 // We already considered pointer induction variables, so there's no reason 4569 // to look at their users again. 4570 // 4571 // TODO: Once we are able to vectorize pointer induction variables we 4572 // should no longer skip over them here. 4573 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4574 continue; 4575 4576 // Determine if all users of the induction variable are scalar after 4577 // vectorization. 4578 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4579 auto *I = cast<Instruction>(U); 4580 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4581 }); 4582 if (!ScalarInd) 4583 continue; 4584 4585 // Determine if all users of the induction variable update instruction are 4586 // scalar after vectorization. 4587 auto ScalarIndUpdate = 4588 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4589 auto *I = cast<Instruction>(U); 4590 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4591 }); 4592 if (!ScalarIndUpdate) 4593 continue; 4594 4595 // The induction variable and its update instruction will remain scalar. 4596 Worklist.insert(Ind); 4597 Worklist.insert(IndUpdate); 4598 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4599 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4600 << "\n"); 4601 } 4602 4603 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4604 } 4605 4606 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4607 if (!blockNeedsPredication(I->getParent())) 4608 return false; 4609 switch(I->getOpcode()) { 4610 default: 4611 break; 4612 case Instruction::Load: 4613 case Instruction::Store: { 4614 if (!Legal->isMaskRequired(I)) 4615 return false; 4616 auto *Ptr = getLoadStorePointerOperand(I); 4617 auto *Ty = getMemInstValueType(I); 4618 // We have already decided how to vectorize this instruction, get that 4619 // result. 4620 if (VF > 1) { 4621 InstWidening WideningDecision = getWideningDecision(I, VF); 4622 assert(WideningDecision != CM_Unknown && 4623 "Widening decision should be ready at this moment"); 4624 return WideningDecision == CM_Scalarize; 4625 } 4626 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4627 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4628 isLegalMaskedGather(Ty, Alignment)) 4629 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4630 isLegalMaskedScatter(Ty, Alignment)); 4631 } 4632 case Instruction::UDiv: 4633 case Instruction::SDiv: 4634 case Instruction::SRem: 4635 case Instruction::URem: 4636 return mayDivideByZero(*I); 4637 } 4638 return false; 4639 } 4640 4641 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4642 unsigned VF) { 4643 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4644 assert(getWideningDecision(I, VF) == CM_Unknown && 4645 "Decision should not be set yet."); 4646 auto *Group = getInterleavedAccessGroup(I); 4647 assert(Group && "Must have a group."); 4648 4649 // If the instruction's allocated size doesn't equal it's type size, it 4650 // requires padding and will be scalarized. 4651 auto &DL = I->getModule()->getDataLayout(); 4652 auto *ScalarTy = getMemInstValueType(I); 4653 if (hasIrregularType(ScalarTy, DL, VF)) 4654 return false; 4655 4656 // Check if masking is required. 4657 // A Group may need masking for one of two reasons: it resides in a block that 4658 // needs predication, or it was decided to use masking to deal with gaps. 4659 bool PredicatedAccessRequiresMasking = 4660 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4661 bool AccessWithGapsRequiresMasking = 4662 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4663 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4664 return true; 4665 4666 // If masked interleaving is required, we expect that the user/target had 4667 // enabled it, because otherwise it either wouldn't have been created or 4668 // it should have been invalidated by the CostModel. 4669 assert(useMaskedInterleavedAccesses(TTI) && 4670 "Masked interleave-groups for predicated accesses are not enabled."); 4671 4672 auto *Ty = getMemInstValueType(I); 4673 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4674 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4675 : TTI.isLegalMaskedStore(Ty, Alignment); 4676 } 4677 4678 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4679 unsigned VF) { 4680 // Get and ensure we have a valid memory instruction. 4681 LoadInst *LI = dyn_cast<LoadInst>(I); 4682 StoreInst *SI = dyn_cast<StoreInst>(I); 4683 assert((LI || SI) && "Invalid memory instruction"); 4684 4685 auto *Ptr = getLoadStorePointerOperand(I); 4686 4687 // In order to be widened, the pointer should be consecutive, first of all. 4688 if (!Legal->isConsecutivePtr(Ptr)) 4689 return false; 4690 4691 // If the instruction is a store located in a predicated block, it will be 4692 // scalarized. 4693 if (isScalarWithPredication(I)) 4694 return false; 4695 4696 // If the instruction's allocated size doesn't equal it's type size, it 4697 // requires padding and will be scalarized. 4698 auto &DL = I->getModule()->getDataLayout(); 4699 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4700 if (hasIrregularType(ScalarTy, DL, VF)) 4701 return false; 4702 4703 return true; 4704 } 4705 4706 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4707 // We should not collect Uniforms more than once per VF. Right now, 4708 // this function is called from collectUniformsAndScalars(), which 4709 // already does this check. Collecting Uniforms for VF=1 does not make any 4710 // sense. 4711 4712 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4713 "This function should not be visited twice for the same VF"); 4714 4715 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4716 // not analyze again. Uniforms.count(VF) will return 1. 4717 Uniforms[VF].clear(); 4718 4719 // We now know that the loop is vectorizable! 4720 // Collect instructions inside the loop that will remain uniform after 4721 // vectorization. 4722 4723 // Global values, params and instructions outside of current loop are out of 4724 // scope. 4725 auto isOutOfScope = [&](Value *V) -> bool { 4726 Instruction *I = dyn_cast<Instruction>(V); 4727 return (!I || !TheLoop->contains(I)); 4728 }; 4729 4730 SetVector<Instruction *> Worklist; 4731 BasicBlock *Latch = TheLoop->getLoopLatch(); 4732 4733 // Instructions that are scalar with predication must not be considered 4734 // uniform after vectorization, because that would create an erroneous 4735 // replicating region where only a single instance out of VF should be formed. 4736 // TODO: optimize such seldom cases if found important, see PR40816. 4737 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4738 if (isScalarWithPredication(I, VF)) { 4739 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4740 << *I << "\n"); 4741 return; 4742 } 4743 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4744 Worklist.insert(I); 4745 }; 4746 4747 // Start with the conditional branch. If the branch condition is an 4748 // instruction contained in the loop that is only used by the branch, it is 4749 // uniform. 4750 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4751 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4752 addToWorklistIfAllowed(Cmp); 4753 4754 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4755 // are pointers that are treated like consecutive pointers during 4756 // vectorization. The pointer operands of interleaved accesses are an 4757 // example. 4758 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4759 4760 // Holds pointer operands of instructions that are possibly non-uniform. 4761 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4762 4763 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4764 InstWidening WideningDecision = getWideningDecision(I, VF); 4765 assert(WideningDecision != CM_Unknown && 4766 "Widening decision should be ready at this moment"); 4767 4768 return (WideningDecision == CM_Widen || 4769 WideningDecision == CM_Widen_Reverse || 4770 WideningDecision == CM_Interleave); 4771 }; 4772 // Iterate over the instructions in the loop, and collect all 4773 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4774 // that a consecutive-like pointer operand will be scalarized, we collect it 4775 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4776 // getelementptr instruction can be used by both vectorized and scalarized 4777 // memory instructions. For example, if a loop loads and stores from the same 4778 // location, but the store is conditional, the store will be scalarized, and 4779 // the getelementptr won't remain uniform. 4780 for (auto *BB : TheLoop->blocks()) 4781 for (auto &I : *BB) { 4782 // If there's no pointer operand, there's nothing to do. 4783 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4784 if (!Ptr) 4785 continue; 4786 4787 // True if all users of Ptr are memory accesses that have Ptr as their 4788 // pointer operand. 4789 auto UsersAreMemAccesses = 4790 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4791 return getLoadStorePointerOperand(U) == Ptr; 4792 }); 4793 4794 // Ensure the memory instruction will not be scalarized or used by 4795 // gather/scatter, making its pointer operand non-uniform. If the pointer 4796 // operand is used by any instruction other than a memory access, we 4797 // conservatively assume the pointer operand may be non-uniform. 4798 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4799 PossibleNonUniformPtrs.insert(Ptr); 4800 4801 // If the memory instruction will be vectorized and its pointer operand 4802 // is consecutive-like, or interleaving - the pointer operand should 4803 // remain uniform. 4804 else 4805 ConsecutiveLikePtrs.insert(Ptr); 4806 } 4807 4808 // Add to the Worklist all consecutive and consecutive-like pointers that 4809 // aren't also identified as possibly non-uniform. 4810 for (auto *V : ConsecutiveLikePtrs) 4811 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4812 addToWorklistIfAllowed(V); 4813 4814 // Expand Worklist in topological order: whenever a new instruction 4815 // is added , its users should be already inside Worklist. It ensures 4816 // a uniform instruction will only be used by uniform instructions. 4817 unsigned idx = 0; 4818 while (idx != Worklist.size()) { 4819 Instruction *I = Worklist[idx++]; 4820 4821 for (auto OV : I->operand_values()) { 4822 // isOutOfScope operands cannot be uniform instructions. 4823 if (isOutOfScope(OV)) 4824 continue; 4825 // First order recurrence Phi's should typically be considered 4826 // non-uniform. 4827 auto *OP = dyn_cast<PHINode>(OV); 4828 if (OP && Legal->isFirstOrderRecurrence(OP)) 4829 continue; 4830 // If all the users of the operand are uniform, then add the 4831 // operand into the uniform worklist. 4832 auto *OI = cast<Instruction>(OV); 4833 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4834 auto *J = cast<Instruction>(U); 4835 return Worklist.count(J) || 4836 (OI == getLoadStorePointerOperand(J) && 4837 isUniformDecision(J, VF)); 4838 })) 4839 addToWorklistIfAllowed(OI); 4840 } 4841 } 4842 4843 // Returns true if Ptr is the pointer operand of a memory access instruction 4844 // I, and I is known to not require scalarization. 4845 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4846 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4847 }; 4848 4849 // For an instruction to be added into Worklist above, all its users inside 4850 // the loop should also be in Worklist. However, this condition cannot be 4851 // true for phi nodes that form a cyclic dependence. We must process phi 4852 // nodes separately. An induction variable will remain uniform if all users 4853 // of the induction variable and induction variable update remain uniform. 4854 // The code below handles both pointer and non-pointer induction variables. 4855 for (auto &Induction : Legal->getInductionVars()) { 4856 auto *Ind = Induction.first; 4857 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4858 4859 // Determine if all users of the induction variable are uniform after 4860 // vectorization. 4861 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4862 auto *I = cast<Instruction>(U); 4863 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4864 isVectorizedMemAccessUse(I, Ind); 4865 }); 4866 if (!UniformInd) 4867 continue; 4868 4869 // Determine if all users of the induction variable update instruction are 4870 // uniform after vectorization. 4871 auto UniformIndUpdate = 4872 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4873 auto *I = cast<Instruction>(U); 4874 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4875 isVectorizedMemAccessUse(I, IndUpdate); 4876 }); 4877 if (!UniformIndUpdate) 4878 continue; 4879 4880 // The induction variable and its update instruction will remain uniform. 4881 addToWorklistIfAllowed(Ind); 4882 addToWorklistIfAllowed(IndUpdate); 4883 } 4884 4885 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4886 } 4887 4888 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4889 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4890 4891 if (Legal->getRuntimePointerChecking()->Need) { 4892 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4893 "runtime pointer checks needed. Enable vectorization of this " 4894 "loop with '#pragma clang loop vectorize(enable)' when " 4895 "compiling with -Os/-Oz", 4896 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4897 return true; 4898 } 4899 4900 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4901 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4902 "runtime SCEV checks needed. Enable vectorization of this " 4903 "loop with '#pragma clang loop vectorize(enable)' when " 4904 "compiling with -Os/-Oz", 4905 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4906 return true; 4907 } 4908 4909 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4910 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4911 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4912 "runtime stride == 1 checks needed. Enable vectorization of " 4913 "this loop with '#pragma clang loop vectorize(enable)' when " 4914 "compiling with -Os/-Oz", 4915 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4916 return true; 4917 } 4918 4919 return false; 4920 } 4921 4922 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4923 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4924 // TODO: It may by useful to do since it's still likely to be dynamically 4925 // uniform if the target can skip. 4926 reportVectorizationFailure( 4927 "Not inserting runtime ptr check for divergent target", 4928 "runtime pointer checks needed. Not enabled for divergent target", 4929 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4930 return None; 4931 } 4932 4933 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4934 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4935 if (TC == 1) { 4936 reportVectorizationFailure("Single iteration (non) loop", 4937 "loop trip count is one, irrelevant for vectorization", 4938 "SingleIterationLoop", ORE, TheLoop); 4939 return None; 4940 } 4941 4942 switch (ScalarEpilogueStatus) { 4943 case CM_ScalarEpilogueAllowed: 4944 return computeFeasibleMaxVF(TC); 4945 case CM_ScalarEpilogueNotNeededUsePredicate: 4946 LLVM_DEBUG( 4947 dbgs() << "LV: vector predicate hint/switch found.\n" 4948 << "LV: Not allowing scalar epilogue, creating predicated " 4949 << "vector loop.\n"); 4950 break; 4951 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4952 // fallthrough as a special case of OptForSize 4953 case CM_ScalarEpilogueNotAllowedOptSize: 4954 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4955 LLVM_DEBUG( 4956 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4957 else 4958 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4959 << "count.\n"); 4960 4961 // Bail if runtime checks are required, which are not good when optimising 4962 // for size. 4963 if (runtimeChecksRequired()) 4964 return None; 4965 break; 4966 } 4967 4968 // Now try the tail folding 4969 4970 // Invalidate interleave groups that require an epilogue if we can't mask 4971 // the interleave-group. 4972 if (!useMaskedInterleavedAccesses(TTI)) 4973 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4974 4975 unsigned MaxVF = computeFeasibleMaxVF(TC); 4976 if (TC > 0 && TC % MaxVF == 0) { 4977 // Accept MaxVF if we do not have a tail. 4978 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4979 return MaxVF; 4980 } 4981 4982 // If we don't know the precise trip count, or if the trip count that we 4983 // found modulo the vectorization factor is not zero, try to fold the tail 4984 // by masking. 4985 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4986 if (Legal->prepareToFoldTailByMasking()) { 4987 FoldTailByMasking = true; 4988 return MaxVF; 4989 } 4990 4991 if (TC == 0) { 4992 reportVectorizationFailure( 4993 "Unable to calculate the loop count due to complex control flow", 4994 "unable to calculate the loop count due to complex control flow", 4995 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4996 return None; 4997 } 4998 4999 reportVectorizationFailure( 5000 "Cannot optimize for size and vectorize at the same time.", 5001 "cannot optimize for size and vectorize at the same time. " 5002 "Enable vectorization of this loop with '#pragma clang loop " 5003 "vectorize(enable)' when compiling with -Os/-Oz", 5004 "NoTailLoopWithOptForSize", ORE, TheLoop); 5005 return None; 5006 } 5007 5008 unsigned 5009 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5010 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5011 unsigned SmallestType, WidestType; 5012 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5013 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5014 5015 // Get the maximum safe dependence distance in bits computed by LAA. 5016 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5017 // the memory accesses that is most restrictive (involved in the smallest 5018 // dependence distance). 5019 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5020 5021 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5022 5023 unsigned MaxVectorSize = WidestRegister / WidestType; 5024 5025 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5026 << " / " << WidestType << " bits.\n"); 5027 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5028 << WidestRegister << " bits.\n"); 5029 5030 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5031 " into one vector!"); 5032 if (MaxVectorSize == 0) { 5033 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5034 MaxVectorSize = 1; 5035 return MaxVectorSize; 5036 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5037 isPowerOf2_32(ConstTripCount)) { 5038 // We need to clamp the VF to be the ConstTripCount. There is no point in 5039 // choosing a higher viable VF as done in the loop below. 5040 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5041 << ConstTripCount << "\n"); 5042 MaxVectorSize = ConstTripCount; 5043 return MaxVectorSize; 5044 } 5045 5046 unsigned MaxVF = MaxVectorSize; 5047 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5048 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5049 // Collect all viable vectorization factors larger than the default MaxVF 5050 // (i.e. MaxVectorSize). 5051 SmallVector<unsigned, 8> VFs; 5052 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5053 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5054 VFs.push_back(VS); 5055 5056 // For each VF calculate its register usage. 5057 auto RUs = calculateRegisterUsage(VFs); 5058 5059 // Select the largest VF which doesn't require more registers than existing 5060 // ones. 5061 for (int i = RUs.size() - 1; i >= 0; --i) { 5062 bool Selected = true; 5063 for (auto& pair : RUs[i].MaxLocalUsers) { 5064 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5065 if (pair.second > TargetNumRegisters) 5066 Selected = false; 5067 } 5068 if (Selected) { 5069 MaxVF = VFs[i]; 5070 break; 5071 } 5072 } 5073 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5074 if (MaxVF < MinVF) { 5075 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5076 << ") with target's minimum: " << MinVF << '\n'); 5077 MaxVF = MinVF; 5078 } 5079 } 5080 } 5081 return MaxVF; 5082 } 5083 5084 VectorizationFactor 5085 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5086 float Cost = expectedCost(1).first; 5087 const float ScalarCost = Cost; 5088 unsigned Width = 1; 5089 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5090 5091 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5092 if (ForceVectorization && MaxVF > 1) { 5093 // Ignore scalar width, because the user explicitly wants vectorization. 5094 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5095 // evaluation. 5096 Cost = std::numeric_limits<float>::max(); 5097 } 5098 5099 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5100 // Notice that the vector loop needs to be executed less times, so 5101 // we need to divide the cost of the vector loops by the width of 5102 // the vector elements. 5103 VectorizationCostTy C = expectedCost(i); 5104 float VectorCost = C.first / (float)i; 5105 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5106 << " costs: " << (int)VectorCost << ".\n"); 5107 if (!C.second && !ForceVectorization) { 5108 LLVM_DEBUG( 5109 dbgs() << "LV: Not considering vector loop of width " << i 5110 << " because it will not generate any vector instructions.\n"); 5111 continue; 5112 } 5113 if (VectorCost < Cost) { 5114 Cost = VectorCost; 5115 Width = i; 5116 } 5117 } 5118 5119 if (!EnableCondStoresVectorization && NumPredStores) { 5120 reportVectorizationFailure("There are conditional stores.", 5121 "store that is conditionally executed prevents vectorization", 5122 "ConditionalStore", ORE, TheLoop); 5123 Width = 1; 5124 Cost = ScalarCost; 5125 } 5126 5127 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5128 << "LV: Vectorization seems to be not beneficial, " 5129 << "but was forced by a user.\n"); 5130 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5131 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5132 return Factor; 5133 } 5134 5135 std::pair<unsigned, unsigned> 5136 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5137 unsigned MinWidth = -1U; 5138 unsigned MaxWidth = 8; 5139 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5140 5141 // For each block. 5142 for (BasicBlock *BB : TheLoop->blocks()) { 5143 // For each instruction in the loop. 5144 for (Instruction &I : BB->instructionsWithoutDebug()) { 5145 Type *T = I.getType(); 5146 5147 // Skip ignored values. 5148 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5149 continue; 5150 5151 // Only examine Loads, Stores and PHINodes. 5152 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5153 continue; 5154 5155 // Examine PHI nodes that are reduction variables. Update the type to 5156 // account for the recurrence type. 5157 if (auto *PN = dyn_cast<PHINode>(&I)) { 5158 if (!Legal->isReductionVariable(PN)) 5159 continue; 5160 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5161 T = RdxDesc.getRecurrenceType(); 5162 } 5163 5164 // Examine the stored values. 5165 if (auto *ST = dyn_cast<StoreInst>(&I)) 5166 T = ST->getValueOperand()->getType(); 5167 5168 // Ignore loaded pointer types and stored pointer types that are not 5169 // vectorizable. 5170 // 5171 // FIXME: The check here attempts to predict whether a load or store will 5172 // be vectorized. We only know this for certain after a VF has 5173 // been selected. Here, we assume that if an access can be 5174 // vectorized, it will be. We should also look at extending this 5175 // optimization to non-pointer types. 5176 // 5177 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5178 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5179 continue; 5180 5181 MinWidth = std::min(MinWidth, 5182 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5183 MaxWidth = std::max(MaxWidth, 5184 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5185 } 5186 } 5187 5188 return {MinWidth, MaxWidth}; 5189 } 5190 5191 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5192 unsigned LoopCost) { 5193 // -- The interleave heuristics -- 5194 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5195 // There are many micro-architectural considerations that we can't predict 5196 // at this level. For example, frontend pressure (on decode or fetch) due to 5197 // code size, or the number and capabilities of the execution ports. 5198 // 5199 // We use the following heuristics to select the interleave count: 5200 // 1. If the code has reductions, then we interleave to break the cross 5201 // iteration dependency. 5202 // 2. If the loop is really small, then we interleave to reduce the loop 5203 // overhead. 5204 // 3. We don't interleave if we think that we will spill registers to memory 5205 // due to the increased register pressure. 5206 5207 if (!isScalarEpilogueAllowed()) 5208 return 1; 5209 5210 // We used the distance for the interleave count. 5211 if (Legal->getMaxSafeDepDistBytes() != -1U) 5212 return 1; 5213 5214 // Do not interleave loops with a relatively small known or estimated trip 5215 // count. 5216 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5217 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5218 return 1; 5219 5220 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5221 // We divide by these constants so assume that we have at least one 5222 // instruction that uses at least one register. 5223 for (auto& pair : R.MaxLocalUsers) { 5224 pair.second = std::max(pair.second, 1U); 5225 } 5226 5227 // We calculate the interleave count using the following formula. 5228 // Subtract the number of loop invariants from the number of available 5229 // registers. These registers are used by all of the interleaved instances. 5230 // Next, divide the remaining registers by the number of registers that is 5231 // required by the loop, in order to estimate how many parallel instances 5232 // fit without causing spills. All of this is rounded down if necessary to be 5233 // a power of two. We want power of two interleave count to simplify any 5234 // addressing operations or alignment considerations. 5235 // We also want power of two interleave counts to ensure that the induction 5236 // variable of the vector loop wraps to zero, when tail is folded by masking; 5237 // this currently happens when OptForSize, in which case IC is set to 1 above. 5238 unsigned IC = UINT_MAX; 5239 5240 for (auto& pair : R.MaxLocalUsers) { 5241 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5242 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5243 << " registers of " 5244 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5245 if (VF == 1) { 5246 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5247 TargetNumRegisters = ForceTargetNumScalarRegs; 5248 } else { 5249 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5250 TargetNumRegisters = ForceTargetNumVectorRegs; 5251 } 5252 unsigned MaxLocalUsers = pair.second; 5253 unsigned LoopInvariantRegs = 0; 5254 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5255 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5256 5257 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5258 // Don't count the induction variable as interleaved. 5259 if (EnableIndVarRegisterHeur) { 5260 TmpIC = 5261 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5262 std::max(1U, (MaxLocalUsers - 1))); 5263 } 5264 5265 IC = std::min(IC, TmpIC); 5266 } 5267 5268 // Clamp the interleave ranges to reasonable counts. 5269 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5270 5271 // Check if the user has overridden the max. 5272 if (VF == 1) { 5273 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5274 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5275 } else { 5276 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5277 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5278 } 5279 5280 // If trip count is known or estimated compile time constant, limit the 5281 // interleave count to be less than the trip count divided by VF. 5282 if (BestKnownTC) { 5283 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5284 } 5285 5286 // If we did not calculate the cost for VF (because the user selected the VF) 5287 // then we calculate the cost of VF here. 5288 if (LoopCost == 0) 5289 LoopCost = expectedCost(VF).first; 5290 5291 assert(LoopCost && "Non-zero loop cost expected"); 5292 5293 // Clamp the calculated IC to be between the 1 and the max interleave count 5294 // that the target and trip count allows. 5295 if (IC > MaxInterleaveCount) 5296 IC = MaxInterleaveCount; 5297 else if (IC < 1) 5298 IC = 1; 5299 5300 // Interleave if we vectorized this loop and there is a reduction that could 5301 // benefit from interleaving. 5302 if (VF > 1 && !Legal->getReductionVars().empty()) { 5303 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5304 return IC; 5305 } 5306 5307 // Note that if we've already vectorized the loop we will have done the 5308 // runtime check and so interleaving won't require further checks. 5309 bool InterleavingRequiresRuntimePointerCheck = 5310 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5311 5312 // We want to interleave small loops in order to reduce the loop overhead and 5313 // potentially expose ILP opportunities. 5314 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5315 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5316 // We assume that the cost overhead is 1 and we use the cost model 5317 // to estimate the cost of the loop and interleave until the cost of the 5318 // loop overhead is about 5% of the cost of the loop. 5319 unsigned SmallIC = 5320 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5321 5322 // Interleave until store/load ports (estimated by max interleave count) are 5323 // saturated. 5324 unsigned NumStores = Legal->getNumStores(); 5325 unsigned NumLoads = Legal->getNumLoads(); 5326 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5327 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5328 5329 // If we have a scalar reduction (vector reductions are already dealt with 5330 // by this point), we can increase the critical path length if the loop 5331 // we're interleaving is inside another loop. Limit, by default to 2, so the 5332 // critical path only gets increased by one reduction operation. 5333 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5334 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5335 SmallIC = std::min(SmallIC, F); 5336 StoresIC = std::min(StoresIC, F); 5337 LoadsIC = std::min(LoadsIC, F); 5338 } 5339 5340 if (EnableLoadStoreRuntimeInterleave && 5341 std::max(StoresIC, LoadsIC) > SmallIC) { 5342 LLVM_DEBUG( 5343 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5344 return std::max(StoresIC, LoadsIC); 5345 } 5346 5347 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5348 return SmallIC; 5349 } 5350 5351 // Interleave if this is a large loop (small loops are already dealt with by 5352 // this point) that could benefit from interleaving. 5353 bool HasReductions = !Legal->getReductionVars().empty(); 5354 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5355 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5356 return IC; 5357 } 5358 5359 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5360 return 1; 5361 } 5362 5363 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5364 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5365 // This function calculates the register usage by measuring the highest number 5366 // of values that are alive at a single location. Obviously, this is a very 5367 // rough estimation. We scan the loop in a topological order in order and 5368 // assign a number to each instruction. We use RPO to ensure that defs are 5369 // met before their users. We assume that each instruction that has in-loop 5370 // users starts an interval. We record every time that an in-loop value is 5371 // used, so we have a list of the first and last occurrences of each 5372 // instruction. Next, we transpose this data structure into a multi map that 5373 // holds the list of intervals that *end* at a specific location. This multi 5374 // map allows us to perform a linear search. We scan the instructions linearly 5375 // and record each time that a new interval starts, by placing it in a set. 5376 // If we find this value in the multi-map then we remove it from the set. 5377 // The max register usage is the maximum size of the set. 5378 // We also search for instructions that are defined outside the loop, but are 5379 // used inside the loop. We need this number separately from the max-interval 5380 // usage number because when we unroll, loop-invariant values do not take 5381 // more register. 5382 LoopBlocksDFS DFS(TheLoop); 5383 DFS.perform(LI); 5384 5385 RegisterUsage RU; 5386 5387 // Each 'key' in the map opens a new interval. The values 5388 // of the map are the index of the 'last seen' usage of the 5389 // instruction that is the key. 5390 using IntervalMap = DenseMap<Instruction *, unsigned>; 5391 5392 // Maps instruction to its index. 5393 SmallVector<Instruction *, 64> IdxToInstr; 5394 // Marks the end of each interval. 5395 IntervalMap EndPoint; 5396 // Saves the list of instruction indices that are used in the loop. 5397 SmallPtrSet<Instruction *, 8> Ends; 5398 // Saves the list of values that are used in the loop but are 5399 // defined outside the loop, such as arguments and constants. 5400 SmallPtrSet<Value *, 8> LoopInvariants; 5401 5402 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5403 for (Instruction &I : BB->instructionsWithoutDebug()) { 5404 IdxToInstr.push_back(&I); 5405 5406 // Save the end location of each USE. 5407 for (Value *U : I.operands()) { 5408 auto *Instr = dyn_cast<Instruction>(U); 5409 5410 // Ignore non-instruction values such as arguments, constants, etc. 5411 if (!Instr) 5412 continue; 5413 5414 // If this instruction is outside the loop then record it and continue. 5415 if (!TheLoop->contains(Instr)) { 5416 LoopInvariants.insert(Instr); 5417 continue; 5418 } 5419 5420 // Overwrite previous end points. 5421 EndPoint[Instr] = IdxToInstr.size(); 5422 Ends.insert(Instr); 5423 } 5424 } 5425 } 5426 5427 // Saves the list of intervals that end with the index in 'key'. 5428 using InstrList = SmallVector<Instruction *, 2>; 5429 DenseMap<unsigned, InstrList> TransposeEnds; 5430 5431 // Transpose the EndPoints to a list of values that end at each index. 5432 for (auto &Interval : EndPoint) 5433 TransposeEnds[Interval.second].push_back(Interval.first); 5434 5435 SmallPtrSet<Instruction *, 8> OpenIntervals; 5436 5437 // Get the size of the widest register. 5438 unsigned MaxSafeDepDist = -1U; 5439 if (Legal->getMaxSafeDepDistBytes() != -1U) 5440 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5441 unsigned WidestRegister = 5442 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5443 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5444 5445 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5446 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5447 5448 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5449 5450 // A lambda that gets the register usage for the given type and VF. 5451 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5452 if (Ty->isTokenTy()) 5453 return 0U; 5454 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5455 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5456 }; 5457 5458 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5459 Instruction *I = IdxToInstr[i]; 5460 5461 // Remove all of the instructions that end at this location. 5462 InstrList &List = TransposeEnds[i]; 5463 for (Instruction *ToRemove : List) 5464 OpenIntervals.erase(ToRemove); 5465 5466 // Ignore instructions that are never used within the loop. 5467 if (Ends.find(I) == Ends.end()) 5468 continue; 5469 5470 // Skip ignored values. 5471 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5472 continue; 5473 5474 // For each VF find the maximum usage of registers. 5475 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5476 // Count the number of live intervals. 5477 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5478 5479 if (VFs[j] == 1) { 5480 for (auto Inst : OpenIntervals) { 5481 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5482 if (RegUsage.find(ClassID) == RegUsage.end()) 5483 RegUsage[ClassID] = 1; 5484 else 5485 RegUsage[ClassID] += 1; 5486 } 5487 } else { 5488 collectUniformsAndScalars(VFs[j]); 5489 for (auto Inst : OpenIntervals) { 5490 // Skip ignored values for VF > 1. 5491 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5492 continue; 5493 if (isScalarAfterVectorization(Inst, VFs[j])) { 5494 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5495 if (RegUsage.find(ClassID) == RegUsage.end()) 5496 RegUsage[ClassID] = 1; 5497 else 5498 RegUsage[ClassID] += 1; 5499 } else { 5500 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5501 if (RegUsage.find(ClassID) == RegUsage.end()) 5502 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5503 else 5504 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5505 } 5506 } 5507 } 5508 5509 for (auto& pair : RegUsage) { 5510 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5511 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5512 else 5513 MaxUsages[j][pair.first] = pair.second; 5514 } 5515 } 5516 5517 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5518 << OpenIntervals.size() << '\n'); 5519 5520 // Add the current instruction to the list of open intervals. 5521 OpenIntervals.insert(I); 5522 } 5523 5524 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5525 SmallMapVector<unsigned, unsigned, 4> Invariant; 5526 5527 for (auto Inst : LoopInvariants) { 5528 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5529 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5530 if (Invariant.find(ClassID) == Invariant.end()) 5531 Invariant[ClassID] = Usage; 5532 else 5533 Invariant[ClassID] += Usage; 5534 } 5535 5536 LLVM_DEBUG({ 5537 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5538 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5539 << " item\n"; 5540 for (const auto &pair : MaxUsages[i]) { 5541 dbgs() << "LV(REG): RegisterClass: " 5542 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5543 << " registers\n"; 5544 } 5545 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5546 << " item\n"; 5547 for (const auto &pair : Invariant) { 5548 dbgs() << "LV(REG): RegisterClass: " 5549 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5550 << " registers\n"; 5551 } 5552 }); 5553 5554 RU.LoopInvariantRegs = Invariant; 5555 RU.MaxLocalUsers = MaxUsages[i]; 5556 RUs[i] = RU; 5557 } 5558 5559 return RUs; 5560 } 5561 5562 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5563 // TODO: Cost model for emulated masked load/store is completely 5564 // broken. This hack guides the cost model to use an artificially 5565 // high enough value to practically disable vectorization with such 5566 // operations, except where previously deployed legality hack allowed 5567 // using very low cost values. This is to avoid regressions coming simply 5568 // from moving "masked load/store" check from legality to cost model. 5569 // Masked Load/Gather emulation was previously never allowed. 5570 // Limited number of Masked Store/Scatter emulation was allowed. 5571 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5572 return isa<LoadInst>(I) || 5573 (isa<StoreInst>(I) && 5574 NumPredStores > NumberOfStoresToPredicate); 5575 } 5576 5577 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5578 // If we aren't vectorizing the loop, or if we've already collected the 5579 // instructions to scalarize, there's nothing to do. Collection may already 5580 // have occurred if we have a user-selected VF and are now computing the 5581 // expected cost for interleaving. 5582 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5583 return; 5584 5585 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5586 // not profitable to scalarize any instructions, the presence of VF in the 5587 // map will indicate that we've analyzed it already. 5588 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5589 5590 // Find all the instructions that are scalar with predication in the loop and 5591 // determine if it would be better to not if-convert the blocks they are in. 5592 // If so, we also record the instructions to scalarize. 5593 for (BasicBlock *BB : TheLoop->blocks()) { 5594 if (!blockNeedsPredication(BB)) 5595 continue; 5596 for (Instruction &I : *BB) 5597 if (isScalarWithPredication(&I)) { 5598 ScalarCostsTy ScalarCosts; 5599 // Do not apply discount logic if hacked cost is needed 5600 // for emulated masked memrefs. 5601 if (!useEmulatedMaskMemRefHack(&I) && 5602 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5603 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5604 // Remember that BB will remain after vectorization. 5605 PredicatedBBsAfterVectorization.insert(BB); 5606 } 5607 } 5608 } 5609 5610 int LoopVectorizationCostModel::computePredInstDiscount( 5611 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5612 unsigned VF) { 5613 assert(!isUniformAfterVectorization(PredInst, VF) && 5614 "Instruction marked uniform-after-vectorization will be predicated"); 5615 5616 // Initialize the discount to zero, meaning that the scalar version and the 5617 // vector version cost the same. 5618 int Discount = 0; 5619 5620 // Holds instructions to analyze. The instructions we visit are mapped in 5621 // ScalarCosts. Those instructions are the ones that would be scalarized if 5622 // we find that the scalar version costs less. 5623 SmallVector<Instruction *, 8> Worklist; 5624 5625 // Returns true if the given instruction can be scalarized. 5626 auto canBeScalarized = [&](Instruction *I) -> bool { 5627 // We only attempt to scalarize instructions forming a single-use chain 5628 // from the original predicated block that would otherwise be vectorized. 5629 // Although not strictly necessary, we give up on instructions we know will 5630 // already be scalar to avoid traversing chains that are unlikely to be 5631 // beneficial. 5632 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5633 isScalarAfterVectorization(I, VF)) 5634 return false; 5635 5636 // If the instruction is scalar with predication, it will be analyzed 5637 // separately. We ignore it within the context of PredInst. 5638 if (isScalarWithPredication(I)) 5639 return false; 5640 5641 // If any of the instruction's operands are uniform after vectorization, 5642 // the instruction cannot be scalarized. This prevents, for example, a 5643 // masked load from being scalarized. 5644 // 5645 // We assume we will only emit a value for lane zero of an instruction 5646 // marked uniform after vectorization, rather than VF identical values. 5647 // Thus, if we scalarize an instruction that uses a uniform, we would 5648 // create uses of values corresponding to the lanes we aren't emitting code 5649 // for. This behavior can be changed by allowing getScalarValue to clone 5650 // the lane zero values for uniforms rather than asserting. 5651 for (Use &U : I->operands()) 5652 if (auto *J = dyn_cast<Instruction>(U.get())) 5653 if (isUniformAfterVectorization(J, VF)) 5654 return false; 5655 5656 // Otherwise, we can scalarize the instruction. 5657 return true; 5658 }; 5659 5660 // Compute the expected cost discount from scalarizing the entire expression 5661 // feeding the predicated instruction. We currently only consider expressions 5662 // that are single-use instruction chains. 5663 Worklist.push_back(PredInst); 5664 while (!Worklist.empty()) { 5665 Instruction *I = Worklist.pop_back_val(); 5666 5667 // If we've already analyzed the instruction, there's nothing to do. 5668 if (ScalarCosts.find(I) != ScalarCosts.end()) 5669 continue; 5670 5671 // Compute the cost of the vector instruction. Note that this cost already 5672 // includes the scalarization overhead of the predicated instruction. 5673 unsigned VectorCost = getInstructionCost(I, VF).first; 5674 5675 // Compute the cost of the scalarized instruction. This cost is the cost of 5676 // the instruction as if it wasn't if-converted and instead remained in the 5677 // predicated block. We will scale this cost by block probability after 5678 // computing the scalarization overhead. 5679 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5680 5681 // Compute the scalarization overhead of needed insertelement instructions 5682 // and phi nodes. 5683 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5684 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5685 true, false); 5686 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5687 } 5688 5689 // Compute the scalarization overhead of needed extractelement 5690 // instructions. For each of the instruction's operands, if the operand can 5691 // be scalarized, add it to the worklist; otherwise, account for the 5692 // overhead. 5693 for (Use &U : I->operands()) 5694 if (auto *J = dyn_cast<Instruction>(U.get())) { 5695 assert(VectorType::isValidElementType(J->getType()) && 5696 "Instruction has non-scalar type"); 5697 if (canBeScalarized(J)) 5698 Worklist.push_back(J); 5699 else if (needsExtract(J, VF)) 5700 ScalarCost += TTI.getScalarizationOverhead( 5701 ToVectorTy(J->getType(),VF), false, true); 5702 } 5703 5704 // Scale the total scalar cost by block probability. 5705 ScalarCost /= getReciprocalPredBlockProb(); 5706 5707 // Compute the discount. A non-negative discount means the vector version 5708 // of the instruction costs more, and scalarizing would be beneficial. 5709 Discount += VectorCost - ScalarCost; 5710 ScalarCosts[I] = ScalarCost; 5711 } 5712 5713 return Discount; 5714 } 5715 5716 LoopVectorizationCostModel::VectorizationCostTy 5717 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5718 VectorizationCostTy Cost; 5719 5720 // For each block. 5721 for (BasicBlock *BB : TheLoop->blocks()) { 5722 VectorizationCostTy BlockCost; 5723 5724 // For each instruction in the old loop. 5725 for (Instruction &I : BB->instructionsWithoutDebug()) { 5726 // Skip ignored values. 5727 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5728 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5729 continue; 5730 5731 VectorizationCostTy C = getInstructionCost(&I, VF); 5732 5733 // Check if we should override the cost. 5734 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5735 C.first = ForceTargetInstructionCost; 5736 5737 BlockCost.first += C.first; 5738 BlockCost.second |= C.second; 5739 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5740 << " for VF " << VF << " For instruction: " << I 5741 << '\n'); 5742 } 5743 5744 // If we are vectorizing a predicated block, it will have been 5745 // if-converted. This means that the block's instructions (aside from 5746 // stores and instructions that may divide by zero) will now be 5747 // unconditionally executed. For the scalar case, we may not always execute 5748 // the predicated block. Thus, scale the block's cost by the probability of 5749 // executing it. 5750 if (VF == 1 && blockNeedsPredication(BB)) 5751 BlockCost.first /= getReciprocalPredBlockProb(); 5752 5753 Cost.first += BlockCost.first; 5754 Cost.second |= BlockCost.second; 5755 } 5756 5757 return Cost; 5758 } 5759 5760 /// Gets Address Access SCEV after verifying that the access pattern 5761 /// is loop invariant except the induction variable dependence. 5762 /// 5763 /// This SCEV can be sent to the Target in order to estimate the address 5764 /// calculation cost. 5765 static const SCEV *getAddressAccessSCEV( 5766 Value *Ptr, 5767 LoopVectorizationLegality *Legal, 5768 PredicatedScalarEvolution &PSE, 5769 const Loop *TheLoop) { 5770 5771 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5772 if (!Gep) 5773 return nullptr; 5774 5775 // We are looking for a gep with all loop invariant indices except for one 5776 // which should be an induction variable. 5777 auto SE = PSE.getSE(); 5778 unsigned NumOperands = Gep->getNumOperands(); 5779 for (unsigned i = 1; i < NumOperands; ++i) { 5780 Value *Opd = Gep->getOperand(i); 5781 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5782 !Legal->isInductionVariable(Opd)) 5783 return nullptr; 5784 } 5785 5786 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5787 return PSE.getSCEV(Ptr); 5788 } 5789 5790 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5791 return Legal->hasStride(I->getOperand(0)) || 5792 Legal->hasStride(I->getOperand(1)); 5793 } 5794 5795 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5796 unsigned VF) { 5797 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5798 Type *ValTy = getMemInstValueType(I); 5799 auto SE = PSE.getSE(); 5800 5801 unsigned AS = getLoadStoreAddressSpace(I); 5802 Value *Ptr = getLoadStorePointerOperand(I); 5803 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5804 5805 // Figure out whether the access is strided and get the stride value 5806 // if it's known in compile time 5807 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5808 5809 // Get the cost of the scalar memory instruction and address computation. 5810 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5811 5812 // Don't pass *I here, since it is scalar but will actually be part of a 5813 // vectorized loop where the user of it is a vectorized instruction. 5814 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5815 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5816 Alignment, AS); 5817 5818 // Get the overhead of the extractelement and insertelement instructions 5819 // we might create due to scalarization. 5820 Cost += getScalarizationOverhead(I, VF); 5821 5822 // If we have a predicated store, it may not be executed for each vector 5823 // lane. Scale the cost by the probability of executing the predicated 5824 // block. 5825 if (isPredicatedInst(I)) { 5826 Cost /= getReciprocalPredBlockProb(); 5827 5828 if (useEmulatedMaskMemRefHack(I)) 5829 // Artificially setting to a high enough value to practically disable 5830 // vectorization with such operations. 5831 Cost = 3000000; 5832 } 5833 5834 return Cost; 5835 } 5836 5837 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5838 unsigned VF) { 5839 Type *ValTy = getMemInstValueType(I); 5840 Type *VectorTy = ToVectorTy(ValTy, VF); 5841 Value *Ptr = getLoadStorePointerOperand(I); 5842 unsigned AS = getLoadStoreAddressSpace(I); 5843 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5844 5845 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5846 "Stride should be 1 or -1 for consecutive memory access"); 5847 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5848 unsigned Cost = 0; 5849 if (Legal->isMaskRequired(I)) 5850 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5851 Alignment ? Alignment->value() : 0, AS); 5852 else 5853 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5854 5855 bool Reverse = ConsecutiveStride < 0; 5856 if (Reverse) 5857 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5858 return Cost; 5859 } 5860 5861 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5862 unsigned VF) { 5863 Type *ValTy = getMemInstValueType(I); 5864 Type *VectorTy = ToVectorTy(ValTy, VF); 5865 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5866 unsigned AS = getLoadStoreAddressSpace(I); 5867 if (isa<LoadInst>(I)) { 5868 return TTI.getAddressComputationCost(ValTy) + 5869 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5870 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5871 } 5872 StoreInst *SI = cast<StoreInst>(I); 5873 5874 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5875 return TTI.getAddressComputationCost(ValTy) + 5876 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5877 (isLoopInvariantStoreValue 5878 ? 0 5879 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5880 VF - 1)); 5881 } 5882 5883 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5884 unsigned VF) { 5885 Type *ValTy = getMemInstValueType(I); 5886 Type *VectorTy = ToVectorTy(ValTy, VF); 5887 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5888 Value *Ptr = getLoadStorePointerOperand(I); 5889 5890 return TTI.getAddressComputationCost(VectorTy) + 5891 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5892 Legal->isMaskRequired(I), 5893 Alignment ? Alignment->value() : 0, I); 5894 } 5895 5896 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5897 unsigned VF) { 5898 Type *ValTy = getMemInstValueType(I); 5899 Type *VectorTy = ToVectorTy(ValTy, VF); 5900 unsigned AS = getLoadStoreAddressSpace(I); 5901 5902 auto Group = getInterleavedAccessGroup(I); 5903 assert(Group && "Fail to get an interleaved access group."); 5904 5905 unsigned InterleaveFactor = Group->getFactor(); 5906 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5907 5908 // Holds the indices of existing members in an interleaved load group. 5909 // An interleaved store group doesn't need this as it doesn't allow gaps. 5910 SmallVector<unsigned, 4> Indices; 5911 if (isa<LoadInst>(I)) { 5912 for (unsigned i = 0; i < InterleaveFactor; i++) 5913 if (Group->getMember(i)) 5914 Indices.push_back(i); 5915 } 5916 5917 // Calculate the cost of the whole interleaved group. 5918 bool UseMaskForGaps = 5919 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5920 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5921 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5922 Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5923 5924 if (Group->isReverse()) { 5925 // TODO: Add support for reversed masked interleaved access. 5926 assert(!Legal->isMaskRequired(I) && 5927 "Reverse masked interleaved access not supported."); 5928 Cost += Group->getNumMembers() * 5929 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5930 } 5931 return Cost; 5932 } 5933 5934 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5935 unsigned VF) { 5936 // Calculate scalar cost only. Vectorization cost should be ready at this 5937 // moment. 5938 if (VF == 1) { 5939 Type *ValTy = getMemInstValueType(I); 5940 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5941 unsigned AS = getLoadStoreAddressSpace(I); 5942 5943 return TTI.getAddressComputationCost(ValTy) + 5944 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5945 } 5946 return getWideningCost(I, VF); 5947 } 5948 5949 LoopVectorizationCostModel::VectorizationCostTy 5950 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5951 // If we know that this instruction will remain uniform, check the cost of 5952 // the scalar version. 5953 if (isUniformAfterVectorization(I, VF)) 5954 VF = 1; 5955 5956 if (VF > 1 && isProfitableToScalarize(I, VF)) 5957 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5958 5959 // Forced scalars do not have any scalarization overhead. 5960 auto ForcedScalar = ForcedScalars.find(VF); 5961 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5962 auto InstSet = ForcedScalar->second; 5963 if (InstSet.find(I) != InstSet.end()) 5964 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5965 } 5966 5967 Type *VectorTy; 5968 unsigned C = getInstructionCost(I, VF, VectorTy); 5969 5970 bool TypeNotScalarized = 5971 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5972 return VectorizationCostTy(C, TypeNotScalarized); 5973 } 5974 5975 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5976 unsigned VF) { 5977 5978 if (VF == 1) 5979 return 0; 5980 5981 unsigned Cost = 0; 5982 Type *RetTy = ToVectorTy(I->getType(), VF); 5983 if (!RetTy->isVoidTy() && 5984 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5985 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5986 5987 // Some targets keep addresses scalar. 5988 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5989 return Cost; 5990 5991 // Some targets support efficient element stores. 5992 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5993 return Cost; 5994 5995 // Collect operands to consider. 5996 CallInst *CI = dyn_cast<CallInst>(I); 5997 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5998 5999 // Skip operands that do not require extraction/scalarization and do not incur 6000 // any overhead. 6001 return Cost + TTI.getOperandsScalarizationOverhead( 6002 filterExtractingOperands(Ops, VF), VF); 6003 } 6004 6005 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6006 if (VF == 1) 6007 return; 6008 NumPredStores = 0; 6009 for (BasicBlock *BB : TheLoop->blocks()) { 6010 // For each instruction in the old loop. 6011 for (Instruction &I : *BB) { 6012 Value *Ptr = getLoadStorePointerOperand(&I); 6013 if (!Ptr) 6014 continue; 6015 6016 // TODO: We should generate better code and update the cost model for 6017 // predicated uniform stores. Today they are treated as any other 6018 // predicated store (see added test cases in 6019 // invariant-store-vectorization.ll). 6020 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6021 NumPredStores++; 6022 6023 if (Legal->isUniform(Ptr) && 6024 // Conditional loads and stores should be scalarized and predicated. 6025 // isScalarWithPredication cannot be used here since masked 6026 // gather/scatters are not considered scalar with predication. 6027 !Legal->blockNeedsPredication(I.getParent())) { 6028 // TODO: Avoid replicating loads and stores instead of 6029 // relying on instcombine to remove them. 6030 // Load: Scalar load + broadcast 6031 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6032 unsigned Cost = getUniformMemOpCost(&I, VF); 6033 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6034 continue; 6035 } 6036 6037 // We assume that widening is the best solution when possible. 6038 if (memoryInstructionCanBeWidened(&I, VF)) { 6039 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6040 int ConsecutiveStride = 6041 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6042 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6043 "Expected consecutive stride."); 6044 InstWidening Decision = 6045 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6046 setWideningDecision(&I, VF, Decision, Cost); 6047 continue; 6048 } 6049 6050 // Choose between Interleaving, Gather/Scatter or Scalarization. 6051 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6052 unsigned NumAccesses = 1; 6053 if (isAccessInterleaved(&I)) { 6054 auto Group = getInterleavedAccessGroup(&I); 6055 assert(Group && "Fail to get an interleaved access group."); 6056 6057 // Make one decision for the whole group. 6058 if (getWideningDecision(&I, VF) != CM_Unknown) 6059 continue; 6060 6061 NumAccesses = Group->getNumMembers(); 6062 if (interleavedAccessCanBeWidened(&I, VF)) 6063 InterleaveCost = getInterleaveGroupCost(&I, VF); 6064 } 6065 6066 unsigned GatherScatterCost = 6067 isLegalGatherOrScatter(&I) 6068 ? getGatherScatterCost(&I, VF) * NumAccesses 6069 : std::numeric_limits<unsigned>::max(); 6070 6071 unsigned ScalarizationCost = 6072 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6073 6074 // Choose better solution for the current VF, 6075 // write down this decision and use it during vectorization. 6076 unsigned Cost; 6077 InstWidening Decision; 6078 if (InterleaveCost <= GatherScatterCost && 6079 InterleaveCost < ScalarizationCost) { 6080 Decision = CM_Interleave; 6081 Cost = InterleaveCost; 6082 } else if (GatherScatterCost < ScalarizationCost) { 6083 Decision = CM_GatherScatter; 6084 Cost = GatherScatterCost; 6085 } else { 6086 Decision = CM_Scalarize; 6087 Cost = ScalarizationCost; 6088 } 6089 // If the instructions belongs to an interleave group, the whole group 6090 // receives the same decision. The whole group receives the cost, but 6091 // the cost will actually be assigned to one instruction. 6092 if (auto Group = getInterleavedAccessGroup(&I)) 6093 setWideningDecision(Group, VF, Decision, Cost); 6094 else 6095 setWideningDecision(&I, VF, Decision, Cost); 6096 } 6097 } 6098 6099 // Make sure that any load of address and any other address computation 6100 // remains scalar unless there is gather/scatter support. This avoids 6101 // inevitable extracts into address registers, and also has the benefit of 6102 // activating LSR more, since that pass can't optimize vectorized 6103 // addresses. 6104 if (TTI.prefersVectorizedAddressing()) 6105 return; 6106 6107 // Start with all scalar pointer uses. 6108 SmallPtrSet<Instruction *, 8> AddrDefs; 6109 for (BasicBlock *BB : TheLoop->blocks()) 6110 for (Instruction &I : *BB) { 6111 Instruction *PtrDef = 6112 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6113 if (PtrDef && TheLoop->contains(PtrDef) && 6114 getWideningDecision(&I, VF) != CM_GatherScatter) 6115 AddrDefs.insert(PtrDef); 6116 } 6117 6118 // Add all instructions used to generate the addresses. 6119 SmallVector<Instruction *, 4> Worklist; 6120 for (auto *I : AddrDefs) 6121 Worklist.push_back(I); 6122 while (!Worklist.empty()) { 6123 Instruction *I = Worklist.pop_back_val(); 6124 for (auto &Op : I->operands()) 6125 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6126 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6127 AddrDefs.insert(InstOp).second) 6128 Worklist.push_back(InstOp); 6129 } 6130 6131 for (auto *I : AddrDefs) { 6132 if (isa<LoadInst>(I)) { 6133 // Setting the desired widening decision should ideally be handled in 6134 // by cost functions, but since this involves the task of finding out 6135 // if the loaded register is involved in an address computation, it is 6136 // instead changed here when we know this is the case. 6137 InstWidening Decision = getWideningDecision(I, VF); 6138 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6139 // Scalarize a widened load of address. 6140 setWideningDecision(I, VF, CM_Scalarize, 6141 (VF * getMemoryInstructionCost(I, 1))); 6142 else if (auto Group = getInterleavedAccessGroup(I)) { 6143 // Scalarize an interleave group of address loads. 6144 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6145 if (Instruction *Member = Group->getMember(I)) 6146 setWideningDecision(Member, VF, CM_Scalarize, 6147 (VF * getMemoryInstructionCost(Member, 1))); 6148 } 6149 } 6150 } else 6151 // Make sure I gets scalarized and a cost estimate without 6152 // scalarization overhead. 6153 ForcedScalars[VF].insert(I); 6154 } 6155 } 6156 6157 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6158 unsigned VF, 6159 Type *&VectorTy) { 6160 Type *RetTy = I->getType(); 6161 if (canTruncateToMinimalBitwidth(I, VF)) 6162 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6163 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6164 auto SE = PSE.getSE(); 6165 6166 // TODO: We need to estimate the cost of intrinsic calls. 6167 switch (I->getOpcode()) { 6168 case Instruction::GetElementPtr: 6169 // We mark this instruction as zero-cost because the cost of GEPs in 6170 // vectorized code depends on whether the corresponding memory instruction 6171 // is scalarized or not. Therefore, we handle GEPs with the memory 6172 // instruction cost. 6173 return 0; 6174 case Instruction::Br: { 6175 // In cases of scalarized and predicated instructions, there will be VF 6176 // predicated blocks in the vectorized loop. Each branch around these 6177 // blocks requires also an extract of its vector compare i1 element. 6178 bool ScalarPredicatedBB = false; 6179 BranchInst *BI = cast<BranchInst>(I); 6180 if (VF > 1 && BI->isConditional() && 6181 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6182 PredicatedBBsAfterVectorization.end() || 6183 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6184 PredicatedBBsAfterVectorization.end())) 6185 ScalarPredicatedBB = true; 6186 6187 if (ScalarPredicatedBB) { 6188 // Return cost for branches around scalarized and predicated blocks. 6189 Type *Vec_i1Ty = 6190 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6191 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6192 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6193 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6194 // The back-edge branch will remain, as will all scalar branches. 6195 return TTI.getCFInstrCost(Instruction::Br); 6196 else 6197 // This branch will be eliminated by if-conversion. 6198 return 0; 6199 // Note: We currently assume zero cost for an unconditional branch inside 6200 // a predicated block since it will become a fall-through, although we 6201 // may decide in the future to call TTI for all branches. 6202 } 6203 case Instruction::PHI: { 6204 auto *Phi = cast<PHINode>(I); 6205 6206 // First-order recurrences are replaced by vector shuffles inside the loop. 6207 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6208 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6209 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6210 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6211 6212 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6213 // converted into select instructions. We require N - 1 selects per phi 6214 // node, where N is the number of incoming values. 6215 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6216 return (Phi->getNumIncomingValues() - 1) * 6217 TTI.getCmpSelInstrCost( 6218 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6219 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6220 6221 return TTI.getCFInstrCost(Instruction::PHI); 6222 } 6223 case Instruction::UDiv: 6224 case Instruction::SDiv: 6225 case Instruction::URem: 6226 case Instruction::SRem: 6227 // If we have a predicated instruction, it may not be executed for each 6228 // vector lane. Get the scalarization cost and scale this amount by the 6229 // probability of executing the predicated block. If the instruction is not 6230 // predicated, we fall through to the next case. 6231 if (VF > 1 && isScalarWithPredication(I)) { 6232 unsigned Cost = 0; 6233 6234 // These instructions have a non-void type, so account for the phi nodes 6235 // that we will create. This cost is likely to be zero. The phi node 6236 // cost, if any, should be scaled by the block probability because it 6237 // models a copy at the end of each predicated block. 6238 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6239 6240 // The cost of the non-predicated instruction. 6241 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6242 6243 // The cost of insertelement and extractelement instructions needed for 6244 // scalarization. 6245 Cost += getScalarizationOverhead(I, VF); 6246 6247 // Scale the cost by the probability of executing the predicated blocks. 6248 // This assumes the predicated block for each vector lane is equally 6249 // likely. 6250 return Cost / getReciprocalPredBlockProb(); 6251 } 6252 LLVM_FALLTHROUGH; 6253 case Instruction::Add: 6254 case Instruction::FAdd: 6255 case Instruction::Sub: 6256 case Instruction::FSub: 6257 case Instruction::Mul: 6258 case Instruction::FMul: 6259 case Instruction::FDiv: 6260 case Instruction::FRem: 6261 case Instruction::Shl: 6262 case Instruction::LShr: 6263 case Instruction::AShr: 6264 case Instruction::And: 6265 case Instruction::Or: 6266 case Instruction::Xor: { 6267 // Since we will replace the stride by 1 the multiplication should go away. 6268 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6269 return 0; 6270 // Certain instructions can be cheaper to vectorize if they have a constant 6271 // second vector operand. One example of this are shifts on x86. 6272 Value *Op2 = I->getOperand(1); 6273 TargetTransformInfo::OperandValueProperties Op2VP; 6274 TargetTransformInfo::OperandValueKind Op2VK = 6275 TTI.getOperandInfo(Op2, Op2VP); 6276 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6277 Op2VK = TargetTransformInfo::OK_UniformValue; 6278 6279 SmallVector<const Value *, 4> Operands(I->operand_values()); 6280 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6281 return N * TTI.getArithmeticInstrCost( 6282 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6283 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6284 } 6285 case Instruction::FNeg: { 6286 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6287 return N * TTI.getArithmeticInstrCost( 6288 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6289 TargetTransformInfo::OK_AnyValue, 6290 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6291 I->getOperand(0), I); 6292 } 6293 case Instruction::Select: { 6294 SelectInst *SI = cast<SelectInst>(I); 6295 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6296 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6297 Type *CondTy = SI->getCondition()->getType(); 6298 if (!ScalarCond) 6299 CondTy = VectorType::get(CondTy, VF); 6300 6301 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6302 } 6303 case Instruction::ICmp: 6304 case Instruction::FCmp: { 6305 Type *ValTy = I->getOperand(0)->getType(); 6306 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6307 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6308 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6309 VectorTy = ToVectorTy(ValTy, VF); 6310 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6311 } 6312 case Instruction::Store: 6313 case Instruction::Load: { 6314 unsigned Width = VF; 6315 if (Width > 1) { 6316 InstWidening Decision = getWideningDecision(I, Width); 6317 assert(Decision != CM_Unknown && 6318 "CM decision should be taken at this point"); 6319 if (Decision == CM_Scalarize) 6320 Width = 1; 6321 } 6322 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6323 return getMemoryInstructionCost(I, VF); 6324 } 6325 case Instruction::ZExt: 6326 case Instruction::SExt: 6327 case Instruction::FPToUI: 6328 case Instruction::FPToSI: 6329 case Instruction::FPExt: 6330 case Instruction::PtrToInt: 6331 case Instruction::IntToPtr: 6332 case Instruction::SIToFP: 6333 case Instruction::UIToFP: 6334 case Instruction::Trunc: 6335 case Instruction::FPTrunc: 6336 case Instruction::BitCast: { 6337 // We optimize the truncation of induction variables having constant 6338 // integer steps. The cost of these truncations is the same as the scalar 6339 // operation. 6340 if (isOptimizableIVTruncate(I, VF)) { 6341 auto *Trunc = cast<TruncInst>(I); 6342 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6343 Trunc->getSrcTy(), Trunc); 6344 } 6345 6346 Type *SrcScalarTy = I->getOperand(0)->getType(); 6347 Type *SrcVecTy = 6348 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6349 if (canTruncateToMinimalBitwidth(I, VF)) { 6350 // This cast is going to be shrunk. This may remove the cast or it might 6351 // turn it into slightly different cast. For example, if MinBW == 16, 6352 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6353 // 6354 // Calculate the modified src and dest types. 6355 Type *MinVecTy = VectorTy; 6356 if (I->getOpcode() == Instruction::Trunc) { 6357 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6358 VectorTy = 6359 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6360 } else if (I->getOpcode() == Instruction::ZExt || 6361 I->getOpcode() == Instruction::SExt) { 6362 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6363 VectorTy = 6364 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6365 } 6366 } 6367 6368 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6369 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6370 } 6371 case Instruction::Call: { 6372 bool NeedToScalarize; 6373 CallInst *CI = cast<CallInst>(I); 6374 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6375 if (getVectorIntrinsicIDForCall(CI, TLI)) 6376 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6377 return CallCost; 6378 } 6379 default: 6380 // The cost of executing VF copies of the scalar instruction. This opcode 6381 // is unknown. Assume that it is the same as 'mul'. 6382 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6383 getScalarizationOverhead(I, VF); 6384 } // end of switch. 6385 } 6386 6387 char LoopVectorize::ID = 0; 6388 6389 static const char lv_name[] = "Loop Vectorization"; 6390 6391 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6392 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6393 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6394 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6395 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6396 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6397 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6398 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6399 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6400 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6401 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6402 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6403 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6404 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6405 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6406 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6407 6408 namespace llvm { 6409 6410 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6411 6412 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6413 bool VectorizeOnlyWhenForced) { 6414 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6415 } 6416 6417 } // end namespace llvm 6418 6419 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6420 // Check if the pointer operand of a load or store instruction is 6421 // consecutive. 6422 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6423 return Legal->isConsecutivePtr(Ptr); 6424 return false; 6425 } 6426 6427 void LoopVectorizationCostModel::collectValuesToIgnore() { 6428 // Ignore ephemeral values. 6429 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6430 6431 // Ignore type-promoting instructions we identified during reduction 6432 // detection. 6433 for (auto &Reduction : Legal->getReductionVars()) { 6434 RecurrenceDescriptor &RedDes = Reduction.second; 6435 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6436 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6437 } 6438 // Ignore type-casting instructions we identified during induction 6439 // detection. 6440 for (auto &Induction : Legal->getInductionVars()) { 6441 InductionDescriptor &IndDes = Induction.second; 6442 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6443 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6444 } 6445 } 6446 6447 // TODO: we could return a pair of values that specify the max VF and 6448 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6449 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6450 // doesn't have a cost model that can choose which plan to execute if 6451 // more than one is generated. 6452 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6453 LoopVectorizationCostModel &CM) { 6454 unsigned WidestType; 6455 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6456 return WidestVectorRegBits / WidestType; 6457 } 6458 6459 VectorizationFactor 6460 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6461 unsigned VF = UserVF; 6462 // Outer loop handling: They may require CFG and instruction level 6463 // transformations before even evaluating whether vectorization is profitable. 6464 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6465 // the vectorization pipeline. 6466 if (!OrigLoop->empty()) { 6467 // If the user doesn't provide a vectorization factor, determine a 6468 // reasonable one. 6469 if (!UserVF) { 6470 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6471 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6472 6473 // Make sure we have a VF > 1 for stress testing. 6474 if (VPlanBuildStressTest && VF < 2) { 6475 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6476 << "overriding computed VF.\n"); 6477 VF = 4; 6478 } 6479 } 6480 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6481 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6482 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6483 << " to build VPlans.\n"); 6484 buildVPlans(VF, VF); 6485 6486 // For VPlan build stress testing, we bail out after VPlan construction. 6487 if (VPlanBuildStressTest) 6488 return VectorizationFactor::Disabled(); 6489 6490 return {VF, 0}; 6491 } 6492 6493 LLVM_DEBUG( 6494 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6495 "VPlan-native path.\n"); 6496 return VectorizationFactor::Disabled(); 6497 } 6498 6499 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6500 assert(OrigLoop->empty() && "Inner loop expected."); 6501 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6502 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6503 return None; 6504 6505 // Invalidate interleave groups if all blocks of loop will be predicated. 6506 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6507 !useMaskedInterleavedAccesses(*TTI)) { 6508 LLVM_DEBUG( 6509 dbgs() 6510 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6511 "which requires masked-interleaved support.\n"); 6512 CM.InterleaveInfo.reset(); 6513 } 6514 6515 if (UserVF) { 6516 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6517 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6518 // Collect the instructions (and their associated costs) that will be more 6519 // profitable to scalarize. 6520 CM.selectUserVectorizationFactor(UserVF); 6521 buildVPlansWithVPRecipes(UserVF, UserVF); 6522 LLVM_DEBUG(printPlans(dbgs())); 6523 return {{UserVF, 0}}; 6524 } 6525 6526 unsigned MaxVF = MaybeMaxVF.getValue(); 6527 assert(MaxVF != 0 && "MaxVF is zero."); 6528 6529 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6530 // Collect Uniform and Scalar instructions after vectorization with VF. 6531 CM.collectUniformsAndScalars(VF); 6532 6533 // Collect the instructions (and their associated costs) that will be more 6534 // profitable to scalarize. 6535 if (VF > 1) 6536 CM.collectInstsToScalarize(VF); 6537 } 6538 6539 buildVPlansWithVPRecipes(1, MaxVF); 6540 LLVM_DEBUG(printPlans(dbgs())); 6541 if (MaxVF == 1) 6542 return VectorizationFactor::Disabled(); 6543 6544 // Select the optimal vectorization factor. 6545 return CM.selectVectorizationFactor(MaxVF); 6546 } 6547 6548 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6549 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6550 << '\n'); 6551 BestVF = VF; 6552 BestUF = UF; 6553 6554 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6555 return !Plan->hasVF(VF); 6556 }); 6557 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6558 } 6559 6560 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6561 DominatorTree *DT) { 6562 // Perform the actual loop transformation. 6563 6564 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6565 VPCallbackILV CallbackILV(ILV); 6566 6567 VPTransformState State{BestVF, BestUF, LI, 6568 DT, ILV.Builder, ILV.VectorLoopValueMap, 6569 &ILV, CallbackILV}; 6570 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6571 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6572 6573 //===------------------------------------------------===// 6574 // 6575 // Notice: any optimization or new instruction that go 6576 // into the code below should also be implemented in 6577 // the cost-model. 6578 // 6579 //===------------------------------------------------===// 6580 6581 // 2. Copy and widen instructions from the old loop into the new loop. 6582 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6583 VPlans.front()->execute(&State); 6584 6585 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6586 // predication, updating analyses. 6587 ILV.fixVectorizedLoop(); 6588 } 6589 6590 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6591 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6592 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6593 6594 // We create new control-flow for the vectorized loop, so the original 6595 // condition will be dead after vectorization if it's only used by the 6596 // branch. 6597 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6598 if (Cmp && Cmp->hasOneUse()) 6599 DeadInstructions.insert(Cmp); 6600 6601 // We create new "steps" for induction variable updates to which the original 6602 // induction variables map. An original update instruction will be dead if 6603 // all its users except the induction variable are dead. 6604 for (auto &Induction : Legal->getInductionVars()) { 6605 PHINode *Ind = Induction.first; 6606 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6607 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6608 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6609 DeadInstructions.end(); 6610 })) 6611 DeadInstructions.insert(IndUpdate); 6612 6613 // We record as "Dead" also the type-casting instructions we had identified 6614 // during induction analysis. We don't need any handling for them in the 6615 // vectorized loop because we have proven that, under a proper runtime 6616 // test guarding the vectorized loop, the value of the phi, and the casted 6617 // value of the phi, are the same. The last instruction in this casting chain 6618 // will get its scalar/vector/widened def from the scalar/vector/widened def 6619 // of the respective phi node. Any other casts in the induction def-use chain 6620 // have no other uses outside the phi update chain, and will be ignored. 6621 InductionDescriptor &IndDes = Induction.second; 6622 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6623 DeadInstructions.insert(Casts.begin(), Casts.end()); 6624 } 6625 } 6626 6627 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6628 6629 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6630 6631 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6632 Instruction::BinaryOps BinOp) { 6633 // When unrolling and the VF is 1, we only need to add a simple scalar. 6634 Type *Ty = Val->getType(); 6635 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6636 6637 if (Ty->isFloatingPointTy()) { 6638 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6639 6640 // Floating point operations had to be 'fast' to enable the unrolling. 6641 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6642 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6643 } 6644 Constant *C = ConstantInt::get(Ty, StartIdx); 6645 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6646 } 6647 6648 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6649 SmallVector<Metadata *, 4> MDs; 6650 // Reserve first location for self reference to the LoopID metadata node. 6651 MDs.push_back(nullptr); 6652 bool IsUnrollMetadata = false; 6653 MDNode *LoopID = L->getLoopID(); 6654 if (LoopID) { 6655 // First find existing loop unrolling disable metadata. 6656 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6657 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6658 if (MD) { 6659 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6660 IsUnrollMetadata = 6661 S && S->getString().startswith("llvm.loop.unroll.disable"); 6662 } 6663 MDs.push_back(LoopID->getOperand(i)); 6664 } 6665 } 6666 6667 if (!IsUnrollMetadata) { 6668 // Add runtime unroll disable metadata. 6669 LLVMContext &Context = L->getHeader()->getContext(); 6670 SmallVector<Metadata *, 1> DisableOperands; 6671 DisableOperands.push_back( 6672 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6673 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6674 MDs.push_back(DisableNode); 6675 MDNode *NewLoopID = MDNode::get(Context, MDs); 6676 // Set operand 0 to refer to the loop id itself. 6677 NewLoopID->replaceOperandWith(0, NewLoopID); 6678 L->setLoopID(NewLoopID); 6679 } 6680 } 6681 6682 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6683 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6684 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6685 bool PredicateAtRangeStart = Predicate(Range.Start); 6686 6687 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6688 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6689 Range.End = TmpVF; 6690 break; 6691 } 6692 6693 return PredicateAtRangeStart; 6694 } 6695 6696 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6697 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6698 /// of VF's starting at a given VF and extending it as much as possible. Each 6699 /// vectorization decision can potentially shorten this sub-range during 6700 /// buildVPlan(). 6701 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6702 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6703 VFRange SubRange = {VF, MaxVF + 1}; 6704 VPlans.push_back(buildVPlan(SubRange)); 6705 VF = SubRange.End; 6706 } 6707 } 6708 6709 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6710 VPlanPtr &Plan) { 6711 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6712 6713 // Look for cached value. 6714 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6715 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6716 if (ECEntryIt != EdgeMaskCache.end()) 6717 return ECEntryIt->second; 6718 6719 VPValue *SrcMask = createBlockInMask(Src, Plan); 6720 6721 // The terminator has to be a branch inst! 6722 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6723 assert(BI && "Unexpected terminator found"); 6724 6725 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6726 return EdgeMaskCache[Edge] = SrcMask; 6727 6728 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6729 assert(EdgeMask && "No Edge Mask found for condition"); 6730 6731 if (BI->getSuccessor(0) != Dst) 6732 EdgeMask = Builder.createNot(EdgeMask); 6733 6734 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6735 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6736 6737 return EdgeMaskCache[Edge] = EdgeMask; 6738 } 6739 6740 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6741 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6742 6743 // Look for cached value. 6744 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6745 if (BCEntryIt != BlockMaskCache.end()) 6746 return BCEntryIt->second; 6747 6748 // All-one mask is modelled as no-mask following the convention for masked 6749 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6750 VPValue *BlockMask = nullptr; 6751 6752 if (OrigLoop->getHeader() == BB) { 6753 if (!CM.blockNeedsPredication(BB)) 6754 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6755 6756 // Introduce the early-exit compare IV <= BTC to form header block mask. 6757 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6758 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6759 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6760 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6761 return BlockMaskCache[BB] = BlockMask; 6762 } 6763 6764 // This is the block mask. We OR all incoming edges. 6765 for (auto *Predecessor : predecessors(BB)) { 6766 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6767 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6768 return BlockMaskCache[BB] = EdgeMask; 6769 6770 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6771 BlockMask = EdgeMask; 6772 continue; 6773 } 6774 6775 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6776 } 6777 6778 return BlockMaskCache[BB] = BlockMask; 6779 } 6780 6781 VPWidenMemoryInstructionRecipe * 6782 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6783 VPlanPtr &Plan) { 6784 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6785 return nullptr; 6786 6787 auto willWiden = [&](unsigned VF) -> bool { 6788 if (VF == 1) 6789 return false; 6790 LoopVectorizationCostModel::InstWidening Decision = 6791 CM.getWideningDecision(I, VF); 6792 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6793 "CM decision should be taken at this point."); 6794 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6795 return true; 6796 if (CM.isScalarAfterVectorization(I, VF) || 6797 CM.isProfitableToScalarize(I, VF)) 6798 return false; 6799 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6800 }; 6801 6802 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6803 return nullptr; 6804 6805 VPValue *Mask = nullptr; 6806 if (Legal->isMaskRequired(I)) 6807 Mask = createBlockInMask(I->getParent(), Plan); 6808 6809 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6810 return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); 6811 } 6812 6813 VPWidenIntOrFpInductionRecipe * 6814 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6815 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6816 // Check if this is an integer or fp induction. If so, build the recipe that 6817 // produces its scalar and vector values. 6818 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6819 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6820 II.getKind() == InductionDescriptor::IK_FpInduction) 6821 return new VPWidenIntOrFpInductionRecipe(Phi); 6822 6823 return nullptr; 6824 } 6825 6826 // Optimize the special case where the source is a constant integer 6827 // induction variable. Notice that we can only optimize the 'trunc' case 6828 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6829 // (c) other casts depend on pointer size. 6830 6831 // Determine whether \p K is a truncation based on an induction variable that 6832 // can be optimized. 6833 auto isOptimizableIVTruncate = 6834 [&](Instruction *K) -> std::function<bool(unsigned)> { 6835 return 6836 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6837 }; 6838 6839 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6840 isOptimizableIVTruncate(I), Range)) 6841 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6842 cast<TruncInst>(I)); 6843 return nullptr; 6844 } 6845 6846 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6847 PHINode *Phi = dyn_cast<PHINode>(I); 6848 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6849 return nullptr; 6850 6851 // We know that all PHIs in non-header blocks are converted into selects, so 6852 // we don't have to worry about the insertion order and we can just use the 6853 // builder. At this point we generate the predication tree. There may be 6854 // duplications since this is a simple recursive scan, but future 6855 // optimizations will clean it up. 6856 6857 SmallVector<VPValue *, 2> Masks; 6858 unsigned NumIncoming = Phi->getNumIncomingValues(); 6859 for (unsigned In = 0; In < NumIncoming; In++) { 6860 VPValue *EdgeMask = 6861 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6862 assert((EdgeMask || NumIncoming == 1) && 6863 "Multiple predecessors with one having a full mask"); 6864 if (EdgeMask) 6865 Masks.push_back(EdgeMask); 6866 } 6867 return new VPBlendRecipe(Phi, Masks); 6868 } 6869 6870 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6871 VFRange &Range) { 6872 6873 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6874 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6875 6876 if (IsPredicated) 6877 return false; 6878 6879 auto IsVectorizableOpcode = [](unsigned Opcode) { 6880 switch (Opcode) { 6881 case Instruction::Add: 6882 case Instruction::And: 6883 case Instruction::AShr: 6884 case Instruction::BitCast: 6885 case Instruction::Br: 6886 case Instruction::Call: 6887 case Instruction::FAdd: 6888 case Instruction::FCmp: 6889 case Instruction::FDiv: 6890 case Instruction::FMul: 6891 case Instruction::FNeg: 6892 case Instruction::FPExt: 6893 case Instruction::FPToSI: 6894 case Instruction::FPToUI: 6895 case Instruction::FPTrunc: 6896 case Instruction::FRem: 6897 case Instruction::FSub: 6898 case Instruction::ICmp: 6899 case Instruction::IntToPtr: 6900 case Instruction::Load: 6901 case Instruction::LShr: 6902 case Instruction::Mul: 6903 case Instruction::Or: 6904 case Instruction::PHI: 6905 case Instruction::PtrToInt: 6906 case Instruction::SDiv: 6907 case Instruction::Select: 6908 case Instruction::SExt: 6909 case Instruction::Shl: 6910 case Instruction::SIToFP: 6911 case Instruction::SRem: 6912 case Instruction::Store: 6913 case Instruction::Sub: 6914 case Instruction::Trunc: 6915 case Instruction::UDiv: 6916 case Instruction::UIToFP: 6917 case Instruction::URem: 6918 case Instruction::Xor: 6919 case Instruction::ZExt: 6920 return true; 6921 } 6922 return false; 6923 }; 6924 6925 if (!IsVectorizableOpcode(I->getOpcode())) 6926 return false; 6927 6928 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6929 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6930 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6931 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6932 return false; 6933 } 6934 6935 auto willWiden = [&](unsigned VF) -> bool { 6936 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6937 CM.isProfitableToScalarize(I, VF))) 6938 return false; 6939 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6940 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6941 // The following case may be scalarized depending on the VF. 6942 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6943 // version of the instruction. 6944 // Is it beneficial to perform intrinsic call compared to lib call? 6945 bool NeedToScalarize; 6946 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6947 bool UseVectorIntrinsic = 6948 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6949 return UseVectorIntrinsic || !NeedToScalarize; 6950 } 6951 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6952 assert(CM.getWideningDecision(I, VF) == 6953 LoopVectorizationCostModel::CM_Scalarize && 6954 "Memory widening decisions should have been taken care by now"); 6955 return false; 6956 } 6957 return true; 6958 }; 6959 6960 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6961 return false; 6962 // If this ingredient's recipe is to be recorded, keep its recipe a singleton 6963 // to avoid having to split recipes later. 6964 bool IsSingleton = Ingredient2Recipe.count(I); 6965 6966 // Success: widen this instruction. 6967 6968 // Use the default widening recipe. We optimize the common case where 6969 // consecutive instructions can be represented by a single recipe. 6970 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && 6971 LastExtensibleRecipe->appendInstruction(I)) 6972 return true; 6973 6974 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); 6975 if (!IsSingleton) 6976 LastExtensibleRecipe = WidenRecipe; 6977 setRecipe(I, WidenRecipe); 6978 VPBB->appendRecipe(WidenRecipe); 6979 return true; 6980 } 6981 6982 VPBasicBlock *VPRecipeBuilder::handleReplication( 6983 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6984 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6985 VPlanPtr &Plan) { 6986 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6987 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6988 Range); 6989 6990 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6991 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6992 6993 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6994 setRecipe(I, Recipe); 6995 6996 // Find if I uses a predicated instruction. If so, it will use its scalar 6997 // value. Avoid hoisting the insert-element which packs the scalar value into 6998 // a vector value, as that happens iff all users use the vector value. 6999 for (auto &Op : I->operands()) 7000 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7001 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7002 PredInst2Recipe[PredInst]->setAlsoPack(false); 7003 7004 // Finalize the recipe for Instr, first if it is not predicated. 7005 if (!IsPredicated) { 7006 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7007 VPBB->appendRecipe(Recipe); 7008 return VPBB; 7009 } 7010 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7011 assert(VPBB->getSuccessors().empty() && 7012 "VPBB has successors when handling predicated replication."); 7013 // Record predicated instructions for above packing optimizations. 7014 PredInst2Recipe[I] = Recipe; 7015 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7016 VPBlockUtils::insertBlockAfter(Region, VPBB); 7017 auto *RegSucc = new VPBasicBlock(); 7018 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7019 return RegSucc; 7020 } 7021 7022 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7023 VPRecipeBase *PredRecipe, 7024 VPlanPtr &Plan) { 7025 // Instructions marked for predication are replicated and placed under an 7026 // if-then construct to prevent side-effects. 7027 7028 // Generate recipes to compute the block mask for this region. 7029 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7030 7031 // Build the triangular if-then region. 7032 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7033 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7034 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7035 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7036 auto *PHIRecipe = 7037 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7038 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7039 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7040 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7041 7042 // Note: first set Entry as region entry and then connect successors starting 7043 // from it in order, to propagate the "parent" of each VPBasicBlock. 7044 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7045 VPBlockUtils::connectBlocks(Pred, Exit); 7046 7047 return Region; 7048 } 7049 7050 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7051 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7052 VPRecipeBase *Recipe = nullptr; 7053 7054 // First, check for specific widening recipes that deal with memory 7055 // operations, inductions and Phi nodes. 7056 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7057 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7058 (Recipe = tryToBlend(Instr, Plan)) || 7059 (isa<PHINode>(Instr) && 7060 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7061 setRecipe(Instr, Recipe); 7062 VPBB->appendRecipe(Recipe); 7063 return true; 7064 } 7065 7066 // Handle GEP widening. 7067 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7068 auto Scalarize = [&](unsigned VF) { 7069 return CM.isScalarWithPredication(Instr, VF) || 7070 CM.isScalarAfterVectorization(Instr, VF) || 7071 CM.isProfitableToScalarize(Instr, VF); 7072 }; 7073 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7074 return false; 7075 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7076 setRecipe(Instr, Recipe); 7077 VPBB->appendRecipe(Recipe); 7078 return true; 7079 } 7080 7081 // Check if Instr is to be widened by a general VPWidenRecipe, after 7082 // having first checked for specific widening recipes. 7083 if (tryToWiden(Instr, VPBB, Range)) 7084 return true; 7085 7086 return false; 7087 } 7088 7089 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7090 unsigned MaxVF) { 7091 assert(OrigLoop->empty() && "Inner loop expected."); 7092 7093 // Collect conditions feeding internal conditional branches; they need to be 7094 // represented in VPlan for it to model masking. 7095 SmallPtrSet<Value *, 1> NeedDef; 7096 7097 auto *Latch = OrigLoop->getLoopLatch(); 7098 for (BasicBlock *BB : OrigLoop->blocks()) { 7099 if (BB == Latch) 7100 continue; 7101 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7102 if (Branch && Branch->isConditional()) 7103 NeedDef.insert(Branch->getCondition()); 7104 } 7105 7106 // If the tail is to be folded by masking, the primary induction variable 7107 // needs to be represented in VPlan for it to model early-exit masking. 7108 // Also, both the Phi and the live-out instruction of each reduction are 7109 // required in order to introduce a select between them in VPlan. 7110 if (CM.foldTailByMasking()) { 7111 NeedDef.insert(Legal->getPrimaryInduction()); 7112 for (auto &Reduction : Legal->getReductionVars()) { 7113 NeedDef.insert(Reduction.first); 7114 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7115 } 7116 } 7117 7118 // Collect instructions from the original loop that will become trivially dead 7119 // in the vectorized loop. We don't need to vectorize these instructions. For 7120 // example, original induction update instructions can become dead because we 7121 // separately emit induction "steps" when generating code for the new loop. 7122 // Similarly, we create a new latch condition when setting up the structure 7123 // of the new loop, so the old one can become dead. 7124 SmallPtrSet<Instruction *, 4> DeadInstructions; 7125 collectTriviallyDeadInstructions(DeadInstructions); 7126 7127 // Add assume instructions we need to drop to DeadInstructions, to prevent 7128 // them from being added to the VPlan. 7129 // TODO: We only need to drop assumes in blocks that get flattend. If the 7130 // control flow is preserved, we should keep them. 7131 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7132 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7133 7134 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7135 // Dead instructions do not need sinking. Remove them from SinkAfter. 7136 for (Instruction *I : DeadInstructions) 7137 SinkAfter.erase(I); 7138 7139 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7140 VFRange SubRange = {VF, MaxVF + 1}; 7141 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7142 DeadInstructions, SinkAfter)); 7143 VF = SubRange.End; 7144 } 7145 } 7146 7147 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7148 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7149 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7150 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7151 7152 // Hold a mapping from predicated instructions to their recipes, in order to 7153 // fix their AlsoPack behavior if a user is determined to replicate and use a 7154 // scalar instead of vector value. 7155 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7156 7157 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7158 7159 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7160 7161 // --------------------------------------------------------------------------- 7162 // Pre-construction: record ingredients whose recipes we'll need to further 7163 // process after constructing the initial VPlan. 7164 // --------------------------------------------------------------------------- 7165 7166 // Mark instructions we'll need to sink later and their targets as 7167 // ingredients whose recipe we'll need to record. 7168 for (auto &Entry : SinkAfter) { 7169 RecipeBuilder.recordRecipeOf(Entry.first); 7170 RecipeBuilder.recordRecipeOf(Entry.second); 7171 } 7172 7173 // For each interleave group which is relevant for this (possibly trimmed) 7174 // Range, add it to the set of groups to be later applied to the VPlan and add 7175 // placeholders for its members' Recipes which we'll be replacing with a 7176 // single VPInterleaveRecipe. 7177 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7178 auto applyIG = [IG, this](unsigned VF) -> bool { 7179 return (VF >= 2 && // Query is illegal for VF == 1 7180 CM.getWideningDecision(IG->getInsertPos(), VF) == 7181 LoopVectorizationCostModel::CM_Interleave); 7182 }; 7183 if (!getDecisionAndClampRange(applyIG, Range)) 7184 continue; 7185 InterleaveGroups.insert(IG); 7186 for (unsigned i = 0; i < IG->getFactor(); i++) 7187 if (Instruction *Member = IG->getMember(i)) 7188 RecipeBuilder.recordRecipeOf(Member); 7189 }; 7190 7191 // --------------------------------------------------------------------------- 7192 // Build initial VPlan: Scan the body of the loop in a topological order to 7193 // visit each basic block after having visited its predecessor basic blocks. 7194 // --------------------------------------------------------------------------- 7195 7196 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7197 auto Plan = std::make_unique<VPlan>(); 7198 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7199 Plan->setEntry(VPBB); 7200 7201 // Represent values that will have defs inside VPlan. 7202 for (Value *V : NeedDef) 7203 Plan->addVPValue(V); 7204 7205 // Scan the body of the loop in a topological order to visit each basic block 7206 // after having visited its predecessor basic blocks. 7207 LoopBlocksDFS DFS(OrigLoop); 7208 DFS.perform(LI); 7209 7210 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7211 // Relevant instructions from basic block BB will be grouped into VPRecipe 7212 // ingredients and fill a new VPBasicBlock. 7213 unsigned VPBBsForBB = 0; 7214 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7215 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7216 VPBB = FirstVPBBForBB; 7217 Builder.setInsertPoint(VPBB); 7218 7219 // Introduce each ingredient into VPlan. 7220 for (Instruction &I : BB->instructionsWithoutDebug()) { 7221 Instruction *Instr = &I; 7222 7223 // First filter out irrelevant instructions, to ensure no recipes are 7224 // built for them. 7225 if (isa<BranchInst>(Instr) || 7226 DeadInstructions.find(Instr) != DeadInstructions.end()) 7227 continue; 7228 7229 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7230 continue; 7231 7232 // Otherwise, if all widening options failed, Instruction is to be 7233 // replicated. This may create a successor for VPBB. 7234 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7235 Instr, Range, VPBB, PredInst2Recipe, Plan); 7236 if (NextVPBB != VPBB) { 7237 VPBB = NextVPBB; 7238 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7239 : ""); 7240 } 7241 } 7242 } 7243 7244 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7245 // may also be empty, such as the last one VPBB, reflecting original 7246 // basic-blocks with no recipes. 7247 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7248 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7249 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7250 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7251 delete PreEntry; 7252 7253 // --------------------------------------------------------------------------- 7254 // Transform initial VPlan: Apply previously taken decisions, in order, to 7255 // bring the VPlan to its final state. 7256 // --------------------------------------------------------------------------- 7257 7258 // Apply Sink-After legal constraints. 7259 for (auto &Entry : SinkAfter) { 7260 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7261 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7262 Sink->moveAfter(Target); 7263 } 7264 7265 // Interleave memory: for each Interleave Group we marked earlier as relevant 7266 // for this VPlan, replace the Recipes widening its memory instructions with a 7267 // single VPInterleaveRecipe at its insertion point. 7268 for (auto IG : InterleaveGroups) { 7269 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7270 RecipeBuilder.getRecipe(IG->getInsertPos())); 7271 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7272 ->insertBefore(Recipe); 7273 7274 for (unsigned i = 0; i < IG->getFactor(); ++i) 7275 if (Instruction *Member = IG->getMember(i)) { 7276 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7277 } 7278 } 7279 7280 // Finally, if tail is folded by masking, introduce selects between the phi 7281 // and the live-out instruction of each reduction, at the end of the latch. 7282 if (CM.foldTailByMasking()) { 7283 Builder.setInsertPoint(VPBB); 7284 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7285 for (auto &Reduction : Legal->getReductionVars()) { 7286 VPValue *Phi = Plan->getVPValue(Reduction.first); 7287 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7288 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7289 } 7290 } 7291 7292 std::string PlanName; 7293 raw_string_ostream RSO(PlanName); 7294 unsigned VF = Range.Start; 7295 Plan->addVF(VF); 7296 RSO << "Initial VPlan for VF={" << VF; 7297 for (VF *= 2; VF < Range.End; VF *= 2) { 7298 Plan->addVF(VF); 7299 RSO << "," << VF; 7300 } 7301 RSO << "},UF>=1"; 7302 RSO.flush(); 7303 Plan->setName(PlanName); 7304 7305 return Plan; 7306 } 7307 7308 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7309 // Outer loop handling: They may require CFG and instruction level 7310 // transformations before even evaluating whether vectorization is profitable. 7311 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7312 // the vectorization pipeline. 7313 assert(!OrigLoop->empty()); 7314 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7315 7316 // Create new empty VPlan 7317 auto Plan = std::make_unique<VPlan>(); 7318 7319 // Build hierarchical CFG 7320 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7321 HCFGBuilder.buildHierarchicalCFG(); 7322 7323 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7324 Plan->addVF(VF); 7325 7326 if (EnableVPlanPredication) { 7327 VPlanPredicator VPP(*Plan); 7328 VPP.predicate(); 7329 7330 // Avoid running transformation to recipes until masked code generation in 7331 // VPlan-native path is in place. 7332 return Plan; 7333 } 7334 7335 SmallPtrSet<Instruction *, 1> DeadInstructions; 7336 VPlanTransforms::VPInstructionsToVPRecipes( 7337 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7338 return Plan; 7339 } 7340 7341 Value* LoopVectorizationPlanner::VPCallbackILV:: 7342 getOrCreateVectorValues(Value *V, unsigned Part) { 7343 return ILV.getOrCreateVectorValue(V, Part); 7344 } 7345 7346 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7347 Value *V, const VPIteration &Instance) { 7348 return ILV.getOrCreateScalarValue(V, Instance); 7349 } 7350 7351 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7352 VPSlotTracker &SlotTracker) const { 7353 O << " +\n" 7354 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7355 IG->getInsertPos()->printAsOperand(O, false); 7356 O << ", "; 7357 getAddr()->printAsOperand(O, SlotTracker); 7358 VPValue *Mask = getMask(); 7359 if (Mask) { 7360 O << ", "; 7361 Mask->printAsOperand(O, SlotTracker); 7362 } 7363 O << "\\l\""; 7364 for (unsigned i = 0; i < IG->getFactor(); ++i) 7365 if (Instruction *I = IG->getMember(i)) 7366 O << " +\n" 7367 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7368 } 7369 7370 void VPWidenRecipe::execute(VPTransformState &State) { 7371 for (auto &Instr : make_range(Begin, End)) 7372 State.ILV->widenInstruction(Instr); 7373 } 7374 7375 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7376 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7377 IsIndexLoopInvariant); 7378 } 7379 7380 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7381 assert(!State.Instance && "Int or FP induction being replicated."); 7382 State.ILV->widenIntOrFpInduction(IV, Trunc); 7383 } 7384 7385 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7386 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7387 } 7388 7389 void VPBlendRecipe::execute(VPTransformState &State) { 7390 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7391 // We know that all PHIs in non-header blocks are converted into 7392 // selects, so we don't have to worry about the insertion order and we 7393 // can just use the builder. 7394 // At this point we generate the predication tree. There may be 7395 // duplications since this is a simple recursive scan, but future 7396 // optimizations will clean it up. 7397 7398 unsigned NumIncoming = Phi->getNumIncomingValues(); 7399 7400 assert((User || NumIncoming == 1) && 7401 "Multiple predecessors with predecessors having a full mask"); 7402 // Generate a sequence of selects of the form: 7403 // SELECT(Mask3, In3, 7404 // SELECT(Mask2, In2, 7405 // ( ...))) 7406 InnerLoopVectorizer::VectorParts Entry(State.UF); 7407 for (unsigned In = 0; In < NumIncoming; ++In) { 7408 for (unsigned Part = 0; Part < State.UF; ++Part) { 7409 // We might have single edge PHIs (blocks) - use an identity 7410 // 'select' for the first PHI operand. 7411 Value *In0 = 7412 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7413 if (In == 0) 7414 Entry[Part] = In0; // Initialize with the first incoming value. 7415 else { 7416 // Select between the current value and the previous incoming edge 7417 // based on the incoming mask. 7418 Value *Cond = State.get(User->getOperand(In), Part); 7419 Entry[Part] = 7420 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7421 } 7422 } 7423 } 7424 for (unsigned Part = 0; Part < State.UF; ++Part) 7425 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7426 } 7427 7428 void VPInterleaveRecipe::execute(VPTransformState &State) { 7429 assert(!State.Instance && "Interleave group being replicated."); 7430 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), 7431 getMask()); 7432 } 7433 7434 void VPReplicateRecipe::execute(VPTransformState &State) { 7435 if (State.Instance) { // Generate a single instance. 7436 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7437 // Insert scalar instance packing it into a vector. 7438 if (AlsoPack && State.VF > 1) { 7439 // If we're constructing lane 0, initialize to start from undef. 7440 if (State.Instance->Lane == 0) { 7441 Value *Undef = 7442 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7443 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7444 } 7445 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7446 } 7447 return; 7448 } 7449 7450 // Generate scalar instances for all VF lanes of all UF parts, unless the 7451 // instruction is uniform inwhich case generate only the first lane for each 7452 // of the UF parts. 7453 unsigned EndLane = IsUniform ? 1 : State.VF; 7454 for (unsigned Part = 0; Part < State.UF; ++Part) 7455 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7456 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7457 } 7458 7459 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7460 assert(State.Instance && "Branch on Mask works only on single instance."); 7461 7462 unsigned Part = State.Instance->Part; 7463 unsigned Lane = State.Instance->Lane; 7464 7465 Value *ConditionBit = nullptr; 7466 if (!User) // Block in mask is all-one. 7467 ConditionBit = State.Builder.getTrue(); 7468 else { 7469 VPValue *BlockInMask = User->getOperand(0); 7470 ConditionBit = State.get(BlockInMask, Part); 7471 if (ConditionBit->getType()->isVectorTy()) 7472 ConditionBit = State.Builder.CreateExtractElement( 7473 ConditionBit, State.Builder.getInt32(Lane)); 7474 } 7475 7476 // Replace the temporary unreachable terminator with a new conditional branch, 7477 // whose two destinations will be set later when they are created. 7478 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7479 assert(isa<UnreachableInst>(CurrentTerminator) && 7480 "Expected to replace unreachable terminator with conditional branch."); 7481 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7482 CondBr->setSuccessor(0, nullptr); 7483 ReplaceInstWithInst(CurrentTerminator, CondBr); 7484 } 7485 7486 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7487 assert(State.Instance && "Predicated instruction PHI works per instance."); 7488 Instruction *ScalarPredInst = cast<Instruction>( 7489 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7490 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7491 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7492 assert(PredicatingBB && "Predicated block has no single predecessor."); 7493 7494 // By current pack/unpack logic we need to generate only a single phi node: if 7495 // a vector value for the predicated instruction exists at this point it means 7496 // the instruction has vector users only, and a phi for the vector value is 7497 // needed. In this case the recipe of the predicated instruction is marked to 7498 // also do that packing, thereby "hoisting" the insert-element sequence. 7499 // Otherwise, a phi node for the scalar value is needed. 7500 unsigned Part = State.Instance->Part; 7501 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7502 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7503 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7504 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7505 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7506 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7507 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7508 } else { 7509 Type *PredInstType = PredInst->getType(); 7510 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7511 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7512 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7513 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7514 } 7515 } 7516 7517 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7518 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); 7519 } 7520 7521 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7522 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7523 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7524 // for predication. 7525 static ScalarEpilogueLowering getScalarEpilogueLowering( 7526 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7527 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7528 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7529 LoopVectorizationLegality &LVL) { 7530 bool OptSize = 7531 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7532 PGSOQueryType::IRPass); 7533 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7534 // don't look at hints or options, and don't request a scalar epilogue. 7535 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7536 return CM_ScalarEpilogueNotAllowedOptSize; 7537 7538 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7539 !PreferPredicateOverEpilog; 7540 7541 // 2) Next, if disabling predication is requested on the command line, honour 7542 // this and request a scalar epilogue. Also do this if we don't have a 7543 // primary induction variable, which is required for predication. 7544 if (PredicateOptDisabled || !LVL.getPrimaryInduction()) 7545 return CM_ScalarEpilogueAllowed; 7546 7547 // 3) and 4) look if enabling predication is requested on the command line, 7548 // with a loop hint, or if the TTI hook indicates this is profitable, request 7549 // predication . 7550 if (PreferPredicateOverEpilog || 7551 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7552 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7553 LVL.getLAI()) && 7554 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7555 return CM_ScalarEpilogueNotNeededUsePredicate; 7556 7557 return CM_ScalarEpilogueAllowed; 7558 } 7559 7560 // Process the loop in the VPlan-native vectorization path. This path builds 7561 // VPlan upfront in the vectorization pipeline, which allows to apply 7562 // VPlan-to-VPlan transformations from the very beginning without modifying the 7563 // input LLVM IR. 7564 static bool processLoopInVPlanNativePath( 7565 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7566 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7567 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7568 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7569 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7570 7571 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7572 Function *F = L->getHeader()->getParent(); 7573 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7574 7575 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7576 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7577 7578 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7579 &Hints, IAI); 7580 // Use the planner for outer loop vectorization. 7581 // TODO: CM is not used at this point inside the planner. Turn CM into an 7582 // optional argument if we don't need it in the future. 7583 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7584 7585 // Get user vectorization factor. 7586 const unsigned UserVF = Hints.getWidth(); 7587 7588 // Plan how to best vectorize, return the best VF and its cost. 7589 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7590 7591 // If we are stress testing VPlan builds, do not attempt to generate vector 7592 // code. Masked vector code generation support will follow soon. 7593 // Also, do not attempt to vectorize if no vector code will be produced. 7594 if (VPlanBuildStressTest || EnableVPlanPredication || 7595 VectorizationFactor::Disabled() == VF) 7596 return false; 7597 7598 LVP.setBestPlan(VF.Width, 1); 7599 7600 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7601 &CM); 7602 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7603 << L->getHeader()->getParent()->getName() << "\"\n"); 7604 LVP.executePlan(LB, DT); 7605 7606 // Mark the loop as already vectorized to avoid vectorizing again. 7607 Hints.setAlreadyVectorized(); 7608 7609 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7610 return true; 7611 } 7612 7613 bool LoopVectorizePass::processLoop(Loop *L) { 7614 assert((EnableVPlanNativePath || L->empty()) && 7615 "VPlan-native path is not enabled. Only process inner loops."); 7616 7617 #ifndef NDEBUG 7618 const std::string DebugLocStr = getDebugLocString(L); 7619 #endif /* NDEBUG */ 7620 7621 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7622 << L->getHeader()->getParent()->getName() << "\" from " 7623 << DebugLocStr << "\n"); 7624 7625 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7626 7627 LLVM_DEBUG( 7628 dbgs() << "LV: Loop hints:" 7629 << " force=" 7630 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7631 ? "disabled" 7632 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7633 ? "enabled" 7634 : "?")) 7635 << " width=" << Hints.getWidth() 7636 << " unroll=" << Hints.getInterleave() << "\n"); 7637 7638 // Function containing loop 7639 Function *F = L->getHeader()->getParent(); 7640 7641 // Looking at the diagnostic output is the only way to determine if a loop 7642 // was vectorized (other than looking at the IR or machine code), so it 7643 // is important to generate an optimization remark for each loop. Most of 7644 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7645 // generated as OptimizationRemark and OptimizationRemarkMissed are 7646 // less verbose reporting vectorized loops and unvectorized loops that may 7647 // benefit from vectorization, respectively. 7648 7649 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7650 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7651 return false; 7652 } 7653 7654 PredicatedScalarEvolution PSE(*SE, *L); 7655 7656 // Check if it is legal to vectorize the loop. 7657 LoopVectorizationRequirements Requirements(*ORE); 7658 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7659 &Requirements, &Hints, DB, AC); 7660 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7661 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7662 Hints.emitRemarkWithHints(); 7663 return false; 7664 } 7665 7666 // Check the function attributes and profiles to find out if this function 7667 // should be optimized for size. 7668 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7669 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7670 7671 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7672 // here. They may require CFG and instruction level transformations before 7673 // even evaluating whether vectorization is profitable. Since we cannot modify 7674 // the incoming IR, we need to build VPlan upfront in the vectorization 7675 // pipeline. 7676 if (!L->empty()) 7677 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7678 ORE, BFI, PSI, Hints); 7679 7680 assert(L->empty() && "Inner loop expected."); 7681 7682 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7683 // count by optimizing for size, to minimize overheads. 7684 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7685 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7686 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7687 << "This loop is worth vectorizing only if no scalar " 7688 << "iteration overheads are incurred."); 7689 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7690 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7691 else { 7692 LLVM_DEBUG(dbgs() << "\n"); 7693 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7694 } 7695 } 7696 7697 // Check the function attributes to see if implicit floats are allowed. 7698 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7699 // an integer loop and the vector instructions selected are purely integer 7700 // vector instructions? 7701 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7702 reportVectorizationFailure( 7703 "Can't vectorize when the NoImplicitFloat attribute is used", 7704 "loop not vectorized due to NoImplicitFloat attribute", 7705 "NoImplicitFloat", ORE, L); 7706 Hints.emitRemarkWithHints(); 7707 return false; 7708 } 7709 7710 // Check if the target supports potentially unsafe FP vectorization. 7711 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7712 // for the target we're vectorizing for, to make sure none of the 7713 // additional fp-math flags can help. 7714 if (Hints.isPotentiallyUnsafe() && 7715 TTI->isFPVectorizationPotentiallyUnsafe()) { 7716 reportVectorizationFailure( 7717 "Potentially unsafe FP op prevents vectorization", 7718 "loop not vectorized due to unsafe FP support.", 7719 "UnsafeFP", ORE, L); 7720 Hints.emitRemarkWithHints(); 7721 return false; 7722 } 7723 7724 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7725 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7726 7727 // If an override option has been passed in for interleaved accesses, use it. 7728 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7729 UseInterleaved = EnableInterleavedMemAccesses; 7730 7731 // Analyze interleaved memory accesses. 7732 if (UseInterleaved) { 7733 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7734 } 7735 7736 // Use the cost model. 7737 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7738 F, &Hints, IAI); 7739 CM.collectValuesToIgnore(); 7740 7741 // Use the planner for vectorization. 7742 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7743 7744 // Get user vectorization factor. 7745 unsigned UserVF = Hints.getWidth(); 7746 7747 // Plan how to best vectorize, return the best VF and its cost. 7748 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7749 7750 VectorizationFactor VF = VectorizationFactor::Disabled(); 7751 unsigned IC = 1; 7752 unsigned UserIC = Hints.getInterleave(); 7753 7754 if (MaybeVF) { 7755 VF = *MaybeVF; 7756 // Select the interleave count. 7757 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7758 } 7759 7760 // Identify the diagnostic messages that should be produced. 7761 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7762 bool VectorizeLoop = true, InterleaveLoop = true; 7763 if (Requirements.doesNotMeet(F, L, Hints)) { 7764 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7765 "requirements.\n"); 7766 Hints.emitRemarkWithHints(); 7767 return false; 7768 } 7769 7770 if (VF.Width == 1) { 7771 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7772 VecDiagMsg = std::make_pair( 7773 "VectorizationNotBeneficial", 7774 "the cost-model indicates that vectorization is not beneficial"); 7775 VectorizeLoop = false; 7776 } 7777 7778 if (!MaybeVF && UserIC > 1) { 7779 // Tell the user interleaving was avoided up-front, despite being explicitly 7780 // requested. 7781 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7782 "interleaving should be avoided up front\n"); 7783 IntDiagMsg = std::make_pair( 7784 "InterleavingAvoided", 7785 "Ignoring UserIC, because interleaving was avoided up front"); 7786 InterleaveLoop = false; 7787 } else if (IC == 1 && UserIC <= 1) { 7788 // Tell the user interleaving is not beneficial. 7789 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7790 IntDiagMsg = std::make_pair( 7791 "InterleavingNotBeneficial", 7792 "the cost-model indicates that interleaving is not beneficial"); 7793 InterleaveLoop = false; 7794 if (UserIC == 1) { 7795 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7796 IntDiagMsg.second += 7797 " and is explicitly disabled or interleave count is set to 1"; 7798 } 7799 } else if (IC > 1 && UserIC == 1) { 7800 // Tell the user interleaving is beneficial, but it explicitly disabled. 7801 LLVM_DEBUG( 7802 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7803 IntDiagMsg = std::make_pair( 7804 "InterleavingBeneficialButDisabled", 7805 "the cost-model indicates that interleaving is beneficial " 7806 "but is explicitly disabled or interleave count is set to 1"); 7807 InterleaveLoop = false; 7808 } 7809 7810 // Override IC if user provided an interleave count. 7811 IC = UserIC > 0 ? UserIC : IC; 7812 7813 // Emit diagnostic messages, if any. 7814 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7815 if (!VectorizeLoop && !InterleaveLoop) { 7816 // Do not vectorize or interleaving the loop. 7817 ORE->emit([&]() { 7818 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7819 L->getStartLoc(), L->getHeader()) 7820 << VecDiagMsg.second; 7821 }); 7822 ORE->emit([&]() { 7823 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7824 L->getStartLoc(), L->getHeader()) 7825 << IntDiagMsg.second; 7826 }); 7827 return false; 7828 } else if (!VectorizeLoop && InterleaveLoop) { 7829 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7830 ORE->emit([&]() { 7831 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7832 L->getStartLoc(), L->getHeader()) 7833 << VecDiagMsg.second; 7834 }); 7835 } else if (VectorizeLoop && !InterleaveLoop) { 7836 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7837 << ") in " << DebugLocStr << '\n'); 7838 ORE->emit([&]() { 7839 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7840 L->getStartLoc(), L->getHeader()) 7841 << IntDiagMsg.second; 7842 }); 7843 } else if (VectorizeLoop && InterleaveLoop) { 7844 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7845 << ") in " << DebugLocStr << '\n'); 7846 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7847 } 7848 7849 LVP.setBestPlan(VF.Width, IC); 7850 7851 using namespace ore; 7852 bool DisableRuntimeUnroll = false; 7853 MDNode *OrigLoopID = L->getLoopID(); 7854 7855 if (!VectorizeLoop) { 7856 assert(IC > 1 && "interleave count should not be 1 or 0"); 7857 // If we decided that it is not legal to vectorize the loop, then 7858 // interleave it. 7859 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7860 &CM); 7861 LVP.executePlan(Unroller, DT); 7862 7863 ORE->emit([&]() { 7864 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7865 L->getHeader()) 7866 << "interleaved loop (interleaved count: " 7867 << NV("InterleaveCount", IC) << ")"; 7868 }); 7869 } else { 7870 // If we decided that it is *legal* to vectorize the loop, then do it. 7871 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7872 &LVL, &CM); 7873 LVP.executePlan(LB, DT); 7874 ++LoopsVectorized; 7875 7876 // Add metadata to disable runtime unrolling a scalar loop when there are 7877 // no runtime checks about strides and memory. A scalar loop that is 7878 // rarely used is not worth unrolling. 7879 if (!LB.areSafetyChecksAdded()) 7880 DisableRuntimeUnroll = true; 7881 7882 // Report the vectorization decision. 7883 ORE->emit([&]() { 7884 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7885 L->getHeader()) 7886 << "vectorized loop (vectorization width: " 7887 << NV("VectorizationFactor", VF.Width) 7888 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7889 }); 7890 } 7891 7892 Optional<MDNode *> RemainderLoopID = 7893 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7894 LLVMLoopVectorizeFollowupEpilogue}); 7895 if (RemainderLoopID.hasValue()) { 7896 L->setLoopID(RemainderLoopID.getValue()); 7897 } else { 7898 if (DisableRuntimeUnroll) 7899 AddRuntimeUnrollDisableMetaData(L); 7900 7901 // Mark the loop as already vectorized to avoid vectorizing again. 7902 Hints.setAlreadyVectorized(); 7903 } 7904 7905 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7906 return true; 7907 } 7908 7909 bool LoopVectorizePass::runImpl( 7910 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7911 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7912 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7913 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7914 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7915 SE = &SE_; 7916 LI = &LI_; 7917 TTI = &TTI_; 7918 DT = &DT_; 7919 BFI = &BFI_; 7920 TLI = TLI_; 7921 AA = &AA_; 7922 AC = &AC_; 7923 GetLAA = &GetLAA_; 7924 DB = &DB_; 7925 ORE = &ORE_; 7926 PSI = PSI_; 7927 7928 // Don't attempt if 7929 // 1. the target claims to have no vector registers, and 7930 // 2. interleaving won't help ILP. 7931 // 7932 // The second condition is necessary because, even if the target has no 7933 // vector registers, loop vectorization may still enable scalar 7934 // interleaving. 7935 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7936 TTI->getMaxInterleaveFactor(1) < 2) 7937 return false; 7938 7939 bool Changed = false; 7940 7941 // The vectorizer requires loops to be in simplified form. 7942 // Since simplification may add new inner loops, it has to run before the 7943 // legality and profitability checks. This means running the loop vectorizer 7944 // will simplify all loops, regardless of whether anything end up being 7945 // vectorized. 7946 for (auto &L : *LI) 7947 Changed |= 7948 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7949 7950 // Build up a worklist of inner-loops to vectorize. This is necessary as 7951 // the act of vectorizing or partially unrolling a loop creates new loops 7952 // and can invalidate iterators across the loops. 7953 SmallVector<Loop *, 8> Worklist; 7954 7955 for (Loop *L : *LI) 7956 collectSupportedLoops(*L, LI, ORE, Worklist); 7957 7958 LoopsAnalyzed += Worklist.size(); 7959 7960 // Now walk the identified inner loops. 7961 while (!Worklist.empty()) { 7962 Loop *L = Worklist.pop_back_val(); 7963 7964 // For the inner loops we actually process, form LCSSA to simplify the 7965 // transform. 7966 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7967 7968 Changed |= processLoop(L); 7969 } 7970 7971 // Process each loop nest in the function. 7972 return Changed; 7973 } 7974 7975 PreservedAnalyses LoopVectorizePass::run(Function &F, 7976 FunctionAnalysisManager &AM) { 7977 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7978 auto &LI = AM.getResult<LoopAnalysis>(F); 7979 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7980 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7981 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7982 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7983 auto &AA = AM.getResult<AAManager>(F); 7984 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7985 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7986 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7987 MemorySSA *MSSA = EnableMSSALoopDependency 7988 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7989 : nullptr; 7990 7991 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7992 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7993 [&](Loop &L) -> const LoopAccessInfo & { 7994 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7995 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7996 }; 7997 const ModuleAnalysisManager &MAM = 7998 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7999 ProfileSummaryInfo *PSI = 8000 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8001 bool Changed = 8002 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8003 if (!Changed) 8004 return PreservedAnalyses::all(); 8005 PreservedAnalyses PA; 8006 8007 // We currently do not preserve loopinfo/dominator analyses with outer loop 8008 // vectorization. Until this is addressed, mark these analyses as preserved 8009 // only for non-VPlan-native path. 8010 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8011 if (!EnableVPlanNativePath) { 8012 PA.preserve<LoopAnalysis>(); 8013 PA.preserve<DominatorTreeAnalysis>(); 8014 } 8015 PA.preserve<BasicAA>(); 8016 PA.preserve<GlobalsAA>(); 8017 return PA; 8018 } 8019